From e7ddf4b7b3c5fe64902c4fb9edac92532c87cd75 Mon Sep 17 00:00:00 2001
From: Srinidhi Kasagar <srinidhi.kasagar@intel.com>
Date: Fri, 19 Dec 2014 23:13:51 +0530
Subject: cpufreq: Add SFI based cpufreq driver support

This adds the SFI based cpu freq driver for some of the Intel's
Silvermont based Atom architectures like Z34xx and Z35xx.

Signed-off-by: Rudramuni, Vishwesh M <vishwesh.m.rudramuni@intel.com>
Signed-off-by: Srinidhi Kasagar <srinidhi.kasagar@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/uapi/asm/msr-index.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index e21331ce368f..4f6dae67dd10 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -318,6 +318,7 @@
 
 #define MSR_IA32_PERF_STATUS		0x00000198
 #define MSR_IA32_PERF_CTL		0x00000199
+#define INTEL_PERF_CTL_MASK		0xffff
 #define MSR_AMD_PSTATE_DEF_BASE		0xc0010064
 #define MSR_AMD_PERF_STATUS		0xc0010063
 #define MSR_AMD_PERF_CTL		0xc0010062
-- 
cgit v1.2.3


From b700e7f03df5d92f85fa5247fe1f557528d3363d Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@redhat.com>
Date: Tue, 16 Dec 2014 11:58:19 -0600
Subject: livepatch: kernel: add support for live patching

This commit introduces code for the live patching core.  It implements
an ftrace-based mechanism and kernel interface for doing live patching
of kernel and kernel module functions.

It represents the greatest common functionality set between kpatch and
kgraft and can accept patches built using either method.

This first version does not implement any consistency mechanism that
ensures that old and new code do not run together.  In practice, ~90% of
CVEs are safe to apply in this way, since they simply add a conditional
check.  However, any function change that can not execute safely with
the old version of the function can _not_ be safely applied in this
version.

[ jkosina@suse.cz: due to the number of contributions that got folded into
  this original patch from Seth Jennings, add SUSE's copyright as well, as
  discussed via e-mail ]

Signed-off-by: Seth Jennings <sjenning@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/sysfs-kernel-livepatch |  44 ++
 MAINTAINERS                                      |  13 +
 arch/x86/Kconfig                                 |   3 +
 arch/x86/include/asm/livepatch.h                 |  37 +
 arch/x86/kernel/Makefile                         |   1 +
 arch/x86/kernel/livepatch.c                      |  90 +++
 include/linux/livepatch.h                        | 133 ++++
 kernel/Makefile                                  |   1 +
 kernel/livepatch/Kconfig                         |  18 +
 kernel/livepatch/Makefile                        |   3 +
 kernel/livepatch/core.c                          | 930 +++++++++++++++++++++++
 11 files changed, 1273 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-livepatch
 create mode 100644 arch/x86/include/asm/livepatch.h
 create mode 100644 arch/x86/kernel/livepatch.c
 create mode 100644 include/linux/livepatch.h
 create mode 100644 kernel/livepatch/Kconfig
 create mode 100644 kernel/livepatch/Makefile
 create mode 100644 kernel/livepatch/core.c

(limited to 'arch/x86')

diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch
new file mode 100644
index 000000000000..5bf42a840b22
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
@@ -0,0 +1,44 @@
+What:		/sys/kernel/livepatch
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		Interface for kernel live patching
+
+		The /sys/kernel/livepatch directory contains subdirectories for
+		each loaded live patch module.
+
+What:		/sys/kernel/livepatch/<patch>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The patch directory contains subdirectories for each kernel
+		object (vmlinux or a module) in which it patched functions.
+
+What:		/sys/kernel/livepatch/<patch>/enabled
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		A writable attribute that indicates whether the patched
+		code is currently applied.  Writing 0 will disable the patch
+		while writing 1 will re-enable the patch.
+
+What:		/sys/kernel/livepatch/<patch>/<object>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The object directory contains subdirectories for each function
+		that is patched within the object.
+
+What:		/sys/kernel/livepatch/<patch>/<object>/<function>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The function directory contains attributes regarding the
+		properties and state of the patched function.
+
+		There are currently no such attributes.
diff --git a/MAINTAINERS b/MAINTAINERS
index ddb9ac8d32b3..df6a0784b466 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5784,6 +5784,19 @@ F:	Documentation/misc-devices/lis3lv02d
 F:	drivers/misc/lis3lv02d/
 F:	drivers/platform/x86/hp_accel.c
 
+LIVE PATCHING
+M:	Josh Poimboeuf <jpoimboe@redhat.com>
+M:	Seth Jennings <sjenning@redhat.com>
+M:	Jiri Kosina <jkosina@suse.cz>
+M:	Vojtech Pavlik <vojtech@suse.cz>
+S:	Maintained
+F:	kernel/livepatch/
+F:	include/linux/livepatch.h
+F:	arch/x86/include/asm/livepatch.h
+F:	arch/x86/kernel/livepatch.c
+F:	Documentation/ABI/testing/sysfs-kernel-livepatch
+L:	live-patching@vger.kernel.org
+
 LLC (802.2)
 M:	Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
 S:	Maintained
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba397bde7948..460b31b79938 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,6 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select ARCH_HAVE_LIVE_PATCHING
 
 ### Arch settings
 config X86
@@ -2008,6 +2009,8 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+source "kernel/livepatch/Kconfig"
+
 endmenu
 
 config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
new file mode 100644
index 000000000000..d529db1b1edf
--- /dev/null
+++ b/arch/x86/include/asm/livepatch.h
@@ -0,0 +1,37 @@
+/*
+ * livepatch.h - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_LIVEPATCH_H
+#define _ASM_X86_LIVEPATCH_H
+
+#include <linux/module.h>
+
+#ifdef CONFIG_LIVE_PATCHING
+#ifndef CC_USING_FENTRY
+#error Your compiler must support -mfentry for live patching to work
+#endif
+extern int klp_write_module_reloc(struct module *mod, unsigned long type,
+				  unsigned long loc, unsigned long value);
+
+#else
+#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#endif
+
+#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..316b34e74c15 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_LIVE_PATCHING)	+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..ff3c3101d003
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,90 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/elf.h>
+#include <asm/livepatch.h>
+
+/**
+ * klp_write_module_reloc() - write a relocation in a module
+ * @mod:	module in which the section to be modified is found
+ * @type:	ELF relocation type (see asm/elf.h)
+ * @loc:	address that the relocation should be written to
+ * @value:	relocation value (sym address + addend)
+ *
+ * This function writes a relocation to the specified location for
+ * a particular module.
+ */
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+			   unsigned long loc, unsigned long value)
+{
+	int ret, numpages, size = 4;
+	bool readonly;
+	unsigned long val;
+	unsigned long core = (unsigned long)mod->module_core;
+	unsigned long core_ro_size = mod->core_ro_size;
+	unsigned long core_size = mod->core_size;
+
+	switch (type) {
+	case R_X86_64_NONE:
+		return 0;
+	case R_X86_64_64:
+		val = value;
+		size = 8;
+		break;
+	case R_X86_64_32:
+		val = (u32)value;
+		break;
+	case R_X86_64_32S:
+		val = (s32)value;
+		break;
+	case R_X86_64_PC32:
+		val = (u32)(value - loc);
+		break;
+	default:
+		/* unsupported relocation type */
+		return -EINVAL;
+	}
+
+	if (loc < core || loc >= core + core_size)
+		/* loc does not point to any symbol inside the module */
+		return -EINVAL;
+
+	if (loc < core + core_ro_size)
+		readonly = true;
+	else
+		readonly = false;
+
+	/* determine if the relocation spans a page boundary */
+	numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
+
+	if (readonly)
+		set_memory_rw(loc & PAGE_MASK, numpages);
+
+	ret = probe_kernel_write((void *)loc, &val, size);
+
+	if (readonly)
+		set_memory_ro(loc & PAGE_MASK, numpages);
+
+	return ret;
+}
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
new file mode 100644
index 000000000000..950bc615842f
--- /dev/null
+++ b/include/linux/livepatch.h
@@ -0,0 +1,133 @@
+/*
+ * livepatch.h - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _LINUX_LIVEPATCH_H_
+#define _LINUX_LIVEPATCH_H_
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+#if IS_ENABLED(CONFIG_LIVE_PATCHING)
+
+#include <asm/livepatch.h>
+
+enum klp_state {
+	KLP_DISABLED,
+	KLP_ENABLED
+};
+
+/**
+ * struct klp_func - function structure for live patching
+ * @old_name:	name of the function to be patched
+ * @new_func:	pointer to the patched function code
+ * @old_addr:	a hint conveying at what address the old function
+ *		can be found (optional, vmlinux patches only)
+ * @kobj:	kobject for sysfs resources
+ * @fops:	ftrace operations structure
+ * @state:	tracks function-level patch application state
+ */
+struct klp_func {
+	/* external */
+	const char *old_name;
+	void *new_func;
+	/*
+	 * The old_addr field is optional and can be used to resolve
+	 * duplicate symbol names in the vmlinux object.  If this
+	 * information is not present, the symbol is located by name
+	 * with kallsyms. If the name is not unique and old_addr is
+	 * not provided, the patch application fails as there is no
+	 * way to resolve the ambiguity.
+	 */
+	unsigned long old_addr;
+
+	/* internal */
+	struct kobject kobj;
+	struct ftrace_ops *fops;
+	enum klp_state state;
+};
+
+/**
+ * struct klp_reloc - relocation structure for live patching
+ * @loc:	address where the relocation will be written
+ * @val:	address of the referenced symbol (optional,
+ *		vmlinux	patches only)
+ * @type:	ELF relocation type
+ * @name:	name of the referenced symbol (for lookup/verification)
+ * @addend:	offset from the referenced symbol
+ * @external:	symbol is either exported or within the live patch module itself
+ */
+struct klp_reloc {
+	unsigned long loc;
+	unsigned long val;
+	unsigned long type;
+	const char *name;
+	int addend;
+	int external;
+};
+
+/**
+ * struct klp_object - kernel object structure for live patching
+ * @name:	module name (or NULL for vmlinux)
+ * @relocs:	relocation entries to be applied at load time
+ * @funcs:	function entries for functions to be patched in the object
+ * @kobj:	kobject for sysfs resources
+ * @mod:	kernel module associated with the patched object
+ * 		(NULL for vmlinux)
+ * @state:	tracks object-level patch application state
+ */
+struct klp_object {
+	/* external */
+	const char *name;
+	struct klp_reloc *relocs;
+	struct klp_func *funcs;
+
+	/* internal */
+	struct kobject *kobj;
+	struct module *mod;
+	enum klp_state state;
+};
+
+/**
+ * struct klp_patch - patch structure for live patching
+ * @mod:	reference to the live patch module
+ * @objs:	object entries for kernel objects to be patched
+ * @list:	list node for global list of registered patches
+ * @kobj:	kobject for sysfs resources
+ * @state:	tracks patch-level application state
+ */
+struct klp_patch {
+	/* external */
+	struct module *mod;
+	struct klp_object *objs;
+
+	/* internal */
+	struct list_head list;
+	struct kobject kobj;
+	enum klp_state state;
+};
+
+extern int klp_register_patch(struct klp_patch *);
+extern int klp_unregister_patch(struct klp_patch *);
+extern int klp_enable_patch(struct klp_patch *);
+extern int klp_disable_patch(struct klp_patch *);
+
+#endif /* CONFIG_LIVE_PATCHING */
+
+#endif /* _LINUX_LIVEPATCH_H_ */
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..616994f0a76f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y += power/
 obj-y += printk/
 obj-y += irq/
 obj-y += rcu/
+obj-y += livepatch/
 
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..96da00fbc120
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config ARCH_HAVE_LIVE_PATCHING
+	boolean
+	help
+	  Arch supports kernel live patching
+
+config LIVE_PATCHING
+	boolean "Kernel Live Patching"
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on MODULES
+	depends on SYSFS
+	depends on KALLSYMS_ALL
+	depends on ARCH_HAVE_LIVE_PATCHING
+	help
+	  Say Y here if you want to support kernel live patching.
+	  This option has no runtime impact until a kernel "patch"
+	  module uses the interface provided by this option to register
+	  a patch, causing calls to patched functions to be redirected
+	  to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..7c1f00861428
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVE_PATCHING) += livepatch.o
+
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..f99fe189d596
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,930 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+
+/*
+ * The klp_mutex protects the klp_patches list and state transitions of any
+ * structure reachable from the patches list.  References to any structure must
+ * be obtained under mutex protection.
+ */
+
+static DEFINE_MUTEX(klp_mutex);
+static LIST_HEAD(klp_patches);
+
+static struct kobject *klp_root_kobj;
+
+static bool klp_is_module(struct klp_object *obj)
+{
+	return obj->name;
+}
+
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+	return !obj->name || obj->mod;
+}
+
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+	if (!klp_is_module(obj))
+		return;
+
+	mutex_lock(&module_mutex);
+	/*
+	 * We don't need to take a reference on the module here because we have
+	 * the klp_mutex, which is also taken by the module notifier.  This
+	 * prevents any module from unloading until we release the klp_mutex.
+	 */
+	obj->mod = find_module(obj->name);
+	mutex_unlock(&module_mutex);
+}
+
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+	struct klp_patch *mypatch;
+
+	list_for_each_entry(mypatch, &klp_patches, list)
+		if (mypatch == patch)
+			return true;
+
+	return false;
+}
+
+static bool klp_initialized(void)
+{
+	return klp_root_kobj;
+}
+
+struct klp_find_arg {
+	const char *objname;
+	const char *name;
+	unsigned long addr;
+	/*
+	 * If count == 0, the symbol was not found. If count == 1, a unique
+	 * match was found and addr is set.  If count > 1, there is
+	 * unresolvable ambiguity among "count" number of symbols with the same
+	 * name in the same object.
+	 */
+	unsigned long count;
+};
+
+static int klp_find_callback(void *data, const char *name,
+			     struct module *mod, unsigned long addr)
+{
+	struct klp_find_arg *args = data;
+
+	if ((mod && !args->objname) || (!mod && args->objname))
+		return 0;
+
+	if (strcmp(args->name, name))
+		return 0;
+
+	if (args->objname && strcmp(args->objname, mod->name))
+		return 0;
+
+	/*
+	 * args->addr might be overwritten if another match is found
+	 * but klp_find_object_symbol() handles this and only returns the
+	 * addr if count == 1.
+	 */
+	args->addr = addr;
+	args->count++;
+
+	return 0;
+}
+
+static int klp_find_object_symbol(const char *objname, const char *name,
+				  unsigned long *addr)
+{
+	struct klp_find_arg args = {
+		.objname = objname,
+		.name = name,
+		.addr = 0,
+		.count = 0
+	};
+
+	kallsyms_on_each_symbol(klp_find_callback, &args);
+
+	if (args.count == 0)
+		pr_err("symbol '%s' not found in symbol table\n", name);
+	else if (args.count > 1)
+		pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+		       args.count, name, objname);
+	else {
+		*addr = args.addr;
+		return 0;
+	}
+
+	*addr = 0;
+	return -EINVAL;
+}
+
+struct klp_verify_args {
+	const char *name;
+	const unsigned long addr;
+};
+
+static int klp_verify_callback(void *data, const char *name,
+			       struct module *mod, unsigned long addr)
+{
+	struct klp_verify_args *args = data;
+
+	if (!mod &&
+	    !strcmp(args->name, name) &&
+	    args->addr == addr)
+		return 1;
+
+	return 0;
+}
+
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+	struct klp_verify_args args = {
+		.name = name,
+		.addr = addr,
+	};
+
+	if (kallsyms_on_each_symbol(klp_verify_callback, &args))
+		return 0;
+
+	pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?",
+		name, addr);
+	return -EINVAL;
+}
+
+static int klp_find_verify_func_addr(struct klp_object *obj,
+				     struct klp_func *func)
+{
+	int ret;
+
+#if defined(CONFIG_RANDOMIZE_BASE)
+	/* KASLR is enabled, disregard old_addr from user */
+	func->old_addr = 0;
+#endif
+
+	if (!func->old_addr || klp_is_module(obj))
+		ret = klp_find_object_symbol(obj->name, func->old_name,
+					     &func->old_addr);
+	else
+		ret = klp_verify_vmlinux_symbol(func->old_name,
+						func->old_addr);
+
+	return ret;
+}
+
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+				    unsigned long *addr)
+{
+	const struct kernel_symbol *sym;
+
+	/* first, check if it's an exported symbol */
+	preempt_disable();
+	sym = find_symbol(name, NULL, NULL, true, true);
+	preempt_enable();
+	if (sym) {
+		*addr = sym->value;
+		return 0;
+	}
+
+	/* otherwise check if it's in another .o within the patch module */
+	return klp_find_object_symbol(pmod->name, name, addr);
+}
+
+static int klp_write_object_relocations(struct module *pmod,
+					struct klp_object *obj)
+{
+	int ret;
+	struct klp_reloc *reloc;
+
+	if (WARN_ON(!klp_is_object_loaded(obj)))
+		return -EINVAL;
+
+	if (WARN_ON(!obj->relocs))
+		return -EINVAL;
+
+	for (reloc = obj->relocs; reloc->name; reloc++) {
+		if (!klp_is_module(obj)) {
+			ret = klp_verify_vmlinux_symbol(reloc->name,
+							reloc->val);
+			if (ret)
+				return ret;
+		} else {
+			/* module, reloc->val needs to be discovered */
+			if (reloc->external)
+				ret = klp_find_external_symbol(pmod,
+							       reloc->name,
+							       &reloc->val);
+			else
+				ret = klp_find_object_symbol(obj->mod->name,
+							     reloc->name,
+							     &reloc->val);
+			if (ret)
+				return ret;
+		}
+		ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+					     reloc->val + reloc->addend);
+		if (ret) {
+			pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+			       reloc->name, reloc->val, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+				       unsigned long parent_ip,
+				       struct ftrace_ops *ops,
+				       struct pt_regs *regs)
+{
+	struct klp_func *func = ops->private;
+
+	regs->ip = (unsigned long)func->new_func;
+}
+
+static int klp_disable_func(struct klp_func *func)
+{
+	int ret;
+
+	if (WARN_ON(func->state != KLP_ENABLED))
+		return -EINVAL;
+
+	if (WARN_ON(!func->old_addr))
+		return -EINVAL;
+
+	ret = unregister_ftrace_function(func->fops);
+	if (ret) {
+		pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
+		       func->old_name, ret);
+		return ret;
+	}
+
+	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
+	if (ret)
+		pr_warn("function unregister succeeded but failed to clear the filter\n");
+
+	func->state = KLP_DISABLED;
+
+	return 0;
+}
+
+static int klp_enable_func(struct klp_func *func)
+{
+	int ret;
+
+	if (WARN_ON(!func->old_addr))
+		return -EINVAL;
+
+	if (WARN_ON(func->state != KLP_DISABLED))
+		return -EINVAL;
+
+	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 0, 0);
+	if (ret) {
+		pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+		       func->old_name, ret);
+		return ret;
+	}
+
+	ret = register_ftrace_function(func->fops);
+	if (ret) {
+		pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+		       func->old_name, ret);
+		ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
+	} else {
+		func->state = KLP_ENABLED;
+	}
+
+	return ret;
+}
+
+static int klp_disable_object(struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		if (func->state != KLP_ENABLED)
+			continue;
+
+		ret = klp_disable_func(func);
+		if (ret)
+			return ret;
+	}
+
+	obj->state = KLP_DISABLED;
+
+	return 0;
+}
+
+static int klp_enable_object(struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	if (WARN_ON(obj->state != KLP_DISABLED))
+		return -EINVAL;
+
+	if (WARN_ON(!klp_is_object_loaded(obj)))
+		return -EINVAL;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_enable_func(func);
+		if (ret)
+			goto unregister;
+	}
+	obj->state = KLP_ENABLED;
+
+	return 0;
+
+unregister:
+	WARN_ON(klp_disable_object(obj));
+	return ret;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	pr_notice("disabling patch '%s'\n", patch->mod->name);
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		if (obj->state != KLP_ENABLED)
+			continue;
+
+		ret = klp_disable_object(obj);
+		if (ret)
+			return ret;
+	}
+
+	patch->state = KLP_DISABLED;
+
+	return 0;
+}
+
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch:	The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (patch->state == KLP_DISABLED) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = __klp_disable_patch(patch);
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	if (WARN_ON(patch->state != KLP_DISABLED))
+		return -EINVAL;
+
+	pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+	add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+
+	pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		klp_find_object_module(obj);
+
+		if (!klp_is_object_loaded(obj))
+			continue;
+
+		ret = klp_enable_object(obj);
+		if (ret)
+			goto unregister;
+	}
+
+	patch->state = KLP_ENABLED;
+
+	return 0;
+
+unregister:
+	WARN_ON(__klp_disable_patch(patch));
+	return ret;
+}
+
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch:	The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = __klp_enable_patch(patch);
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct klp_patch *patch;
+	int ret;
+	unsigned long val;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return -EINVAL;
+
+	if (val != KLP_DISABLED && val != KLP_ENABLED)
+		return -EINVAL;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+
+	mutex_lock(&klp_mutex);
+
+	if (val == patch->state) {
+		/* already in requested state */
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (val == KLP_ENABLED) {
+		ret = __klp_enable_patch(patch);
+		if (ret)
+			goto err;
+	} else {
+		ret = __klp_disable_patch(patch);
+		if (ret)
+			goto err;
+	}
+
+	mutex_unlock(&klp_mutex);
+
+	return count;
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buf)
+{
+	struct klp_patch *patch;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+	return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+	&enabled_kobj_attr.attr,
+	NULL
+};
+
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+	/*
+	 * Once we have a consistency model we'll need to module_put() the
+	 * patch module here.  See klp_register_patch() for more details.
+	 */
+}
+
+static struct kobj_type klp_ktype_patch = {
+	.release = klp_kobj_release_patch,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = klp_patch_attrs,
+};
+
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+	struct klp_func *func;
+
+	func = container_of(kobj, struct klp_func, kobj);
+	kfree(func->fops);
+}
+
+static struct kobj_type klp_ktype_func = {
+	.release = klp_kobj_release_func,
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+				   struct klp_func *limit)
+{
+	struct klp_func *func;
+
+	for (func = obj->funcs; func->old_name && func != limit; func++)
+		kobject_put(&func->kobj);
+}
+
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+	struct klp_func *func;
+
+	obj->mod = NULL;
+
+	for (func = obj->funcs; func->old_name; func++)
+		func->old_addr = 0;
+}
+
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+				     struct klp_object *limit)
+{
+	struct klp_object *obj;
+
+	for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+		klp_free_funcs_limited(obj, NULL);
+		kobject_put(obj->kobj);
+	}
+}
+
+static void klp_free_patch(struct klp_patch *patch)
+{
+	klp_free_objects_limited(patch, NULL);
+	if (!list_empty(&patch->list))
+		list_del(&patch->list);
+	kobject_put(&patch->kobj);
+}
+
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+	struct ftrace_ops *ops;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	ops->private = func;
+	ops->func = klp_ftrace_handler;
+	ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC;
+	func->fops = ops;
+	func->state = KLP_DISABLED;
+
+	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
+				   obj->kobj, func->old_name);
+	if (ret) {
+		kfree(func->fops);
+		return ret;
+	}
+
+	return 0;
+}
+
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+				  struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	if (obj->relocs) {
+		ret = klp_write_object_relocations(patch->mod, obj);
+		if (ret)
+			return ret;
+	}
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_find_verify_func_addr(obj, func);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+	const char *name;
+
+	if (!obj->funcs)
+		return -EINVAL;
+
+	obj->state = KLP_DISABLED;
+
+	klp_find_object_module(obj);
+
+	name = klp_is_module(obj) ? obj->name : "vmlinux";
+	obj->kobj = kobject_create_and_add(name, &patch->kobj);
+	if (!obj->kobj)
+		return -ENOMEM;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_init_func(obj, func);
+		if (ret)
+			goto free;
+	}
+
+	if (klp_is_object_loaded(obj)) {
+		ret = klp_init_object_loaded(patch, obj);
+		if (ret)
+			goto free;
+	}
+
+	return 0;
+
+free:
+	klp_free_funcs_limited(obj, func);
+	kobject_put(obj->kobj);
+	return ret;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	if (!patch->objs)
+		return -EINVAL;
+
+	mutex_lock(&klp_mutex);
+
+	patch->state = KLP_DISABLED;
+
+	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+				   klp_root_kobj, patch->mod->name);
+	if (ret)
+		goto unlock;
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		ret = klp_init_object(patch, obj);
+		if (ret)
+			goto free;
+	}
+
+	list_add(&patch->list, &klp_patches);
+
+	mutex_unlock(&klp_mutex);
+
+	return 0;
+
+free:
+	klp_free_objects_limited(patch, obj);
+	kobject_put(&patch->kobj);
+unlock:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch:	Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+	int ret = 0;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (patch->state == KLP_ENABLED) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	klp_free_patch(patch);
+
+out:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+
+/**
+ * klp_register_patch() - registers a patch
+ * @patch:	Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	if (!klp_initialized())
+		return -ENODEV;
+
+	if (!patch || !patch->mod)
+		return -EINVAL;
+
+	/*
+	 * A reference is taken on the patch module to prevent it from being
+	 * unloaded.  Right now, we don't allow patch modules to unload since
+	 * there is currently no method to determine if a thread is still
+	 * running in the patched code contained in the patch module once
+	 * the ftrace registration is successful.
+	 */
+	if (!try_module_get(patch->mod))
+		return -ENODEV;
+
+	ret = klp_init_patch(patch);
+	if (ret)
+		module_put(patch->mod);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+
+static void klp_module_notify_coming(struct klp_patch *patch,
+				     struct klp_object *obj)
+{
+	struct module *pmod = patch->mod;
+	struct module *mod = obj->mod;
+	int ret;
+
+	ret = klp_init_object_loaded(patch, obj);
+	if (ret)
+		goto err;
+
+	if (patch->state == KLP_DISABLED)
+		return;
+
+	pr_notice("applying patch '%s' to loading module '%s'\n",
+		  pmod->name, mod->name);
+
+	ret = klp_enable_object(obj);
+	if (!ret)
+		return;
+
+err:
+	pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+		pmod->name, mod->name, ret);
+}
+
+static void klp_module_notify_going(struct klp_patch *patch,
+				    struct klp_object *obj)
+{
+	struct module *pmod = patch->mod;
+	struct module *mod = obj->mod;
+	int ret;
+
+	if (patch->state == KLP_DISABLED)
+		goto disabled;
+
+	pr_notice("reverting patch '%s' on unloading module '%s'\n",
+		  pmod->name, mod->name);
+
+	ret = klp_disable_object(obj);
+	if (ret)
+		pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
+			pmod->name, mod->name, ret);
+
+disabled:
+	klp_free_object_loaded(obj);
+}
+
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+			     void *data)
+{
+	struct module *mod = data;
+	struct klp_patch *patch;
+	struct klp_object *obj;
+
+	if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+		return 0;
+
+	mutex_lock(&klp_mutex);
+
+	list_for_each_entry(patch, &klp_patches, list) {
+		for (obj = patch->objs; obj->funcs; obj++) {
+			if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+				continue;
+
+			if (action == MODULE_STATE_COMING) {
+				obj->mod = mod;
+				klp_module_notify_coming(patch, obj);
+			} else /* MODULE_STATE_GOING */
+				klp_module_notify_going(patch, obj);
+
+			break;
+		}
+	}
+
+	mutex_unlock(&klp_mutex);
+
+	return 0;
+}
+
+static struct notifier_block klp_module_nb = {
+	.notifier_call = klp_module_notify,
+	.priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+
+static int klp_init(void)
+{
+	int ret;
+
+	ret = register_module_notifier(&klp_module_nb);
+	if (ret)
+		return ret;
+
+	klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+	if (!klp_root_kobj) {
+		ret = -ENOMEM;
+		goto unregister;
+	}
+
+	return 0;
+
+unregister:
+	unregister_module_notifier(&klp_module_nb);
+	return ret;
+}
+
+module_init(klp_init);
-- 
cgit v1.2.3


From b5bfc51707f1b56b0b733980bb4fcc0562bf02d8 Mon Sep 17 00:00:00 2001
From: Li Bin <huawei.libin@huawei.com>
Date: Fri, 19 Dec 2014 14:11:17 +0800
Subject: livepatch: move x86 specific ftrace handler code to arch/x86

The execution flow redirection related implemention in the livepatch
ftrace handler is depended on the specific architecture. This patch
introduces klp_arch_set_pc(like kgdb_arch_set_pc) interface to change
the pt_regs.

Signed-off-by: Li Bin <huawei.libin@huawei.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 5 +++++
 kernel/livepatch/core.c          | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index d529db1b1edf..b5608d7757fd 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -22,6 +22,7 @@
 #define _ASM_X86_LIVEPATCH_H
 
 #include <linux/module.h>
+#include <linux/ftrace.h>
 
 #ifdef CONFIG_LIVE_PATCHING
 #ifndef CC_USING_FENTRY
@@ -30,6 +31,10 @@
 extern int klp_write_module_reloc(struct module *mod, unsigned long type,
 				  unsigned long loc, unsigned long value);
 
+static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
 #else
 #error Live patching support is disabled; check CONFIG_LIVE_PATCHING
 #endif
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f99fe189d596..07a2db9d01e6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -272,7 +272,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 {
 	struct klp_func *func = ops->private;
 
-	regs->ip = (unsigned long)func->new_func;
+	klp_arch_set_pc(regs, (unsigned long)func->new_func);
 }
 
 static int klp_disable_func(struct klp_func *func)
-- 
cgit v1.2.3


From 801a55911432f582c8ab82c895d2821dc02b70e3 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Fri, 2 Jan 2015 06:11:16 +0100
Subject: x86: init_mem_mapping(): use capital BIOS in comment

Use capital BIOS in comment. Its cleaner, and allows diference
between BIOS and BIOs.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 66dba36f2343..452f9042e5b2 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -582,7 +582,7 @@ void __init init_mem_mapping(void)
  *
  *
  * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
+ * contains BIOS code and data regions used by X and dosemu and similar apps.
  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
  * mmio resources as well as potential bios/acpi data regions.
  */
-- 
cgit v1.2.3


From ff651cb613b4cc8aa2e4284525948872b4d77d66 Mon Sep 17 00:00:00 2001
From: Wincy Van <fanwenyi0529@gmail.com>
Date: Thu, 11 Dec 2014 08:52:58 +0300
Subject: KVM: nVMX: Add nested msr load/restore algorithm

Several hypervisors need MSR auto load/restore feature.
We read MSRs from VM-entry MSR load area which specified by L1,
and load them via kvm_set_msr in the nested entry.
When nested exit occurs, we get MSRs via kvm_get_msr, writing
them to L1`s MSR store area. After this, we read MSRs from VM-exit
MSR load area, and load them via kvm_set_msr.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/vmx.h |   5 ++
 arch/x86/kvm/vmx.c              | 101 +++++++++++++++++++++++++++++++++++-----
 arch/x86/kvm/x86.c              |   1 +
 virt/kvm/kvm_main.c             |   1 +
 4 files changed, 97 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b813bf9da1e2..ff2b8e28883e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_INVALID_STATE       33
+#define EXIT_REASON_MSR_LOAD_FAIL       34
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
@@ -116,10 +117,14 @@
 	{ EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
 	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
 	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
+	{ EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
 	{ EXIT_REASON_INVD,                  "INVD" }, \
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
+
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..9137d2ba26a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6143,6 +6143,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
 	 */
 }
 
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+	/* TODO: not to reset guest simply here. */
+	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+	pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 {
 	struct vcpu_vmx *vmx =
@@ -8286,6 +8293,67 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
 		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
 
+static inline int nested_vmx_msr_check_common(struct vmx_msr_entry *e)
+{
+	if (e->index >> 8 == 0x8 || e->reserved != 0)
+		return -EINVAL;
+	return 0;
+}
+
+static inline int nested_vmx_load_msr_check(struct vmx_msr_entry *e)
+{
+	if (e->index == MSR_FS_BASE ||
+	    e->index == MSR_GS_BASE ||
+		nested_vmx_msr_check_common(e))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+	u32 i;
+	struct vmx_msr_entry e;
+	struct msr_data msr;
+
+	msr.host_initiated = false;
+	for (i = 0; i < count; i++) {
+		kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), &e, sizeof(e));
+		if (nested_vmx_load_msr_check(&e))
+			goto fail;
+		msr.index = e.index;
+		msr.data = e.value;
+		if (kvm_set_msr(vcpu, &msr))
+			goto fail;
+	}
+	return 0;
+fail:
+	return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+	u32 i;
+	struct vmx_msr_entry e;
+
+	for (i = 0; i < count; i++) {
+		kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+				&e, 2 * sizeof(u32));
+		if (nested_vmx_msr_check_common(&e))
+			return -EINVAL;
+		if (kvm_get_msr(vcpu, e.index, &e.value))
+			return -EINVAL;
+		kvm_write_guest(vcpu->kvm,
+				gpa + i * sizeof(e) +
+					offsetof(struct vmx_msr_entry, value),
+				&e.value, sizeof(e.value));
+	}
+	return 0;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8582,6 +8650,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	int cpu;
 	struct loaded_vmcs *vmcs02;
 	bool ia32e;
+	u32 msr_entry_idx;
 
 	if (!nested_vmx_check_permission(vcpu) ||
 	    !nested_vmx_check_vmcs12(vcpu))
@@ -8629,15 +8698,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
-	if (vmcs12->vm_entry_msr_load_count > 0 ||
-	    vmcs12->vm_exit_msr_load_count > 0 ||
-	    vmcs12->vm_exit_msr_store_count > 0) {
-		pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
-				    __func__);
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
-	}
-
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 				nested_vmx_true_procbased_ctls_low,
 				nested_vmx_procbased_ctls_high) ||
@@ -8739,10 +8799,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	vmx_segment_cache_clear(vmx);
 
-	vmcs12->launch_state = 1;
-
 	prepare_vmcs02(vcpu, vmcs12);
 
+	msr_entry_idx = nested_vmx_load_msr(vcpu,
+					    vmcs12->vm_entry_msr_load_addr,
+					    vmcs12->vm_entry_msr_load_count);
+	if (msr_entry_idx) {
+		leave_guest_mode(vcpu);
+		vmx_load_vmcs01(vcpu);
+		nested_vmx_entry_failure(vcpu, vmcs12,
+				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+		return 1;
+	}
+
+	vmcs12->launch_state = 1;
+
 	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
 		return kvm_emulate_halt(vcpu);
 
@@ -9172,6 +9243,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
 	kvm_set_dr(vcpu, 7, 0x400);
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+				vmcs12->vm_exit_msr_load_count))
+		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
 }
 
 /*
@@ -9193,6 +9268,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
 		       exit_qualification);
 
+	if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+				 vmcs12->vm_exit_msr_store_count))
+		nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+
 	vmx_load_vmcs01(vcpu);
 
 	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..af9faed270f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2324,6 +2324,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
 	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 }
+EXPORT_SYMBOL_GPL(kvm_get_msr);
 
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cc6e2e19982..167e8c14b143 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1593,6 +1593,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest);
 
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			      gpa_t gpa, unsigned long len)
-- 
cgit v1.2.3


From e9ac033e6b6970c7061725fc6824b3933eb5a0e7 Mon Sep 17 00:00:00 2001
From: Eugene Korenevsky <ekorenevsky@gmail.com>
Date: Thu, 11 Dec 2014 08:53:27 +0300
Subject: KVM: nVMX: Improve nested msr switch checking

This patch improve checks required by Intel Software Developer Manual.
 - SMM MSRs are not allowed.
 - microcode MSRs are not allowed.
 - check x2apic MSRs only when LAPIC is in x2apic mode.
 - MSR switch areas must be aligned to 16 bytes.
 - address of first and last byte in MSR switch areas should not set any bits
   beyond the processor's physical-address width.

Also it adds warning messages on failures during MSR switch. These messages
are useful for people who debug their VMMs in nVMX.

Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/msr-index.h |   3 +
 arch/x86/kvm/vmx.c                    | 128 ++++++++++++++++++++++++++++++----
 2 files changed, 117 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c8aa65d56027..d0050f25ea80 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -356,6 +356,9 @@
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
+#define MSR_IA32_SMM_MONITOR_CTL	0x0000009b
+#define MSR_IA32_SMBASE			0x0000009e
+
 #define MSR_IA32_PERF_STATUS		0x00000198
 #define MSR_IA32_PERF_CTL		0x00000199
 #define MSR_AMD_PSTATE_DEF_BASE		0xc0010064
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9137d2ba26a2..70bdcf946f95 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8293,18 +8293,80 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
 		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
 
-static inline int nested_vmx_msr_check_common(struct vmx_msr_entry *e)
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+				       unsigned long count_field,
+				       unsigned long addr_field,
+				       int maxphyaddr)
 {
-	if (e->index >> 8 == 0x8 || e->reserved != 0)
+	u64 count, addr;
+
+	if (vmcs12_read_any(vcpu, count_field, &count) ||
+	    vmcs12_read_any(vcpu, addr_field, &addr)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+	if (count == 0)
+		return 0;
+	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+		pr_warn_ratelimited(
+			"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+			addr_field, maxphyaddr, count, addr);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+						struct vmcs12 *vmcs12)
+{
+	int maxphyaddr;
+
+	if (vmcs12->vm_exit_msr_load_count == 0 &&
+	    vmcs12->vm_exit_msr_store_count == 0 &&
+	    vmcs12->vm_entry_msr_load_count == 0)
+		return 0; /* Fast path */
+	maxphyaddr = cpuid_maxphyaddr(vcpu);
+	if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+					VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+	    nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+					VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+	    nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+					VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+		return -EINVAL;
+	return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+				       struct vmx_msr_entry *e)
+{
+	/* x2APIC MSR accesses are not allowed */
+	if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+		return -EINVAL;
+	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+	    e->index == MSR_IA32_UCODE_REV)
+		return -EINVAL;
+	if (e->reserved != 0)
 		return -EINVAL;
 	return 0;
 }
 
-static inline int nested_vmx_load_msr_check(struct vmx_msr_entry *e)
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+				     struct vmx_msr_entry *e)
 {
 	if (e->index == MSR_FS_BASE ||
 	    e->index == MSR_GS_BASE ||
-		nested_vmx_msr_check_common(e))
+	    e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+	    nested_vmx_msr_check_common(vcpu, e))
+		return -EINVAL;
+	return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+				      struct vmx_msr_entry *e)
+{
+	if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+	    nested_vmx_msr_check_common(vcpu, e))
 		return -EINVAL;
 	return 0;
 }
@@ -8321,13 +8383,27 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 
 	msr.host_initiated = false;
 	for (i = 0; i < count; i++) {
-		kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), &e, sizeof(e));
-		if (nested_vmx_load_msr_check(&e))
+		if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+				   &e, sizeof(e))) {
+			pr_warn_ratelimited(
+				"%s cannot read MSR entry (%u, 0x%08llx)\n",
+				__func__, i, gpa + i * sizeof(e));
 			goto fail;
+		}
+		if (nested_vmx_load_msr_check(vcpu, &e)) {
+			pr_warn_ratelimited(
+				"%s check failed (%u, 0x%x, 0x%x)\n",
+				__func__, i, e.index, e.reserved);
+			goto fail;
+		}
 		msr.index = e.index;
 		msr.data = e.value;
-		if (kvm_set_msr(vcpu, &msr))
+		if (kvm_set_msr(vcpu, &msr)) {
+			pr_warn_ratelimited(
+				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+				__func__, i, e.index, e.value);
 			goto fail;
+		}
 	}
 	return 0;
 fail:
@@ -8340,16 +8416,35 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 	struct vmx_msr_entry e;
 
 	for (i = 0; i < count; i++) {
-		kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
-				&e, 2 * sizeof(u32));
-		if (nested_vmx_msr_check_common(&e))
+		if (kvm_read_guest(vcpu->kvm,
+				   gpa + i * sizeof(e),
+				   &e, 2 * sizeof(u32))) {
+			pr_warn_ratelimited(
+				"%s cannot read MSR entry (%u, 0x%08llx)\n",
+				__func__, i, gpa + i * sizeof(e));
 			return -EINVAL;
-		if (kvm_get_msr(vcpu, e.index, &e.value))
+		}
+		if (nested_vmx_store_msr_check(vcpu, &e)) {
+			pr_warn_ratelimited(
+				"%s check failed (%u, 0x%x, 0x%x)\n",
+				__func__, i, e.index, e.reserved);
 			return -EINVAL;
-		kvm_write_guest(vcpu->kvm,
-				gpa + i * sizeof(e) +
+		}
+		if (kvm_get_msr(vcpu, e.index, &e.value)) {
+			pr_warn_ratelimited(
+				"%s cannot read MSR (%u, 0x%x)\n",
+				__func__, i, e.index);
+			return -EINVAL;
+		}
+		if (kvm_write_guest(vcpu->kvm,
+				    gpa + i * sizeof(e) +
 					offsetof(struct vmx_msr_entry, value),
-				&e.value, sizeof(e.value));
+				    &e.value, sizeof(e.value))) {
+			pr_warn_ratelimited(
+				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+				__func__, i, e.index, e.value);
+			return -EINVAL;
+		}
 	}
 	return 0;
 }
@@ -8698,6 +8793,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
+	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 				nested_vmx_true_procbased_ctls_low,
 				nested_vmx_procbased_ctls_high) ||
-- 
cgit v1.2.3


From 19d5f10b3ad08f596d2682404af9a3a9030aa684 Mon Sep 17 00:00:00 2001
From: Eugene Korenevsky <ekorenevsky@gmail.com>
Date: Tue, 16 Dec 2014 22:35:53 +0300
Subject: KVM: nVMX: consult PFEC_MASK and PFEC_MATCH when generating #PF
 VM-exit

When generating #PF VM-exit, check equality:
(PFEC & PFEC_MASK) == PFEC_MATCH
If there is equality, the 14 bit of exception bitmap is used to take decision
about generating #PF VM-exit. If there is inequality, inverted 14 bit is used.

Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 70bdcf946f95..e14c96e574ff 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8206,6 +8206,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
 
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+					    u16 error_code)
+{
+	bool inequality, bit;
+
+	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+	inequality =
+		(error_code & vmcs12->page_fault_error_code_mask) !=
+		 vmcs12->page_fault_error_code_match;
+	return inequality ^ bit;
+}
+
 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 		struct x86_exception *fault)
 {
@@ -8213,8 +8225,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
 	WARN_ON(!is_guest_mode(vcpu));
 
-	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-	if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
 		nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
 				  vmcs_read32(VM_EXIT_INTR_INFO),
 				  vmcs_readl(EXIT_QUALIFICATION));
-- 
cgit v1.2.3


From 5ff22e7ebf2e75e6300ad968a6e529b5e70877f1 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Thu, 18 Dec 2014 21:13:22 -0500
Subject: KVM: x86: Remove FIXMEs in emulate.c for the function,task_switch_32

Remove FIXME comments about needing fault addresses to be returned.  These
are propaagated from walk_addr_generic to gva_to_gpa and from there to
ops->read_std and ops->write_std.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 169b09d76ddd..feaba468cce6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2750,7 +2750,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 	ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
 			    &ctxt->exception);
 	if (ret != X86EMUL_CONTINUE)
-		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	save_state_to_tss32(ctxt, &tss_seg);
@@ -2759,13 +2758,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 	ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
 			     ldt_sel_offset - eip_offset, &ctxt->exception);
 	if (ret != X86EMUL_CONTINUE)
-		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
 			    &ctxt->exception);
 	if (ret != X86EMUL_CONTINUE)
-		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	if (old_tss_sel != 0xffff) {
@@ -2776,7 +2773,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     sizeof tss_seg.prev_task_link,
 				     &ctxt->exception);
 		if (ret != X86EMUL_CONTINUE)
-			/* FIXME: need to provide precise fault address */
 			return ret;
 	}
 
-- 
cgit v1.2.3


From b4eef9b36db461ca44832226fbca614db58c0c33 Mon Sep 17 00:00:00 2001
From: Tiejun Chen <tiejun.chen@intel.com>
Date: Mon, 22 Dec 2014 10:32:57 +0100
Subject: kvm: x86: vmx: NULL out hwapic_isr_update() in case of !enable_apicv

In most cases calling hwapic_isr_update(), we always check if
kvm_apic_vid_enabled() == 1, but actually,
kvm_apic_vid_enabled()
    -> kvm_x86_ops->vm_has_apicv()
        -> vmx_vm_has_apicv() or '0' in svm case
            -> return enable_apicv && irqchip_in_kernel(kvm)

So its a little cost to recall vmx_vm_has_apicv() inside
hwapic_isr_update(), here just NULL out hwapic_isr_update() in
case of !enable_apicv inside hardware_setup() then make all
related stuffs follow this. Note we don't check this under that
condition of irqchip_in_kernel() since we should make sure
definitely any caller don't work  without in-kernel irqchip.

Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 8 +++++---
 arch/x86/kvm/vmx.c   | 4 +---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4f0c0b954686..fe8bae511e99 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -402,7 +402,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
 	 * because the processor can modify ISR under the hood.  Instead
 	 * just set SVI.
 	 */
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+	if (unlikely(kvm_x86_ops->hwapic_isr_update))
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
 	else {
 		++apic->isr_count;
@@ -450,7 +450,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 	 * on the other hand isr_count and highest_isr_cache are unused
 	 * and must be left alone.
 	 */
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+	if (unlikely(kvm_x86_ops->hwapic_isr_update))
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 					       apic_find_highest_isr(apic));
 	else {
@@ -1742,7 +1742,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	if (kvm_x86_ops->hwapic_irr_update)
 		kvm_x86_ops->hwapic_irr_update(vcpu,
 				apic_find_highest_irr(apic));
-	kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
+	if (unlikely(kvm_x86_ops->hwapic_isr_update))
+		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e14c96e574ff..6e71fac27d4e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5895,6 +5895,7 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->update_cr8_intercept = NULL;
 	else {
 		kvm_x86_ops->hwapic_irr_update = NULL;
+		kvm_x86_ops->hwapic_isr_update = NULL;
 		kvm_x86_ops->deliver_posted_interrupt = NULL;
 		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
 	}
@@ -7478,9 +7479,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
 	u16 status;
 	u8 old;
 
-	if (!vmx_vm_has_apicv(kvm))
-		return;
-
 	if (isr == -1)
 		isr = 0;
 
-- 
cgit v1.2.3


From 7c6a98dfa1ba9dc64a62e73624ecea9995736bbd Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 16 Dec 2014 09:08:14 -0500
Subject: KVM: x86: add method to test PIR bitmap vector

kvm_x86_ops->test_posted_interrupt() returns true/false depending
whether 'vector' is set.

Next patch makes use of this interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx.c              | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d89c6b828c96..cb19d05af3cd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -753,6 +753,7 @@ struct kvm_x86_ops {
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+	bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e71fac27d4e..3b97f8b3065e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -435,6 +435,11 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 
+static int pi_test_pir(int vector, struct pi_desc *pi_desc)
+{
+	return test_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -5897,6 +5902,7 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->hwapic_irr_update = NULL;
 		kvm_x86_ops->hwapic_isr_update = NULL;
 		kvm_x86_ops->deliver_posted_interrupt = NULL;
+		kvm_x86_ops->test_posted_interrupt = NULL;
 		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
 	}
 
@@ -6968,6 +6974,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	return pi_test_pir(vector, &vmx->pi_desc);
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -9562,6 +9575,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.hwapic_isr_update = vmx_hwapic_isr_update,
 	.sync_pir_to_irr = vmx_sync_pir_to_irr,
 	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+	.test_posted_interrupt = vmx_test_pir,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
-- 
cgit v1.2.3


From d0659d946be05e098883b6955d2764595997f6a4 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 16 Dec 2014 09:08:15 -0500
Subject: KVM: x86: add option to advance tscdeadline hrtimer expiration

For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.

This allows achieving low latencies in cyclictest (or any scenario
which requires strict timing regarding timer expiration).

Reduces average cyclictest latency from 12us to 8us
on Core i5 desktop.

Note: this option requires tuning to find the appropriate value
for a particular hardware/guest combination. One method is to measure the
average delay between apic_timer_fn and VM-entry.
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/lapic.h |  3 +++
 arch/x86/kvm/x86.c   |  5 +++++
 arch/x86/kvm/x86.h   |  2 ++
 4 files changed, 66 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fe8bae511e99..e1c0befaa9f6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
 #include <asm/page.h>
 #include <asm/current.h>
 #include <asm/apicdef.h>
+#include <asm/delay.h>
 #include <linux/atomic.h>
 #include <linux/jump_label.h>
 #include "kvm_cache_regs.h"
@@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	wait_queue_head_t *q = &vcpu->wq;
+	struct kvm_timer *ktimer = &apic->lapic_timer;
 
 	/*
 	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,61 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 
 	if (waitqueue_active(q))
 		wake_up_interruptible(q);
+
+	if (apic_lvtt_tscdeadline(apic))
+		ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+	if (kvm_apic_hw_enabled(apic)) {
+		int vec = reg & APIC_VECTOR_MASK;
+
+		if (kvm_x86_ops->test_posted_interrupt)
+			return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
+		else {
+			if (apic_test_vector(vec, apic->regs + APIC_ISR))
+				return true;
+		}
+	}
+	return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 guest_tsc, tsc_deadline;
+
+	if (!kvm_vcpu_has_lapic(vcpu))
+		return;
+
+	if (apic->lapic_timer.expired_tscdeadline == 0)
+		return;
+
+	if (!lapic_timer_int_injected(vcpu))
+		return;
+
+	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+	apic->lapic_timer.expired_tscdeadline = 0;
+	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+
+	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
+	if (guest_tsc < tsc_deadline)
+		__delay(tsc_deadline - guest_tsc);
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
+
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1189,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		/* lapic timer in tsc deadline mode */
 		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 		u64 ns = 0;
+		ktime_t expire;
 		struct kvm_vcpu *vcpu = apic->vcpu;
 		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
 		unsigned long flags;
@@ -1151,8 +1204,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
+			expire = ktime_add_ns(now, ns);
+			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
 			hrtimer_start(&apic->lapic_timer.timer,
-				ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+				      expire, HRTIMER_MODE_ABS);
 		} else
 			apic_timer_expired(apic);
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..7054437944cd 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
 	u32 timer_mode;
 	u32 timer_mode_mask;
 	u64 tscdeadline;
+	u64 expired_tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
 };
 
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af9faed270f1..559e3fd6c897 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 static u32 tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
 static bool backwards_tsc_observed = false;
 
 #define KVM_NR_SHARED_MSRS 16
@@ -6312,6 +6316,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	trace_kvm_entry(vcpu->vcpu_id);
+	wait_lapic_expire(vcpu);
 	kvm_x86_ops->run(vcpu);
 
 	/*
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..07994f38dacf 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -170,5 +170,7 @@ extern u64 kvm_supported_xcr0(void);
 
 extern unsigned int min_timer_period_us;
 
+extern unsigned int lapic_timer_advance_ns;
+
 extern struct static_key kvm_no_apic_vcpu;
 #endif
-- 
cgit v1.2.3


From 6c19b7538f5ae2b6cdf91ab29f7fddf7320ece5b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 16 Dec 2014 09:08:16 -0500
Subject: KVM: x86: add tracepoint to wait_lapic_expire

Add tracepoint to wait_lapic_expire.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
[Remind reader if early or late. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c |  1 +
 arch/x86/kvm/trace.h | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e1c0befaa9f6..3eb7f8d9992c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1134,6 +1134,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
 	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 
 	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
 	if (guest_tsc < tsc_deadline)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..587149bd6f76 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -914,6 +914,26 @@ TRACE_EVENT(kvm_pvclock_update,
 		  __entry->flags)
 );
 
+TRACE_EVENT(kvm_wait_lapic_expire,
+	TP_PROTO(unsigned int vcpu_id, s64 delta),
+	TP_ARGS(vcpu_id, delta),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
+		__field(	s64,		delta		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	   = vcpu_id;
+		__entry->delta             = delta;
+	),
+
+	TP_printk("vcpu %u: delta %lld (%s)",
+		  __entry->vcpu_id,
+		  __entry->delta,
+		  __entry->delta < 0 ? "early" : "late")
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From e0c6db3e22f564d91832547a2432ab00f215108e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Dec 2014 13:39:46 +0100
Subject: KVM: x86: mmu: do not use return to tail-call functions that return
 void

This is, pedantically, not valid C.  It also looks weird.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..8ddbcb570fce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3900,11 +3900,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	if (mmu_is_nested(vcpu))
-		return init_kvm_nested_mmu(vcpu);
+		init_kvm_nested_mmu(vcpu);
 	else if (tdp_enabled)
-		return init_kvm_tdp_mmu(vcpu);
+		init_kvm_tdp_mmu(vcpu);
 	else
-		return init_kvm_softmmu(vcpu);
+		init_kvm_softmmu(vcpu);
 }
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From ad896af0b50ed656e38a31fca1fdb7bb7533db45 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 2 Oct 2013 16:56:14 +0200
Subject: KVM: x86: mmu: remove argument to kvm_init_shadow_mmu and
 kvm_init_shadow_ept_mmu

The initialization function in mmu.c can always use walk_mmu, which
is known to be vcpu->arch.mmu.  Only init_kvm_nested_mmu is used to
initialize vcpu->arch.nested_mmu.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 35 ++++++++++++++++++++---------------
 arch/x86/kvm/mmu.h |  5 ++---
 arch/x86/kvm/svm.c |  4 ++--
 arch/x86/kvm/vmx.c |  4 ++--
 4 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8ddbcb570fce..d6d3d6f0ff1b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3763,7 +3763,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
 
 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu *context = vcpu->arch.walk_mmu;
+	struct kvm_mmu *context = &vcpu->arch.mmu;
 
 	context->base_role.word = 0;
 	context->page_fault = tdp_page_fault;
@@ -3803,11 +3803,13 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	update_last_pte_bitmap(vcpu, context);
 }
 
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 {
 	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(context->root_hpa));
 
 	if (!is_paging(vcpu))
 		nonpaging_init_context(vcpu, context);
@@ -3818,19 +3820,20 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 	else
 		paging32_init_context(vcpu, context);
 
-	vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
-	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
-	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
-	vcpu->arch.mmu.base_role.smep_andnot_wp
+	context->base_role.nxe = is_nx(vcpu);
+	context->base_role.cr4_pae = !!is_pae(vcpu);
+	context->base_role.cr0_wp  = is_write_protection(vcpu);
+	context->base_role.smep_andnot_wp
 		= smep && !is_write_protection(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-		bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 {
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(context->root_hpa));
 
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
@@ -3851,11 +3854,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-	kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
-	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
-	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
-	vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
-	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	kvm_init_shadow_mmu(vcpu);
+	context->set_cr3           = kvm_x86_ops->set_cr3;
+	context->get_cr3           = get_cr3;
+	context->get_pdptr         = kvm_pdptr_read;
+	context->inject_page_fault = kvm_inject_page_fault;
 }
 
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..a7f9a121690d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -81,9 +81,8 @@ enum {
 };
 
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-		bool execonly);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
 void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 		bool ept);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..a17d848c6d42 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-	kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-
+	WARN_ON(mmu_is_nested(vcpu));
+	kvm_init_shadow_mmu(vcpu);
 	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
 	vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3b97f8b3065e..ce350718eb88 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8202,9 +8202,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 
 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-	kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+	WARN_ON(mmu_is_nested(vcpu));
+	kvm_init_shadow_ept_mmu(vcpu,
 			nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
-
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
-- 
cgit v1.2.3


From 4c1a50de9223e1bb7ce5decdd69bdf34a864f03e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 2 Oct 2013 16:56:15 +0200
Subject: KVM: x86: mmu: remove ASSERT(vcpu)

Because ASSERT is just a printk, these would oops right away.
The assertion thus hardly adds anything.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d6d3d6f0ff1b..b31eff8fa43d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3329,7 +3329,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	if (r)
 		return r;
 
-	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	gfn = gva >> PAGE_SHIFT;
@@ -3396,7 +3395,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	int write = error_code & PFERR_WRITE_MASK;
 	bool map_writable;
 
-	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (unlikely(error_code & PFERR_RSVD_MASK)) {
@@ -3808,7 +3806,6 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
-	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(context->root_hpa));
 
 	if (!is_paging(vcpu))
@@ -3832,7 +3829,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
-	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(context->root_hpa));
 
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
@@ -3914,8 +3910,6 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
-
 	kvm_mmu_unload(vcpu);
 	init_kvm_mmu(vcpu);
 }
@@ -4271,8 +4265,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 	struct page *page;
 	int i;
 
-	ASSERT(vcpu);
-
 	/*
 	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
 	 * Therefore we need to allocate shadow page tables in the first
@@ -4291,8 +4283,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
-
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4303,7 +4293,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 
 void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	init_kvm_mmu(vcpu);
@@ -4611,8 +4600,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
-
 	kvm_mmu_unload(vcpu);
 	free_mmu_pages(vcpu);
 	mmu_free_memory_caches(vcpu);
-- 
cgit v1.2.3


From fa4a2c080e37d362ae603f4ea157fe779bd85cb5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 2 Oct 2013 16:56:16 +0200
Subject: KVM: x86: mmu: replace assertions with MMU_WARN_ON, a conditional
 WARN_ON

This makes the direction of the conditions consistent with code that
is already using WARN_ON.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b31eff8fa43d..a0985ebb5512 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
 
 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
+#define MMU_WARN_ON(x) WARN_ON(x)
 #else
-
 #define pgprintk(x...) do { } while (0)
 #define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#ifdef MMU_DEBUG
-static bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x)							\
-	if (!(x)) {							\
-		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
-		       __FILE__, __LINE__, #x);				\
-	}
+#define MMU_WARN_ON(x) do { } while (0)
 #endif
 
 #define PTE_PREFETCH_NUM		8
@@ -1536,7 +1522,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
 
 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
-	ASSERT(is_empty_shadow_page(sp->spt));
+	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
 	hlist_del(&sp->hash_link);
 	list_del(&sp->link);
 	free_page((unsigned long)sp->spt);
@@ -3041,7 +3027,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 		for (i = 0; i < 4; ++i) {
 			hpa_t root = vcpu->arch.mmu.pae_root[i];
 
-			ASSERT(!VALID_PAGE(root));
+			MMU_WARN_ON(VALID_PAGE(root));
 			spin_lock(&vcpu->kvm->mmu_lock);
 			make_mmu_pages_available(vcpu);
 			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3065,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
-		ASSERT(!VALID_PAGE(root));
+		MMU_WARN_ON(VALID_PAGE(root));
 
 		spin_lock(&vcpu->kvm->mmu_lock);
 		make_mmu_pages_available(vcpu);
@@ -3104,7 +3090,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
-		ASSERT(!VALID_PAGE(root));
+		MMU_WARN_ON(VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 			pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
 			if (!is_present_gpte(pdptr)) {
@@ -3329,7 +3315,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	if (r)
 		return r;
 
-	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	gfn = gva >> PAGE_SHIFT;
 
@@ -3395,7 +3381,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	int write = error_code & PFERR_WRITE_MASK;
 	bool map_writable;
 
-	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (unlikely(error_code & PFERR_RSVD_MASK)) {
 		r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3716,7 +3702,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);
 
-	ASSERT(is_pae(vcpu));
+	MMU_WARN_ON(!is_pae(vcpu));
 	context->page_fault = paging64_page_fault;
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->sync_page = paging64_sync_page;
@@ -3806,7 +3792,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
-	ASSERT(!VALID_PAGE(context->root_hpa));
+	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
 
 	if (!is_paging(vcpu))
 		nonpaging_init_context(vcpu, context);
@@ -3829,7 +3815,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
-	ASSERT(!VALID_PAGE(context->root_hpa));
+	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
 
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
@@ -4293,7 +4279,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 
 void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	init_kvm_mmu(vcpu);
 }
-- 
cgit v1.2.3


From 3313bc4ee83c4e2870d8e83800c6064b0d215679 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:17 +0200
Subject: KVM: x86: pop sreg accesses only 2 bytes

Although pop sreg updates RSP according to the operand size, only 2 bytes are
read.  The current behavior may result in incorrect #GP or #PF exceptions.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index feaba468cce6..abe95d2e6848 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1828,12 +1828,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
 	unsigned long selector;
 	int rc;
 
-	rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
+	rc = emulate_pop(ctxt, &selector, 2);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
 	if (ctxt->modrm_reg == VCPU_SREG_SS)
 		ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+	if (ctxt->op_bytes > 2)
+		rsp_increment(ctxt, ctxt->op_bytes - 2);
 
 	rc = load_segment_descriptor(ctxt, (u16)selector, seg);
 	return rc;
-- 
cgit v1.2.3


From 16bebefe29d8495c89961a9d57ea1947547a5211 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:18 +0200
Subject: KVM: x86: fnstcw and fnstsw may cause spurious exception

Since the operand size of fnstcw and fnstsw is updated during the execution,
the emulation may cause spurious exceptions as it reads the memory beforehand.

Marking these instructions as Mov (since the previous value is ignored) and
DstMem16 to simplify the setting of operand size.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abe95d2e6848..fff11885a3a0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
 #define DstAcc      (OpAcc << DstShift)
 #define DstDI       (OpDI << DstShift)
 #define DstMem64    (OpMem64 << DstShift)
+#define DstMem16    (OpMem16 << DstShift)
 #define DstImmUByte (OpImmUByte << DstShift)
 #define DstDX       (OpDX << DstShift)
 #define DstAccLo    (OpAccLo << DstShift)
@@ -1057,8 +1058,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
 	asm volatile("fnstcw %0": "+m"(fcw));
 	ctxt->ops->put_fpu(ctxt);
 
-	/* force 2 byte destination */
-	ctxt->dst.bytes = 2;
 	ctxt->dst.val = fcw;
 
 	return X86EMUL_CONTINUE;
@@ -1075,8 +1074,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
 	asm volatile("fnstsw %0": "+m"(fsw));
 	ctxt->ops->put_fpu(ctxt);
 
-	/* force 2 byte destination */
-	ctxt->dst.bytes = 2;
 	ctxt->dst.val = fsw;
 
 	return X86EMUL_CONTINUE;
@@ -3863,7 +3860,7 @@ static const struct gprefix pfx_0f_e7 = {
 };
 
 static const struct escape escape_d9 = { {
-	N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+	N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
 }, {
 	/* 0xC0 - 0xC7 */
 	N, N, N, N, N, N, N, N,
@@ -3905,7 +3902,7 @@ static const struct escape escape_db = { {
 } };
 
 static const struct escape escape_dd = { {
-	N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+	N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
 }, {
 	/* 0xC0 - 0xC7 */
 	N, N, N, N, N, N, N, N,
-- 
cgit v1.2.3


From 3dc4bc4f6b9265bd05dda007b07eac5a17da0562 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:19 +0200
Subject: KVM: x86: JMP/CALL using call- or task-gate causes exception

The KVM emulator does not emulate JMP and CALL that target a call gate or a
task gate.  This patch does not try to implement these scenario as they are
presumably rare; yet it returns X86EMUL_UNHANDLEABLE error in such cases
instead of generating an exception.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 54 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fff11885a3a0..1fec3ed86cbf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -263,6 +263,13 @@ struct instr_dual {
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
 #define EFLG_RESERVED_ONE_MASK 2
 
+enum x86_transfer_type {
+	X86_TRANSFER_NONE,
+	X86_TRANSFER_CALL_JMP,
+	X86_TRANSFER_RET,
+	X86_TRANSFER_TASK_SWITCH,
+};
+
 static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
 {
 	if (!(ctxt->regs_valid & (1 << nr))) {
@@ -1472,7 +1479,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 /* Does not support long mode */
 static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				     u16 selector, int seg, u8 cpl,
-				     bool in_task_switch,
+				     enum x86_transfer_type transfer,
 				     struct desc_struct *desc)
 {
 	struct desc_struct seg_desc, old_desc;
@@ -1526,11 +1533,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 		return ret;
 
 	err_code = selector & 0xfffc;
-	err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR;
+	err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+							   GP_VECTOR;
 
 	/* can't load system descriptor into segment selector */
-	if (seg <= VCPU_SREG_GS && !seg_desc.s)
+	if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+		if (transfer == X86_TRANSFER_CALL_JMP)
+			return X86EMUL_UNHANDLEABLE;
 		goto exception;
+	}
 
 	if (!seg_desc.p) {
 		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1628,7 +1639,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				   u16 selector, int seg)
 {
 	u8 cpl = ctxt->ops->cpl(ctxt);
-	return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL);
+	return __load_segment_descriptor(ctxt, selector, seg, cpl,
+					 X86_TRANSFER_NONE, NULL);
 }
 
 static void write_register_operand(struct operand *op)
@@ -2040,7 +2052,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 
 	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
-	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+				       X86_TRANSFER_CALL_JMP,
 				       &new_desc);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -2129,7 +2142,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 	/* Outer-privilege level return is not implemented */
 	if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
 		return X86EMUL_UNHANDLEABLE;
-	rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
+	rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+				       X86_TRANSFER_RET,
 				       &new_desc);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -2566,23 +2580,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 	 * it is handled in a context of new task
 	 */
 	ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2704,31 +2718,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	 * it is handled in a context of new task
 	 */
 	ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
-					cpl, true, NULL);
+					cpl, X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -3010,8 +3024,8 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
 
 	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
-	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
-				       &new_desc);
+	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+				       X86_TRANSFER_CALL_JMP, &new_desc);
 	if (rc != X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
 
-- 
cgit v1.2.3


From 80976dbb5cb2b64480d7d38981b3220887575728 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:20 +0200
Subject: KVM: x86: em_call_far should return failure result

Currently, if em_call_far fails it returns success instead of the resulting
error-code. Fix it.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1fec3ed86cbf..8f32c03515ad 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3027,7 +3027,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
 				       X86_TRANSFER_CALL_JMP, &new_desc);
 	if (rc != X86EMUL_CONTINUE)
-		return X86EMUL_CONTINUE;
+		return rc;
 
 	rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
 	if (rc != X86EMUL_CONTINUE)
-- 
cgit v1.2.3


From ab708099a0617e2c37b26d9ecbb373456057ba9b Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:21 +0200
Subject: KVM: x86: POP [ESP] is not emulated correctly

According to Intel SDM: "If the ESP register is used as a base register for
addressing a destination operand in memory, the POP instruction computes the
effective address of the operand after it increments the ESP register."

The current emulation does not behave so. The fix required to waste another
of the precious instruction flags and to check the flag in decode_modrm.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8f32c03515ad..cc24b74b7454 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -170,6 +170,7 @@
 #define PrivUD      ((u64)1 << 51)  /* #UD instead of #GP on CPL > 0 */
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
+#define IncSP       ((u64)1 << 54)  /* SP is incremented before ModRM calc */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -1227,6 +1228,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			else {
 				modrm_ea += reg_read(ctxt, base_reg);
 				adjust_modrm_seg(ctxt, base_reg);
+				/* Increment ESP on POP [ESP] */
+				if ((ctxt->d & IncSP) &&
+				    base_reg == VCPU_REGS_RSP)
+					modrm_ea += ctxt->op_bytes;
 			}
 			if (index_reg != 4)
 				modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -3758,7 +3763,7 @@ static const struct opcode group1[] = {
 };
 
 static const struct opcode group1A[] = {
-	I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
 };
 
 static const struct opcode group2[] = {
-- 
cgit v1.2.3


From e2cefa746e7e2a1104931d411b6f5de159d98ec6 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:22 +0200
Subject: KVM: x86: Do not set access bit on accessed segments

When segment is loaded, the segment access bit is set unconditionally.  In
fact, it should be set conditionally, based on whether the segment had the
accessed bit set before. In addition, it can improve performance.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cc24b74b7454..e36e1fc5bf85 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1618,10 +1618,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	if (seg_desc.s) {
 		/* mark segment as accessed */
-		seg_desc.type |= 1;
-		ret = write_segment_descriptor(ctxt, selector, &seg_desc);
-		if (ret != X86EMUL_CONTINUE)
-			return ret;
+		if (!(seg_desc.type & 1)) {
+			seg_desc.type |= 1;
+			ret = write_segment_descriptor(ctxt, selector,
+						       &seg_desc);
+			if (ret != X86EMUL_CONTINUE)
+				return ret;
+		}
 	} else if (ctxt->mode == X86EMUL_MODE_PROT64) {
 		ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
 				sizeof(base3), &ctxt->exception);
-- 
cgit v1.2.3


From edccda7ca7e56b335c70ae512f89d0fdf7fb8c69 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:23 +0200
Subject: KVM: x86: Access to LDT/GDT that wraparound is incorrect

When access to descriptor in LDT/GDT wraparound outside long-mode, the address
of the descriptor should be truncated to 32-bit.  Citing Intel SDM 2.1.1.1
"Global and Local Descriptor Tables in IA-32e Mode": "GDTR and LDTR registers
are expanded to 64-bits wide in both IA-32e sub-modes (64-bit mode and
compatibility mode)."

So in other cases, we need to truncate. Creating new function to return a
pointer to descriptor table to avoid too much code duplication.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
[Wrap 64-bit check with #ifdef CONFIG_X86_64, to avoid a "right shift count
 >= width of type" warning and consequent undefined behavior. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e36e1fc5bf85..d949287ed010 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1444,10 +1444,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 		ops->get_gdt(ctxt, dt);
 }
 
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   u16 selector, struct desc_struct *desc,
-				   ulong *desc_addr_p)
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+			      u16 selector, ulong *desc_addr_p)
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
@@ -1458,8 +1456,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (dt.size < index * 8 + 7)
 		return emulate_gp(ctxt, selector & 0xfffc);
 
-	*desc_addr_p = addr = dt.address + index * 8;
-	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+	addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+	if (addr >> 32 != 0) {
+		u64 efer = 0;
+
+		ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+		if (!(efer & EFER_LMA))
+			addr &= (u32)-1;
+	}
+#endif
+
+	*desc_addr_p = addr;
+	return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   u16 selector, struct desc_struct *desc,
+				   ulong *desc_addr_p)
+{
+	int rc;
+
+	rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
 				   &ctxt->exception);
 }
 
@@ -1467,16 +1491,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				    u16 selector, struct desc_struct *desc)
 {
-	struct desc_ptr dt;
-	u16 index = selector >> 3;
+	int rc;
 	ulong addr;
 
-	get_descriptor_table_ptr(ctxt, selector, &dt);
-
-	if (dt.size < index * 8 + 7)
-		return emulate_gp(ctxt, selector & 0xfffc);
+	rc = get_descriptor_ptr(ctxt, selector, &addr);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	addr = dt.address + index * 8;
 	return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
 				    &ctxt->exception);
 }
-- 
cgit v1.2.3


From bab5bb398273bb37547a185f7b344b37c700d0b9 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Thu, 1 Jan 2015 22:05:18 -0500
Subject: kvm: x86: Remove kvm_make_request from lapic.c

Adds a function kvm_vcpu_set_pending_timer instead of calling
kvm_make_request in lapic.c.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 7 +------
 arch/x86/kvm/x86.c   | 9 +++++++++
 arch/x86/kvm/x86.h   | 1 +
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3eb7f8d9992c..a688fbffb34e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1076,16 +1076,11 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	wait_queue_head_t *q = &vcpu->wq;
 	struct kvm_timer *ktimer = &apic->lapic_timer;
 
-	/*
-	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
-	 * vcpu_enter_guest.
-	 */
 	if (atomic_read(&apic->lapic_timer.pending))
 		return;
 
 	atomic_inc(&apic->lapic_timer.pending);
-	/* FIXME: this code should not know anything about vcpus */
-	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	kvm_set_pending_timer(vcpu);
 
 	if (waitqueue_active(q))
 		wake_up_interruptible(q);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 559e3fd6c897..49ecda7ca958 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1087,6 +1087,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 }
 #endif
 
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+	 * vcpu_enter_guest.  This function is only called from
+	 * the physical CPU that is running vcpu.
+	 */
+	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
 
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 07994f38dacf..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
-- 
cgit v1.2.3


From defcf51fa93929bd5d3ce5b91f8e6a106dae5e46 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 8 Jan 2015 15:59:30 +0100
Subject: KVM: x86: allow TSC deadline timer on all hosts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Emulation does not utilize the feature.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49ecda7ca958..98930c64b06e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2752,6 +2752,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_HYPERV_TIME:
 	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+	case KVM_CAP_TSC_DEADLINE_TIMER:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -2790,9 +2791,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
-	case KVM_CAP_TSC_DEADLINE_TIMER:
-		r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
-		break;
 	default:
 		r = 0;
 		break;
-- 
cgit v1.2.3


From 7e71a59b250330fd52ee7293eb9d31952f16682e Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Fri, 9 Jan 2015 16:44:30 +0800
Subject: KVM: x86: flush TLB when D bit is manually changed.

When software changes D bit (either from 1 to 0, or 0 to 1), the
corresponding TLB entity in the hardware won't be updated immediately. We
should flush it to guarantee the consistence of D bit between TLB and
MMU page table in memory.  This is especially important when clearing
the D bit, since it may cause false negatives in reporting dirtiness.

Sanity test was done on my machine with Intel processor.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
[Check A bit too. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a0985ebb5512..0d0fdd6f002c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -532,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
 	return (old_spte & bit_mask) && !(new_spte & bit_mask);
 }
 
+static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+	return (old_spte & bit_mask) != (new_spte & bit_mask);
+}
+
 /* Rules for using mmu_spte_set:
  * Set the sptep from nonpresent to present.
  * Note: the sptep being assigned *must* be either not present
@@ -582,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	if (!shadow_accessed_mask)
 		return ret;
 
+	/*
+	 * Flush TLB when accessed/dirty bits are changed in the page tables,
+	 * to guarantee consistency between TLB and page tables.
+	 */
+	if (spte_is_bit_changed(old_spte, new_spte,
+                                shadow_accessed_mask | shadow_dirty_mask))
+		ret = true;
+
 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
-- 
cgit v1.2.3


From c205fb7d7d4f81e46fc577b707ceb9e356af1456 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:16 +0200
Subject: KVM: x86: #PF error-code on R/W operations is wrong

When emulating an instruction that reads the destination memory operand (i.e.,
instructions without the Mov flag in the emulator), the operand is first read.
If a page-fault is detected in this phase, the error-code which would be
delivered to the VM does not indicate that the access that caused the exception
is a write one. This does not conform with real hardware, and may cause the VM
to enter the page-fault handler twice for no reason (once for read, once for
write).

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++++++++
 arch/x86/kvm/emulate.c          |  6 +++++-
 arch/x86/kvm/mmu.h              | 12 ------------
 3 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cb19d05af3cd..97a5dd0222c8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -160,6 +160,18 @@ enum {
 #define DR7_FIXED_1	0x00000400
 #define DR7_VOLATILE	0xffff2bff
 
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+
+#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
+#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
 /*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d949287ed010..ef23c1e5fa9f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4909,8 +4909,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		/* optimisation - avoid slow emulated read if Mov */
 		rc = segmented_read(ctxt, ctxt->dst.addr.mem,
 				   &ctxt->dst.val, ctxt->dst.bytes);
-		if (rc != X86EMUL_CONTINUE)
+		if (rc != X86EMUL_CONTINUE) {
+			if (rc == X86EMUL_PROPAGATE_FAULT &&
+			    ctxt->exception.vector == PF_VECTOR)
+				ctxt->exception.error_code |= PFERR_WRITE_MASK;
 			goto done;
+		}
 	}
 	ctxt->dst.orig_val = ctxt->dst.val;
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index a7f9a121690d..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
 
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
-
 static inline u64 rsvd_bits(int s, int e)
 {
 	return ((1ULL << (e - s + 1)) - 1) << s;
-- 
cgit v1.2.3


From b9dfe0bed999d23ee8838d389637dd8aef83fafa Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 9 Jan 2015 10:53:21 +0100
Subject: livepatch: handle ancient compilers with more grace

We are aborting a build in case when gcc doesn't support fentry on x86_64
(regs->ip modification can't really reliably work with mcount).

This however breaks allmodconfig for people with older gccs that don't
support -mfentry.

Turn the build-time failure into runtime failure, resulting in the whole
infrastructure not being initialized if CC_USING_FENTRY is unset.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/include/asm/livepatch.h | 6 +++++-
 kernel/livepatch/core.c          | 6 ++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index b5608d7757fd..26e58134c8cb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -25,9 +25,13 @@
 #include <linux/ftrace.h>
 
 #ifdef CONFIG_LIVE_PATCHING
+static inline int klp_check_compiler_support(void)
+{
 #ifndef CC_USING_FENTRY
-#error Your compiler must support -mfentry for live patching to work
+	return 1;
 #endif
+	return 0;
+}
 extern int klp_write_module_reloc(struct module *mod, unsigned long type,
 				  unsigned long loc, unsigned long value);
 
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 6f6387912da7..ce42d3b930dc 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -911,6 +911,12 @@ static int klp_init(void)
 {
 	int ret;
 
+	ret = klp_check_compiler_support();
+	if (ret) {
+		pr_info("Your compiler is too old; turning off.\n");
+		return -EINVAL;
+	}
+
 	ret = register_module_notifier(&klp_module_nb);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 665d92e38f65d70796aad2b8e49e42e80815d4a4 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.m@jp.panasonic.com>
Date: Thu, 25 Dec 2014 14:31:24 +0900
Subject: kbuild: do not add $(call ...) to invoke cc-version or cc-fullversion

The macros cc-version, cc-fullversion and ld-version take no argument.
It is not necessary to add $(call ...) to invoke them.

Signed-off-by: Masahiro Yamada <yamada.m@jp.panasonic.com>
Acked-by: Helge Deller <deller@gmx.de> [parisc]
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 Documentation/kbuild/makefiles.txt | 4 ++--
 arch/parisc/Makefile               | 2 +-
 arch/powerpc/Makefile              | 6 +++---
 arch/x86/Makefile.um               | 2 +-
 kernel/gcov/Makefile               | 2 +-
 scripts/Kbuild.include             | 7 ++-----
 6 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index a311db829e9b..7b3487a67476 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -524,7 +524,7 @@ more details, with real examples.
 	Example:
 		#arch/x86/Makefile
 		cflags-y += $(shell \
-		if [ $(call cc-version) -ge 0300 ] ; then \
+		if [ $(cc-version) -ge 0300 ] ; then \
 			echo "-mregparm=3"; fi ;)
 
 	In the above example, -mregparm=3 is only used for gcc version greater
@@ -552,7 +552,7 @@ more details, with real examples.
 
 	Example:
 		#arch/powerpc/Makefile
-		$(Q)if test "$(call cc-fullversion)" = "040200" ; then \
+		$(Q)if test "$(cc-fullversion)" = "040200" ; then \
 			echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \
 			false ; \
 		fi
diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile
index 5db8882f732c..fc1aca379fe2 100644
--- a/arch/parisc/Makefile
+++ b/arch/parisc/Makefile
@@ -149,7 +149,7 @@ endef
 # we require gcc 3.3 or above to compile the kernel
 archprepare: checkbin
 checkbin:
-	@if test "$(call cc-version)" -lt "0303"; then \
+	@if test "$(cc-version)" -lt "0303"; then \
 		echo -n "Sorry, GCC v3.3 or above is required to build " ; \
 		echo "the kernel." ; \
 		false ; \
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 132d9c681d6a..fc502e042438 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -314,7 +314,7 @@ TOUT	:= .tmp_gas_check
 # - Require gcc 4.0 or above on 64-bit
 # - gcc-4.2.0 has issues compiling modules on 64-bit
 checkbin:
-	@if test "$(call cc-version)" = "0304" ; then \
+	@if test "$(cc-version)" = "0304" ; then \
 		if ! /bin/echo mftb 5 | $(AS) -v -mppc -many -o $(TOUT) >/dev/null 2>&1 ; then \
 			echo -n '*** ${VERSION}.${PATCHLEVEL} kernels no longer build '; \
 			echo 'correctly with gcc-3.4 and your version of binutils.'; \
@@ -322,13 +322,13 @@ checkbin:
 			false; \
 		fi ; \
 	fi
-	@if test "$(call cc-version)" -lt "0400" \
+	@if test "$(cc-version)" -lt "0400" \
 	    && test "x${CONFIG_PPC64}" = "xy" ; then \
                 echo -n "Sorry, GCC v4.0 or above is required to build " ; \
                 echo "the 64-bit powerpc kernel." ; \
                 false ; \
         fi
-	@if test "$(call cc-fullversion)" = "040200" \
+	@if test "$(cc-fullversion)" = "040200" \
 	    && test "x${CONFIG_MODULES}${CONFIG_PPC64}" = "xyy" ; then \
 		echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \
 		echo 'kernel with modules enabled.' ; \
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 36b62bc52638..95eba554baf9 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -30,7 +30,7 @@ cflags-y += -ffreestanding
 # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
 # a lot more stack due to the lack of sharing of stacklots.  Also, gcc
 # 4.3.0 needs -funit-at-a-time for extern inline functions.
-KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
+KBUILD_CFLAGS += $(shell if [ $(cc-version) -lt 0400 ] ; then \
 			echo $(call cc-option,-fno-unit-at-a-time); \
 			else echo $(call cc-option,-funit-at-a-time); fi ;)
 
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8de927..6f01fa3d9ca1 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -21,7 +21,7 @@ else
 # is not available. We can probably move if-lt to Kbuild.include, so it's also
 # not defined during clean or to include Kbuild.include in
 # scripts/Makefile.clean. But the following workaround seems least invasive.
-  cc-ver := $(if $(call cc-version),$(call cc-version),0)
+  cc-ver := $(if $(cc-version),$(cc-version),0)
 endif
 
 obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 34a87fc77f71..ddf0ebdc2ca8 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -129,17 +129,15 @@ cc-disable-warning = $(call try-run,\
 	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
 
 # cc-version
-# Usage gcc-ver := $(call cc-version)
 cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
 
 # cc-fullversion
-# Usage gcc-ver := $(call cc-fullversion)
 cc-fullversion = $(shell $(CONFIG_SHELL) \
 	$(srctree)/scripts/gcc-version.sh -p $(CC))
 
 # cc-ifversion
 # Usage:  EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1)
-cc-ifversion = $(shell [ $(call cc-version) $(1) $(2) ] && echo $(3))
+cc-ifversion = $(shell [ $(cc-version) $(1) $(2) ] && echo $(3))
 
 # cc-ldoption
 # Usage: ldflags += $(call cc-ldoption, -Wl$(comma)--hash-style=both)
@@ -157,13 +155,12 @@ ld-option = $(call try-run,\
 ar-option = $(call try-run, $(AR) rc$(1) "$$TMP",$(1),$(2))
 
 # ld-version
-# Usage: $(call ld-version)
 # Note this is mainly for HJ Lu's 3 number binutil versions
 ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh)
 
 # ld-ifversion
 # Usage:  $(call ld-ifversion, -ge, 22252, y)
-ld-ifversion = $(shell [ $(call ld-version) $(1) $(2) ] && echo $(3))
+ld-ifversion = $(shell [ $(ld-version) $(1) $(2) ] && echo $(3))
 
 ######
 
-- 
cgit v1.2.3


From d8219f52a72033f84c15cde73294d46578fb2d68 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Sun, 11 Jan 2015 18:17:47 +0100
Subject: crypto: x86/des3_ede - drop bogus module aliases

This module implements variations of "des3_ede" only. Drop the bogus
module aliases for "des".

Cc: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/des3_ede_glue.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index 38a14f818ef1..d6fc59aaaadf 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -504,6 +504,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
 MODULE_ALIAS_CRYPTO("des3_ede");
 MODULE_ALIAS_CRYPTO("des3_ede-asm");
-MODULE_ALIAS_CRYPTO("des");
-MODULE_ALIAS_CRYPTO("des-asm");
 MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
-- 
cgit v1.2.3


From e182c570e9953859aee5cb016583217d9e68ea18 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 12 Dec 2014 01:56:04 +0200
Subject: x86/uaccess: fix sparse errors

virtio wants to read bitwise types from userspace using get_user.  At the
moment this triggers sparse errors, since the value is passed through an
integer.

Fix that up using __force.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 0d592e0a5b84..ace9dec050b1 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 	asm volatile("call __get_user_%P3"				\
 		     : "=a" (__ret_gu), "=r" (__val_gu)			\
 		     : "0" (ptr), "i" (sizeof(*(ptr))));		\
-	(x) = (__typeof__(*(ptr))) __val_gu;				\
+	(x) = (__force __typeof__(*(ptr))) __val_gu;			\
 	__ret_gu;							\
 })
 
-- 
cgit v1.2.3


From cbf6ab52add20b845f903decc973afbd5463c527 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 5 Jan 2015 19:29:32 +0800
Subject: kprobes: Pass the original kprobe for preparing optimized kprobe

Pass the original kprobe for preparing an optimized kprobe arch-dep
part, since for some architecture (e.g. ARM32) requires the information
in original kprobe.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 arch/x86/kernel/kprobes/opt.c | 3 ++-
 include/linux/kprobes.h       | 3 ++-
 kernel/kprobes.c              | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..0dd8d089c315 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
  * Target instructions MUST be relocatable (checked inside)
  * This is called when new aggr(opt)probe is allocated or reused.
  */
-int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+				  struct kprobe *__unused)
 {
 	u8 *buf;
 	int ret;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 5297f9fa0ef2..1ab54754a86d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -308,7 +308,8 @@ struct optimized_kprobe {
 /* Architecture dependent functions for direct jump optimization */
 extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
 extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
-extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
+extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+					 struct kprobe *orig);
 extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
 extern void arch_optimize_kprobes(struct list_head *oplist);
 extern void arch_unoptimize_kprobes(struct list_head *oplist,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..bad4e959f2f7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
 	struct optimized_kprobe *op;
 
 	op = container_of(p, struct optimized_kprobe, kp);
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 }
 
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 
 	INIT_LIST_HEAD(&op->list);
 	op->kp.addr = p->addr;
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 
 	return &op->kp;
 }
-- 
cgit v1.2.3


From e31ac32d3bc27c33f002e0c9ffd6ae08b65474e6 Mon Sep 17 00:00:00 2001
From: Timothy McCaffrey <timothy.mccaffrey@unisys.com>
Date: Tue, 13 Jan 2015 13:16:43 -0500
Subject: crypto: aesni - Add support for 192 & 256 bit keys to AESNI RFC4106

These patches fix the RFC4106 implementation in the aesni-intel
module so it supports 192 & 256 bit keys.

Since the AVX support that was added to this module also only
supports 128 bit keys, and this patch only affects the SSE
implementation, changes were also made to use the SSE version
if key sizes other than 128 are specified.

RFC4106 specifies that 192 & 256 bit keys must be supported (section
8.4).

Also, this should fix Strongswan issue 341 where the aesni module
needs to be unloaded if 256 bit keys are used:

http://wiki.strongswan.org/issues/341

This patch has been tested with Sandy Bridge and Haswell processors.
With 128 bit keys and input buffers > 512 bytes a slight performance
degradation was noticed (~1%).  For input buffers of less than 512
bytes there was no performance impact.  Compared to 128 bit keys,
256 bit key size performance is approx. .5 cycles per byte slower
on Sandy Bridge, and .37 cycles per byte slower on Haswell (vs.
SSE code).

This patch has also been tested with StrongSwan IPSec connections
where it worked correctly.

I created this diff from a git clone of crypto-2.6.git.

Any questions, please feel free to contact me.

Signed-off-by: Timothy McCaffrey <timothy.mccaffrey@unisys.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_asm.S  | 343 +++++++++++++++++++------------------
 arch/x86/crypto/aesni-intel_glue.c |  34 +++-
 2 files changed, 205 insertions(+), 172 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 477e9d75149b..6bd2c6c95373 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,12 +32,23 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
+/*
+ * The following macros are used to move an (un)aligned 16 byte value to/from
+ * an XMM register.  This can done for either FP or integer values, for FP use
+ * movaps (move aligned packed single) or integer use movdqa (move double quad
+ * aligned).  It doesn't make a performance difference which instruction is used
+ * since Nehalem (original Core i7) was released.  However, the movaps is a byte
+ * shorter, so that is the one we'll use for now. (same for unaligned).
+ */
+#define MOVADQ	movaps
+#define MOVUDQ	movups
+
 #ifdef __x86_64__
+
 .data
 .align 16
 .Lgf128mul_x_ble_mask:
 	.octa 0x00000000000000010000000000000087
-
 POLY:   .octa 0xC2000000000000000000000000000001
 TWOONE: .octa 0x00000001000000000000000000000001
 
@@ -89,6 +100,7 @@ enc:        .octa 0x2
 #define arg8 STACK_OFFSET+16(%r14)
 #define arg9 STACK_OFFSET+24(%r14)
 #define arg10 STACK_OFFSET+32(%r14)
+#define keysize 2*15*16(%arg1)
 #endif
 
 
@@ -213,10 +225,12 @@ enc:        .octa 0x2
 
 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+        MOVADQ     SHUF_MASK(%rip), %xmm14
 	mov	   arg7, %r10           # %r10 = AAD
 	mov	   arg8, %r12           # %r12 = aadLen
 	mov	   %r12, %r11
 	pxor	   %xmm\i, %xmm\i
+
 _get_AAD_loop\num_initial_blocks\operation:
 	movd	   (%r10), \TMP1
 	pslldq	   $12, \TMP1
@@ -225,16 +239,18 @@ _get_AAD_loop\num_initial_blocks\operation:
 	add	   $4, %r10
 	sub	   $4, %r12
 	jne	   _get_AAD_loop\num_initial_blocks\operation
+
 	cmp	   $16, %r11
 	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+
 	mov	   $16, %r12
 _get_AAD_loop2\num_initial_blocks\operation:
 	psrldq	   $4, %xmm\i
 	sub	   $4, %r12
 	cmp	   %r11, %r12
 	jne	   _get_AAD_loop2\num_initial_blocks\operation
+
 _get_AAD_loop2_done\num_initial_blocks\operation:
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 
 	xor	   %r11, %r11 # initialise the data pointer offset as zero
@@ -243,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 
 	mov	   %arg5, %rax                      # %rax = *Y0
 	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, \XMM0
 
 .if (\i == 5) || (\i == 6) || (\i == 7)
+	MOVADQ		ONE(%RIP),\TMP1
+	MOVADQ		(%arg1),\TMP2
 .irpc index, \i_seq
-	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	paddd	   \TMP1, \XMM0                 # INCR Y0
 	movdqa	   \XMM0, %xmm\index
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
-
-.endr
-.irpc index, \i_seq
-	pxor	   16*0(%arg1), %xmm\index
-.endr
-.irpc index, \i_seq
-	movaps 0x10(%rdi), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 1
-.endr
-.irpc index, \i_seq
-	movaps 0x20(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x30(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x40(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x50(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x60(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	pxor	   \TMP2, %xmm\index
 .endr
-.irpc index, \i_seq
-	movaps 0x70(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x80(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x90(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	lea	0x10(%arg1),%r10
+	mov	keysize,%eax
+	shr	$2,%eax				# 128->4, 192->6, 256->8
+	add	$5,%eax			      # 128->9, 192->11, 256->13
+
+aes_loop_initial_dec\num_initial_blocks:
+	MOVADQ	(%r10),\TMP1
+.irpc	index, \i_seq
+	AESENC	\TMP1, %xmm\index
 .endr
+	add	$16,%r10
+	sub	$1,%eax
+	jnz	aes_loop_initial_dec\num_initial_blocks
+
+	MOVADQ	(%r10), \TMP1
 .irpc index, \i_seq
-	movaps 0xa0(%arg1), \TMP1
-	AESENCLAST \TMP1, %xmm\index         # Round 10
+	AESENCLAST \TMP1, %xmm\index         # Last Round
 .endr
 .irpc index, \i_seq
 	movdqu	   (%arg3 , %r11, 1), \TMP1
@@ -305,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	add	   $16, %r11
 
 	movdqa     \TMP1, %xmm\index
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM	   %xmm14, %xmm\index
-
-		# prepare plaintext/ciphertext for GHASH computation
+                # prepare plaintext/ciphertext for GHASH computation
 .endr
 .endif
 	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -338,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 */
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM1
-        movdqa     SHUF_MASK(%rip), %xmm14
+	MOVADQ	   ONE(%rip), \TMP1
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM1
 	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM2
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM2
 	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM3
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM3
 	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM4
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM4
 	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 
-	pxor	   16*0(%arg1), \XMM1
-	pxor	   16*0(%arg1), \XMM2
-	pxor	   16*0(%arg1), \XMM3
-	pxor	   16*0(%arg1), \XMM4
+	MOVADQ	   0(%arg1),\TMP1
+	pxor	   \TMP1, \XMM1
+	pxor	   \TMP1, \XMM2
+	pxor	   \TMP1, \XMM3
+	pxor	   \TMP1, \XMM4
 	movdqa	   \TMP3, \TMP5
 	pshufd	   $78, \TMP3, \TMP1
 	pxor	   \TMP3, \TMP1
@@ -399,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	pshufd	   $78, \TMP5, \TMP1
 	pxor	   \TMP5, \TMP1
 	movdqa	   \TMP1, HashKey_4_k(%rsp)
-	movaps 0xa0(%arg1), \TMP2
+	lea	   0xa0(%arg1),%r10
+	mov	   keysize,%eax
+	shr	   $2,%eax			# 128->4, 192->6, 256->8
+	sub	   $4,%eax			# 128->0, 192->2, 256->4
+	jz	   aes_loop_pre_dec_done\num_initial_blocks
+
+aes_loop_pre_dec\num_initial_blocks:
+	MOVADQ	   (%r10),\TMP2
+.irpc	index, 1234
+	AESENC	   \TMP2, %xmm\index
+.endr
+	add	   $16,%r10
+	sub	   $1,%eax
+	jnz	   aes_loop_pre_dec\num_initial_blocks
+
+aes_loop_pre_dec_done\num_initial_blocks:
+	MOVADQ	   (%r10), \TMP2
 	AESENCLAST \TMP2, \XMM1
 	AESENCLAST \TMP2, \XMM2
 	AESENCLAST \TMP2, \XMM3
@@ -421,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
 	movdqa     \TMP1, \XMM4
 	add	   $64, %r11
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 	pxor	   \XMMDst, \XMM1
 # combine GHASHed value with the corresponding ciphertext
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 
 _initial_blocks_done\num_initial_blocks\operation:
@@ -451,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation:
 
 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+        MOVADQ     SHUF_MASK(%rip), %xmm14
 	mov	   arg7, %r10           # %r10 = AAD
 	mov	   arg8, %r12           # %r12 = aadLen
 	mov	   %r12, %r11
@@ -472,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation:
 	cmp	   %r11, %r12
 	jne	   _get_AAD_loop2\num_initial_blocks\operation
 _get_AAD_loop2_done\num_initial_blocks\operation:
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 
 	xor	   %r11, %r11 # initialise the data pointer offset as zero
@@ -481,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 
 	mov	   %arg5, %rax                      # %rax = *Y0
 	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, \XMM0
 
 .if (\i == 5) || (\i == 6) || (\i == 7)
-.irpc index, \i_seq
-	paddd	   ONE(%rip), \XMM0                 # INCR Y0
-	movdqa	   \XMM0, %xmm\index
-        movdqa     SHUF_MASK(%rip), %xmm14
-	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 
-.endr
-.irpc index, \i_seq
-	pxor	   16*0(%arg1), %xmm\index
-.endr
-.irpc index, \i_seq
-	movaps 0x10(%rdi), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 1
-.endr
-.irpc index, \i_seq
-	movaps 0x20(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
+	MOVADQ		ONE(%RIP),\TMP1
+	MOVADQ		0(%arg1),\TMP2
 .irpc index, \i_seq
-	movaps 0x30(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	paddd		\TMP1, \XMM0                 # INCR Y0
+	MOVADQ		\XMM0, %xmm\index
+	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
+	pxor		\TMP2, %xmm\index
 .endr
-.irpc index, \i_seq
-	movaps 0x40(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x50(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x60(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x70(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x80(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x90(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	lea	0x10(%arg1),%r10
+	mov	keysize,%eax
+	shr	$2,%eax				# 128->4, 192->6, 256->8
+	add	$5,%eax			      # 128->9, 192->11, 256->13
+
+aes_loop_initial_enc\num_initial_blocks:
+	MOVADQ	(%r10),\TMP1
+.irpc	index, \i_seq
+	AESENC	\TMP1, %xmm\index
 .endr
+	add	$16,%r10
+	sub	$1,%eax
+	jnz	aes_loop_initial_enc\num_initial_blocks
+
+	MOVADQ	(%r10), \TMP1
 .irpc index, \i_seq
-	movaps 0xa0(%arg1), \TMP1
-	AESENCLAST \TMP1, %xmm\index         # Round 10
+	AESENCLAST \TMP1, %xmm\index         # Last Round
 .endr
 .irpc index, \i_seq
 	movdqu	   (%arg3 , %r11, 1), \TMP1
@@ -541,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 	# write back plaintext/ciphertext for num_initial_blocks
 	add	   $16, %r11
-
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM	   %xmm14, %xmm\index
 
 		# prepare plaintext/ciphertext for GHASH computation
@@ -575,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 */
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM1
-        movdqa     SHUF_MASK(%rip), %xmm14
+	MOVADQ	   ONE(%RIP),\TMP1
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM1
 	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM2
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM2
 	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM3
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM3
 	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM4
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM4
 	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 
-	pxor	   16*0(%arg1), \XMM1
-	pxor	   16*0(%arg1), \XMM2
-	pxor	   16*0(%arg1), \XMM3
-	pxor	   16*0(%arg1), \XMM4
+	MOVADQ	   0(%arg1),\TMP1
+	pxor	   \TMP1, \XMM1
+	pxor	   \TMP1, \XMM2
+	pxor	   \TMP1, \XMM3
+	pxor	   \TMP1, \XMM4
 	movdqa	   \TMP3, \TMP5
 	pshufd	   $78, \TMP3, \TMP1
 	pxor	   \TMP3, \TMP1
@@ -636,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	pshufd	   $78, \TMP5, \TMP1
 	pxor	   \TMP5, \TMP1
 	movdqa	   \TMP1, HashKey_4_k(%rsp)
-	movaps 0xa0(%arg1), \TMP2
+	lea	   0xa0(%arg1),%r10
+	mov	   keysize,%eax
+	shr	   $2,%eax			# 128->4, 192->6, 256->8
+	sub	   $4,%eax			# 128->0, 192->2, 256->4
+	jz	   aes_loop_pre_enc_done\num_initial_blocks
+
+aes_loop_pre_enc\num_initial_blocks:
+	MOVADQ	   (%r10),\TMP2
+.irpc	index, 1234
+	AESENC	   \TMP2, %xmm\index
+.endr
+	add	   $16,%r10
+	sub	   $1,%eax
+	jnz	   aes_loop_pre_enc\num_initial_blocks
+
+aes_loop_pre_enc_done\num_initial_blocks:
+	MOVADQ	   (%r10), \TMP2
 	AESENCLAST \TMP2, \XMM1
 	AESENCLAST \TMP2, \XMM2
 	AESENCLAST \TMP2, \XMM3
@@ -655,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
 
 	add	   $64, %r11
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 	pxor	   \XMMDst, \XMM1
 # combine GHASHed value with the corresponding ciphertext
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 
 _initial_blocks_done\num_initial_blocks\operation:
@@ -794,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 	AESENC	  \TMP3, \XMM3
 	AESENC	  \TMP3, \XMM4
 	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
-	movaps 0xa0(%arg1), \TMP3
+	lea	  0xa0(%arg1),%r10
+	mov	  keysize,%eax
+	shr	  $2,%eax			# 128->4, 192->6, 256->8
+	sub	  $4,%eax			# 128->0, 192->2, 256->4
+	jz	  aes_loop_par_enc_done
+
+aes_loop_par_enc:
+	MOVADQ	  (%r10),\TMP3
+.irpc	index, 1234
+	AESENC	  \TMP3, %xmm\index
+.endr
+	add	  $16,%r10
+	sub	  $1,%eax
+	jnz	  aes_loop_par_enc
+
+aes_loop_par_enc_done:
+	MOVADQ	  (%r10), \TMP3
 	AESENCLAST \TMP3, \XMM1           # Round 10
 	AESENCLAST \TMP3, \XMM2
 	AESENCLAST \TMP3, \XMM3
@@ -986,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 	AESENC	  \TMP3, \XMM3
 	AESENC	  \TMP3, \XMM4
 	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
-	movaps 0xa0(%arg1), \TMP3
-	AESENCLAST \TMP3, \XMM1           # Round 10
+	lea	  0xa0(%arg1),%r10
+	mov	  keysize,%eax
+	shr	  $2,%eax		        # 128->4, 192->6, 256->8
+	sub	  $4,%eax			# 128->0, 192->2, 256->4
+	jz	  aes_loop_par_dec_done
+
+aes_loop_par_dec:
+	MOVADQ	  (%r10),\TMP3
+.irpc	index, 1234
+	AESENC	  \TMP3, %xmm\index
+.endr
+	add	  $16,%r10
+	sub	  $1,%eax
+	jnz	  aes_loop_par_dec
+
+aes_loop_par_dec_done:
+	MOVADQ	  (%r10), \TMP3
+	AESENCLAST \TMP3, \XMM1           # last round
 	AESENCLAST \TMP3, \XMM2
 	AESENCLAST \TMP3, \XMM3
 	AESENCLAST \TMP3, \XMM4
@@ -1155,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
 	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
 .endm
 
-/* Encryption of a single block done*/
-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
 
-	pxor	(%arg1), \XMM0
-        movaps 16(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 32(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 48(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 64(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 80(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 96(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 112(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 128(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 144(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 160(%arg1), \TMP1
-	AESENCLAST	\TMP1, \XMM0
-.endm
+/* Encryption of a single block
+* uses eax & r10
+*/
 
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
 
+	pxor		(%arg1), \XMM0
+	mov		keysize,%eax
+	shr		$2,%eax			# 128->4, 192->6, 256->8
+	add		$5,%eax			# 128->9, 192->11, 256->13
+	lea		16(%arg1), %r10	  # get first expanded key address
+
+_esb_loop_\@:
+	MOVADQ		(%r10),\TMP1
+	AESENC		\TMP1,\XMM0
+	add		$16,%r10
+	sub		$1,%eax
+	jnz		_esb_loop_\@
+
+	MOVADQ		(%r10),\TMP1
+	AESENCLAST	\TMP1,\XMM0
+.endm
 /*****************************************************************************
 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ae855f4f64b7..947c6bf52c33 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -43,6 +43,7 @@
 #include <asm/crypto/glue_helper.h>
 #endif
 
+
 /* This data is stored at the end of the crypto_tfm struct.
  * It's a type of per "session" data storage location.
  * This needs to be 16 byte aligned.
@@ -182,7 +183,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
 		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
 				aad_len, auth_tag, auth_tag_len);
 	} else {
@@ -197,7 +199,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
 				aad_len, auth_tag, auth_tag_len);
 	} else {
@@ -231,7 +234,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
 				aad_len, auth_tag, auth_tag_len);
 	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
@@ -250,7 +254,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
 				aad, aad_len, auth_tag, auth_tag_len);
 	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
@@ -511,7 +516,7 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
 		aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
-				  nbytes & AES_BLOCK_MASK, walk.iv);
+			              nbytes & AES_BLOCK_MASK, walk.iv);
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
@@ -902,7 +907,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 	}
 	/*Account for 4 byte nonce at the end.*/
 	key_len -= 4;
-	if (key_len != AES_KEYSIZE_128) {
+	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
+	    key_len != AES_KEYSIZE_256) {
 		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
@@ -1013,6 +1019,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 	__be32 counter = cpu_to_be32(1);
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
 	u8 iv_tab[16+AESNI_ALIGN];
@@ -1027,6 +1034,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 	/* to 8 or 12 bytes */
 	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
 		return -EINVAL;
+	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
+	        return -EINVAL;
+	if (unlikely(key_len != AES_KEYSIZE_128 &&
+	             key_len != AES_KEYSIZE_192 &&
+	             key_len != AES_KEYSIZE_256))
+	        return -EINVAL;
+
 	/* IV below built */
 	for (i = 0; i < 4; i++)
 		*(iv+i) = ctx->nonce[i];
@@ -1091,6 +1105,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	int retval = 0;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
 	u8 iv_and_authTag[32+AESNI_ALIGN];
@@ -1104,6 +1119,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	if (unlikely((req->cryptlen < auth_tag_len) ||
 		(req->assoclen != 8 && req->assoclen != 12)))
 		return -EINVAL;
+	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
+	        return -EINVAL;
+	if (unlikely(key_len != AES_KEYSIZE_128 &&
+	             key_len != AES_KEYSIZE_192 &&
+	             key_len != AES_KEYSIZE_256))
+	        return -EINVAL;
+
 	/* Assuming we are supporting rfc4106 64-bit extended */
 	/* sequence numbers We need to have the AAD length */
 	/* equal to 8 or 12 bytes */
-- 
cgit v1.2.3


From e108ff2f8033a417ee3e517d9f8730f665646076 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 15 Jan 2015 15:58:54 -0800
Subject: KVM: x86: switch to kvm_get_dirty_log_protect

We now have a generic function that does most of the work of
kvm_vm_ioctl_get_dirty_log, now use it.

Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/x86/include/asm/kvm_host.h |  3 --
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/mmu.c              |  4 +--
 arch/x86/kvm/x86.c              | 72 ++++++++---------------------------------
 4 files changed, 16 insertions(+), 64 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cb19d05af3cd..3ceddf41ca74 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -821,9 +821,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
-				     struct kvm_memory_slot *slot,
-				     gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f9d16ff56c6b..d07359466d5d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -39,6 +39,7 @@ config KVM
 	select PERF_EVENTS
 	select HAVE_KVM_MSI
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
+	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_VFIO
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a0985ebb5512..3aa46aaa8cb3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1203,7 +1203,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 }
 
 /**
- * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * kvm_arch_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
  * @slot: slot to protect
  * @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1212,7 +1212,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
  * Used when we do not need to care about huge page mappings: e.g. during dirty
  * logging we do not have any such mappings.
  */
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 				     struct kvm_memory_slot *slot,
 				     gfn_t gfn_offset, unsigned long mask)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49ecda7ca958..556dfb4efc43 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3748,83 +3748,37 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
  * @kvm: kvm instance
  * @log: slot id and address to which we copy the log
  *
- * We need to keep it in mind that VCPU threads can write to the bitmap
- * concurrently.  So, to avoid losing data, we keep the following order for
- * each bit:
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed  and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
  *
  *   1. Take a snapshot of the bit and clear it if needed.
  *   2. Write protect the corresponding page.
- *   3. Flush TLB's if needed.
- *   4. Copy the snapshot to the userspace.
- *
- * Between 2 and 3, the guest may write to the page using the remaining TLB
- * entry.  This is not a problem because the page will be reported dirty at
- * step 4 using the snapshot taken before and step 3 ensures that successive
- * writes will be logged for the next call.
+ *   3. Copy the snapshot to the userspace.
+ *   4. Flush TLB's if needed.
  */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-	int r;
-	struct kvm_memory_slot *memslot;
-	unsigned long n, i;
-	unsigned long *dirty_bitmap;
-	unsigned long *dirty_bitmap_buffer;
 	bool is_dirty = false;
+	int r;
 
 	mutex_lock(&kvm->slots_lock);
 
-	r = -EINVAL;
-	if (log->slot >= KVM_USER_MEM_SLOTS)
-		goto out;
-
-	memslot = id_to_memslot(kvm->memslots, log->slot);
-
-	dirty_bitmap = memslot->dirty_bitmap;
-	r = -ENOENT;
-	if (!dirty_bitmap)
-		goto out;
-
-	n = kvm_dirty_bitmap_bytes(memslot);
-
-	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
-	memset(dirty_bitmap_buffer, 0, n);
-
-	spin_lock(&kvm->mmu_lock);
-
-	for (i = 0; i < n / sizeof(long); i++) {
-		unsigned long mask;
-		gfn_t offset;
-
-		if (!dirty_bitmap[i])
-			continue;
-
-		is_dirty = true;
-
-		mask = xchg(&dirty_bitmap[i], 0);
-		dirty_bitmap_buffer[i] = mask;
-
-		offset = i * BITS_PER_LONG;
-		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
-	}
-
-	spin_unlock(&kvm->mmu_lock);
-
-	/* See the comments in kvm_mmu_slot_remove_write_access(). */
-	lockdep_assert_held(&kvm->slots_lock);
+	r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
 
 	/*
 	 * All the TLBs can be flushed out of mmu lock, see the comments in
 	 * kvm_mmu_slot_remove_write_access().
 	 */
+	lockdep_assert_held(&kvm->slots_lock);
 	if (is_dirty)
 		kvm_flush_remote_tlbs(kvm);
 
-	r = -EFAULT;
-	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
-		goto out;
-
-	r = 0;
-out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
-- 
cgit v1.2.3


From 0c55d6d931484a747dbba5c9766379c7720a8520 Mon Sep 17 00:00:00 2001
From: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Date: Sun, 11 Jan 2015 19:02:26 +0100
Subject: x86: kvm: vmx: Remove some unused functions

Removes some functions that are not used anywhere:
cpu_has_vmx_eptp_writeback() cpu_has_vmx_eptp_uncacheable()

This was partially found by using a static code analysis program called cppcheck.

Signed-off-by: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ce350718eb88..c987374d92c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -964,16 +964,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
 	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 }
 
-static inline bool cpu_has_vmx_eptp_uncacheable(void)
-{
-	return vmx_capability.ept & VMX_EPTP_UC_BIT;
-}
-
-static inline bool cpu_has_vmx_eptp_writeback(void)
-{
-	return vmx_capability.ept & VMX_EPTP_WB_BIT;
-}
-
 static inline bool cpu_has_vmx_ept_2m_page(void)
 {
 	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
-- 
cgit v1.2.3


From d91ffee9ec633c6523ef1ea39c36ace578760258 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Mon, 12 Jan 2015 15:28:54 +0800
Subject: Optimize TLB flush in kvm_mmu_slot_remove_write_access.

No TLB flush is needed when there's no valid rmap in memory slot.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0d0fdd6f002c..97898abe8386 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4302,6 +4302,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 	struct kvm_memory_slot *memslot;
 	gfn_t last_gfn;
 	int i;
+	bool flush = false;
 
 	memslot = id_to_memslot(kvm->memslots, slot);
 	last_gfn = memslot->base_gfn + memslot->npages - 1;
@@ -4318,7 +4319,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 
 		for (index = 0; index <= last_index; ++index, ++rmapp) {
 			if (*rmapp)
-				__rmap_write_protect(kvm, rmapp, false);
+				flush |= __rmap_write_protect(kvm, rmapp,
+						false);
 
 			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
 				cond_resched_lock(&kvm->mmu_lock);
@@ -4345,7 +4347,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 	 * instead of PT_WRITABLE_MASK, that means it does not depend
 	 * on PT_WRITABLE_MASK anymore.
 	 */
-	kvm_flush_remote_tlbs(kvm);
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
 }
 
 #define BATCH_ZAP_PAGES	10
-- 
cgit v1.2.3


From 1760f1eb7ec485197bd3a8a9c13e4160bb740275 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Sun, 7 Dec 2014 22:01:59 +0100
Subject: x86/xen/p2m: Replace ACCESS_ONCE with READ_ONCE

ACCESS_ONCE does not work reliably on non-scalar types. For
example gcc 4.6 and 4.7 might remove the volatile tag for such
accesses during the SRA (scalar replacement of aggregates) step
(https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145)

Change the p2m code to replace ACCESS_ONCE with READ_ONCE.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Acked-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/p2m.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index edbc7a63fd73..cb7101692a75 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -554,7 +554,7 @@ static bool alloc_p2m(unsigned long pfn)
 		mid_mfn = NULL;
 	}
 
-	p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
+	p2m_pfn = pte_pfn(READ_ONCE(*ptep));
 	if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
 	    p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
 		/* p2m leaf page is missing */
-- 
cgit v1.2.3


From bccec2a0a25206cb837e939adab94768a990ffa9 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 6 Jan 2015 22:49:54 +0100
Subject: x86/spinlock: Leftover conversion ACCESS_ONCE->READ_ONCE

commit 78bff1c8684f ("x86/ticketlock: Fix spin_unlock_wait() livelock")
introduced two additional ACCESS_ONCE cases in x86 spinlock.h.
Lets change those as well.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 arch/x86/include/asm/spinlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 625660f8a2fc..7050d864f520 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -183,10 +183,10 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	__ticket_t head = ACCESS_ONCE(lock->tickets.head);
+	__ticket_t head = READ_ONCE(lock->tickets.head);
 
 	for (;;) {
-		struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 		/*
 		 * We need to check "unlocked" in a loop, tmp.head == head
 		 * can be false positive because of overflow.
-- 
cgit v1.2.3


From 32b7eb877165fdb29f1722071c6c64ced1789014 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Tue, 20 Jan 2015 12:49:35 +0100
Subject: livepatch: change ARCH_HAVE_LIVE_PATCHING to HAVE_LIVE_PATCHING

Change ARCH_HAVE_LIVE_PATCHING to HAVE_LIVE_PATCHING in Kconfigs. HAVE_
bools are prevalent there and we should go with the flow.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/Kconfig         | 2 +-
 kernel/livepatch/Kconfig | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 460b31b79938..29b095231276 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,7 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
-	select ARCH_HAVE_LIVE_PATCHING
+	select HAVE_LIVE_PATCHING
 
 ### Arch settings
 config X86
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 66797aa597c0..347ee2221137 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,4 +1,4 @@
-config ARCH_HAVE_LIVE_PATCHING
+config HAVE_LIVE_PATCHING
 	bool
 	help
 	  Arch supports kernel live patching
@@ -9,7 +9,7 @@ config LIVE_PATCHING
 	depends on MODULES
 	depends on SYSFS
 	depends on KALLSYMS_ALL
-	depends on ARCH_HAVE_LIVE_PATCHING
+	depends on HAVE_LIVE_PATCHING
 	help
 	  Say Y here if you want to support kernel live patching.
 	  This option has no runtime impact until a kernel "patch"
-- 
cgit v1.2.3


From 94dd85f6a0688245dbb5a452a86a2a545dee96c1 Mon Sep 17 00:00:00 2001
From: "Palik, Imre" <imrep@amazon.de>
Date: Tue, 13 Jan 2015 09:14:22 +0100
Subject: x86/xen: prefer TSC over xen clocksource for dom0

In Dom0's the use of the TSC clocksource (whenever it is stable enough to
be used) instead of the Xen clocksource should not cause any issues, as
Dom0 VMs never live-migrated.  The TSC clocksource is somewhat more
efficient than the Xen paravirtualised clocksource, thus it should have
higher rating.

This patch decreases the rating of the Xen clocksource in Dom0s to 275.
Which is half-way between the rating of the TSC clocksource (300) and the
hpet clocksource (250).

Cc: Anthony Liguori <aliguori@amazon.com>
Signed-off-by: Imre Palik <imrep@amazon.de>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/time.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 69087341d9ae..55da33b1d51c 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -479,6 +479,10 @@ static void __init xen_time_init(void)
 	int cpu = smp_processor_id();
 	struct timespec tp;
 
+	/* As Dom0 is never moved, no penalty on using TSC there */
+	if (xen_initial_domain())
+		xen_clocksource.rating = 275;
+
 	clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
 
 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
-- 
cgit v1.2.3


From 69b0049a89ad418cd68aa59e7f1e6619a04a4a6f Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Mon, 19 Jan 2015 22:33:39 +0800
Subject: KVM: fix "Should it be static?" warnings from sparse

arch/x86/kvm/x86.c:495:5: sparse: symbol 'kvm_read_nested_guest_page' was not declared. Should it be static?
arch/x86/kvm/x86.c:646:5: sparse: symbol '__kvm_set_xcr' was not declared. Should it be static?
arch/x86/kvm/x86.c:1183:15: sparse: symbol 'max_tsc_khz' was not declared. Should it be static?
arch/x86/kvm/x86.c:1237:6: sparse: symbol 'kvm_track_tsc_matching' was not declared. Should it be static?

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 98930c64b06e..dcb996bfafa4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -496,7 +496,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 			       void *data, int offset, int len, u32 access)
 {
 	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -647,7 +647,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 	}
 }
 
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
 	u64 xcr0 = xcr;
 	u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1193,7 +1193,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
 #endif
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+static unsigned long max_tsc_khz;
 
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
@@ -1247,7 +1247,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 	return tsc;
 }
 
-void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
 	bool vcpus_matched;
-- 
cgit v1.2.3


From 54750f2cf042c42b4223d67b1bd20138464bde0e Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 20 Jan 2015 15:54:52 -0200
Subject: KVM: x86: workaround SuSE's 2.6.16 pvclock vs masterclock issue

SuSE's 2.6.16 kernel fails to boot if the delta between tsc_timestamp
and rdtsc is larger than a given threshold:

 * If we get more than the below threshold into the future, we rerequest
 * the real time from the host again which has only little offset then
 * that we need to adjust using the TSC.
 *
 * For now that threshold is 1/5th of a jiffie. That should be good
 * enough accuracy for completely broken systems, but also give us swing
 * to not call out to the host all the time.
 */
#define PVCLOCK_DELTA_MAX ((1000000000ULL / HZ) / 5)

Disable masterclock support (which increases said delta) in case the
boot vcpu does not use MSR_KVM_SYSTEM_TIME_NEW.

Upstreams kernels which support pvclock vsyscalls (and therefore make
use of PVCLOCK_STABLE_BIT) use MSR_KVM_SYSTEM_TIME_NEW.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 97a5dd0222c8..177b2f2ff9fb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -627,6 +627,8 @@ struct kvm_arch {
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
+
+	bool boot_vcpu_runs_old_kvmclock;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dcb996bfafa4..917672f8034a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1542,7 +1542,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 					&ka->master_cycle_now);
 
 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
-				&& !backwards_tsc_observed;
+				&& !backwards_tsc_observed
+				&& !ka->boot_vcpu_runs_old_kvmclock;
 
 	if (ka->use_master_clock)
 		atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2174,8 +2175,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
 		u64 gpa_offset;
+		struct kvm_arch *ka = &vcpu->kvm->arch;
+
 		kvmclock_reset(vcpu);
 
+		if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+			bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+			if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+				set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+					&vcpu->requests);
+
+			ka->boot_vcpu_runs_old_kvmclock = tmp;
+		}
+
 		vcpu->arch.time = data;
 		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 
-- 
cgit v1.2.3


From cfaa790a3fb8a7efa98f4a6457e19dc3a0db35d3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 15 Jan 2015 09:44:56 +0100
Subject: kvm: Fix CR3_PCID_INVD type on 32-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch/x86/kvm/emulate.c: In function ‘check_cr_write’:
arch/x86/kvm/emulate.c:3552:4: warning: left shift count >= width of type
    rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;

happens because sizeof(UL) on 32-bit is 4 bytes but we shift it 63 bits
to the left.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 177b2f2ff9fb..4327af53e544 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -51,7 +51,7 @@
 			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
 #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_PCID_INVD		 (1UL << 63)
+#define CR3_PCID_INVD		 BIT_64(63)
 #define CR4_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
-- 
cgit v1.2.3


From 31928aa5863e71535ee942f506ca9ac8ce1c4315 Mon Sep 17 00:00:00 2001
From: Dominik Dingel <dingel@linux.vnet.ibm.com>
Date: Thu, 4 Dec 2014 15:47:07 +0100
Subject: KVM: remove unneeded return value of vcpu_postcreate

The return value of kvm_arch_vcpu_postcreate is not checked in its
caller.  This is okay, because only x86 provides vcpu_postcreate right
now and it could only fail if vcpu_load failed.  But that is not
possible during KVM_CREATE_VCPU (kvm_arch_vcpu_load is void, too), so
just get rid of the unchecked return value.

Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/arm/kvm/arm.c         |  3 +--
 arch/mips/kvm/mips.c       |  3 +--
 arch/powerpc/kvm/powerpc.c |  3 +--
 arch/s390/kvm/kvm-s390.c   |  3 +--
 arch/x86/kvm/x86.c         | 10 +++-------
 include/linux/kvm_host.h   |  2 +-
 6 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 2d6d91001062..1a10e0ce9266 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -241,9 +241,8 @@ out:
 	return ERR_PTR(err);
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e3b21e51ff7e..7082481cd108 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -832,9 +832,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return -ENOIOCTLCMD;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c45eaab752b0..27c0face86f4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -623,9 +623,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	return vcpu;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3e09801e3104..ec004f80ee45 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -615,9 +615,8 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
 	kvm_s390_clear_local_irqs(vcpu);
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49ecda7ca958..274fbcbcc180 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7056,15 +7056,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	int r;
 	struct msr_data msr;
 	struct kvm *kvm = vcpu->kvm;
 
-	r = vcpu_load(vcpu);
-	if (r)
-		return r;
+	if (vcpu_load(vcpu))
+		return;
 	msr.data = 0x0;
 	msr.index = MSR_IA32_TSC;
 	msr.host_initiated = true;
@@ -7073,8 +7071,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
-
-	return r;
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..a82432c710c9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -652,7 +652,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 int kvm_arch_hardware_enable(void);
-- 
cgit v1.2.3


From 4061ed9e2aaac31daef44f06e9b83143c78b24b2 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Fri, 9 Jan 2015 23:54:32 -0800
Subject: Drivers: hv: vmbus: Implement a clockevent device

Implement a clockevent device based on the timer support available on
Hyper-V.
In this version of the patch I have addressed Jason's review comments.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/uapi/asm/hyperv.h | 11 ++++++
 drivers/hv/hv.c                    | 78 ++++++++++++++++++++++++++++++++++++++
 drivers/hv/hyperv_vmbus.h          | 21 ++++++++++
 drivers/hv/vmbus_drv.c             | 37 +++++++++++++++++-
 4 files changed, 145 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 462efe746d77..90c458e66e13 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -187,6 +187,17 @@
 #define HV_X64_MSR_SINT14			0x4000009E
 #define HV_X64_MSR_SINT15			0x4000009F
 
+/*
+ * Synthetic Timer MSRs. Four timers per vcpu.
+ */
+#define HV_X64_MSR_STIMER0_CONFIG		0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT		0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG		0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT		0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG		0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT		0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT		0x400000B7
 
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 3e4235c7a47f..50e51a51ff8b 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -28,7 +28,9 @@
 #include <linux/hyperv.h>
 #include <linux/version.h>
 #include <linux/interrupt.h>
+#include <linux/clockchips.h>
 #include <asm/hyperv.h>
+#include <asm/mshyperv.h>
 #include "hyperv_vmbus.h"
 
 /* The one and only */
@@ -37,6 +39,10 @@ struct hv_context hv_context = {
 	.hypercall_page		= NULL,
 };
 
+#define HV_TIMER_FREQUENCY (10 * 1000 * 1000) /* 100ns period */
+#define HV_MAX_MAX_DELTA_TICKS 0xffffffff
+#define HV_MIN_DELTA_TICKS 1
+
 /*
  * query_hypervisor_info - Get version info of the windows hypervisor
  */
@@ -144,6 +150,8 @@ int hv_init(void)
 	       sizeof(int) * NR_CPUS);
 	memset(hv_context.event_dpc, 0,
 	       sizeof(void *) * NR_CPUS);
+	memset(hv_context.clk_evt, 0,
+	       sizeof(void *) * NR_CPUS);
 
 	max_leaf = query_hypervisor_info();
 
@@ -258,10 +266,63 @@ u16 hv_signal_event(void *con_id)
 	return status;
 }
 
+static int hv_ce_set_next_event(unsigned long delta,
+				struct clock_event_device *evt)
+{
+	cycle_t current_tick;
+
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
+	current_tick += delta;
+	wrmsrl(HV_X64_MSR_STIMER0_COUNT, current_tick);
+	return 0;
+}
+
+static void hv_ce_setmode(enum clock_event_mode mode,
+			  struct clock_event_device *evt)
+{
+	union hv_timer_config timer_cfg;
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		timer_cfg.enable = 1;
+		timer_cfg.auto_enable = 1;
+		timer_cfg.sintx = VMBUS_MESSAGE_SINT;
+		wrmsrl(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64);
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		wrmsrl(HV_X64_MSR_STIMER0_COUNT, 0);
+		wrmsrl(HV_X64_MSR_STIMER0_CONFIG, 0);
+		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+	}
+}
+
+static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu)
+{
+	dev->name = "Hyper-V clockevent";
+	dev->features = CLOCK_EVT_FEAT_ONESHOT;
+	dev->cpumask = cpumask_of(cpu);
+	dev->rating = 1000;
+	dev->owner = THIS_MODULE;
+
+	dev->set_mode = hv_ce_setmode;
+	dev->set_next_event = hv_ce_set_next_event;
+}
+
 
 int hv_synic_alloc(void)
 {
 	size_t size = sizeof(struct tasklet_struct);
+	size_t ced_size = sizeof(struct clock_event_device);
 	int cpu;
 
 	for_each_online_cpu(cpu) {
@@ -272,6 +333,13 @@ int hv_synic_alloc(void)
 		}
 		tasklet_init(hv_context.event_dpc[cpu], vmbus_on_event, cpu);
 
+		hv_context.clk_evt[cpu] = kzalloc(ced_size, GFP_ATOMIC);
+		if (hv_context.clk_evt[cpu] == NULL) {
+			pr_err("Unable to allocate clock event device\n");
+			goto err;
+		}
+		hv_init_clockevent_device(hv_context.clk_evt[cpu], cpu);
+
 		hv_context.synic_message_page[cpu] =
 			(void *)get_zeroed_page(GFP_ATOMIC);
 
@@ -305,6 +373,7 @@ err:
 static void hv_synic_free_cpu(int cpu)
 {
 	kfree(hv_context.event_dpc[cpu]);
+	kfree(hv_context.clk_evt[cpu]);
 	if (hv_context.synic_event_page[cpu])
 		free_page((unsigned long)hv_context.synic_event_page[cpu]);
 	if (hv_context.synic_message_page[cpu])
@@ -388,6 +457,15 @@ void hv_synic_init(void *arg)
 	hv_context.vp_index[cpu] = (u32)vp_index;
 
 	INIT_LIST_HEAD(&hv_context.percpu_list[cpu]);
+
+	/*
+	 * Register the per-cpu clockevent source.
+	 */
+	if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE)
+		clockevents_config_and_register(hv_context.clk_evt[cpu],
+						HV_TIMER_FREQUENCY,
+						HV_MIN_DELTA_TICKS,
+						HV_MAX_MAX_DELTA_TICKS);
 	return;
 }
 
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index c386d8dc7223..44b1c9424712 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -178,6 +178,23 @@ struct hv_message_header {
 	};
 };
 
+/*
+ * Timer configuration register.
+ */
+union hv_timer_config {
+	u64 as_uint64;
+	struct {
+		u64 enable:1;
+		u64 periodic:1;
+		u64 lazy:1;
+		u64 auto_enable:1;
+		u64 reserved_z0:12;
+		u64 sintx:4;
+		u64 reserved_z1:44;
+	};
+};
+
+
 /* Define timer message payload structure. */
 struct hv_timer_message_payload {
 	u32 timer_index;
@@ -519,6 +536,10 @@ struct hv_context {
 	 * buffer to post messages to the host.
 	 */
 	void *post_msg_page[NR_CPUS];
+	/*
+	 * Support PV clockevent device.
+	 */
+	struct clock_event_device *clk_evt[NR_CPUS];
 };
 
 extern struct hv_context hv_context;
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 4d6b26979fbd..7488111ec057 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -32,6 +32,7 @@
 #include <linux/completion.h>
 #include <linux/hyperv.h>
 #include <linux/kernel_stat.h>
+#include <linux/clockchips.h>
 #include <asm/hyperv.h>
 #include <asm/hypervisor.h>
 #include <asm/mshyperv.h>
@@ -578,6 +579,34 @@ static void vmbus_onmessage_work(struct work_struct *work)
 	kfree(ctx);
 }
 
+void hv_process_timer_expiration(struct hv_message *msg, int cpu)
+{
+	struct clock_event_device *dev = hv_context.clk_evt[cpu];
+
+	if (dev->event_handler)
+		dev->event_handler(dev);
+
+	msg->header.message_type = HVMSG_NONE;
+
+	/*
+	 * Make sure the write to MessageType (ie set to
+	 * HVMSG_NONE) happens before we read the
+	 * MessagePending and EOMing. Otherwise, the EOMing
+	 * will not deliver any more messages since there is
+	 * no empty slot
+	 */
+	mb();
+
+	if (msg->header.message_flags.msg_pending) {
+		/*
+		 * This will cause message queue rescan to
+		 * possibly deliver another msg from the
+		 * hypervisor
+		 */
+		wrmsrl(HV_X64_MSR_EOM, 0);
+	}
+}
+
 static void vmbus_on_msg_dpc(unsigned long data)
 {
 	int cpu = smp_processor_id();
@@ -667,8 +696,12 @@ static void vmbus_isr(void)
 	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
 
 	/* Check if there are actual msgs to be processed */
-	if (msg->header.message_type != HVMSG_NONE)
-		tasklet_schedule(&msg_dpc);
+	if (msg->header.message_type != HVMSG_NONE) {
+		if (msg->header.message_type == HVMSG_TIMER_EXPIRED)
+			hv_process_timer_expiration(msg, cpu);
+		else
+			tasklet_schedule(&msg_dpc);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 57b6b99bac045bece3b3892377e863b571314950 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 26 Jan 2015 02:10:08 -0800
Subject: x86,xen: use current->state helpers

Call __set_current_state() instead of assigning the new state directly.
These interfaces also aid CONFIG_DEBUG_ATOMIC_SLEEP environments,
keeping track of who changed the state.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4c071aeb8417..08e8489c47f1 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -507,7 +507,7 @@ static int xen_cpu_disable(void)
 static void xen_cpu_die(unsigned int cpu)
 {
 	while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
-		current->state = TASK_UNINTERRUPTIBLE;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(HZ/10);
 	}
 
-- 
cgit v1.2.3


From 2fcf5c8ae244b4c298d2111a288d410a719ac626 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:21 +0200
Subject: KVM: x86: Dirty the dest op page on cmpxchg emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel SDM says for CMPXCHG: "To simplify the interface to the processorâ€™s bus,
the destination operand receives a write cycle without regard to the result of
the comparison.". This means the destination page should be dirtied.

Fix it to by writing back the original value if cmpxchg failed.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ef23c1e5fa9f..aa272545402e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2205,12 +2205,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 	fastop(ctxt, em_cmp);
 
 	if (ctxt->eflags & EFLG_ZF) {
-		/* Success: write back to memory. */
+		/* Success: write back to memory; no update of EAX */
+		ctxt->src.type = OP_NONE;
 		ctxt->dst.val = ctxt->src.orig_val;
 	} else {
 		/* Failure: write the value we saw to EAX. */
-		ctxt->dst.type = OP_REG;
-		ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+		ctxt->src.type = OP_REG;
+		ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+		ctxt->src.val = ctxt->dst.orig_val;
+		/* Create write-cycle to dest by writing the same value */
 		ctxt->dst.val = ctxt->dst.orig_val;
 	}
 	return X86EMUL_CONTINUE;
@@ -4157,7 +4160,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
 	GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
+	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
-- 
cgit v1.2.3


From 16794aaaab66fa74ab19588a8e255a460e8b3ace Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:22 +0200
Subject: KVM: x86: Wrong operand size for far ret

Indeed, Intel SDM specifically states that for the RET instruction "In 64-bit
mode, the default operation size of this instruction is the stack-address size,
i.e. 64 bits."

However, experiments show this is not the case. Here is for example objdump of
small 64-bit asm:

  4004f1:	ca 14 00             	lret   $0x14
  4004f4:	48 cb                	lretq
  4004f6:	48 ca 14 00          	lretq  $0x14

Therefore, remove the Stack flag from far-ret instructions.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aa272545402e..dd7100481aac 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4062,8 +4062,8 @@ static const struct opcode opcode_table[256] = {
 	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
 	I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
-	I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm),
-	I(ImplicitOps | Stack, em_ret_far),
+	I(ImplicitOps | SrcImmU16, em_ret_far_imm),
+	I(ImplicitOps, em_ret_far),
 	D(ImplicitOps), DI(SrcImmByte, intn),
 	D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
 	/* 0xD0 - 0xD7 */
-- 
cgit v1.2.3


From 801806d956c2c198b9fdd3afd156a536f9a3a139 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:23 +0200
Subject: KVM: x86: IRET emulation does not clear NMI masking

The IRET instruction should clear NMI masking, but the current implementation
does not do so.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 +
 arch/x86/kvm/emulate.c             | 1 +
 arch/x86/kvm/x86.c                 | 6 ++++++
 3 files changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index eb181178fe0b..57a9d94fe160 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,7 @@ struct x86_emulate_ops {
 
 	void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
 			  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+	void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index dd7100481aac..fa3ca55a50c6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2047,6 +2047,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 
 	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
 	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+	ctxt->ops->set_nmi_mask(ctxt, false);
 
 	return rc;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index afa0815bd384..cdd6606e4c54 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4963,6 +4963,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
 	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
 }
 
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+	kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
+
 static const struct x86_emulate_ops emulate_ops = {
 	.read_gpr            = emulator_read_gpr,
 	.write_gpr           = emulator_write_gpr,
@@ -4998,6 +5003,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.put_fpu             = emulator_put_fpu,
 	.intercept           = emulator_intercept,
 	.get_cpuid           = emulator_get_cpuid,
+	.set_nmi_mask        = emulator_set_nmi_mask,
 };
 
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
-- 
cgit v1.2.3


From 2276b5116e983277073623cd363954e41674c382 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:24 +0200
Subject: KVM: x86: ARPL emulation can cause spurious exceptions

ARPL and MOVSXD are encoded the same and their execution depends on the
execution mode.  The operand sizes of each instruction are different.
Currently, ARPL is detected too late, after the decoding was already done, and
therefore may result in spurious exception (instead of failed emulation).

Introduce a group to the emulator to handle instructions according to execution
mode (32/64 bits). Note: in order not to make changes that may affect
performance, the new ModeDual can only be applied to instructions with ModRM.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fa3ca55a50c6..db3cf399e39e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -125,6 +125,7 @@
 #define RMExt       (4<<15)     /* Opcode extension in ModRM r/m if mod == 3 */
 #define Escape      (5<<15)     /* Escape to coprocessor instruction */
 #define InstrDual   (6<<15)     /* Alternate instruction decoding of mod == 3 */
+#define ModeDual    (7<<15)     /* Different instruction for 32/64 bit */
 #define Sse         (1<<18)     /* SSE Vector instruction */
 /* Generic ModRM decode. */
 #define ModRM       (1<<19)
@@ -215,6 +216,7 @@ struct opcode {
 		const struct gprefix *gprefix;
 		const struct escape *esc;
 		const struct instr_dual *idual;
+		const struct mode_dual *mdual;
 		void (*fastop)(struct fastop *fake);
 	} u;
 	int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -242,6 +244,11 @@ struct instr_dual {
 	struct opcode mod3;
 };
 
+struct mode_dual {
+	struct opcode mode32;
+	struct opcode mode64;
+};
+
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
 #define EFLG_VIP (1<<20)
@@ -3530,6 +3537,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->dst.val = (s32) ctxt->src.val;
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3729,6 +3742,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
 #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
 #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3973,6 +3987,10 @@ static const struct instr_dual instr_dual_0f_c3 = {
 	I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
 };
 
+static const struct mode_dual mode_dual_63 = {
+	N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
 static const struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	F6ALU(Lock, em_add),
@@ -4007,7 +4025,7 @@ static const struct opcode opcode_table[256] = {
 	/* 0x60 - 0x67 */
 	I(ImplicitOps | Stack | No64, em_pusha),
 	I(ImplicitOps | Stack | No64, em_popa),
-	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+	N, MD(ModRM, &mode_dual_63),
 	N, N, N, N,
 	/* 0x68 - 0x6F */
 	I(SrcImm | Mov | Stack, em_push),
@@ -4227,6 +4245,7 @@ static const struct opcode opcode_map_0f_38[256] = {
 #undef I
 #undef GP
 #undef EXT
+#undef MD
 
 #undef D2bv
 #undef D2bvIP
@@ -4616,6 +4635,12 @@ done_prefixes:
 			else
 				opcode = opcode.u.idual->mod012;
 			break;
+		case ModeDual:
+			if (ctxt->mode == X86EMUL_MODE_PROT64)
+				opcode = opcode.u.mdual->mode64;
+			else
+				opcode = opcode.u.mdual->mode32;
+			break;
 		default:
 			return EMULATION_FAILED;
 		}
@@ -4956,11 +4981,6 @@ special_insn:
 		goto threebyte_insn;
 
 	switch (ctxt->b) {
-	case 0x63:		/* movsxd */
-		if (ctxt->mode != X86EMUL_MODE_PROT64)
-			goto cannot_emulate;
-		ctxt->dst.val = (s32) ctxt->src.val;
-		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(ctxt->b, ctxt->eflags))
 			rc = jmp_rel(ctxt, ctxt->src.val);
-- 
cgit v1.2.3


From 2b42fce6954d1730edaf479d02378703e7b821cb Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:25 +0200
Subject: KVM: x86: Fix defines in emulator.c

Unnecassary define was left after commit 7d882ffa81d5 ("KVM: x86: Revert
NoBigReal patch in the emulator").

Commit 39f062ff51b2 ("KVM: x86: Generate #UD when memory operand is required")
was missing undef.

Fix it.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index db3cf399e39e..997c9ebb70ef 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -167,7 +167,6 @@
 #define NoMod	    ((u64)1 << 47)  /* Mod field is ignored */
 #define Intercept   ((u64)1 << 48)  /* Has valid intercept field */
 #define CheckPerm   ((u64)1 << 49)  /* Has valid check_perm field */
-#define NoBigReal   ((u64)1 << 50)  /* No big real mode */
 #define PrivUD      ((u64)1 << 51)  /* #UD instead of #GP on CPL > 0 */
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
@@ -4246,6 +4245,7 @@ static const struct opcode opcode_map_0f_38[256] = {
 #undef GP
 #undef EXT
 #undef MD
+#undef ID
 
 #undef D2bv
 #undef D2bvIP
-- 
cgit v1.2.3


From bac155310be35e0fa64b066d47625d2a12a75122 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:26 +0200
Subject: KVM: x86: 32-bit wraparound read/write not emulated correctly

If we got a wraparound of 32-bit operand, and the limit is 0xffffffff, read and
writes should be successful. It just needs to be done in two segments.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +++++++---
 arch/x86/kvm/x86.c     |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 997c9ebb70ef..c3b07574942f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -684,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 		}
 		if (addr.ea > lim)
 			goto bad;
-		*max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
-		if (size > *max_size)
-			goto bad;
+		if (lim == 0xffffffff)
+			*max_size = ~0u;
+		else {
+			*max_size = (u64)lim + 1 - addr.ea;
+			if (size > *max_size)
+				goto bad;
+		}
 		la &= (u32)-1;
 		break;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdd6606e4c54..1e10e3f7f516 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4495,6 +4495,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
+		if (ctxt->mode != X86EMUL_MODE_PROT64)
+			addr = (u32)addr;
 		val += now;
 		bytes -= now;
 	}
-- 
cgit v1.2.3


From 82268083fa78452c1c8be30a82984e470d9678c7 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:27 +0200
Subject: KVM: x86: Emulation of call may use incorrect stack size

On long-mode, when far call that changes cs.l takes place, the stack size is
determined by the new mode.  For instance, if we go from 32-bit mode to 64-bit
mode, the stack-size if 64.  KVM uses the old stack size.

Fix it.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c3b07574942f..81dcf7964701 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -741,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
 			  const struct desc_struct *cs_desc)
 {
 	enum x86emul_mode mode = ctxt->mode;
+	int rc;
 
 #ifdef CONFIG_X86_64
-	if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) {
-		u64 efer = 0;
+	if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+		if (cs_desc->l) {
+			u64 efer = 0;
 
-		ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-		if (efer & EFER_LMA)
-			mode = X86EMUL_MODE_PROT64;
+			ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+			if (efer & EFER_LMA)
+				mode = X86EMUL_MODE_PROT64;
+		} else
+			mode = X86EMUL_MODE_PROT32; /* temporary value */
 	}
 #endif
 	if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
 		mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-	return assign_eip(ctxt, dst, mode);
+	rc = assign_eip(ctxt, dst, mode);
+	if (rc == X86EMUL_CONTINUE)
+		ctxt->mode = mode;
+	return rc;
 }
 
 static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -3062,6 +3069,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	struct desc_struct old_desc, new_desc;
 	const struct x86_emulate_ops *ops = ctxt->ops;
 	int cpl = ctxt->ops->cpl(ctxt);
+	enum x86emul_mode prev_mode = ctxt->mode;
 
 	old_eip = ctxt->_eip;
 	ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
@@ -3085,11 +3093,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	rc = em_push(ctxt);
 	/* If we failed, we tainted the memory, but the very least we should
 	   restore cs */
-	if (rc != X86EMUL_CONTINUE)
+	if (rc != X86EMUL_CONTINUE) {
+		pr_warn_once("faulting far call emulation tainted memory\n");
 		goto fail;
+	}
 	return rc;
 fail:
 	ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+	ctxt->mode = prev_mode;
 	return rc;
 
 }
-- 
cgit v1.2.3


From a45de93eb10ae44446aec2c73d722562ab46092a Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Mon, 26 Jan 2015 16:58:56 +0800
Subject: ACPICA: Resources: Provide common part for struct
 acpi_resource_address structures.

struct acpi_resource_address and struct acpi_resource_extended_address64 share substracts
just at different offsets. To unify the parsing functions, OSPMs like Linux
need a new ACPI_ADDRESS64_ATTRIBUTE as their substructs, so they can
extract the shared data.

This patch also synchronizes the structure changes to the Linux kernel.
The usages are searched by matching the following keywords:
1. acpi_resource_address
2. acpi_resource_extended_address
3. ACPI_RESOURCE_TYPE_ADDRESS
4. ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS
And we found and fixed the usages in the following files:
 arch/ia64/kernel/acpi-ext.c
 arch/ia64/pci/pci.c
 arch/x86/pci/acpi.c
 arch/x86/pci/mmconfig-shared.c
 drivers/xen/xen-acpi-memhotplug.c
 drivers/acpi/acpi_memhotplug.c
 drivers/acpi/pci_root.c
 drivers/acpi/resource.c
 drivers/char/hpet.c
 drivers/pnp/pnpacpi/rsparser.c
 drivers/hv/vmbus_drv.c

Build tests are passed with defconfig/allnoconfig/allyesconfig and
defconfig+CONFIG_ACPI=n.

Original-by: Thomas Gleixner <tglx@linutronix.de>
Original-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/ia64/kernel/acpi-ext.c       |  6 ++--
 arch/ia64/pci/pci.c               | 14 +++++-----
 arch/x86/pci/acpi.c               | 26 ++++++++---------
 arch/x86/pci/mmconfig-shared.c    |  6 ++--
 drivers/acpi/acpi_memhotplug.c    |  8 +++---
 drivers/acpi/acpica/rsaddr.c      |  9 +++---
 drivers/acpi/acpica/rsdumpinfo.c  | 59 ++++++++++++++++++++-------------------
 drivers/acpi/acpica/rsxface.c     | 10 +++----
 drivers/acpi/pci_root.c           |  6 ++--
 drivers/acpi/resource.c           | 24 ++++++++--------
 drivers/char/hpet.c               |  4 +--
 drivers/hv/vmbus_drv.c            |  4 +--
 drivers/pnp/pnpacpi/rsparser.c    | 16 +++++------
 drivers/xen/xen-acpi-memhotplug.c |  8 +++---
 include/acpi/acrestyp.h           | 40 +++++++++++++++-----------
 15 files changed, 125 insertions(+), 115 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c
index 8b9318d311a0..bd09bf74f187 100644
--- a/arch/ia64/kernel/acpi-ext.c
+++ b/arch/ia64/kernel/acpi-ext.c
@@ -69,10 +69,10 @@ static acpi_status find_csr_space(struct acpi_resource *resource, void *data)
 	status = acpi_resource_to_address64(resource, &addr);
 	if (ACPI_SUCCESS(status) &&
 	    addr.resource_type == ACPI_MEMORY_RANGE &&
-	    addr.address_length &&
+	    addr.address.address_length &&
 	    addr.producer_consumer == ACPI_CONSUMER) {
-		space->base = addr.minimum;
-		space->length = addr.address_length;
+		space->base = addr.address.minimum;
+		space->length = addr.address.address_length;
 		return AE_CTRL_TERMINATE;
 	}
 	return AE_OK;		/* keep looking */
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 291a582777cf..46920aaa7e9e 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -188,12 +188,12 @@ static u64 add_io_space(struct pci_root_info *info,
 
 	name = (char *)(iospace + 1);
 
-	min = addr->minimum;
-	max = min + addr->address_length - 1;
+	min = addr->address.minimum;
+	max = min + addr->address.address_length - 1;
 	if (addr->info.io.translation_type == ACPI_SPARSE_TRANSLATION)
 		sparse = 1;
 
-	space_nr = new_space(addr->translation_offset, sparse);
+	space_nr = new_space(addr->address.translation_offset, sparse);
 	if (space_nr == ~0)
 		goto free_resource;
 
@@ -247,7 +247,7 @@ static acpi_status resource_to_window(struct acpi_resource *resource,
 	if (ACPI_SUCCESS(status) &&
 	    (addr->resource_type == ACPI_MEMORY_RANGE ||
 	     addr->resource_type == ACPI_IO_RANGE) &&
-	    addr->address_length &&
+	    addr->address.address_length &&
 	    addr->producer_consumer == ACPI_PRODUCER)
 		return AE_OK;
 
@@ -284,7 +284,7 @@ static acpi_status add_window(struct acpi_resource *res, void *data)
 	if (addr.resource_type == ACPI_MEMORY_RANGE) {
 		flags = IORESOURCE_MEM;
 		root = &iomem_resource;
-		offset = addr.translation_offset;
+		offset = addr.address.translation_offset;
 	} else if (addr.resource_type == ACPI_IO_RANGE) {
 		flags = IORESOURCE_IO;
 		root = &ioport_resource;
@@ -297,8 +297,8 @@ static acpi_status add_window(struct acpi_resource *res, void *data)
 	resource = &info->res[info->res_num];
 	resource->name = info->name;
 	resource->flags = flags;
-	resource->start = addr.minimum + offset;
-	resource->end = resource->start + addr.address_length - 1;
+	resource->start = addr.address.minimum + offset;
+	resource->end = resource->start + addr.address.address_length - 1;
 	info->res_offset[info->res_num] = offset;
 
 	if (insert_resource(root, resource)) {
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index cfd1b132b8e3..bb98afd0591e 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -231,23 +231,23 @@ static acpi_status resource_to_addr(struct acpi_resource *resource,
 	case ACPI_RESOURCE_TYPE_MEMORY24:
 		memory24 = &resource->data.memory24;
 		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->minimum = memory24->minimum;
-		addr->address_length = memory24->address_length;
-		addr->maximum = addr->minimum + addr->address_length - 1;
+		addr->address.minimum = memory24->minimum;
+		addr->address.address_length = memory24->address_length;
+		addr->address.maximum = addr->address.minimum + addr->address.address_length - 1;
 		return AE_OK;
 	case ACPI_RESOURCE_TYPE_MEMORY32:
 		memory32 = &resource->data.memory32;
 		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->minimum = memory32->minimum;
-		addr->address_length = memory32->address_length;
-		addr->maximum = addr->minimum + addr->address_length - 1;
+		addr->address.minimum = memory32->minimum;
+		addr->address.address_length = memory32->address_length;
+		addr->address.maximum = addr->address.minimum + addr->address.address_length - 1;
 		return AE_OK;
 	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
 		fixed_memory32 = &resource->data.fixed_memory32;
 		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->minimum = fixed_memory32->address;
-		addr->address_length = fixed_memory32->address_length;
-		addr->maximum = addr->minimum + addr->address_length - 1;
+		addr->address.minimum = fixed_memory32->address;
+		addr->address.address_length = fixed_memory32->address_length;
+		addr->address.maximum = addr->address.minimum + addr->address.address_length - 1;
 		return AE_OK;
 	case ACPI_RESOURCE_TYPE_ADDRESS16:
 	case ACPI_RESOURCE_TYPE_ADDRESS32:
@@ -256,7 +256,7 @@ static acpi_status resource_to_addr(struct acpi_resource *resource,
 		if (ACPI_SUCCESS(status) &&
 		    (addr->resource_type == ACPI_MEMORY_RANGE ||
 		    addr->resource_type == ACPI_IO_RANGE) &&
-		    addr->address_length > 0) {
+		    addr->address.address_length > 0) {
 			return AE_OK;
 		}
 		break;
@@ -298,8 +298,8 @@ static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
 	} else
 		return AE_OK;
 
-	start = addr.minimum + addr.translation_offset;
-	orig_end = end = addr.maximum + addr.translation_offset;
+	start = addr.address.minimum + addr.address.translation_offset;
+	orig_end = end = addr.address.maximum + addr.address.translation_offset;
 
 	/* Exclude non-addressable range or non-addressable portion of range */
 	end = min(end, (u64)iomem_resource.end);
@@ -320,7 +320,7 @@ static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
 	res->flags = flags;
 	res->start = start;
 	res->end = end;
-	info->res_offset[info->res_num] = addr.translation_offset;
+	info->res_offset[info->res_num] = addr.address.translation_offset;
 	info->res_num++;
 
 	if (!pci_use_crs)
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 326198a4434e..5a8dceac3094 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -397,12 +397,12 @@ static acpi_status check_mcfg_resource(struct acpi_resource *res, void *data)
 
 	status = acpi_resource_to_address64(res, &address);
 	if (ACPI_FAILURE(status) ||
-	   (address.address_length <= 0) ||
+	   (address.address.address_length <= 0) ||
 	   (address.resource_type != ACPI_MEMORY_RANGE))
 		return AE_OK;
 
-	if ((mcfg_res->start >= address.minimum) &&
-	    (mcfg_res->end < (address.minimum + address.address_length))) {
+	if ((mcfg_res->start >= address.address.minimum) &&
+	    (mcfg_res->end < (address.address.minimum + address.address.address_length))) {
 		mcfg_res->flags = 1;
 		return AE_CTRL_TERMINATE;
 	}
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 23e2319ead41..ee28f4d15625 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -101,8 +101,8 @@ acpi_memory_get_resource(struct acpi_resource *resource, void *context)
 		/* Can we combine the resource range information? */
 		if ((info->caching == address64.info.mem.caching) &&
 		    (info->write_protect == address64.info.mem.write_protect) &&
-		    (info->start_addr + info->length == address64.minimum)) {
-			info->length += address64.address_length;
+		    (info->start_addr + info->length == address64.address.minimum)) {
+			info->length += address64.address.address_length;
 			return AE_OK;
 		}
 	}
@@ -114,8 +114,8 @@ acpi_memory_get_resource(struct acpi_resource *resource, void *context)
 	INIT_LIST_HEAD(&new->list);
 	new->caching = address64.info.mem.caching;
 	new->write_protect = address64.info.mem.write_protect;
-	new->start_addr = address64.minimum;
-	new->length = address64.address_length;
+	new->start_addr = address64.address.minimum;
+	new->length = address64.address.address_length;
 	list_add_tail(&new->list, &mem_device->res_list);
 
 	return AE_OK;
diff --git a/drivers/acpi/acpica/rsaddr.c b/drivers/acpi/acpica/rsaddr.c
index 916fd095ff34..94a3a057042c 100644
--- a/drivers/acpi/acpica/rsaddr.c
+++ b/drivers/acpi/acpica/rsaddr.c
@@ -74,7 +74,7 @@ struct acpi_rsconvert_info acpi_rs_convert_address16[5] = {
 	 * Address Translation Offset
 	 * Address Length
 	 */
-	{ACPI_RSC_MOVE16, ACPI_RS_OFFSET(data.address16.granularity),
+	{ACPI_RSC_MOVE16, ACPI_RS_OFFSET(data.address16.address.granularity),
 	 AML_OFFSET(address16.granularity),
 	 5},
 
@@ -112,7 +112,7 @@ struct acpi_rsconvert_info acpi_rs_convert_address32[5] = {
 	 * Address Translation Offset
 	 * Address Length
 	 */
-	{ACPI_RSC_MOVE32, ACPI_RS_OFFSET(data.address32.granularity),
+	{ACPI_RSC_MOVE32, ACPI_RS_OFFSET(data.address32.address.granularity),
 	 AML_OFFSET(address32.granularity),
 	 5},
 
@@ -150,7 +150,7 @@ struct acpi_rsconvert_info acpi_rs_convert_address64[5] = {
 	 * Address Translation Offset
 	 * Address Length
 	 */
-	{ACPI_RSC_MOVE64, ACPI_RS_OFFSET(data.address64.granularity),
+	{ACPI_RSC_MOVE64, ACPI_RS_OFFSET(data.address64.address.granularity),
 	 AML_OFFSET(address64.granularity),
 	 5},
 
@@ -194,7 +194,8 @@ struct acpi_rsconvert_info acpi_rs_convert_ext_address64[5] = {
 	 * Address Length
 	 * Type-Specific Attribute
 	 */
-	{ACPI_RSC_MOVE64, ACPI_RS_OFFSET(data.ext_address64.granularity),
+	{ACPI_RSC_MOVE64,
+	 ACPI_RS_OFFSET(data.ext_address64.address.granularity),
 	 AML_OFFSET(ext_address64.granularity),
 	 6}
 };
diff --git a/drivers/acpi/acpica/rsdumpinfo.c b/drivers/acpi/acpica/rsdumpinfo.c
index 2f9332d5c973..6ba7ad5faa9c 100644
--- a/drivers/acpi/acpica/rsdumpinfo.c
+++ b/drivers/acpi/acpica/rsdumpinfo.c
@@ -183,15 +183,15 @@ struct acpi_rsdump_info acpi_rs_dump_address16[8] = {
 	{ACPI_RSD_TITLE, ACPI_RSD_TABLE_SIZE(acpi_rs_dump_address16),
 	 "16-Bit WORD Address Space", NULL},
 	{ACPI_RSD_ADDRESS, 0, NULL, NULL},
-	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.granularity), "Granularity",
-	 NULL},
-	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.minimum), "Address Minimum",
-	 NULL},
-	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.maximum), "Address Maximum",
-	 NULL},
-	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.translation_offset),
+	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.address.granularity),
+	 "Granularity", NULL},
+	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.address.minimum),
+	 "Address Minimum", NULL},
+	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.address.maximum),
+	 "Address Maximum", NULL},
+	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.address.translation_offset),
 	 "Translation Offset", NULL},
-	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.address_length),
+	{ACPI_RSD_UINT16, ACPI_RSD_OFFSET(address16.address.address_length),
 	 "Address Length", NULL},
 	{ACPI_RSD_SOURCE, ACPI_RSD_OFFSET(address16.resource_source), NULL, NULL}
 };
@@ -200,15 +200,15 @@ struct acpi_rsdump_info acpi_rs_dump_address32[8] = {
 	{ACPI_RSD_TITLE, ACPI_RSD_TABLE_SIZE(acpi_rs_dump_address32),
 	 "32-Bit DWORD Address Space", NULL},
 	{ACPI_RSD_ADDRESS, 0, NULL, NULL},
-	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.granularity), "Granularity",
-	 NULL},
-	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.minimum), "Address Minimum",
-	 NULL},
-	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.maximum), "Address Maximum",
-	 NULL},
-	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.translation_offset),
+	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.address.granularity),
+	 "Granularity", NULL},
+	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.address.minimum),
+	 "Address Minimum", NULL},
+	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.address.maximum),
+	 "Address Maximum", NULL},
+	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.address.translation_offset),
 	 "Translation Offset", NULL},
-	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.address_length),
+	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(address32.address.address_length),
 	 "Address Length", NULL},
 	{ACPI_RSD_SOURCE, ACPI_RSD_OFFSET(address32.resource_source), NULL, NULL}
 };
@@ -217,15 +217,15 @@ struct acpi_rsdump_info acpi_rs_dump_address64[8] = {
 	{ACPI_RSD_TITLE, ACPI_RSD_TABLE_SIZE(acpi_rs_dump_address64),
 	 "64-Bit QWORD Address Space", NULL},
 	{ACPI_RSD_ADDRESS, 0, NULL, NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.granularity), "Granularity",
-	 NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.minimum), "Address Minimum",
-	 NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.maximum), "Address Maximum",
-	 NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.translation_offset),
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.address.granularity),
+	 "Granularity", NULL},
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.address.minimum),
+	 "Address Minimum", NULL},
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.address.maximum),
+	 "Address Maximum", NULL},
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.address.translation_offset),
 	 "Translation Offset", NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.address_length),
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(address64.address.address_length),
 	 "Address Length", NULL},
 	{ACPI_RSD_SOURCE, ACPI_RSD_OFFSET(address64.resource_source), NULL, NULL}
 };
@@ -234,15 +234,16 @@ struct acpi_rsdump_info acpi_rs_dump_ext_address64[8] = {
 	{ACPI_RSD_TITLE, ACPI_RSD_TABLE_SIZE(acpi_rs_dump_ext_address64),
 	 "64-Bit Extended Address Space", NULL},
 	{ACPI_RSD_ADDRESS, 0, NULL, NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.granularity),
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.address.granularity),
 	 "Granularity", NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.minimum),
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.address.minimum),
 	 "Address Minimum", NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.maximum),
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.address.maximum),
 	 "Address Maximum", NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.translation_offset),
+	{ACPI_RSD_UINT64,
+	 ACPI_RSD_OFFSET(ext_address64.address.translation_offset),
 	 "Translation Offset", NULL},
-	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.address_length),
+	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.address.address_length),
 	 "Address Length", NULL},
 	{ACPI_RSD_UINT64, ACPI_RSD_OFFSET(ext_address64.type_specific),
 	 "Type-Specific Attribute", NULL}
diff --git a/drivers/acpi/acpica/rsxface.c b/drivers/acpi/acpica/rsxface.c
index 877ab9202133..295287891174 100644
--- a/drivers/acpi/acpica/rsxface.c
+++ b/drivers/acpi/acpica/rsxface.c
@@ -60,11 +60,11 @@ ACPI_MODULE_NAME("rsxface")
 	ACPI_COPY_FIELD(out, in, min_address_fixed);         \
 	ACPI_COPY_FIELD(out, in, max_address_fixed);         \
 	ACPI_COPY_FIELD(out, in, info);                      \
-	ACPI_COPY_FIELD(out, in, granularity);               \
-	ACPI_COPY_FIELD(out, in, minimum);                   \
-	ACPI_COPY_FIELD(out, in, maximum);                   \
-	ACPI_COPY_FIELD(out, in, translation_offset);        \
-	ACPI_COPY_FIELD(out, in, address_length);            \
+	ACPI_COPY_FIELD(out, in, address.granularity);       \
+	ACPI_COPY_FIELD(out, in, address.minimum);           \
+	ACPI_COPY_FIELD(out, in, address.maximum);           \
+	ACPI_COPY_FIELD(out, in, address.translation_offset); \
+	ACPI_COPY_FIELD(out, in, address.address_length);    \
 	ACPI_COPY_FIELD(out, in, resource_source);
 /* Local prototypes */
 static acpi_status
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index c6bcb8c719d8..e53e0f659204 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -112,10 +112,10 @@ get_root_bridge_busnr_callback(struct acpi_resource *resource, void *data)
 	if (ACPI_FAILURE(status))
 		return AE_OK;
 
-	if ((address.address_length > 0) &&
+	if ((address.address.address_length > 0) &&
 	    (address.resource_type == ACPI_BUS_NUMBER_RANGE)) {
-		res->start = address.minimum;
-		res->end = address.minimum + address.address_length - 1;
+		res->start = address.address.minimum;
+		res->end = address.address.minimum + address.address.address_length - 1;
 	}
 
 	return AE_OK;
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 782a0d15c25f..d0a4d90c6bcc 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -202,22 +202,22 @@ bool acpi_dev_resource_address_space(struct acpi_resource *ares,
 	if (ACPI_FAILURE(status))
 		return false;
 
-	res->start = addr.minimum;
-	res->end = addr.maximum;
+	res->start = addr.address.minimum;
+	res->end = addr.address.maximum;
 	window = addr.producer_consumer == ACPI_PRODUCER;
 
 	switch(addr.resource_type) {
 	case ACPI_MEMORY_RANGE:
-		len = addr.maximum - addr.minimum + 1;
+		len = addr.address.maximum - addr.address.minimum + 1;
 		res->flags = acpi_dev_memresource_flags(len,
 						addr.info.mem.write_protect,
 						window);
 		break;
 	case ACPI_IO_RANGE:
-		io_decode = addr.granularity == 0xfff ?
+		io_decode = addr.address.granularity == 0xfff ?
 				ACPI_DECODE_10 : ACPI_DECODE_16;
-		res->flags = acpi_dev_ioresource_flags(addr.minimum,
-						       addr.maximum,
+		res->flags = acpi_dev_ioresource_flags(addr.address.minimum,
+						       addr.address.maximum,
 						       io_decode, window);
 		break;
 	case ACPI_BUS_NUMBER_RANGE:
@@ -253,22 +253,22 @@ bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
 
 	ext_addr = &ares->data.ext_address64;
 
-	res->start = ext_addr->minimum;
-	res->end = ext_addr->maximum;
+	res->start = ext_addr->address.minimum;
+	res->end = ext_addr->address.maximum;
 	window = ext_addr->producer_consumer == ACPI_PRODUCER;
 
 	switch(ext_addr->resource_type) {
 	case ACPI_MEMORY_RANGE:
-		len = ext_addr->maximum - ext_addr->minimum + 1;
+		len = ext_addr->address.maximum - ext_addr->address.minimum + 1;
 		res->flags = acpi_dev_memresource_flags(len,
 					ext_addr->info.mem.write_protect,
 					window);
 		break;
 	case ACPI_IO_RANGE:
-		io_decode = ext_addr->granularity == 0xfff ?
+		io_decode = ext_addr->address.granularity == 0xfff ?
 				ACPI_DECODE_10 : ACPI_DECODE_16;
-		res->flags = acpi_dev_ioresource_flags(ext_addr->minimum,
-						       ext_addr->maximum,
+		res->flags = acpi_dev_ioresource_flags(ext_addr->address.minimum,
+						       ext_addr->address.maximum,
 						       io_decode, window);
 		break;
 	case ACPI_BUS_NUMBER_RANGE:
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index d5d4cd82b9f7..5c0baa9ffc64 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -976,8 +976,8 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data)
 	status = acpi_resource_to_address64(res, &addr);
 
 	if (ACPI_SUCCESS(status)) {
-		hdp->hd_phys_address = addr.minimum;
-		hdp->hd_address = ioremap(addr.minimum, addr.address_length);
+		hdp->hd_phys_address = addr.address.minimum;
+		hdp->hd_address = ioremap(addr.address.minimum, addr.address.address_length);
 
 		if (hpet_is_known(hdp)) {
 			iounmap(hdp->hd_address);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 4d6b26979fbd..bb3725b672cf 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -861,8 +861,8 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 		break;
 
 	case ACPI_RESOURCE_TYPE_ADDRESS64:
-		hyperv_mmio.start = res->data.address64.minimum;
-		hyperv_mmio.end = res->data.address64.maximum;
+		hyperv_mmio.start = res->data.address64.address.minimum;
+		hyperv_mmio.end = res->data.address64.address.maximum;
 		break;
 	}
 
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index 66977ebf13b3..2d9bc789af0f 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -410,12 +410,12 @@ static __init void pnpacpi_parse_address_option(struct pnp_dev *dev,
 	if (p->resource_type == ACPI_MEMORY_RANGE) {
 		if (p->info.mem.write_protect == ACPI_READ_WRITE_MEMORY)
 			flags = IORESOURCE_MEM_WRITEABLE;
-		pnp_register_mem_resource(dev, option_flags, p->minimum,
-					  p->minimum, 0, p->address_length,
+		pnp_register_mem_resource(dev, option_flags, p->address.minimum,
+					  p->address.minimum, 0, p->address.address_length,
 					  flags);
 	} else if (p->resource_type == ACPI_IO_RANGE)
-		pnp_register_port_resource(dev, option_flags, p->minimum,
-					   p->minimum, 0, p->address_length,
+		pnp_register_port_resource(dev, option_flags, p->address.minimum,
+					   p->address.minimum, 0, p->address.address_length,
 					   IORESOURCE_IO_FIXED);
 }
 
@@ -429,12 +429,12 @@ static __init void pnpacpi_parse_ext_address_option(struct pnp_dev *dev,
 	if (p->resource_type == ACPI_MEMORY_RANGE) {
 		if (p->info.mem.write_protect == ACPI_READ_WRITE_MEMORY)
 			flags = IORESOURCE_MEM_WRITEABLE;
-		pnp_register_mem_resource(dev, option_flags, p->minimum,
-					  p->minimum, 0, p->address_length,
+		pnp_register_mem_resource(dev, option_flags, p->address.minimum,
+					  p->address.minimum, 0, p->address.address_length,
 					  flags);
 	} else if (p->resource_type == ACPI_IO_RANGE)
-		pnp_register_port_resource(dev, option_flags, p->minimum,
-					   p->minimum, 0, p->address_length,
+		pnp_register_port_resource(dev, option_flags, p->address.minimum,
+					   p->address.minimum, 0, p->address.address_length,
 					   IORESOURCE_IO_FIXED);
 }
 
diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c
index 34e40b733f9a..4fc886cd5586 100644
--- a/drivers/xen/xen-acpi-memhotplug.c
+++ b/drivers/xen/xen-acpi-memhotplug.c
@@ -117,8 +117,8 @@ acpi_memory_get_resource(struct acpi_resource *resource, void *context)
 	list_for_each_entry(info, &mem_device->res_list, list) {
 		if ((info->caching == address64.info.mem.caching) &&
 		    (info->write_protect == address64.info.mem.write_protect) &&
-		    (info->start_addr + info->length == address64.minimum)) {
-			info->length += address64.address_length;
+		    (info->start_addr + info->length == address64.address.minimum)) {
+			info->length += address64.address.address_length;
 			return AE_OK;
 		}
 	}
@@ -130,8 +130,8 @@ acpi_memory_get_resource(struct acpi_resource *resource, void *context)
 	INIT_LIST_HEAD(&new->list);
 	new->caching = address64.info.mem.caching;
 	new->write_protect = address64.info.mem.write_protect;
-	new->start_addr = address64.minimum;
-	new->length = address64.address_length;
+	new->start_addr = address64.address.minimum;
+	new->length = address64.address.address_length;
 	list_add_tail(&new->list, &mem_device->res_list);
 
 	return AE_OK;
diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index eb760ca0b2e0..0b75e01212c7 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -305,43 +305,51 @@ struct acpi_resource_source {
 	u8                                      max_address_fixed; \
 	union acpi_resource_attribute           info;
 
-struct acpi_resource_address {
-ACPI_RESOURCE_ADDRESS_COMMON};
-
-struct acpi_resource_address16 {
-	ACPI_RESOURCE_ADDRESS_COMMON u16 granularity;
+struct acpi_address16_attribute {
+	u16 granularity;
 	u16 minimum;
 	u16 maximum;
 	u16 translation_offset;
 	u16 address_length;
-	struct acpi_resource_source resource_source;
 };
 
-struct acpi_resource_address32 {
-	ACPI_RESOURCE_ADDRESS_COMMON u32 granularity;
+struct acpi_address32_attribute {
+	u32 granularity;
 	u32 minimum;
 	u32 maximum;
 	u32 translation_offset;
 	u32 address_length;
-	struct acpi_resource_source resource_source;
 };
 
-struct acpi_resource_address64 {
-	ACPI_RESOURCE_ADDRESS_COMMON u64 granularity;
+struct acpi_address64_attribute {
+	u64 granularity;
 	u64 minimum;
 	u64 maximum;
 	u64 translation_offset;
 	u64 address_length;
+};
+
+struct acpi_resource_address {
+ACPI_RESOURCE_ADDRESS_COMMON};
+
+struct acpi_resource_address16 {
+	ACPI_RESOURCE_ADDRESS_COMMON struct acpi_address16_attribute address;
+	struct acpi_resource_source resource_source;
+};
+
+struct acpi_resource_address32 {
+	ACPI_RESOURCE_ADDRESS_COMMON struct acpi_address32_attribute address;
+	struct acpi_resource_source resource_source;
+};
+
+struct acpi_resource_address64 {
+	ACPI_RESOURCE_ADDRESS_COMMON struct acpi_address64_attribute address;
 	struct acpi_resource_source resource_source;
 };
 
 struct acpi_resource_extended_address64 {
 	ACPI_RESOURCE_ADDRESS_COMMON u8 revision_ID;
-	u64 granularity;
-	u64 minimum;
-	u64 maximum;
-	u64 translation_offset;
-	u64 address_length;
+	struct acpi_address64_attribute address;
 	u64 type_specific;
 };
 
-- 
cgit v1.2.3


From 128ca093cc3cf3fc0368fa56b6a98f395090ddb0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 27 Jan 2015 11:57:19 +0100
Subject: kvm: iommu: Add cond_resched to legacy device assignment code

When assigning devices to large memory guests (>=128GB guest
memory in the failure case) the functions to create the
IOMMU page-tables for the whole guest might run for a very
long time. On non-preemptible kernels this might cause
Soft-Lockup warnings. Fix these by adding a cond_resched()
to the mapping and unmapping loops.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/iommu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 17b73eeac8a4..7dbced309ddb 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 
 		gfn += page_size >> PAGE_SHIFT;
 
-
+		cond_resched();
 	}
 
 	return 0;
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 		kvm_unpin_pages(kvm, pfn, unmap_pages);
 
 		gfn += unmap_pages;
+
+		cond_resched();
 	}
 }
 
-- 
cgit v1.2.3


From f0feed10aa3ef6b6f6254bf9a66abd58c0011d90 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 28 Jan 2015 07:44:21 +0100
Subject: x86/xen: cleanup arch/x86/xen/setup.c

Remove extern declarations in arch/x86/xen/setup.c which are either
not used or redundant. Move needed other extern declarations to
xen-ops.h

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/setup.c   | 10 ----------
 arch/x86/xen/xen-ops.h |  6 ++++++
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 865e56cea7a0..d2520c30c2f2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -32,16 +32,6 @@
 #include "p2m.h"
 #include "mmu.h"
 
-/* These are code, but not functions.  Defined in entry.S */
-extern const char xen_hypervisor_callback[];
-extern const char xen_failsafe_callback[];
-#ifdef CONFIG_X86_64
-extern asmlinkage void nmi(void);
-#endif
-extern void xen_sysenter_target(void);
-extern void xen_syscall_target(void);
-extern void xen_syscall32_target(void);
-
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 5686bd9d58cc..9e195c683549 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,6 +10,12 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
+void xen_sysenter_target(void);
+#ifdef CONFIG_X86_64
+void xen_syscall_target(void);
+void xen_syscall32_target(void);
+#endif
+
 extern void *xen_initial_gdt;
 
 struct trap_info;
-- 
cgit v1.2.3


From 3ba5c867ca504a9fbdadbbd1b36b690c7e2718df Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 28 Jan 2015 07:44:22 +0100
Subject: x86/xen: use correct types for addresses in arch/x86/xen/setup.c

In many places in arch/x86/xen/setup.c wrong types are used for
physical addresses (u64 or unsigned long long). Use phys_addr_t
instead.

Use macros already defined instead of open coding them.

Correct some other type mismatches.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/setup.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d2520c30c2f2..4dcc60819eda 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -64,7 +64,7 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
  */
 #define EXTRA_MEM_RATIO		(10)
 
-static void __init xen_add_extra_mem(u64 start, u64 size)
+static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
 {
 	int i;
 
@@ -87,10 +87,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
 	memblock_reserve(start, size);
 }
 
-static void __init xen_del_extra_mem(u64 start, u64 size)
+static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
 {
 	int i;
-	u64 start_r, size_r;
+	phys_addr_t start_r, size_r;
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
 		start_r = xen_extra_mem[i].start;
@@ -257,7 +257,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
 static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
 {
 	struct mmu_update update = {
-		.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
 		.val = pfn
 	};
 
@@ -535,14 +535,15 @@ static unsigned long __init xen_get_max_pages(void)
 	return min(max_pages, MAX_DOMAIN_PAGES);
 }
 
-static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
+static void xen_align_and_add_e820_region(phys_addr_t start, phys_addr_t size,
+					  int type)
 {
-	u64 end = start + size;
+	phys_addr_t end = start + size;
 
 	/* Align RAM regions to page boundaries. */
 	if (type == E820_RAM) {
 		start = PAGE_ALIGN(start);
-		end &= ~((u64)PAGE_SIZE - 1);
+		end &= ~((phys_addr_t)PAGE_SIZE - 1);
 	}
 
 	e820_add_region(start, end - start, type);
@@ -567,7 +568,7 @@ char * __init xen_memory_setup(void)
 	static struct e820entry map[E820MAX] __initdata;
 
 	unsigned long max_pfn = xen_start_info->nr_pages;
-	unsigned long long mem_end;
+	phys_addr_t mem_end;
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
@@ -642,16 +643,16 @@ char * __init xen_memory_setup(void)
 			  extra_pages);
 	i = 0;
 	while (i < memmap.nr_entries) {
-		u64 addr = map[i].addr;
-		u64 size = map[i].size;
+		phys_addr_t addr = map[i].addr;
+		phys_addr_t size = map[i].size;
 		u32 type = map[i].type;
 
 		if (type == E820_RAM) {
 			if (addr < mem_end) {
 				size = min(size, mem_end - addr);
 			} else if (extra_pages) {
-				size = min(size, (u64)extra_pages * PAGE_SIZE);
-				extra_pages -= size / PAGE_SIZE;
+				size = min(size, PFN_PHYS(extra_pages));
+				extra_pages -= PFN_DOWN(size);
 				xen_add_extra_mem(addr, size);
 				xen_max_p2m_pfn = PFN_DOWN(addr + size);
 			} else
-- 
cgit v1.2.3


From a3f5239650a9c08df0473261aedd6f50f7775410 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 28 Jan 2015 07:44:23 +0100
Subject: x86/xen: add some __init and static annotations in
 arch/x86/xen/setup.c

Some more functions in arch/x86/xen/setup.c can be made "__init".
xen_ignore_unusable() can be made "static".

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/setup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 4dcc60819eda..55f388ef481a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -535,8 +535,8 @@ static unsigned long __init xen_get_max_pages(void)
 	return min(max_pages, MAX_DOMAIN_PAGES);
 }
 
-static void xen_align_and_add_e820_region(phys_addr_t start, phys_addr_t size,
-					  int type)
+static void __init xen_align_and_add_e820_region(phys_addr_t start,
+						 phys_addr_t size, int type)
 {
 	phys_addr_t end = start + size;
 
@@ -549,7 +549,7 @@ static void xen_align_and_add_e820_region(phys_addr_t start, phys_addr_t size,
 	e820_add_region(start, end - start, type);
 }
 
-void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
 {
 	struct e820entry *entry;
 	unsigned int i;
-- 
cgit v1.2.3


From bf9d834a9bc54477f3745ba0bf926c8917c45680 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 28 Jan 2015 07:44:24 +0100
Subject: x86/xen: add some __init annotations in arch/x86/xen/mmu.c

The file arch/x86/xen/mmu.c has some functions that can be annotated
with "__init".

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/mmu.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5c1f9ace7ae7..6a8bbf43e617 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1489,7 +1489,7 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 	native_set_pte(ptep, pte);
 }
 
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
 {
 	struct mmuext_op op;
 	op.cmd = cmd;
@@ -1657,7 +1657,7 @@ void __init xen_reserve_top(void)
  * Like __va(), but returns address in the kernel mapping (which is
  * all we have until the physical memory mapping has been set up.
  */
-static void *__ka(phys_addr_t paddr)
+static void * __init __ka(phys_addr_t paddr)
 {
 #ifdef CONFIG_X86_64
 	return (void *)(paddr + __START_KERNEL_map);
@@ -1667,7 +1667,7 @@ static void *__ka(phys_addr_t paddr)
 }
 
 /* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
+static unsigned long __init m2p(phys_addr_t maddr)
 {
 	phys_addr_t paddr;
 
@@ -1678,13 +1678,14 @@ static unsigned long m2p(phys_addr_t maddr)
 }
 
 /* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
+static void * __init m2v(phys_addr_t maddr)
 {
 	return __ka(m2p(maddr));
 }
 
 /* Set the page permissions on an identity-mapped pages */
-static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
+static void __init set_page_prot_flags(void *addr, pgprot_t prot,
+				       unsigned long flags)
 {
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
 	pte_t pte = pfn_pte(pfn, prot);
@@ -1696,7 +1697,7 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
 		BUG();
 }
-static void set_page_prot(void *addr, pgprot_t prot)
+static void __init set_page_prot(void *addr, pgprot_t prot)
 {
 	return set_page_prot_flags(addr, prot, UVMF_NONE);
 }
@@ -1769,7 +1770,7 @@ void __init xen_setup_machphys_mapping(void)
 }
 
 #ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
+static void __init convert_pfn_mfn(void *v)
 {
 	pte_t *pte = v;
 	int i;
-- 
cgit v1.2.3


From 270b79338eb1bd1eb28e62994ffa7b9ecd9975d8 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 28 Jan 2015 07:44:25 +0100
Subject: x86/xen: cleanup arch/x86/xen/mmu.c

Remove a nested ifdef.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/mmu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6a8bbf43e617..adca9e2b6553 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1734,10 +1734,8 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
 			pte_t pte;
 
-#ifdef CONFIG_X86_32
 			if (pfn > max_pfn_mapped)
 				max_pfn_mapped = pfn;
-#endif
 
 			if (!pte_none(pte_page[pteidx]))
 				continue;
-- 
cgit v1.2.3


From 853d0289340026b30f93fd0e768340221d4e605c Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 5 Jan 2015 14:13:41 +0000
Subject: xen/grant-table: pre-populate kernel unmap ops for
 xen_gnttab_unmap_refs()

When unmapping grants, instead of converting the kernel map ops to
unmap ops on the fly, pre-populate the set of unmap ops.

This allows the grant unmap for the kernel mappings to be trivially
batched in the future.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/arm/include/asm/xen/page.h |  2 +-
 arch/arm/xen/p2m.c              |  2 +-
 arch/x86/include/asm/xen/page.h |  2 +-
 arch/x86/xen/p2m.c              | 21 ++++++++++-----------
 drivers/xen/gntdev.c            | 20 ++++++++++++++------
 drivers/xen/grant-table.c       |  4 ++--
 include/xen/grant_table.h       |  2 +-
 7 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h
index 68c739b3fdf4..2f7e6ff67d51 100644
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -92,7 +92,7 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct page **pages, unsigned int count);
 
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-				     struct gnttab_map_grant_ref *kmap_ops,
+				     struct gnttab_unmap_grant_ref *kunmap_ops,
 				     struct page **pages, unsigned int count);
 
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c
index 054857776254..cb7a14c5cd69 100644
--- a/arch/arm/xen/p2m.c
+++ b/arch/arm/xen/p2m.c
@@ -102,7 +102,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
 
 int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-			      struct gnttab_map_grant_ref *kmap_ops,
+			      struct gnttab_unmap_grant_ref *kunmap_ops,
 			      struct page **pages, unsigned int count)
 {
 	int i;
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 5eea09915a15..e9f52fe2d56a 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -55,7 +55,7 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
 				   struct page **pages, unsigned int count);
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-				     struct gnttab_map_grant_ref *kmap_ops,
+				     struct gnttab_unmap_grant_ref *kunmap_ops,
 				     struct page **pages, unsigned int count);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 70fb5075c901..df40b2888eae 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -816,7 +816,7 @@ static struct page *m2p_find_override(unsigned long mfn)
 }
 
 static int m2p_remove_override(struct page *page,
-			       struct gnttab_map_grant_ref *kmap_op,
+			       struct gnttab_unmap_grant_ref *kunmap_op,
 			       unsigned long mfn)
 {
 	unsigned long flags;
@@ -840,7 +840,7 @@ static int m2p_remove_override(struct page *page,
 	list_del(&page->lru);
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
 
-	if (kmap_op != NULL) {
+	if (kunmap_op != NULL) {
 		if (!PageHighMem(page)) {
 			struct multicall_space mcs;
 			struct gnttab_unmap_and_replace *unmap_op;
@@ -855,13 +855,13 @@ static int m2p_remove_override(struct page *page,
 			 * issued. In this case handle is going to -1 because
 			 * it hasn't been modified yet.
 			 */
-			if (kmap_op->handle == -1)
+			if (kunmap_op->handle == -1)
 				xen_mc_flush();
 			/*
 			 * Now if kmap_op->handle is negative it means that the
 			 * hypercall actually returned an error.
 			 */
-			if (kmap_op->handle == GNTST_general_error) {
+			if (kunmap_op->handle == GNTST_general_error) {
 				pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
 					pfn, mfn);
 				put_balloon_scratch_page();
@@ -873,9 +873,9 @@ static int m2p_remove_override(struct page *page,
 			mcs = __xen_mc_entry(
 				sizeof(struct gnttab_unmap_and_replace));
 			unmap_op = mcs.args;
-			unmap_op->host_addr = kmap_op->host_addr;
+			unmap_op->host_addr = kunmap_op->host_addr;
 			unmap_op->new_addr = scratch_page_address;
-			unmap_op->handle = kmap_op->handle;
+			unmap_op->handle = kunmap_op->handle;
 
 			MULTI_grant_table_op(mcs.mc,
 				GNTTABOP_unmap_and_replace, unmap_op, 1);
@@ -887,7 +887,6 @@ static int m2p_remove_override(struct page *page,
 
 			xen_mc_issue(PARAVIRT_LAZY_MMU);
 
-			kmap_op->host_addr = 0;
 			put_balloon_scratch_page();
 		}
 	}
@@ -912,7 +911,7 @@ static int m2p_remove_override(struct page *page,
 }
 
 int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-			      struct gnttab_map_grant_ref *kmap_ops,
+			      struct gnttab_unmap_grant_ref *kunmap_ops,
 			      struct page **pages, unsigned int count)
 {
 	int i, ret = 0;
@@ -921,7 +920,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;
 
-	if (kmap_ops &&
+	if (kunmap_ops &&
 	    !in_interrupt() &&
 	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
 		arch_enter_lazy_mmu_mode();
@@ -942,8 +941,8 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 		ClearPagePrivate(pages[i]);
 		set_phys_to_machine(pfn, pages[i]->index);
 
-		if (kmap_ops)
-			ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
+		if (kunmap_ops)
+			ret = m2p_remove_override(pages[i], &kunmap_ops[i], mfn);
 		if (ret)
 			goto out;
 	}
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 073b4a19a8b0..6444172f2842 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -91,6 +91,7 @@ struct grant_map {
 	struct gnttab_map_grant_ref   *map_ops;
 	struct gnttab_unmap_grant_ref *unmap_ops;
 	struct gnttab_map_grant_ref   *kmap_ops;
+	struct gnttab_unmap_grant_ref *kunmap_ops;
 	struct page **pages;
 };
 
@@ -124,6 +125,7 @@ static void gntdev_free_map(struct grant_map *map)
 	kfree(map->map_ops);
 	kfree(map->unmap_ops);
 	kfree(map->kmap_ops);
+	kfree(map->kunmap_ops);
 	kfree(map);
 }
 
@@ -140,11 +142,13 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
 	add->map_ops   = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL);
 	add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL);
 	add->kmap_ops  = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL);
+	add->kunmap_ops = kcalloc(count, sizeof(add->kunmap_ops[0]), GFP_KERNEL);
 	add->pages     = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL);
 	if (NULL == add->grants    ||
 	    NULL == add->map_ops   ||
 	    NULL == add->unmap_ops ||
 	    NULL == add->kmap_ops  ||
+	    NULL == add->kunmap_ops ||
 	    NULL == add->pages)
 		goto err;
 
@@ -155,6 +159,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
 		add->map_ops[i].handle = -1;
 		add->unmap_ops[i].handle = -1;
 		add->kmap_ops[i].handle = -1;
+		add->kunmap_ops[i].handle = -1;
 	}
 
 	add->index = 0;
@@ -280,6 +285,8 @@ static int map_grant_pages(struct grant_map *map)
 				map->flags | GNTMAP_host_map,
 				map->grants[i].ref,
 				map->grants[i].domid);
+			gnttab_set_unmap_op(&map->kunmap_ops[i], address,
+				map->flags | GNTMAP_host_map, -1);
 		}
 	}
 
@@ -290,13 +297,14 @@ static int map_grant_pages(struct grant_map *map)
 		return err;
 
 	for (i = 0; i < map->count; i++) {
-		if (map->map_ops[i].status)
+		if (map->map_ops[i].status) {
 			err = -EINVAL;
-		else {
-			BUG_ON(map->map_ops[i].handle == -1);
-			map->unmap_ops[i].handle = map->map_ops[i].handle;
-			pr_debug("map handle=%d\n", map->map_ops[i].handle);
+			continue;
 		}
+
+		map->unmap_ops[i].handle = map->map_ops[i].handle;
+		if (use_ptemod)
+			map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
 	}
 	return err;
 }
@@ -316,7 +324,7 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages)
 	}
 
 	err = gnttab_unmap_refs(map->unmap_ops + offset,
-			use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset,
+			use_ptemod ? map->kunmap_ops + offset : NULL, map->pages + offset,
 			pages);
 	if (err)
 		return err;
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 7786291ba229..999d7abdbcec 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -738,7 +738,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 EXPORT_SYMBOL_GPL(gnttab_map_refs);
 
 int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
-		      struct gnttab_map_grant_ref *kmap_ops,
+		      struct gnttab_unmap_grant_ref *kunmap_ops,
 		      struct page **pages, unsigned int count)
 {
 	int ret;
@@ -747,7 +747,7 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 	if (ret)
 		return ret;
 
-	return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count);
+	return clear_foreign_p2m_mapping(unmap_ops, kunmap_ops, pages, count);
 }
 EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
 
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 3387465b9caa..7235d8f35459 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -167,7 +167,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 		    struct gnttab_map_grant_ref *kmap_ops,
 		    struct page **pages, unsigned int count);
 int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
-		      struct gnttab_map_grant_ref *kunmap_ops,
+		      struct gnttab_unmap_grant_ref *kunmap_ops,
 		      struct page **pages, unsigned int count);
 
 /* Perform a batch of grant map/copy operations. Retry every batch slot
-- 
cgit v1.2.3


From 0bb599fd30108883b00c7d4a226eeb49111e6932 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 5 Jan 2015 17:06:01 +0000
Subject: xen: remove scratch frames for ballooned pages and m2p override

The scratch frame mappings for ballooned pages and the m2p override
are broken.  Remove them in preparation for replacing them with
simpler mechanisms that works.

The scratch pages did not ensure that the page was not in use.  In
particular, the foreign page could still be in use by hardware.  If
the guest reused the frame the hardware could read or write that
frame.

The m2p override did not handle the same frame being granted by two
different grant references.  Trying an M2P override lookup in this
case is impossible.

With the m2p override removed, the grant map/unmap for the kernel
mappings (for x86 PV) can be easily batched in
set_foreign_p2m_mapping() and clear_foreign_p2m_mapping().

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/include/asm/xen/page.h |  18 +--
 arch/x86/xen/p2m.c              | 254 ++--------------------------------------
 drivers/xen/balloon.c           |  86 +-------------
 3 files changed, 14 insertions(+), 344 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index e9f52fe2d56a..358dcd338915 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -57,7 +57,6 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 				     struct gnttab_unmap_grant_ref *kunmap_ops,
 				     struct page **pages, unsigned int count);
-extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
 /*
  * Helper functions to write or read unsigned long values to/from
@@ -154,21 +153,12 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 		return mfn;
 
 	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (__pfn_to_mfn(pfn) != mfn) {
-		/*
-		 * If this appears to be a foreign mfn (because the pfn
-		 * doesn't map back to the mfn), then check the local override
-		 * table to see if there's a better pfn to use.
-		 *
-		 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
-		 */
-		pfn = m2p_find_override_pfn(mfn, ~0);
-	}
+	if (__pfn_to_mfn(pfn) != mfn)
+		pfn = ~0;
 
 	/*
-	 * pfn is ~0 if there are no entries in the m2p for mfn or if the
-	 * entry doesn't map back to the mfn and m2p_override doesn't have a
-	 * valid entry for it.
+	 * pfn is ~0 if there are no entries in the m2p for mfn or the
+	 * entry doesn't map back to the mfn.
 	 */
 	if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
 		pfn = mfn;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index df40b2888eae..c9bc53f64359 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -84,8 +84,6 @@
 
 #define PMDS_PER_MID_PAGE	(P2M_MID_PER_PAGE / PTRS_PER_PTE)
 
-static void __init m2p_override_init(void);
-
 unsigned long *xen_p2m_addr __read_mostly;
 EXPORT_SYMBOL_GPL(xen_p2m_addr);
 unsigned long xen_p2m_size __read_mostly;
@@ -402,8 +400,6 @@ void __init xen_vmalloc_p2m_tree(void)
 	xen_p2m_size = xen_max_p2m_pfn;
 
 	xen_inv_extra_mem();
-
-	m2p_override_init();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
@@ -652,100 +648,21 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	return true;
 }
 
-#define M2P_OVERRIDE_HASH_SHIFT	10
-#define M2P_OVERRIDE_HASH	(1 << M2P_OVERRIDE_HASH_SHIFT)
-
-static struct list_head *m2p_overrides;
-static DEFINE_SPINLOCK(m2p_override_lock);
-
-static void __init m2p_override_init(void)
-{
-	unsigned i;
-
-	m2p_overrides = alloc_bootmem_align(
-				sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
-				sizeof(unsigned long));
-
-	for (i = 0; i < M2P_OVERRIDE_HASH; i++)
-		INIT_LIST_HEAD(&m2p_overrides[i]);
-}
-
-static unsigned long mfn_hash(unsigned long mfn)
-{
-	return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
-}
-
-/* Add an MFN override for a particular page */
-static int m2p_add_override(unsigned long mfn, struct page *page,
-			    struct gnttab_map_grant_ref *kmap_op)
-{
-	unsigned long flags;
-	unsigned long pfn;
-	unsigned long uninitialized_var(address);
-	unsigned level;
-	pte_t *ptep = NULL;
-
-	pfn = page_to_pfn(page);
-	if (!PageHighMem(page)) {
-		address = (unsigned long)__va(pfn << PAGE_SHIFT);
-		ptep = lookup_address(address, &level);
-		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
-			 "m2p_add_override: pfn %lx not mapped", pfn))
-			return -EINVAL;
-	}
-
-	if (kmap_op != NULL) {
-		if (!PageHighMem(page)) {
-			struct multicall_space mcs =
-				xen_mc_entry(sizeof(*kmap_op));
-
-			MULTI_grant_table_op(mcs.mc,
-					GNTTABOP_map_grant_ref, kmap_op, 1);
-
-			xen_mc_issue(PARAVIRT_LAZY_MMU);
-		}
-	}
-	spin_lock_irqsave(&m2p_override_lock, flags);
-	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
-	spin_unlock_irqrestore(&m2p_override_lock, flags);
-
-	/* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
-	 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
-	 * pfn so that the following mfn_to_pfn(mfn) calls will return the
-	 * pfn from the m2p_override (the backend pfn) instead.
-	 * We need to do this because the pages shared by the frontend
-	 * (xen-blkfront) can be already locked (lock_page, called by
-	 * do_read_cache_page); when the userspace backend tries to use them
-	 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
-	 * do_blockdev_direct_IO is going to try to lock the same pages
-	 * again resulting in a deadlock.
-	 * As a side effect get_user_pages_fast might not be safe on the
-	 * frontend pages while they are being shared with the backend,
-	 * because mfn_to_pfn (that ends up being called by GUPF) will
-	 * return the backend pfn rather than the frontend pfn. */
-	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (__pfn_to_mfn(pfn) == mfn)
-		set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
-
-	return 0;
-}
-
 int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 			    struct gnttab_map_grant_ref *kmap_ops,
 			    struct page **pages, unsigned int count)
 {
 	int i, ret = 0;
-	bool lazy = false;
 	pte_t *pte;
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;
 
-	if (kmap_ops &&
-	    !in_interrupt() &&
-	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
-		arch_enter_lazy_mmu_mode();
-		lazy = true;
+	if (kmap_ops) {
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+						kmap_ops, count);
+		if (ret)
+			goto out;
 	}
 
 	for (i = 0; i < count; i++) {
@@ -773,160 +690,22 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 			ret = -ENOMEM;
 			goto out;
 		}
-
-		if (kmap_ops) {
-			ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
-			if (ret)
-				goto out;
-		}
 	}
 
 out:
-	if (lazy)
-		arch_leave_lazy_mmu_mode();
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
 
-static struct page *m2p_find_override(unsigned long mfn)
-{
-	unsigned long flags;
-	struct list_head *bucket;
-	struct page *p, *ret;
-
-	if (unlikely(!m2p_overrides))
-		return NULL;
-
-	ret = NULL;
-	bucket = &m2p_overrides[mfn_hash(mfn)];
-
-	spin_lock_irqsave(&m2p_override_lock, flags);
-
-	list_for_each_entry(p, bucket, lru) {
-		if (page_private(p) == mfn) {
-			ret = p;
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&m2p_override_lock, flags);
-
-	return ret;
-}
-
-static int m2p_remove_override(struct page *page,
-			       struct gnttab_unmap_grant_ref *kunmap_op,
-			       unsigned long mfn)
-{
-	unsigned long flags;
-	unsigned long pfn;
-	unsigned long uninitialized_var(address);
-	unsigned level;
-	pte_t *ptep = NULL;
-
-	pfn = page_to_pfn(page);
-
-	if (!PageHighMem(page)) {
-		address = (unsigned long)__va(pfn << PAGE_SHIFT);
-		ptep = lookup_address(address, &level);
-
-		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
-			 "m2p_remove_override: pfn %lx not mapped", pfn))
-			return -EINVAL;
-	}
-
-	spin_lock_irqsave(&m2p_override_lock, flags);
-	list_del(&page->lru);
-	spin_unlock_irqrestore(&m2p_override_lock, flags);
-
-	if (kunmap_op != NULL) {
-		if (!PageHighMem(page)) {
-			struct multicall_space mcs;
-			struct gnttab_unmap_and_replace *unmap_op;
-			struct page *scratch_page = get_balloon_scratch_page();
-			unsigned long scratch_page_address = (unsigned long)
-				__va(page_to_pfn(scratch_page) << PAGE_SHIFT);
-
-			/*
-			 * It might be that we queued all the m2p grant table
-			 * hypercalls in a multicall, then m2p_remove_override
-			 * get called before the multicall has actually been
-			 * issued. In this case handle is going to -1 because
-			 * it hasn't been modified yet.
-			 */
-			if (kunmap_op->handle == -1)
-				xen_mc_flush();
-			/*
-			 * Now if kmap_op->handle is negative it means that the
-			 * hypercall actually returned an error.
-			 */
-			if (kunmap_op->handle == GNTST_general_error) {
-				pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
-					pfn, mfn);
-				put_balloon_scratch_page();
-				return -1;
-			}
-
-			xen_mc_batch();
-
-			mcs = __xen_mc_entry(
-				sizeof(struct gnttab_unmap_and_replace));
-			unmap_op = mcs.args;
-			unmap_op->host_addr = kunmap_op->host_addr;
-			unmap_op->new_addr = scratch_page_address;
-			unmap_op->handle = kunmap_op->handle;
-
-			MULTI_grant_table_op(mcs.mc,
-				GNTTABOP_unmap_and_replace, unmap_op, 1);
-
-			mcs = __xen_mc_entry(0);
-			MULTI_update_va_mapping(mcs.mc, scratch_page_address,
-					pfn_pte(page_to_pfn(scratch_page),
-					PAGE_KERNEL_RO), 0);
-
-			xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-			put_balloon_scratch_page();
-		}
-	}
-
-	/* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
-	 * somewhere in this domain, even before being added to the
-	 * m2p_override (see comment above in m2p_add_override).
-	 * If there are no other entries in the m2p_override corresponding
-	 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
-	 * the original pfn (the one shared by the frontend): the backend
-	 * cannot do any IO on this page anymore because it has been
-	 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
-	 * the original pfn causes mfn_to_pfn(mfn) to return the frontend
-	 * pfn again. */
-	mfn &= ~FOREIGN_FRAME_BIT;
-	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) &&
-			m2p_find_override(mfn) == NULL)
-		set_phys_to_machine(pfn, mfn);
-
-	return 0;
-}
-
 int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 			      struct gnttab_unmap_grant_ref *kunmap_ops,
 			      struct page **pages, unsigned int count)
 {
 	int i, ret = 0;
-	bool lazy = false;
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;
 
-	if (kunmap_ops &&
-	    !in_interrupt() &&
-	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
-		arch_enter_lazy_mmu_mode();
-		lazy = true;
-	}
-
 	for (i = 0; i < count; i++) {
 		unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
 		unsigned long pfn = page_to_pfn(pages[i]);
@@ -940,32 +719,15 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 		WARN_ON(!PagePrivate(pages[i]));
 		ClearPagePrivate(pages[i]);
 		set_phys_to_machine(pfn, pages[i]->index);
-
-		if (kunmap_ops)
-			ret = m2p_remove_override(pages[i], &kunmap_ops[i], mfn);
-		if (ret)
-			goto out;
 	}
-
+	if (kunmap_ops)
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+						kunmap_ops, count);
 out:
-	if (lazy)
-		arch_leave_lazy_mmu_mode();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
 
-unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
-{
-	struct page *p = m2p_find_override(mfn);
-	unsigned long ret = pfn;
-
-	if (p)
-		ret = page_to_pfn(p);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
-
 #ifdef CONFIG_XEN_DEBUG_FS
 #include <linux/debugfs.h>
 #include "debugfs.h"
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 3860d02729dc..0b52d92cb2e5 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -92,7 +92,6 @@ EXPORT_SYMBOL_GPL(balloon_stats);
 
 /* We increase/decrease in batches which fit in a page */
 static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
-static DEFINE_PER_CPU(struct page *, balloon_scratch_page);
 
 
 /* List of ballooned pages, threaded through the mem_map array. */
@@ -423,22 +422,12 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 		page = pfn_to_page(pfn);
 
 #ifdef CONFIG_XEN_HAVE_PVMMU
-		/*
-		 * Ballooned out frames are effectively replaced with
-		 * a scratch frame.  Ensure direct mappings and the
-		 * p2m are consistent.
-		 */
 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 			if (!PageHighMem(page)) {
-				struct page *scratch_page = get_balloon_scratch_page();
-
 				ret = HYPERVISOR_update_va_mapping(
 						(unsigned long)__va(pfn << PAGE_SHIFT),
-						pfn_pte(page_to_pfn(scratch_page),
-							PAGE_KERNEL_RO), 0);
+						__pte_ma(0), 0);
 				BUG_ON(ret);
-
-				put_balloon_scratch_page();
 			}
 			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 		}
@@ -500,18 +489,6 @@ static void balloon_process(struct work_struct *work)
 	mutex_unlock(&balloon_mutex);
 }
 
-struct page *get_balloon_scratch_page(void)
-{
-	struct page *ret = get_cpu_var(balloon_scratch_page);
-	BUG_ON(ret == NULL);
-	return ret;
-}
-
-void put_balloon_scratch_page(void)
-{
-	put_cpu_var(balloon_scratch_page);
-}
-
 /* Resets the Xen limit, sets new target, and kicks off processing. */
 void balloon_set_new_target(unsigned long target)
 {
@@ -605,61 +582,13 @@ static void __init balloon_add_region(unsigned long start_pfn,
 	}
 }
 
-static int alloc_balloon_scratch_page(int cpu)
-{
-	if (per_cpu(balloon_scratch_page, cpu) != NULL)
-		return 0;
-
-	per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL);
-	if (per_cpu(balloon_scratch_page, cpu) == NULL) {
-		pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-
-static int balloon_cpu_notify(struct notifier_block *self,
-				    unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-	switch (action) {
-	case CPU_UP_PREPARE:
-		if (alloc_balloon_scratch_page(cpu))
-			return NOTIFY_BAD;
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block balloon_cpu_notifier = {
-	.notifier_call	= balloon_cpu_notify,
-};
-
 static int __init balloon_init(void)
 {
-	int i, cpu;
+	int i;
 
 	if (!xen_domain())
 		return -ENODEV;
 
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-		register_cpu_notifier(&balloon_cpu_notifier);
-
-		get_online_cpus();
-		for_each_online_cpu(cpu) {
-			if (alloc_balloon_scratch_page(cpu)) {
-				put_online_cpus();
-				unregister_cpu_notifier(&balloon_cpu_notifier);
-				return -ENOMEM;
-			}
-		}
-		put_online_cpus();
-	}
-
 	pr_info("Initialising balloon driver\n");
 
 	balloon_stats.current_pages = xen_pv_domain()
@@ -696,15 +625,4 @@ static int __init balloon_init(void)
 
 subsys_initcall(balloon_init);
 
-static int __init balloon_clear(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		per_cpu(balloon_scratch_page, cpu) = NULL;
-
-	return 0;
-}
-early_initcall(balloon_clear);
-
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 0ae65f49af64d68f0daca37b83383115cae5e690 Mon Sep 17 00:00:00 2001
From: Jennifer Herbert <jennifer.herbert@citrix.com>
Date: Wed, 24 Dec 2014 14:03:16 +0000
Subject: x86/xen: require ballooned pages for grant maps

Ballooned pages are always used for grant maps which means the
original frame does not need to be saved in page->index nor restored
after the grant unmap.

This allows the workaround in netback for the conflicting use of the
(unionized) page->index and page->pfmemalloc to be removed.

Signed-off-by: Jennifer Herbert <jennifer.herbert@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/p2m.c                | 5 +++--
 drivers/net/xen-netback/netback.c | 6 ------
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index c9bc53f64359..a8691cb08420 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -682,9 +682,10 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 		pfn = page_to_pfn(pages[i]);
 
 		WARN_ON(PagePrivate(pages[i]));
+		WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
+
 		SetPagePrivate(pages[i]);
 		set_page_private(pages[i], mfn);
-		pages[i]->index = pfn_to_mfn(pfn);
 
 		if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
 			ret = -ENOMEM;
@@ -718,7 +719,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 		set_page_private(pages[i], INVALID_P2M_ENTRY);
 		WARN_ON(!PagePrivate(pages[i]));
 		ClearPagePrivate(pages[i]);
-		set_phys_to_machine(pfn, pages[i]->index);
+		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 	}
 	if (kunmap_ops)
 		ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 908e65e9b821..64413189ad06 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1241,12 +1241,6 @@ static void xenvif_fill_frags(struct xenvif_queue *queue, struct sk_buff *skb)
 		/* Take an extra reference to offset network stack's put_page */
 		get_page(queue->mmap_pages[pending_idx]);
 	}
-	/* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
-	 * overlaps with "index", and "mapping" is not set. I think mapping
-	 * should be set. If delivered to local stack, it would drop this
-	 * skb in sk_filter unless the socket has the right to use it.
-	 */
-	skb->pfmemalloc	= false;
 }
 
 static int xenvif_get_extras(struct xenvif_queue *queue,
-- 
cgit v1.2.3


From 8da7633f168b5428e2cfb7342408b2c44088f5df Mon Sep 17 00:00:00 2001
From: Jennifer Herbert <jennifer.herbert@citrix.com>
Date: Wed, 24 Dec 2014 14:17:06 +0000
Subject: xen: mark grant mapped pages as foreign

Use the "foreign" page flag to mark pages that have a grant map.  Use
page->private to store information of the grant (the granting domain
and the grant reference).

Signed-off-by: Jennifer Herbert <jennifer.herbert@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/p2m.c        |  7 -------
 drivers/xen/grant-table.c | 43 +++++++++++++++++++++++++++++++++++++++++--
 include/xen/grant_table.h | 20 ++++++++++++++++++++
 3 files changed, 61 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index a8691cb08420..f18fd1d411f6 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -681,12 +681,8 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 		}
 		pfn = page_to_pfn(pages[i]);
 
-		WARN_ON(PagePrivate(pages[i]));
 		WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
 
-		SetPagePrivate(pages[i]);
-		set_page_private(pages[i], mfn);
-
 		if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
 			ret = -ENOMEM;
 			goto out;
@@ -716,9 +712,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 			goto out;
 		}
 
-		set_page_private(pages[i], INVALID_P2M_ENTRY);
-		WARN_ON(!PagePrivate(pages[i]));
-		ClearPagePrivate(pages[i]);
 		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 	}
 	if (kunmap_ops)
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index b4f93c490f83..89dcca448bb6 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -679,12 +679,27 @@ EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames);
  */
 int gnttab_alloc_pages(int nr_pages, struct page **pages)
 {
+	int i;
 	int ret;
 
 	ret = alloc_xenballooned_pages(nr_pages, pages, false);
 	if (ret < 0)
 		return ret;
 
+	for (i = 0; i < nr_pages; i++) {
+#if BITS_PER_LONG < 64
+		struct xen_page_foreign *foreign;
+
+		foreign = kzalloc(sizeof(*foreign), GFP_KERNEL);
+		if (!foreign) {
+			gnttab_free_pages(nr_pages, pages);
+			return -ENOMEM;
+		}
+		set_page_private(pages[i], (unsigned long)foreign);
+#endif
+		SetPagePrivate(pages[i]);
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(gnttab_alloc_pages);
@@ -696,6 +711,16 @@ EXPORT_SYMBOL(gnttab_alloc_pages);
  */
 void gnttab_free_pages(int nr_pages, struct page **pages)
 {
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (PagePrivate(pages[i])) {
+#if BITS_PER_LONG < 64
+			kfree((void *)page_private(pages[i]));
+#endif
+			ClearPagePrivate(pages[i]);
+		}
+	}
 	free_xenballooned_pages(nr_pages, pages);
 }
 EXPORT_SYMBOL(gnttab_free_pages);
@@ -756,12 +781,22 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 	if (ret)
 		return ret;
 
-	/* Retry eagain maps */
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
+		/* Retry eagain maps */
 		if (map_ops[i].status == GNTST_eagain)
 			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i,
 						&map_ops[i].status, __func__);
 
+		if (map_ops[i].status == GNTST_okay) {
+			struct xen_page_foreign *foreign;
+
+			SetPageForeign(pages[i]);
+			foreign = xen_page_foreign(pages[i]);
+			foreign->domid = map_ops[i].dom;
+			foreign->gref = map_ops[i].ref;
+		}
+	}
+
 	return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count);
 }
 EXPORT_SYMBOL_GPL(gnttab_map_refs);
@@ -770,12 +805,16 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 		      struct gnttab_unmap_grant_ref *kunmap_ops,
 		      struct page **pages, unsigned int count)
 {
+	unsigned int i;
 	int ret;
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
 	if (ret)
 		return ret;
 
+	for (i = 0; i < count; i++)
+		ClearPageForeign(pages[i]);
+
 	return clear_foreign_p2m_mapping(unmap_ops, kunmap_ops, pages, count);
 }
 EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 949803e20872..d3bef563e8da 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -45,6 +45,8 @@
 #include <asm/xen/hypervisor.h>
 
 #include <xen/features.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
 
 #define GNTTAB_RESERVED_XENSTORE 1
 
@@ -185,4 +187,22 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count);
 void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count);
 
+
+struct xen_page_foreign {
+	domid_t domid;
+	grant_ref_t gref;
+};
+
+static inline struct xen_page_foreign *xen_page_foreign(struct page *page)
+{
+	if (!PageForeign(page))
+		return NULL;
+#if BITS_PER_LONG < 64
+	return (struct xen_page_foreign *)page->private;
+#else
+	BUILD_BUG_ON(sizeof(struct xen_page_foreign) > BITS_PER_LONG);
+	return (struct xen_page_foreign *)&page->private;
+#endif
+}
+
 #endif /* __ASM_GNTTAB_H__ */
-- 
cgit v1.2.3


From 6a878e5085fe97bd1e222b7883a1b815fcbbe4ed Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Wed, 28 Jan 2015 09:52:17 +0800
Subject: PCI: Fail MSI-X mappings if there's no space assigned to MSI-X BAR

Unlike MSI, which is configured via registers in the MSI capability in
Configuration Space, MSI-X is configured via tables in Memory Space.
These MSI-X tables are mapped by a device BAR, and if no Memory Space
has been assigned to the BAR, MSI-X cannot be used.

Fail MSI-X setup if no space has been assigned for the BAR.

Previously, we ioremapped the MSI-X table even if the resource hadn't been
assigned.  In this case, the resource address is undefined (and is often
zero), which may lead to warnings or oopses in this path:

  pci_enable_msix
    msix_capability_init
      msix_map_region
        ioremap_nocache

The PCI core sets resource flags to zero when it can't assign space for the
resource (see reset_resource()).  There are also some cases where it sets
the IORESOURCE_UNSET flag, e.g., pci_reassigndev_resource_alignment(),
pci_assign_resource(), etc.  So we must check for both cases.

[bhelgaas: changelog]
Reported-by: Zhang Jukuo <zhangjukuo@huawei.com>
Tested-by: Zhang Jukuo <zhangjukuo@huawei.com>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/xen.c | 4 ++++
 drivers/pci/msi.c  | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index c489ef2c1a39..34fc4189ebf0 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -298,12 +298,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			map_irq.entry_nr = nvec;
 		} else if (type == PCI_CAP_ID_MSIX) {
 			int pos;
+			unsigned long flags;
 			u32 table_offset, bir;
 
 			pos = dev->msix_cap;
 			pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
 					      &table_offset);
 			bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
+			flags = pci_resource_flags(dev, bir);
+			if (!flags || (flags & IORESOURCE_UNSET))
+				return -EINVAL;
 
 			map_irq.table_base = pci_resource_start(dev, bir);
 			map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index fd60806d3fd0..c3e7dfcf9ff5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -694,11 +694,16 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
 {
 	resource_size_t phys_addr;
 	u32 table_offset;
+	unsigned long flags;
 	u8 bir;
 
 	pci_read_config_dword(dev, dev->msix_cap + PCI_MSIX_TABLE,
 			      &table_offset);
 	bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
+	flags = pci_resource_flags(dev, bir);
+	if (!flags || (flags & IORESOURCE_UNSET))
+		return NULL;
+
 	table_offset &= PCI_MSIX_TABLE_OFFSET;
 	phys_addr = pci_resource_start(dev, bir) + table_offset;
 
-- 
cgit v1.2.3


From 3b0f1d01e501792d8d89ab4371bc9e8cd2a10032 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:23 +0800
Subject: KVM: Rename kvm_arch_mmu_write_protect_pt_masked to be more generic
 for log dirty

We don't have to write protect guest memory for dirty logging if architecture
supports hardware dirty logging, such as PML on VMX, so rename it to be more
generic.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/mmu.c       | 18 ++++++++++++++++--
 arch/x86/kvm/mmu.c       | 21 +++++++++++++++++++--
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      |  2 +-
 4 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 74aeabaa3c4d..6034697ede3f 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1081,7 +1081,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 }
 
 /**
- * kvm_arch_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
  * @kvm:	The KVM pointer
  * @slot:	The memory slot associated with mask
  * @gfn_offset:	The gfn offset in memory slot
@@ -1091,7 +1091,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
  * Walks bits set in mask write protects the associated pte's. Caller must
  * acquire kvm_mmu_lock.
  */
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 		struct kvm_memory_slot *slot,
 		gfn_t gfn_offset, unsigned long mask)
 {
@@ -1102,6 +1102,20 @@ void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 	stage2_wp_range(kvm, start, end);
 }
 
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ed9f795e4f0..b18e65ce3683 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1216,7 +1216,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 }
 
 /**
- * kvm_arch_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
  * @slot: slot to protect
  * @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1225,7 +1225,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
  * Used when we do not need to care about huge page mappings: e.g. during dirty
  * logging we do not have any such mappings.
  */
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 				     struct kvm_memory_slot *slot,
 				     gfn_t gfn_offset, unsigned long mask)
 {
@@ -1241,6 +1241,23 @@ void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 	}
 }
 
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+				struct kvm_memory_slot *slot,
+				gfn_t gfn_offset, unsigned long mask)
+{
+	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
 static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d6719522f1f..32d057571bf6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -615,7 +615,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
 int kvm_get_dirty_log_protect(struct kvm *kvm,
 			struct kvm_dirty_log *log, bool *is_dirty);
 
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					struct kvm_memory_slot *slot,
 					gfn_t gfn_offset,
 					unsigned long mask);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a8490f084483..0c281760a1c5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1059,7 +1059,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 		dirty_bitmap_buffer[i] = mask;
 
 		offset = i * BITS_PER_LONG;
-		kvm_arch_mmu_write_protect_pt_masked(kvm, memslot, offset,
+		kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
 								mask);
 	}
 
-- 
cgit v1.2.3


From f4b4b1808690c37c7c703d43789c1988c5e7fdeb Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:24 +0800
Subject: KVM: MMU: Add mmu help functions to support PML

This patch adds new mmu layer functions to clear/set D-bit for memory slot, and
to write protect superpages for memory slot.

In case of PML, CPU logs the dirty GPA automatically to PML buffer when CPU
updates D-bit from 0 to 1, therefore we don't have to write protect 4K pages,
instead, we only need to clear D-bit in order to log that GPA.

For superpages, we still write protect it and let page fault code to handle
dirty page logging, as we still need to split superpage to 4K pages in PML.

As PML is always enabled during guest's lifetime, to eliminate unnecessary PML
GPA logging, we set D-bit manually for the slot with dirty logging disabled.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   9 ++
 arch/x86/kvm/mmu.c              | 195 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 843bea0e70fd..4f6369b6f7d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -835,6 +835,15 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+					struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+			    struct kvm_memory_slot *memslot);
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+				   struct kvm_memory_slot *slot,
+				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b18e65ce3683..c438224cca34 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1215,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 	return flush;
 }
 
+static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+{
+	u64 spte = *sptep;
+
+	rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+
+	spte &= ~shadow_dirty_mask;
+
+	return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	bool flush = false;
+
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		flush |= spte_clear_dirty(kvm, sptep);
+		sptep = rmap_get_next(&iter);
+	}
+
+	return flush;
+}
+
+static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+{
+	u64 spte = *sptep;
+
+	rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+
+	spte |= shadow_dirty_mask;
+
+	return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	bool flush = false;
+
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		flush |= spte_set_dirty(kvm, sptep);
+		sptep = rmap_get_next(&iter);
+	}
+
+	return flush;
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
@@ -1241,6 +1295,32 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	}
 }
 
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+				     struct kvm_memory_slot *slot,
+				     gfn_t gfn_offset, unsigned long mask)
+{
+	unsigned long *rmapp;
+
+	while (mask) {
+		rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+				      PT_PAGE_TABLE_LEVEL, slot);
+		__rmap_clear_dirty(kvm, rmapp);
+
+		/* clear the first set bit */
+		mask &= mask - 1;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
+
 /**
  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
  * PT level pages.
@@ -4368,6 +4448,121 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		kvm_flush_remote_tlbs(kvm);
 }
 
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot)
+{
+	gfn_t last_gfn;
+	unsigned long *rmapp;
+	unsigned long last_index, index;
+	bool flush = false;
+
+	last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+	spin_lock(&kvm->mmu_lock);
+
+	rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
+	last_index = gfn_to_index(last_gfn, memslot->base_gfn,
+			PT_PAGE_TABLE_LEVEL);
+
+	for (index = 0; index <= last_index; ++index, ++rmapp) {
+		if (*rmapp)
+			flush |= __rmap_clear_dirty(kvm, rmapp);
+
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+			cond_resched_lock(&kvm->mmu_lock);
+	}
+
+	spin_unlock(&kvm->mmu_lock);
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	/*
+	 * It's also safe to flush TLBs out of mmu lock here as currently this
+	 * function is only used for dirty logging, in which case flushing TLB
+	 * out of mmu lock also guarantees no dirty pages will be lost in
+	 * dirty_bitmap.
+	 */
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
+
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+					struct kvm_memory_slot *memslot)
+{
+	gfn_t last_gfn;
+	int i;
+	bool flush = false;
+
+	last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+	spin_lock(&kvm->mmu_lock);
+
+	for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
+	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+		unsigned long *rmapp;
+		unsigned long last_index, index;
+
+		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+		for (index = 0; index <= last_index; ++index, ++rmapp) {
+			if (*rmapp)
+				flush |= __rmap_write_protect(kvm, rmapp,
+						false);
+
+			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+				cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	/* see kvm_mmu_slot_remove_write_access */
+	lockdep_assert_held(&kvm->slots_lock);
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+			    struct kvm_memory_slot *memslot)
+{
+	gfn_t last_gfn;
+	int i;
+	bool flush = false;
+
+	last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+	spin_lock(&kvm->mmu_lock);
+
+	for (i = PT_PAGE_TABLE_LEVEL;
+	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+		unsigned long *rmapp;
+		unsigned long last_index, index;
+
+		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+		for (index = 0; index <= last_index; ++index, ++rmapp) {
+			if (*rmapp)
+				flush |= __rmap_set_dirty(kvm, rmapp);
+
+			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+				cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+
+	spin_unlock(&kvm->mmu_lock);
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	/* see kvm_mmu_slot_leaf_clear_dirty */
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
+
 #define BATCH_ZAP_PAGES	10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
-- 
cgit v1.2.3


From 9b51a63024bd759f97a12f50907b8af23b065b36 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:25 +0800
Subject: KVM: MMU: Explicitly set D-bit for writable spte.

This patch avoids unnecessary dirty GPA logging to PML buffer in EPT violation
path by setting D-bit manually prior to the occurrence of the write from guest.

We only set D-bit manually in set_spte, and leave fast_page_fault path
unchanged, as fast_page_fault is very unlikely to happen in case of PML.

For the hva <-> pa change case, the spte is updated to either read-only (host
pte is read-only) or be dropped (host pte is writeable), and both cases will be
handled by above changes, therefore no change is necessary.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c438224cca34..b7192236dcba 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2597,8 +2597,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		}
 	}
 
-	if (pte_access & ACC_WRITE_MASK)
+	if (pte_access & ACC_WRITE_MASK) {
 		mark_page_dirty(vcpu->kvm, gfn);
+		spte |= shadow_dirty_mask;
+	}
 
 set_pte:
 	if (mmu_spte_update(sptep, spte))
@@ -2914,6 +2916,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 */
 	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
 
+	/*
+	 * Theoretically we could also set dirty bit (and flush TLB) here in
+	 * order to eliminate unnecessary PML logging. See comments in
+	 * set_spte. But fast_page_fault is very unlikely to happen with PML
+	 * enabled, so we do not do this. This might result in the same GPA
+	 * to be logged in PML buffer again when the write really happens, and
+	 * eventually to be called by mark_page_dirty twice. But it's also no
+	 * harm. This also avoids the TLB flush needed after setting dirty bit
+	 * so non-PML cases won't be impacted.
+	 *
+	 * Compare with set_spte where instead shadow_dirty_mask is set.
+	 */
 	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
 		mark_page_dirty(vcpu->kvm, gfn);
 
-- 
cgit v1.2.3


From 1c91cad42366ce0799ca17e7ad6995418741d012 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:26 +0800
Subject: KVM: x86: Change parameter of kvm_mmu_slot_remove_write_access

This patch changes the second parameter of kvm_mmu_slot_remove_write_access from
'slot id' to 'struct kvm_memory_slot *' to align with kvm_x86_ops dirty logging
hooks, which will be introduced in further patch.

Better way is to change second parameter of kvm_arch_commit_memory_region from
'struct kvm_userspace_memory_region *' to 'struct kvm_memory_slot * new', but it
requires changes on other non-x86 ARCH too, so avoid it now.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/mmu.c              |  5 ++---
 arch/x86/kvm/x86.c              | 10 +++++++---
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f6369b6f7d2..67a98d793bf2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -834,7 +834,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b7192236dcba..ec62ecd2d513 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4408,14 +4408,13 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot)
 {
-	struct kvm_memory_slot *memslot;
 	gfn_t last_gfn;
 	int i;
 	bool flush = false;
 
-	memslot = id_to_memslot(kvm->memslots, slot);
 	last_gfn = memslot->base_gfn + memslot->npages - 1;
 
 	spin_lock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1e10e3f7f516..3a7fcff81af6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7538,7 +7538,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				const struct kvm_memory_slot *old,
 				enum kvm_mr_change change)
 {
-
+	struct kvm_memory_slot *new;
 	int nr_mmu_pages = 0;
 
 	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7557,6 +7557,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+
+	/* It's OK to get 'new' slot here as it has already been installed */
+	new = id_to_memslot(kvm->memslots, mem->slot);
+
 	/*
 	 * Write protect all pages for dirty logging.
 	 *
@@ -7566,8 +7570,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 *
 	 * See the comments in fast_page_fault().
 	 */
-	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
-		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	if ((change != KVM_MR_DELETE) && (new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+		kvm_mmu_slot_remove_write_access(kvm, new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
-- 
cgit v1.2.3


From 88178fd4f7187bbe290c5d373fd44aabec891934 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:27 +0800
Subject: KVM: x86: Add new dirty logging kvm_x86_ops for PML

This patch adds new kvm_x86_ops dirty logging hooks to enable/disable dirty
logging for particular memory slot, and to flush potentially logged dirty GPAs
before reporting slot->dirty_bitmap to userspace.

kvm x86 common code calls these hooks when they are available so PML logic can
be hidden to VMX specific. SVM won't be impacted as these hooks remain NULL
there.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 25 +++++++++++++++
 arch/x86/kvm/mmu.c              |  6 +++-
 arch/x86/kvm/x86.c              | 71 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 93 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 67a98d793bf2..57916ecb9b92 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -802,6 +802,31 @@ struct kvm_x86_ops {
 	int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
 	void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+	/*
+	 * Arch-specific dirty logging hooks. These hooks are only supposed to
+	 * be valid if the specific arch has hardware-accelerated dirty logging
+	 * mechanism. Currently only for PML on VMX.
+	 *
+	 *  - slot_enable_log_dirty:
+	 *	called when enabling log dirty mode for the slot.
+	 *  - slot_disable_log_dirty:
+	 *	called when disabling log dirty mode for the slot.
+	 *	also called when slot is created with log dirty disabled.
+	 *  - flush_log_dirty:
+	 *	called before reporting dirty_bitmap to userspace.
+	 *  - enable_log_dirty_pt_masked:
+	 *	called when reenabling log dirty for the GFNs in the mask after
+	 *	corresponding bits are cleared in slot->dirty_bitmap.
+	 */
+	void (*slot_enable_log_dirty)(struct kvm *kvm,
+				      struct kvm_memory_slot *slot);
+	void (*slot_disable_log_dirty)(struct kvm *kvm,
+				       struct kvm_memory_slot *slot);
+	void (*flush_log_dirty)(struct kvm *kvm);
+	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
+					   struct kvm_memory_slot *slot,
+					   gfn_t offset, unsigned long mask);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ec62ecd2d513..cee759299a35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1335,7 +1335,11 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 				struct kvm_memory_slot *slot,
 				gfn_t gfn_offset, unsigned long mask)
 {
-	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+	if (kvm_x86_ops->enable_log_dirty_pt_masked)
+		kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
+				mask);
+	else
+		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
 static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a7fcff81af6..442ee7d90946 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3780,6 +3780,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 
 	mutex_lock(&kvm->slots_lock);
 
+	/*
+	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+	 */
+	if (kvm_x86_ops->flush_log_dirty)
+		kvm_x86_ops->flush_log_dirty(kvm);
+
 	r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
 
 	/*
@@ -7533,6 +7539,56 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	return 0;
 }
 
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+				     struct kvm_memory_slot *new)
+{
+	/* Still write protect RO slot */
+	if (new->flags & KVM_MEM_READONLY) {
+		kvm_mmu_slot_remove_write_access(kvm, new);
+		return;
+	}
+
+	/*
+	 * Call kvm_x86_ops dirty logging hooks when they are valid.
+	 *
+	 * kvm_x86_ops->slot_disable_log_dirty is called when:
+	 *
+	 *  - KVM_MR_CREATE with dirty logging is disabled
+	 *  - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+	 *
+	 * The reason is, in case of PML, we need to set D-bit for any slots
+	 * with dirty logging disabled in order to eliminate unnecessary GPA
+	 * logging in PML buffer (and potential PML buffer full VMEXT). This
+	 * guarantees leaving PML enabled during guest's lifetime won't have
+	 * any additonal overhead from PML when guest is running with dirty
+	 * logging disabled for memory slots.
+	 *
+	 * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+	 * to dirty logging mode.
+	 *
+	 * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+	 *
+	 * In case of write protect:
+	 *
+	 * Write protect all pages for dirty logging.
+	 *
+	 * All the sptes including the large sptes which point to this
+	 * slot are set to readonly. We can not create any new large
+	 * spte on this slot until the end of the logging.
+	 *
+	 * See the comments in fast_page_fault().
+	 */
+	if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+		if (kvm_x86_ops->slot_enable_log_dirty)
+			kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+		else
+			kvm_mmu_slot_remove_write_access(kvm, new);
+	} else {
+		if (kvm_x86_ops->slot_disable_log_dirty)
+			kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+	}
+}
+
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
@@ -7562,16 +7618,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	new = id_to_memslot(kvm->memslots, mem->slot);
 
 	/*
-	 * Write protect all pages for dirty logging.
+	 * Set up write protection and/or dirty logging for the new slot.
 	 *
-	 * All the sptes including the large sptes which point to this
-	 * slot are set to readonly. We can not create any new large
-	 * spte on this slot until the end of the logging.
-	 *
-	 * See the comments in fast_page_fault().
+	 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
+	 * been zapped so no dirty logging staff is needed for old slot. For
+	 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
+	 * new and it's also covered when dealing with the new slot.
 	 */
-	if ((change != KVM_MR_DELETE) && (new->flags & KVM_MEM_LOG_DIRTY_PAGES))
-		kvm_mmu_slot_remove_write_access(kvm, new);
+	if (change != KVM_MR_DELETE)
+		kvm_mmu_slot_apply_flags(kvm, new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
-- 
cgit v1.2.3


From 843e4330573cc5261ae260ce0b83dc570d8cdc05 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:28 +0800
Subject: KVM: VMX: Add PML support in VMX

This patch adds PML support in VMX. A new module parameter 'enable_pml' is added
to allow user to enable/disable it manually.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h      |   4 +
 arch/x86/include/uapi/asm/vmx.h |   1 +
 arch/x86/kvm/trace.h            |  18 ++++
 arch/x86/kvm/vmx.c              | 195 +++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |   1 +
 5 files changed, 218 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 45afaee9555c..da772edd19ab 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
+#define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
 
 
@@ -121,6 +122,7 @@ enum vmcs_field {
 	GUEST_LDTR_SELECTOR             = 0x0000080c,
 	GUEST_TR_SELECTOR               = 0x0000080e,
 	GUEST_INTR_STATUS               = 0x00000810,
+	GUEST_PML_INDEX			= 0x00000812,
 	HOST_ES_SELECTOR                = 0x00000c00,
 	HOST_CS_SELECTOR                = 0x00000c02,
 	HOST_SS_SELECTOR                = 0x00000c04,
@@ -140,6 +142,8 @@ enum vmcs_field {
 	VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
 	VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
 	VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+	PML_ADDRESS			= 0x0000200e,
+	PML_ADDRESS_HIGH		= 0x0000200f,
 	TSC_OFFSET                      = 0x00002010,
 	TSC_OFFSET_HIGH                 = 0x00002011,
 	VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index ff2b8e28883e..c5f1a1deb91a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -73,6 +73,7 @@
 #define EXIT_REASON_XSETBV              55
 #define EXIT_REASON_APIC_WRITE          56
 #define EXIT_REASON_INVPCID             58
+#define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 587149bd6f76..7c7bc8bef21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc,
 
 #endif /* CONFIG_X86_64 */
 
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+	TP_PROTO(unsigned int vcpu_id),
+	TP_ARGS(vcpu_id),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id			)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+	),
+
+	TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
 TRACE_EVENT(kvm_ple_window,
 	TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
 	TP_ARGS(grow, vcpu_id, new, old),
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c987374d92c1..de5ce82b2436 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -101,6 +101,9 @@ module_param(nested, bool, S_IRUGO);
 
 static u64 __read_mostly host_xss;
 
+static bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON						\
@@ -516,6 +519,10 @@ struct vcpu_vmx {
 	/* Dynamic PLE window. */
 	int ple_window;
 	bool ple_window_dirty;
+
+	/* Support for PML */
+#define PML_ENTITY_NUM		512
+	struct page *pml_pg;
 };
 
 enum segment_cache_field {
@@ -1068,6 +1075,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void)
 		SECONDARY_EXEC_SHADOW_VMCS;
 }
 
+static inline bool cpu_has_vmx_pml(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -2924,7 +2936,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_APIC_REGISTER_VIRT |
 			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 			SECONDARY_EXEC_SHADOW_VMCS |
-			SECONDARY_EXEC_XSAVES;
+			SECONDARY_EXEC_XSAVES |
+			SECONDARY_EXEC_ENABLE_PML;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -4355,6 +4368,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	   a current VMCS12
 	*/
 	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+	/* PML is enabled/disabled in creating/destorying vcpu */
+	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
 	return exec_control;
 }
 
@@ -5942,6 +5958,20 @@ static __init int hardware_setup(void)
 
 	update_ple_window_actual_max();
 
+	/*
+	 * Only enable PML when hardware supports PML feature, and both EPT
+	 * and EPT A/D bit features are enabled -- PML depends on them to work.
+	 */
+	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+		enable_pml = 0;
+
+	if (!enable_pml) {
+		kvm_x86_ops->slot_enable_log_dirty = NULL;
+		kvm_x86_ops->slot_disable_log_dirty = NULL;
+		kvm_x86_ops->flush_log_dirty = NULL;
+		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+	}
+
 	return alloc_kvm_area();
 
 out7:
@@ -6971,6 +7001,31 @@ static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector)
 	return pi_test_pir(vector, &vmx->pi_desc);
 }
 
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification;
+
+	trace_kvm_pml_full(vcpu->vcpu_id);
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	/*
+	 * PML buffer FULL happened while executing iret from NMI,
+	 * "blocked by NMI" bit has to be set before next VM entry.
+	 */
+	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+			cpu_has_virtual_nmis() &&
+			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_NMI);
+
+	/*
+	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+	 * here.., and there's no userspace involvement needed for PML.
+	 */
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7019,6 +7074,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVVPID]                 = handle_invvpid,
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
+	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7325,6 +7381,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
+static int vmx_enable_pml(struct vcpu_vmx *vmx)
+{
+	struct page *pml_pg;
+	u32 exec_control;
+
+	pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!pml_pg)
+		return -ENOMEM;
+
+	vmx->pml_pg = pml_pg;
+
+	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+
+	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	exec_control |= SECONDARY_EXEC_ENABLE_PML;
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+
+	return 0;
+}
+
+static void vmx_disable_pml(struct vcpu_vmx *vmx)
+{
+	u32 exec_control;
+
+	ASSERT(vmx->pml_pg);
+	__free_page(vmx->pml_pg);
+	vmx->pml_pg = NULL;
+
+	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+}
+
+static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+{
+	struct kvm *kvm = vmx->vcpu.kvm;
+	u64 *pml_buf;
+	u16 pml_idx;
+
+	pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+	/* Do nothing if PML buffer is empty */
+	if (pml_idx == (PML_ENTITY_NUM - 1))
+		return;
+
+	/* PML index always points to next available PML buffer entity */
+	if (pml_idx >= PML_ENTITY_NUM)
+		pml_idx = 0;
+	else
+		pml_idx++;
+
+	pml_buf = page_address(vmx->pml_pg);
+	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+		u64 gpa;
+
+		gpa = pml_buf[pml_idx];
+		WARN_ON(gpa & (PAGE_SIZE - 1));
+		mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+	}
+
+	/* reset PML index */
+	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+
+/*
+ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ * Called before reporting dirty_bitmap to userspace.
+ */
+static void kvm_flush_pml_buffers(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	/*
+	 * We only need to kick vcpu out of guest mode here, as PML buffer
+	 * is flushed at beginning of all VMEXITs, and it's obvious that only
+	 * vcpus running in guest are possible to have unflushed GPAs in PML
+	 * buffer.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_vcpu_kick(vcpu);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -7335,6 +7474,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
+	/*
+	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
+	 * mode as if vcpus is in root mode, the PML buffer must has been
+	 * flushed already.
+	 */
+	if (enable_pml)
+		vmx_flush_pml_buffer(vmx);
+
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required)
 		return handle_invalid_guest_state(vcpu);
@@ -7981,6 +8130,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (enable_pml)
+		vmx_disable_pml(vmx);
 	free_vpid(vmx);
 	leave_guest_mode(vcpu);
 	vmx_load_vmcs01(vcpu);
@@ -8051,6 +8202,18 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	vmx->nested.current_vmptr = -1ull;
 	vmx->nested.current_vmcs12 = NULL;
 
+	/*
+	 * If PML is turned on, failure on enabling PML just results in failure
+	 * of creating the vcpu, therefore we can simplify PML logic (by
+	 * avoiding dealing with cases, such as enabling PML partially on vcpus
+	 * for the guest, etc.
+	 */
+	if (enable_pml) {
+		err = vmx_enable_pml(vmx);
+		if (err)
+			goto free_vmcs;
+	}
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -9492,6 +9655,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 		shrink_ple_window(vcpu);
 }
 
+static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+				     struct kvm_memory_slot *slot)
+{
+	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+}
+
+static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+				       struct kvm_memory_slot *slot)
+{
+	kvm_mmu_slot_set_dirty(kvm, slot);
+}
+
+static void vmx_flush_log_dirty(struct kvm *kvm)
+{
+	kvm_flush_pml_buffers(kvm);
+}
+
+static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+					   struct kvm_memory_slot *memslot,
+					   gfn_t offset, unsigned long mask)
+{
+	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -9601,6 +9789,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.check_nested_events = vmx_check_nested_events,
 
 	.sched_in = vmx_sched_in,
+
+	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+	.flush_log_dirty = vmx_flush_log_dirty,
+	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 442ee7d90946..1373e04e1f19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7880,3 +7880,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
-- 
cgit v1.2.3


From 52c233a4405eef4f4dbbe2984e308871866db007 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 29 Jan 2015 22:48:48 +0100
Subject: KVM: x86: return bool from kvm_apic_match*()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And don't export the internal ones while at it.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.h | 2 +-
 arch/x86/kvm/lapic.c  | 8 ++++----
 arch/x86/kvm/lapic.h  | 2 --
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c9195535ffc..c2e36d934af4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, unsigned int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a688fbffb34e..62f624656549 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -578,18 +578,18 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 	apic_update_ppr(apic);
 }
 
-static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
 {
 	return dest == (apic_x2apic_mode(apic) ?
 			X2APIC_BROADCAST : APIC_BROADCAST);
 }
 
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
 {
 	return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
 }
 
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 {
 	int result = 0;
 	u32 logical_id;
@@ -623,7 +623,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 	return result;
 }
 
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode)
 {
 	int result = 0;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7054437944cd..c1ef25c89508 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -58,8 +58,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		unsigned long *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
-- 
cgit v1.2.3


From 9368b5676265304702419eb090deffc9faf9b08f Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 29 Jan 2015 22:48:49 +0100
Subject: KVM: x86: cleanup kvm_apic_match_*()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The majority of this patch turns
  result = 0; if (CODE) result = 1; return result;
into
  return CODE;
because we return bool now.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 46 ++++++++++++++++------------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 62f624656549..caf5a8d82c18 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -591,42 +591,34 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 {
-	int result = 0;
 	u32 logical_id;
 
 	if (kvm_apic_broadcast(apic, mda))
-		return 1;
+		return true;
 
-	if (apic_x2apic_mode(apic)) {
-		logical_id = kvm_apic_get_reg(apic, APIC_LDR);
-		return logical_id & mda;
-	}
+	logical_id = kvm_apic_get_reg(apic, APIC_LDR);
 
-	logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
+	if (apic_x2apic_mode(apic))
+		return (logical_id & mda) != 0;
+
+	logical_id = GET_APIC_LOGICAL_ID(logical_id);
 
 	switch (kvm_apic_get_reg(apic, APIC_DFR)) {
 	case APIC_DFR_FLAT:
-		if (logical_id & mda)
-			result = 1;
-		break;
+		return (logical_id & mda) != 0;
 	case APIC_DFR_CLUSTER:
-		if (((logical_id >> 4) == (mda >> 0x4))
-		    && (logical_id & mda & 0xf))
-			result = 1;
-		break;
+		return ((logical_id >> 4) == (mda >> 4))
+		       && (logical_id & mda & 0xf) != 0;
 	default:
 		apic_debug("Bad DFR vcpu %d: %08x\n",
 			   apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
-		break;
+		return false;
 	}
-
-	return result;
 }
 
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode)
 {
-	int result = 0;
 	struct kvm_lapic *target = vcpu->arch.apic;
 
 	apic_debug("target %p, source %p, dest 0x%x, "
@@ -638,27 +630,21 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 	case APIC_DEST_NOSHORT:
 		if (dest_mode == 0)
 			/* Physical mode. */
-			result = kvm_apic_match_physical_addr(target, dest);
+			return kvm_apic_match_physical_addr(target, dest);
 		else
 			/* Logical mode. */
-			result = kvm_apic_match_logical_addr(target, dest);
-		break;
+			return kvm_apic_match_logical_addr(target, dest);
 	case APIC_DEST_SELF:
-		result = (target == source);
-		break;
+		return target == source;
 	case APIC_DEST_ALLINC:
-		result = 1;
-		break;
+		return true;
 	case APIC_DEST_ALLBUT:
-		result = (target != source);
-		break;
+		return target != source;
 	default:
 		apic_debug("kvm: apic: Bad dest shorthand value %x\n",
 			   short_hand);
-		break;
+		return false;
 	}
-
-	return result;
 }
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-- 
cgit v1.2.3


From 3697f302ab37000c109e2ab9f1763187853efb72 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 29 Jan 2015 22:48:50 +0100
Subject: KVM: x86: replace 0 with APIC_DEST_PHYSICAL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To make the code self-documenting.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index caf5a8d82c18..99c536597332 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -628,11 +628,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 	ASSERT(target);
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
-		if (dest_mode == 0)
-			/* Physical mode. */
+		if (dest_mode == APIC_DEST_PHYSICAL)
 			return kvm_apic_match_physical_addr(target, dest);
 		else
-			/* Logical mode. */
 			return kvm_apic_match_logical_addr(target, dest);
 	case APIC_DEST_SELF:
 		return target == source;
@@ -677,7 +675,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 	ret = true;
 
-	if (irq->dest_mode == 0) { /* physical mode */
+	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
 		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
 			goto out;
 
-- 
cgit v1.2.3


From 8a395363e2f9f52ec44a1cd892881e8ee1a53269 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 29 Jan 2015 22:48:51 +0100
Subject: KVM: x86: fix x2apic logical address matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We cannot hit the bug now, but future patches will expose this path.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 99c536597332..555956c3c473 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -599,7 +599,8 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 	logical_id = kvm_apic_get_reg(apic, APIC_LDR);
 
 	if (apic_x2apic_mode(apic))
-		return (logical_id & mda) != 0;
+		return ((logical_id >> 16) == (mda >> 16))
+		       && (logical_id & mda & 0xffff) != 0;
 
 	logical_id = GET_APIC_LOGICAL_ID(logical_id);
 
-- 
cgit v1.2.3


From ad15a296471d40703be647b11ddc56df0f1c0cdd Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 30 Jan 2015 16:18:49 +0100
Subject: kvm: vmx: fix oops with explicit flexpriority=0 option

A function pointer was not NULLed, causing kvm_vcpu_reload_apic_access_page to
go down the wrong path and OOPS when doing put_page(NULL).

This did not happen on old processors, only when setting the module option
explicitly.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index de5ce82b2436..81152a070b1f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5879,16 +5879,16 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_unrestricted_guest())
 		enable_unrestricted_guest = 0;
 
-	if (!cpu_has_vmx_flexpriority()) {
+	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
 
-		/*
-		 * set_apic_access_page_addr() is used to reload apic access
-		 * page upon invalidation.  No need to do anything if the
-		 * processor does not have the APIC_ACCESS_ADDR VMCS field.
-		 */
+	/*
+	 * set_apic_access_page_addr() is used to reload apic access
+	 * page upon invalidation.  No need to do anything if not
+	 * using the APIC_ACCESS_ADDR VMCS field.
+	 */
+	if (!flexpriority_enabled)
 		kvm_x86_ops->set_apic_access_page_addr = NULL;
-	}
 
 	if (!cpu_has_vmx_tpr_shadow())
 		kvm_x86_ops->update_cr8_intercept = NULL;
-- 
cgit v1.2.3


From f9339860384aaa104d85f59b75921941b39a45fc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 2 Feb 2015 15:26:08 -0200
Subject: KVM: x86: fix lapic_timer_int_injected with APIC-v

With APICv, LAPIC timer interrupt is always delivered via IRR:
apic_find_highest_irr syncs PIR to IRR.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 555956c3c473..3481e9eee8f4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1086,13 +1086,13 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
 
 	if (kvm_apic_hw_enabled(apic)) {
 		int vec = reg & APIC_VECTOR_MASK;
+		void *bitmap = apic->regs + APIC_ISR;
 
-		if (kvm_x86_ops->test_posted_interrupt)
-			return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
-		else {
-			if (apic_test_vector(vec, apic->regs + APIC_ISR))
-				return true;
-		}
+		if (kvm_x86_ops->deliver_posted_interrupt)
+			bitmap = apic->regs + APIC_IRR;
+
+		if (apic_test_vector(vec, bitmap))
+			return true;
 	}
 	return false;
 }
-- 
cgit v1.2.3


From 2e6d015799d523dcce11c7d1465e6feb7b69fab1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 2 Feb 2015 15:26:09 -0200
Subject: KVM: x86: revert "add method to test PIR bitmap vector"

Revert 7c6a98dfa1ba9dc64a62e73624ecea9995736bbd, given
that testing PIR is not necessary anymore.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/vmx.c              | 14 --------------
 2 files changed, 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 57916ecb9b92..9dbc7435cbc2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -767,7 +767,6 @@ struct kvm_x86_ops {
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
-	bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81152a070b1f..15b78936c101 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -438,11 +438,6 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 
-static int pi_test_pir(int vector, struct pi_desc *pi_desc)
-{
-	return test_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -5908,7 +5903,6 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->hwapic_irr_update = NULL;
 		kvm_x86_ops->hwapic_isr_update = NULL;
 		kvm_x86_ops->deliver_posted_interrupt = NULL;
-		kvm_x86_ops->test_posted_interrupt = NULL;
 		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
 	}
 
@@ -6994,13 +6988,6 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	return pi_test_pir(vector, &vmx->pi_desc);
-}
-
 static int handle_pml_full(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
@@ -9753,7 +9740,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.hwapic_isr_update = vmx_hwapic_isr_update,
 	.sync_pir_to_irr = vmx_sync_pir_to_irr,
 	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
-	.test_posted_interrupt = vmx_test_pir,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
-- 
cgit v1.2.3


From 874e52086f9f1b9f9bdfbf98cca8506b110b63ba Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 12 Jan 2015 15:17:22 +0200
Subject: x86, mrst: remove Moorestown specific serial drivers

Intel Moorestown platform support was removed few years ago. This is a follow
up which removes Moorestown specific code for the serial devices. It includes
mrst_max3110 and earlyprintk bits.

This was used on SFI (Medfield, Clovertrail) based platforms as well, though
new ones use normal serial interface for the console service.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: David Cohen <david.a.cohen@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/intel-mid.h                   |   3 -
 arch/x86/kernel/early_printk.c                     |   5 -
 arch/x86/platform/intel-mid/device_libs/Makefile   |   2 -
 .../intel-mid/device_libs/platform_max3111.c       |  35 -
 .../platform/intel-mid/early_printk_intel_mid.c    | 220 +----
 drivers/tty/serial/Kconfig                         |  10 -
 drivers/tty/serial/Makefile                        |   1 -
 drivers/tty/serial/mrst_max3110.c                  | 909 ---------------------
 drivers/tty/serial/mrst_max3110.h                  |  61 --
 9 files changed, 4 insertions(+), 1242 deletions(-)
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_max3111.c
 delete mode 100644 drivers/tty/serial/mrst_max3110.c
 delete mode 100644 drivers/tty/serial/mrst_max3110.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index e34e097b6f9d..705d35708a50 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
-extern struct console early_mrst_console;
-extern void mrst_early_console_init(void);
-
 extern struct console early_hsu_console;
 extern void hsu_early_console_init(const char *);
 
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 01d1c187c9f9..de814dfb8aee 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -226,11 +226,6 @@ static int __init setup_early_printk(char *buf)
 			early_console_register(&xenboot_console, keep);
 #endif
 #ifdef CONFIG_EARLY_PRINTK_INTEL_MID
-		if (!strncmp(buf, "mrst", 4)) {
-			mrst_early_console_init();
-			early_console_register(&early_mrst_console, keep);
-		}
-
 		if (!strncmp(buf, "hsu", 3)) {
 			hsu_early_console_init(buf + 3);
 			early_console_register(&early_hsu_console, keep);
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index af9307f2cc28..91ec9f8704bf 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -16,8 +16,6 @@ obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o
 obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
 obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
 obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
-# SPI Devices
-obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o
 # MISC Devices
 obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
 obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c
deleted file mode 100644
index afd1df94e0e5..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * platform_max3111.c: max3111 platform data initilization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/gpio.h>
-#include <linux/spi/spi.h>
-#include <asm/intel-mid.h>
-
-static void __init *max3111_platform_data(void *info)
-{
-	struct spi_board_info *spi_info = info;
-	int intr = get_gpio_by_name("max3111_int");
-
-	spi_info->mode = SPI_MODE_0;
-	if (intr == -1)
-		return NULL;
-	spi_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	return NULL;
-}
-
-static const struct devs_id max3111_dev_id __initconst = {
-	.name = "spi_max3111",
-	.type = SFI_DEV_TYPE_SPI,
-	.get_platform_data = &max3111_platform_data,
-};
-
-sfi_device(max3111_dev_id);
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
index e0bd082a80e0..4e720829ab90 100644
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
@@ -10,15 +10,13 @@
  */
 
 /*
- * This file implements two early consoles named mrst and hsu.
- * mrst is based on Maxim3110 spi-uart device, it exists in both
- * Moorestown and Medfield platforms, while hsu is based on a High
- * Speed UART device which only exists in the Medfield platform
+ * This file implements early console named hsu.
+ * hsu is based on a High Speed UART device which only exists in the Medfield
+ * platform
  */
 
 #include <linux/serial_reg.h>
 #include <linux/serial_mfd.h>
-#include <linux/kmsg_dump.h>
 #include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -28,216 +26,6 @@
 #include <asm/pgtable.h>
 #include <asm/intel-mid.h>
 
-#define MRST_SPI_TIMEOUT		0x200000
-#define MRST_REGBASE_SPI0		0xff128000
-#define MRST_REGBASE_SPI1		0xff128400
-#define MRST_CLK_SPI0_REG		0xff11d86c
-
-/* Bit fields in CTRLR0 */
-#define SPI_DFS_OFFSET			0
-
-#define SPI_FRF_OFFSET			4
-#define SPI_FRF_SPI			0x0
-#define SPI_FRF_SSP			0x1
-#define SPI_FRF_MICROWIRE		0x2
-#define SPI_FRF_RESV			0x3
-
-#define SPI_MODE_OFFSET			6
-#define SPI_SCPH_OFFSET			6
-#define SPI_SCOL_OFFSET			7
-#define SPI_TMOD_OFFSET			8
-#define	SPI_TMOD_TR			0x0		/* xmit & recv */
-#define SPI_TMOD_TO			0x1		/* xmit only */
-#define SPI_TMOD_RO			0x2		/* recv only */
-#define SPI_TMOD_EPROMREAD		0x3		/* eeprom read mode */
-
-#define SPI_SLVOE_OFFSET		10
-#define SPI_SRL_OFFSET			11
-#define SPI_CFS_OFFSET			12
-
-/* Bit fields in SR, 7 bits */
-#define SR_MASK				0x7f		/* cover 7 bits */
-#define SR_BUSY				(1 << 0)
-#define SR_TF_NOT_FULL			(1 << 1)
-#define SR_TF_EMPT			(1 << 2)
-#define SR_RF_NOT_EMPT			(1 << 3)
-#define SR_RF_FULL			(1 << 4)
-#define SR_TX_ERR			(1 << 5)
-#define SR_DCOL				(1 << 6)
-
-struct dw_spi_reg {
-	u32	ctrl0;
-	u32	ctrl1;
-	u32	ssienr;
-	u32	mwcr;
-	u32	ser;
-	u32	baudr;
-	u32	txfltr;
-	u32	rxfltr;
-	u32	txflr;
-	u32	rxflr;
-	u32	sr;
-	u32	imr;
-	u32	isr;
-	u32	risr;
-	u32	txoicr;
-	u32	rxoicr;
-	u32	rxuicr;
-	u32	msticr;
-	u32	icr;
-	u32	dmacr;
-	u32	dmatdlr;
-	u32	dmardlr;
-	u32	idr;
-	u32	version;
-
-	/* Currently operates as 32 bits, though only the low 16 bits matter */
-	u32	dr;
-} __packed;
-
-#define dw_readl(dw, name)		__raw_readl(&(dw)->name)
-#define dw_writel(dw, name, val)	__raw_writel((val), &(dw)->name)
-
-/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
-static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
-
-static u32 *pclk_spi0;
-/* Always contains an accessible address, start with 0 */
-static struct dw_spi_reg *pspi;
-
-static struct kmsg_dumper dw_dumper;
-static int dumper_registered;
-
-static void dw_kmsg_dump(struct kmsg_dumper *dumper,
-			 enum kmsg_dump_reason reason)
-{
-	static char line[1024];
-	size_t len;
-
-	/* When run to this, we'd better re-init the HW */
-	mrst_early_console_init();
-
-	while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len))
-		early_mrst_console.write(&early_mrst_console, line, len);
-}
-
-/* Set the ratio rate to 115200, 8n1, IRQ disabled */
-static void max3110_write_config(void)
-{
-	u16 config;
-
-	config = 0xc001;
-	dw_writel(pspi, dr, config);
-}
-
-/* Translate char to a eligible word and send to max3110 */
-static void max3110_write_data(char c)
-{
-	u16 data;
-
-	data = 0x8000 | c;
-	dw_writel(pspi, dr, data);
-}
-
-void mrst_early_console_init(void)
-{
-	u32 ctrlr0 = 0;
-	u32 spi0_cdiv;
-	u32 freq; /* Freqency info only need be searched once */
-
-	/* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
-	pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-							MRST_CLK_SPI0_REG);
-	spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
-	freq = 100000000 / (spi0_cdiv + 1);
-
-	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL)
-		mrst_spi_paddr = MRST_REGBASE_SPI1;
-
-	pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-						mrst_spi_paddr);
-
-	/* Disable SPI controller */
-	dw_writel(pspi, ssienr, 0);
-
-	/* Set control param, 8 bits, transmit only mode */
-	ctrlr0 = dw_readl(pspi, ctrl0);
-
-	ctrlr0 &= 0xfcc0;
-	ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
-		      | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
-	dw_writel(pspi, ctrl0, ctrlr0);
-
-	/*
-	 * Change the spi0 clk to comply with 115200 bps, use 100000 to
-	 * calculate the clk dividor to make the clock a little slower
-	 * than real baud rate.
-	 */
-	dw_writel(pspi, baudr, freq/100000);
-
-	/* Disable all INT for early phase */
-	dw_writel(pspi, imr, 0x0);
-
-	/* Set the cs to spi-uart */
-	dw_writel(pspi, ser, 0x2);
-
-	/* Enable the HW, the last step for HW init */
-	dw_writel(pspi, ssienr, 0x1);
-
-	/* Set the default configuration */
-	max3110_write_config();
-
-	/* Register the kmsg dumper */
-	if (!dumper_registered) {
-		dw_dumper.dump = dw_kmsg_dump;
-		kmsg_dump_register(&dw_dumper);
-		dumper_registered = 1;
-	}
-}
-
-/* Slave select should be called in the read/write function */
-static void early_mrst_spi_putc(char c)
-{
-	unsigned int timeout;
-	u32 sr;
-
-	timeout = MRST_SPI_TIMEOUT;
-	/* Early putc needs to make sure the TX FIFO is not full */
-	while (--timeout) {
-		sr = dw_readl(pspi, sr);
-		if (!(sr & SR_TF_NOT_FULL))
-			cpu_relax();
-		else
-			break;
-	}
-
-	if (!timeout)
-		pr_warn("MRST earlycon: timed out\n");
-	else
-		max3110_write_data(c);
-}
-
-/* Early SPI only uses polling mode */
-static void early_mrst_spi_write(struct console *con, const char *str,
-					unsigned n)
-{
-	int i;
-
-	for (i = 0; i < n && *str; i++) {
-		if (*str == '\n')
-			early_mrst_spi_putc('\r');
-		early_mrst_spi_putc(*str);
-		str++;
-	}
-}
-
-struct console early_mrst_console = {
-	.name =		"earlymrst",
-	.write =	early_mrst_spi_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-
 /*
  * Following is the early console based on Medfield HSU (High
  * Speed UART) device.
@@ -259,7 +47,7 @@ void hsu_early_console_init(const char *s)
 		port = clamp_val(port, 0, 2);
 
 	paddr = HSU_PORT_BASE + port * 0x80;
-	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
+	phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
 
 	/* Disable FIFO */
 	writeb(0x0, phsu + UART_FCR);
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 96ec6cfd74e8..35677f75a1d8 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -482,16 +482,6 @@ config SERIAL_SA1100_CONSOLE
 	  your boot loader (lilo or loadlin) about how to pass options to the
 	  kernel at boot time.)
 
-config SERIAL_MRST_MAX3110
-	tristate "SPI UART driver for Max3110"
-	depends on SPI_DW_PCI
-	select SERIAL_CORE
-	select SERIAL_CORE_CONSOLE
-	help
-	  This is the UART protocol driver for the MAX3110 device on
-	  the Intel Moorestown platform. On other systems use the max3100
-	  driver.
-
 config SERIAL_MFD_HSU
 	tristate "Medfield High Speed UART support"
 	depends on PCI
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 770a19bb7fcb..a1c1085ef75c 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -77,7 +77,6 @@ obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
 obj-$(CONFIG_SERIAL_GRLIB_GAISLER_APBUART) += apbuart.o
 obj-$(CONFIG_SERIAL_ALTERA_JTAGUART) += altera_jtaguart.o
 obj-$(CONFIG_SERIAL_VT8500) += vt8500_serial.o
-obj-$(CONFIG_SERIAL_MRST_MAX3110)	+= mrst_max3110.o
 obj-$(CONFIG_SERIAL_MFD_HSU)	+= mfd.o
 obj-$(CONFIG_SERIAL_IFX6X60)  	+= ifx6x60.o
 obj-$(CONFIG_SERIAL_PCH_UART)	+= pch_uart.o
diff --git a/drivers/tty/serial/mrst_max3110.c b/drivers/tty/serial/mrst_max3110.c
deleted file mode 100644
index 77239d5e620d..000000000000
--- a/drivers/tty/serial/mrst_max3110.c
+++ /dev/null
@@ -1,909 +0,0 @@
-/*
- *  mrst_max3110.c - spi uart protocol driver for Maxim 3110
- *
- * Copyright (c) 2008-2010, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-/*
- * Note:
- * 1. From Max3110 spec, the Rx FIFO has 8 words, while the Tx FIFO only has
- *    1 word. If SPI master controller doesn't support sclk frequency change,
- *    then the char need be sent out one by one with some delay
- *
- * 2. Currently only RX available interrupt is used, no need for waiting TXE
- *    interrupt for a low speed UART device
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#ifdef CONFIG_MAGIC_SYSRQ
-#define SUPPORT_SYSRQ
-#endif
-
-#include <linux/module.h>
-#include <linux/ioport.h>
-#include <linux/irq.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/serial_core.h>
-#include <linux/serial_reg.h>
-
-#include <linux/kthread.h>
-#include <linux/spi/spi.h>
-#include <linux/pm.h>
-
-#include "mrst_max3110.h"
-
-#define UART_TX_NEEDED 1
-#define CON_TX_NEEDED  2
-#define BIT_IRQ_PENDING    3
-
-struct uart_max3110 {
-	struct uart_port port;
-	struct spi_device *spi;
-	char name[SPI_NAME_SIZE];
-
-	wait_queue_head_t wq;
-	struct task_struct *main_thread;
-	struct task_struct *read_thread;
-	struct mutex thread_mutex;
-	struct mutex io_mutex;
-
-	u32 baud;
-	u16 cur_conf;
-	u8 clock;
-	u8 parity, word_7bits;
-	u16 irq;
-
-	unsigned long uart_flags;
-
-	/* console related */
-	struct circ_buf con_xmit;
-};
-
-/* global data structure, may need be removed */
-static struct uart_max3110 *pmax;
-
-static int receive_chars(struct uart_max3110 *max,
-				unsigned short *str, int len);
-static int max3110_read_multi(struct uart_max3110 *max);
-static void max3110_con_receive(struct uart_max3110 *max);
-
-static int max3110_write_then_read(struct uart_max3110 *max,
-		const void *txbuf, void *rxbuf, unsigned len, int always_fast)
-{
-	struct spi_device *spi = max->spi;
-	struct spi_message	message;
-	struct spi_transfer	x;
-	int ret;
-
-	mutex_lock(&max->io_mutex);
-	spi_message_init(&message);
-	memset(&x, 0, sizeof x);
-	x.len = len;
-	x.tx_buf = txbuf;
-	x.rx_buf = rxbuf;
-	spi_message_add_tail(&x, &message);
-
-	if (always_fast)
-		x.speed_hz = spi->max_speed_hz;
-	else if (max->baud)
-		x.speed_hz = max->baud;
-
-	/* Do the i/o */
-	ret = spi_sync(spi, &message);
-	mutex_unlock(&max->io_mutex);
-	return ret;
-}
-
-/* Write a 16b word to the device */
-static int max3110_out(struct uart_max3110 *max, const u16 out)
-{
-	void *buf;
-	u16 *obuf, *ibuf;
-	int ret;
-
-	buf = kzalloc(8, GFP_KERNEL | GFP_DMA);
-	if (!buf)
-		return -ENOMEM;
-
-	obuf = buf;
-	ibuf = buf + 4;
-	*obuf = out;
-	ret = max3110_write_then_read(max, obuf, ibuf, 2, 1);
-	if (ret) {
-		pr_warn("%s: get err msg %d when sending 0x%x\n",
-			__func__, ret, out);
-		goto exit;
-	}
-
-	receive_chars(max, ibuf, 1);
-
-exit:
-	kfree(buf);
-	return ret;
-}
-
-/*
- * This is usually used to read data from SPIC RX FIFO, which doesn't
- * need any delay like flushing character out.
- *
- * Return how many valide bytes are read back
- */
-static int max3110_read_multi(struct uart_max3110 *max)
-{
-	void *buf;
-	u16 *obuf, *ibuf;
-	int ret, blen;
-
-	blen = M3110_RX_FIFO_DEPTH * sizeof(u16);
-	buf = kzalloc(blen * 2, GFP_KERNEL | GFP_DMA);
-	if (!buf)
-		return 0;
-
-	/* tx/rx always have the same length */
-	obuf = buf;
-	ibuf = buf + blen;
-
-	if (max3110_write_then_read(max, obuf, ibuf, blen, 1)) {
-		kfree(buf);
-		return 0;
-	}
-
-	ret = receive_chars(max, ibuf, M3110_RX_FIFO_DEPTH);
-
-	kfree(buf);
-	return ret;
-}
-
-static void serial_m3110_con_putchar(struct uart_port *port, int ch)
-{
-	struct uart_max3110 *max =
-		container_of(port, struct uart_max3110, port);
-	struct circ_buf *xmit = &max->con_xmit;
-
-	if (uart_circ_chars_free(xmit)) {
-		xmit->buf[xmit->head] = (char)ch;
-		xmit->head = (xmit->head + 1) & (PAGE_SIZE - 1);
-	}
-}
-
-/*
- * Print a string to the serial port trying not to disturb
- * any possible real use of the port...
- *
- *	The console_lock must be held when we get here.
- */
-static void serial_m3110_con_write(struct console *co,
-				const char *s, unsigned int count)
-{
-	if (!pmax)
-		return;
-
-	uart_console_write(&pmax->port, s, count, serial_m3110_con_putchar);
-
-	if (!test_and_set_bit(CON_TX_NEEDED, &pmax->uart_flags))
-		wake_up(&pmax->wq);
-}
-
-static int __init
-serial_m3110_con_setup(struct console *co, char *options)
-{
-	struct uart_max3110 *max = pmax;
-	int baud = 115200;
-	int bits = 8;
-	int parity = 'n';
-	int flow = 'n';
-
-	pr_info("setting up console\n");
-
-	if (co->index == -1)
-		co->index = 0;
-
-	if (!max) {
-		pr_err("pmax is NULL, return\n");
-		return -ENODEV;
-	}
-
-	if (options)
-		uart_parse_options(options, &baud, &parity, &bits, &flow);
-
-	return uart_set_options(&max->port, co, baud, parity, bits, flow);
-}
-
-static struct tty_driver *serial_m3110_con_device(struct console *co,
-							int *index)
-{
-	struct uart_driver *p = co->data;
-	*index = co->index;
-	return p->tty_driver;
-}
-
-static struct uart_driver serial_m3110_reg;
-static struct console serial_m3110_console = {
-	.name		= "ttyS",
-	.write		= serial_m3110_con_write,
-	.device		= serial_m3110_con_device,
-	.setup		= serial_m3110_con_setup,
-	.flags		= CON_PRINTBUFFER,
-	.index		= -1,
-	.data		= &serial_m3110_reg,
-};
-
-static unsigned int serial_m3110_tx_empty(struct uart_port *port)
-{
-	return 1;
-}
-
-static void serial_m3110_stop_tx(struct uart_port *port)
-{
-	return;
-}
-
-/* stop_rx will be called in spin_lock env */
-static void serial_m3110_stop_rx(struct uart_port *port)
-{
-	return;
-}
-
-#define WORDS_PER_XFER	128
-static void send_circ_buf(struct uart_max3110 *max,
-				struct circ_buf *xmit)
-{
-	void *buf;
-	u16 *obuf, *ibuf;
-	int i, len, blen, dma_size, left, ret = 0;
-
-
-	dma_size = WORDS_PER_XFER * sizeof(u16) * 2;
-	buf = kzalloc(dma_size, GFP_KERNEL | GFP_DMA);
-	if (!buf)
-		return;
-	obuf = buf;
-	ibuf = buf + dma_size/2;
-
-	while (!uart_circ_empty(xmit)) {
-		left = uart_circ_chars_pending(xmit);
-		while (left) {
-			len = min(left, WORDS_PER_XFER);
-			blen = len * sizeof(u16);
-			memset(ibuf, 0, blen);
-
-			for (i = 0; i < len; i++) {
-				obuf[i] = (u8)xmit->buf[xmit->tail] | WD_TAG;
-				xmit->tail = (xmit->tail + 1) &
-						(UART_XMIT_SIZE - 1);
-			}
-
-			/* Fail to send msg to console is not very critical */
-
-			ret = max3110_write_then_read(max, obuf, ibuf, blen, 0);
-			if (ret)
-				pr_warn("%s: get err msg %d\n", __func__, ret);
-
-			receive_chars(max, ibuf, len);
-
-			max->port.icount.tx += len;
-			left -= len;
-		}
-	}
-
-	kfree(buf);
-}
-
-static void transmit_char(struct uart_max3110 *max)
-{
-	struct uart_port *port = &max->port;
-	struct circ_buf *xmit = &port->state->xmit;
-
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
-		return;
-
-	send_circ_buf(max, xmit);
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
-		uart_write_wakeup(port);
-
-	if (uart_circ_empty(xmit))
-		serial_m3110_stop_tx(port);
-}
-
-/*
- * This will be called by uart_write() and tty_write, can't
- * go to sleep
- */
-static void serial_m3110_start_tx(struct uart_port *port)
-{
-	struct uart_max3110 *max =
-		container_of(port, struct uart_max3110, port);
-
-	if (!test_and_set_bit(UART_TX_NEEDED, &max->uart_flags))
-		wake_up(&max->wq);
-}
-
-static int
-receive_chars(struct uart_max3110 *max, unsigned short *str, int len)
-{
-	struct uart_port *port = &max->port;
-	struct tty_port *tport;
-	char buf[M3110_RX_FIFO_DEPTH];
-	int r, w, usable;
-
-	/* If uart is not opened, just return */
-	if (!port->state)
-		return 0;
-
-	tport = &port->state->port;
-
-	for (r = 0, w = 0; r < len; r++) {
-		if (str[r] & MAX3110_BREAK &&
-		    uart_handle_break(port))
-			continue;
-
-		if (str[r] & MAX3110_READ_DATA_AVAILABLE) {
-			if (uart_handle_sysrq_char(port, str[r] & 0xff))
-				continue;
-
-			buf[w++] = str[r] & 0xff;
-		}
-	}
-
-	if (!w)
-		return 0;
-
-	for (r = 0; w; r += usable, w -= usable) {
-		usable = tty_buffer_request_room(tport, w);
-		if (usable) {
-			tty_insert_flip_string(tport, buf + r, usable);
-			port->icount.rx += usable;
-		}
-	}
-	tty_flip_buffer_push(tport);
-
-	return r;
-}
-
-/*
- * This routine will be used in read_thread or RX IRQ handling,
- * it will first do one round buffer read(8 words), if there is some
- * valid RX data, will try to read 5 more rounds till all data
- * is read out.
- *
- * Use stack space as data buffer to save some system load, and chose
- * 504 Btyes as a threadhold to do a bulk push to upper tty layer when
- * receiving bulk data, a much bigger buffer may cause stack overflow
- */
-static void max3110_con_receive(struct uart_max3110 *max)
-{
-	int loop = 1, num;
-
-	do {
-		num = max3110_read_multi(max);
-
-		if (num) {
-			loop = 5;
-		}
-	} while (--loop);
-}
-
-static int max3110_main_thread(void *_max)
-{
-	struct uart_max3110 *max = _max;
-	wait_queue_head_t *wq = &max->wq;
-	int ret = 0;
-	struct circ_buf *xmit = &max->con_xmit;
-
-	pr_info("start main thread\n");
-
-	do {
-		wait_event_interruptible(*wq,
-				max->uart_flags || kthread_should_stop());
-
-		mutex_lock(&max->thread_mutex);
-
-		if (test_and_clear_bit(BIT_IRQ_PENDING, &max->uart_flags))
-			max3110_con_receive(max);
-
-		/* first handle console output */
-		if (test_and_clear_bit(CON_TX_NEEDED, &max->uart_flags))
-			send_circ_buf(max, xmit);
-
-		/* handle uart output */
-		if (test_and_clear_bit(UART_TX_NEEDED, &max->uart_flags))
-			transmit_char(max);
-
-		mutex_unlock(&max->thread_mutex);
-
-	} while (!kthread_should_stop());
-
-	return ret;
-}
-
-static irqreturn_t serial_m3110_irq(int irq, void *dev_id)
-{
-	struct uart_max3110 *max = dev_id;
-
-	/* max3110's irq is a falling edge, not level triggered,
-	 * so no need to disable the irq */
-
-	if (!test_and_set_bit(BIT_IRQ_PENDING, &max->uart_flags))
-		wake_up(&max->wq);
-
-	return IRQ_HANDLED;
-}
-
-/* if don't use RX IRQ, then need a thread to polling read */
-static int max3110_read_thread(void *_max)
-{
-	struct uart_max3110 *max = _max;
-
-	pr_info("start read thread\n");
-	do {
-		/*
-		 * If can't acquire the mutex, it means the main thread
-		 * is running which will also perform the rx job
-		 */
-		if (mutex_trylock(&max->thread_mutex)) {
-			max3110_con_receive(max);
-			mutex_unlock(&max->thread_mutex);
-		}
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ / 20);
-	} while (!kthread_should_stop());
-
-	return 0;
-}
-
-static int serial_m3110_startup(struct uart_port *port)
-{
-	struct uart_max3110 *max =
-		container_of(port, struct uart_max3110, port);
-	u16 config = 0;
-	int ret = 0;
-
-	if (port->line != 0) {
-		pr_err("uart port startup failed\n");
-		return -1;
-	}
-
-	/* Disable all IRQ and config it to 115200, 8n1 */
-	config = WC_TAG | WC_FIFO_ENABLE
-			| WC_1_STOPBITS
-			| WC_8BIT_WORD
-			| WC_BAUD_DR2;
-
-	/* as we use thread to handle tx/rx, need set low latency */
-	port->state->port.low_latency = 1;
-
-	if (max->irq) {
-		/* Enable RX IRQ only */
-		config |= WC_RXA_IRQ_ENABLE;
-	} else {
-		/* If IRQ is disabled, start a read thread for input data */
-		max->read_thread =
-			kthread_run(max3110_read_thread, max, "max3110_read");
-		if (IS_ERR(max->read_thread)) {
-			ret = PTR_ERR(max->read_thread);
-			max->read_thread = NULL;
-			pr_err("Can't create read thread!\n");
-			return ret;
-		}
-	}
-
-	ret = max3110_out(max, config);
-	if (ret) {
-		if (max->read_thread)
-			kthread_stop(max->read_thread);
-		max->read_thread = NULL;
-		return ret;
-	}
-
-	max->cur_conf = config;
-	return 0;
-}
-
-static void serial_m3110_shutdown(struct uart_port *port)
-{
-	struct uart_max3110 *max =
-		container_of(port, struct uart_max3110, port);
-	u16 config;
-
-	if (max->read_thread) {
-		kthread_stop(max->read_thread);
-		max->read_thread = NULL;
-	}
-
-	/* Disable interrupts from this port */
-	config = WC_TAG | WC_SW_SHDI;
-	max3110_out(max, config);
-}
-
-static void serial_m3110_release_port(struct uart_port *port)
-{
-}
-
-static int serial_m3110_request_port(struct uart_port *port)
-{
-	return 0;
-}
-
-static void serial_m3110_config_port(struct uart_port *port, int flags)
-{
-	port->type = PORT_MAX3100;
-}
-
-static int
-serial_m3110_verify_port(struct uart_port *port, struct serial_struct *ser)
-{
-	/* we don't want the core code to modify any port params */
-	return -EINVAL;
-}
-
-
-static const char *serial_m3110_type(struct uart_port *port)
-{
-	struct uart_max3110 *max =
-		container_of(port, struct uart_max3110, port);
-	return max->name;
-}
-
-static void
-serial_m3110_set_termios(struct uart_port *port, struct ktermios *termios,
-		       struct ktermios *old)
-{
-	struct uart_max3110 *max =
-		container_of(port, struct uart_max3110, port);
-	unsigned char cval;
-	unsigned int baud, parity = 0;
-	int clk_div = -1;
-	u16 new_conf = max->cur_conf;
-
-	switch (termios->c_cflag & CSIZE) {
-	case CS7:
-		cval = UART_LCR_WLEN7;
-		new_conf |= WC_7BIT_WORD;
-		break;
-	default:
-		/* We only support CS7 & CS8 */
-		termios->c_cflag &= ~CSIZE;
-		termios->c_cflag |= CS8;
-	case CS8:
-		cval = UART_LCR_WLEN8;
-		new_conf |= WC_8BIT_WORD;
-		break;
-	}
-
-	baud = uart_get_baud_rate(port, termios, old, 0, 230400);
-
-	/* First calc the div for 1.8MHZ clock case */
-	switch (baud) {
-	case 300:
-		clk_div = WC_BAUD_DR384;
-		break;
-	case 600:
-		clk_div = WC_BAUD_DR192;
-		break;
-	case 1200:
-		clk_div = WC_BAUD_DR96;
-		break;
-	case 2400:
-		clk_div = WC_BAUD_DR48;
-		break;
-	case 4800:
-		clk_div = WC_BAUD_DR24;
-		break;
-	case 9600:
-		clk_div = WC_BAUD_DR12;
-		break;
-	case 19200:
-		clk_div = WC_BAUD_DR6;
-		break;
-	case 38400:
-		clk_div = WC_BAUD_DR3;
-		break;
-	case 57600:
-		clk_div = WC_BAUD_DR2;
-		break;
-	case 115200:
-		clk_div = WC_BAUD_DR1;
-		break;
-	case 230400:
-		if (max->clock & MAX3110_HIGH_CLK)
-			break;
-	default:
-		/* Pick the previous baud rate */
-		baud = max->baud;
-		clk_div = max->cur_conf & WC_BAUD_DIV_MASK;
-		tty_termios_encode_baud_rate(termios, baud, baud);
-	}
-
-	if (max->clock & MAX3110_HIGH_CLK) {
-		clk_div += 1;
-		/* High clk version max3110 doesn't support B300 */
-		if (baud == 300) {
-			baud = 600;
-			clk_div = WC_BAUD_DR384;
-		}
-		if (baud == 230400)
-			clk_div = WC_BAUD_DR1;
-		tty_termios_encode_baud_rate(termios, baud, baud);
-	}
-
-	new_conf = (new_conf & ~WC_BAUD_DIV_MASK) | clk_div;
-
-	if (unlikely(termios->c_cflag & CMSPAR))
-		termios->c_cflag &= ~CMSPAR;
-
-	if (termios->c_cflag & CSTOPB)
-		new_conf |= WC_2_STOPBITS;
-	else
-		new_conf &= ~WC_2_STOPBITS;
-
-	if (termios->c_cflag & PARENB) {
-		new_conf |= WC_PARITY_ENABLE;
-		parity |= UART_LCR_PARITY;
-	} else
-		new_conf &= ~WC_PARITY_ENABLE;
-
-	if (!(termios->c_cflag & PARODD))
-		parity |= UART_LCR_EPAR;
-	max->parity = parity;
-
-	uart_update_timeout(port, termios->c_cflag, baud);
-
-	new_conf |= WC_TAG;
-	if (new_conf != max->cur_conf) {
-		if (!max3110_out(max, new_conf)) {
-			max->cur_conf = new_conf;
-			max->baud = baud;
-		}
-	}
-}
-
-/* Don't handle hw handshaking */
-static unsigned int serial_m3110_get_mctrl(struct uart_port *port)
-{
-	return TIOCM_DSR | TIOCM_CAR | TIOCM_DSR;
-}
-
-static void serial_m3110_set_mctrl(struct uart_port *port, unsigned int mctrl)
-{
-}
-
-static void serial_m3110_break_ctl(struct uart_port *port, int break_state)
-{
-}
-
-static void serial_m3110_pm(struct uart_port *port, unsigned int state,
-			unsigned int oldstate)
-{
-}
-
-static struct uart_ops serial_m3110_ops = {
-	.tx_empty	= serial_m3110_tx_empty,
-	.set_mctrl	= serial_m3110_set_mctrl,
-	.get_mctrl	= serial_m3110_get_mctrl,
-	.stop_tx	= serial_m3110_stop_tx,
-	.start_tx	= serial_m3110_start_tx,
-	.stop_rx	= serial_m3110_stop_rx,
-	.break_ctl	= serial_m3110_break_ctl,
-	.startup	= serial_m3110_startup,
-	.shutdown	= serial_m3110_shutdown,
-	.set_termios	= serial_m3110_set_termios,
-	.pm		= serial_m3110_pm,
-	.type		= serial_m3110_type,
-	.release_port	= serial_m3110_release_port,
-	.request_port	= serial_m3110_request_port,
-	.config_port	= serial_m3110_config_port,
-	.verify_port	= serial_m3110_verify_port,
-};
-
-static struct uart_driver serial_m3110_reg = {
-	.owner		= THIS_MODULE,
-	.driver_name	= "MRST serial",
-	.dev_name	= "ttyS",
-	.major		= TTY_MAJOR,
-	.minor		= 64,
-	.nr		= 1,
-	.cons		= &serial_m3110_console,
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int serial_m3110_suspend(struct device *dev)
-{
-	struct spi_device *spi = to_spi_device(dev);
-	struct uart_max3110 *max = spi_get_drvdata(spi);
-
-	if (max->irq > 0)
-		disable_irq(max->irq);
-	uart_suspend_port(&serial_m3110_reg, &max->port);
-	max3110_out(max, max->cur_conf | WC_SW_SHDI);
-	return 0;
-}
-
-static int serial_m3110_resume(struct device *dev)
-{
-	struct spi_device *spi = to_spi_device(dev);
-	struct uart_max3110 *max = spi_get_drvdata(spi);
-
-	max3110_out(max, max->cur_conf);
-	uart_resume_port(&serial_m3110_reg, &max->port);
-	if (max->irq > 0)
-		enable_irq(max->irq);
-	return 0;
-}
-
-static SIMPLE_DEV_PM_OPS(serial_m3110_pm_ops, serial_m3110_suspend,
-			serial_m3110_resume);
-#define SERIAL_M3110_PM_OPS (&serial_m3110_pm_ops)
-
-#else
-#define SERIAL_M3110_PM_OPS NULL
-#endif
-
-static int serial_m3110_probe(struct spi_device *spi)
-{
-	struct uart_max3110 *max;
-	void *buffer;
-	u16 res;
-	int ret = 0;
-
-	max = kzalloc(sizeof(*max), GFP_KERNEL);
-	if (!max)
-		return -ENOMEM;
-
-	/* Set spi info */
-	spi->bits_per_word = 16;
-	max->clock = MAX3110_HIGH_CLK;
-
-	spi_setup(spi);
-
-	max->port.type = PORT_MAX3100;
-	max->port.fifosize = 2;		/* Only have 16b buffer */
-	max->port.ops = &serial_m3110_ops;
-	max->port.line = 0;
-	max->port.dev = &spi->dev;
-	max->port.uartclk = 115200;
-
-	max->spi = spi;
-	strcpy(max->name, spi->modalias);
-	max->irq = (u16)spi->irq;
-
-	mutex_init(&max->thread_mutex);
-	mutex_init(&max->io_mutex);
-
-	max->word_7bits = 0;
-	max->parity = 0;
-	max->baud = 0;
-
-	max->cur_conf = 0;
-	max->uart_flags = 0;
-
-	/* Check if reading configuration register returns something sane */
-
-	res = RC_TAG;
-	ret = max3110_write_then_read(max, (u8 *)&res, (u8 *)&res, 2, 0);
-	if (ret < 0 || res == 0 || res == 0xffff) {
-		dev_dbg(&spi->dev, "MAX3111 deemed not present (conf reg %04x)",
-									res);
-		ret = -ENODEV;
-		goto err_get_page;
-	}
-
-	buffer = (void *)__get_free_page(GFP_KERNEL);
-	if (!buffer) {
-		ret = -ENOMEM;
-		goto err_get_page;
-	}
-	max->con_xmit.buf = buffer;
-	max->con_xmit.head = 0;
-	max->con_xmit.tail = 0;
-
-	init_waitqueue_head(&max->wq);
-
-	max->main_thread = kthread_run(max3110_main_thread,
-					max, "max3110_main");
-	if (IS_ERR(max->main_thread)) {
-		ret = PTR_ERR(max->main_thread);
-		goto err_kthread;
-	}
-
-	if (max->irq) {
-		ret = request_irq(max->irq, serial_m3110_irq,
-				IRQ_TYPE_EDGE_FALLING, "max3110", max);
-		if (ret) {
-			max->irq = 0;
-			dev_warn(&spi->dev,
-			"unable to allocate IRQ, will use polling method\n");
-		}
-	}
-
-	spi_set_drvdata(spi, max);
-	pmax = max;
-
-	/* Give membase a psudo value to pass serial_core's check */
-	max->port.membase = (unsigned char __iomem *)0xff110000;
-	uart_add_one_port(&serial_m3110_reg, &max->port);
-
-	return 0;
-
-err_kthread:
-	free_page((unsigned long)buffer);
-err_get_page:
-	kfree(max);
-	return ret;
-}
-
-static int serial_m3110_remove(struct spi_device *dev)
-{
-	struct uart_max3110 *max = spi_get_drvdata(dev);
-
-	if (!max)
-		return 0;
-
-	uart_remove_one_port(&serial_m3110_reg, &max->port);
-
-	free_page((unsigned long)max->con_xmit.buf);
-
-	if (max->irq)
-		free_irq(max->irq, max);
-
-	if (max->main_thread)
-		kthread_stop(max->main_thread);
-
-	kfree(max);
-	return 0;
-}
-
-static struct spi_driver uart_max3110_driver = {
-	.driver = {
-			.name	= "spi_max3111",
-			.owner	= THIS_MODULE,
-			.pm	= SERIAL_M3110_PM_OPS,
-	},
-	.probe		= serial_m3110_probe,
-	.remove		= serial_m3110_remove,
-};
-
-static int __init serial_m3110_init(void)
-{
-	int ret = 0;
-
-	ret = uart_register_driver(&serial_m3110_reg);
-	if (ret)
-		return ret;
-
-	ret = spi_register_driver(&uart_max3110_driver);
-	if (ret)
-		uart_unregister_driver(&serial_m3110_reg);
-
-	return ret;
-}
-
-static void __exit serial_m3110_exit(void)
-{
-	spi_unregister_driver(&uart_max3110_driver);
-	uart_unregister_driver(&serial_m3110_reg);
-}
-
-module_init(serial_m3110_init);
-module_exit(serial_m3110_exit);
-
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("spi:max3110-uart");
diff --git a/drivers/tty/serial/mrst_max3110.h b/drivers/tty/serial/mrst_max3110.h
deleted file mode 100644
index 35af0739513b..000000000000
--- a/drivers/tty/serial/mrst_max3110.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _MRST_MAX3110_H
-#define _MRST_MAX3110_H
-
-#define MAX3110_HIGH_CLK	0x1	/* 3.6864 MHZ */
-#define MAX3110_LOW_CLK		0x0	/* 1.8432 MHZ */
-
-/* status bits for all 4 MAX3110 operate modes */
-#define MAX3110_READ_DATA_AVAILABLE	(1 << 15)
-#define MAX3110_WRITE_BUF_EMPTY		(1 << 14)
-#define MAX3110_BREAK			(1 << 10)
-
-#define WC_TAG			(3 << 14)
-#define RC_TAG			(1 << 14)
-#define WD_TAG			(2 << 14)
-#define RD_TAG			(0 << 14)
-
-/* bits def for write configuration */
-#define WC_FIFO_ENABLE_MASK	(1 << 13)
-#define WC_FIFO_ENABLE		(0 << 13)
-
-#define WC_SW_SHDI		(1 << 12)
-
-#define WC_IRQ_MASK		(0xF << 8)
-#define WC_TXE_IRQ_ENABLE	(1 << 11)	/* TX empty irq */
-#define WC_RXA_IRQ_ENABLE	(1 << 10)	/* RX available irq */
-#define WC_PAR_HIGH_IRQ_ENABLE	(1 << 9)
-#define WC_REC_ACT_IRQ_ENABLE	(1 << 8)
-
-#define WC_IRDA_ENABLE		(1 << 7)
-
-#define WC_STOPBITS_MASK	(1 << 6)
-#define WC_2_STOPBITS		(1 << 6)
-#define WC_1_STOPBITS		(0 << 6)
-
-#define WC_PARITY_ENABLE_MASK	(1 << 5)
-#define WC_PARITY_ENABLE	(1 << 5)
-
-#define WC_WORDLEN_MASK		(1 << 4)
-#define WC_7BIT_WORD		(1 << 4)
-#define WC_8BIT_WORD		(0 << 4)
-
-#define WC_BAUD_DIV_MASK	(0xF)
-#define WC_BAUD_DR1		(0x0)
-#define WC_BAUD_DR2		(0x1)
-#define WC_BAUD_DR4		(0x2)
-#define WC_BAUD_DR8		(0x3)
-#define WC_BAUD_DR16		(0x4)
-#define WC_BAUD_DR32		(0x5)
-#define WC_BAUD_DR64		(0x6)
-#define WC_BAUD_DR128		(0x7)
-#define WC_BAUD_DR3		(0x8)
-#define WC_BAUD_DR6		(0x9)
-#define WC_BAUD_DR12		(0xA)
-#define WC_BAUD_DR24		(0xB)
-#define WC_BAUD_DR48		(0xC)
-#define WC_BAUD_DR96		(0xD)
-#define WC_BAUD_DR192		(0xE)
-#define WC_BAUD_DR384		(0xF)
-
-#define M3110_RX_FIFO_DEPTH	8
-#endif
-- 
cgit v1.2.3


From ea9e9d8029020d438b0717ffddf65140fda16051 Mon Sep 17 00:00:00 2001
From: "Stuart R. Anderson" <stuart.r.anderson@intel.com>
Date: Tue, 20 Jan 2015 12:07:57 +0000
Subject: Specify PCI based UART for earlyprintk

Add support for specifying PCI based UARTs for earlyprintk
using a syntax like "earlyprintk=pciserial,00:18.1,115200",
where 00:18.1 is the BDF of a UART device.

[Slightly tidied from Stuart's original patch]
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/early_printk.c | 182 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 166 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index de814dfb8aee..a62536a1be88 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -19,6 +19,7 @@
 #include <linux/usb/ehci_def.h>
 #include <linux/efi.h>
 #include <asm/efi.h>
+#include <asm/pci_x86.h>
 
 /* Simple VGA output */
 #define VGABASE		(__ISA_IO_base + 0xb8000)
@@ -76,7 +77,7 @@ static struct console early_vga_console = {
 
 /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
 
-static int early_serial_base = 0x3f8;  /* ttyS0 */
+static unsigned long early_serial_base = 0x3f8;  /* ttyS0 */
 
 #define XMTRDY          0x20
 
@@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8;  /* ttyS0 */
 #define DLL             0       /*  Divisor Latch Low         */
 #define DLH             1       /*  Divisor latch High        */
 
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+	uint32_t *vaddr = (uint32_t *)addr;
+	/* shift implied by pointer type */
+	writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+	uint32_t *vaddr = (uint32_t *)addr;
+	/* shift implied by pointer type */
+	return readl(vaddr + offset);
+}
+
+static unsigned int io_serial_in(unsigned long addr, int offset)
+{
+	return inb(addr + offset);
+}
+
+static void io_serial_out(unsigned long addr, int offset, int value)
+{
+	outb(value, addr + offset);
+}
+
+static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
+static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+
 static int early_serial_putc(unsigned char ch)
 {
 	unsigned timeout = 0xffff;
 
-	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+	while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
 		cpu_relax();
-	outb(ch, early_serial_base + TXR);
+	serial_out(early_serial_base, TXR, ch);
 	return timeout ? 0 : -1;
 }
 
@@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n)
 	}
 }
 
+static __init void early_serial_hw_init(unsigned divisor)
+{
+	unsigned char c;
+
+	serial_out(early_serial_base, LCR, 0x3);	/* 8n1 */
+	serial_out(early_serial_base, IER, 0);	/* no interrupt */
+	serial_out(early_serial_base, FCR, 0);	/* no fifo */
+	serial_out(early_serial_base, MCR, 0x3);	/* DTR + RTS */
+
+	c = serial_in(early_serial_base, LCR);
+	serial_out(early_serial_base, LCR, c | DLAB);
+	serial_out(early_serial_base, DLL, divisor & 0xff);
+	serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
+	serial_out(early_serial_base, LCR, c & ~DLAB);
+}
+
 #define DEFAULT_BAUD 9600
 
 static __init void early_serial_init(char *s)
 {
-	unsigned char c;
 	unsigned divisor;
-	unsigned baud = DEFAULT_BAUD;
+	unsigned long baud = DEFAULT_BAUD;
 	char *e;
 
 	if (*s == ',')
@@ -145,24 +188,124 @@ static __init void early_serial_init(char *s)
 			s++;
 	}
 
-	outb(0x3, early_serial_base + LCR);	/* 8n1 */
-	outb(0, early_serial_base + IER);	/* no interrupt */
-	outb(0, early_serial_base + FCR);	/* no fifo */
-	outb(0x3, early_serial_base + MCR);	/* DTR + RTS */
+	if (*s) {
+		if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+			baud = DEFAULT_BAUD;
+	}
+
+	/* Convert from baud to divisor value */
+	divisor = 115200 / baud;
+
+	/* These will always be IO based ports */
+	serial_in = io_serial_in;
+	serial_out = io_serial_out;
+
+	/* Set up the HW */
+	early_serial_hw_init(divisor);
+}
+
+#ifdef CONFIG_PCI
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be ",B:D.F,baud" where B, D & F describe the
+ * location of a PCI device that must be a UART device.
+ */
+static __init void early_pci_serial_init(char *s)
+{
+	unsigned divisor;
+	unsigned long baud = DEFAULT_BAUD;
+	u8 bus, slot, func;
+	uint32_t classcode, bar0;
+	uint16_t cmdreg;
+	char *e;
+
+
+	/*
+	 * First, part the param to get the BDF values
+	 */
+	if (*s == ',')
+		++s;
+
+	if (*s == 0)
+		return;
+
+	bus = (u8)simple_strtoul(s, &e, 16);
+	s = e;
+	if (*s != ':')
+		return;
+	++s;
+	slot = (u8)simple_strtoul(s, &e, 16);
+	s = e;
+	if (*s != '.')
+		return;
+	++s;
+	func = (u8)simple_strtoul(s, &e, 16);
+	s = e;
 
+	/* A baud might be following */
+	if (*s == ',')
+		s++;
+
+	/*
+	 * Second, find the device from the BDF
+	 */
+	cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+	classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+	bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+
+	/*
+	 * Verify it is a UART type device
+	 */
+	if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+	     (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+	   (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */
+		return;
+
+	/*
+	 * Determine if it is IO or memory mapped
+	 */
+	if (bar0 & 0x01) {
+		/* it is IO mapped */
+		serial_in = io_serial_in;
+		serial_out = io_serial_out;
+		early_serial_base = bar0&0xfffffffc;
+		write_pci_config(bus, slot, func, PCI_COMMAND,
+						cmdreg|PCI_COMMAND_IO);
+	} else {
+		/* It is memory mapped - assume 32-bit alignment */
+		serial_in = mem32_serial_in;
+		serial_out = mem32_serial_out;
+		/* WARNING! assuming the address is always in the first 4G */
+		early_serial_base =
+			(unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+		write_pci_config(bus, slot, func, PCI_COMMAND,
+						cmdreg|PCI_COMMAND_MEMORY);
+	}
+
+	/*
+	 * Lastly, initalize the hardware
+	 */
 	if (*s) {
-		baud = simple_strtoul(s, &e, 0);
-		if (baud == 0 || s == e)
+		if (strcmp(s, "nocfg") == 0)
+			/* Sometimes, we want to leave the UART alone
+			 * and assume the BIOS has set it up correctly.
+			 * "nocfg" tells us this is the case, and we
+			 * should do no more setup.
+			 */
+			return;
+		if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
 			baud = DEFAULT_BAUD;
 	}
 
+	/* Convert from baud to divisor value */
 	divisor = 115200 / baud;
-	c = inb(early_serial_base + LCR);
-	outb(c | DLAB, early_serial_base + LCR);
-	outb(divisor & 0xff, early_serial_base + DLL);
-	outb((divisor >> 8) & 0xff, early_serial_base + DLH);
-	outb(c & ~DLAB, early_serial_base + LCR);
+
+	/* Set up the HW */
+	early_serial_hw_init(divisor);
 }
+#endif
 
 static struct console early_serial_console = {
 	.name =		"earlyser",
@@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf)
 			early_serial_init(buf + 4);
 			early_console_register(&early_serial_console, keep);
 		}
+#ifdef CONFIG_PCI
+		if (!strncmp(buf, "pciserial", 9)) {
+			early_pci_serial_init(buf + 9);
+			early_console_register(&early_serial_console, keep);
+			buf += 9; /* Keep from match the above "serial" */
+		}
+#endif
 		if (!strncmp(buf, "vga", 3) &&
 		    boot_params.screen_info.orig_video_isVGA == 1) {
 			max_xpos = boot_params.screen_info.orig_video_cols;
-- 
cgit v1.2.3


From 3af18d9c5fe95a6b377dca7b9ff9c6d3ab7f0969 Mon Sep 17 00:00:00 2001
From: Wincy Van <fanwenyi0529@gmail.com>
Date: Tue, 3 Feb 2015 23:49:31 +0800
Subject: KVM: nVMX: Prepare for using hardware MSR bitmap

Currently, if L1 enables MSR_BITMAP, we will emulate this feature, all
of L2's msr access is intercepted by L0.  Features like "virtualize
x2apic mode" require that the MSR bitmap is enabled, or the hardware
will exit and for example not virtualize the x2apic MSRs.  In order to
let L1 use these features, we need to build a merged bitmap that only
not cause a VMEXIT if 1) L1 requires that 2) the bit is not required by
the processor for APIC virtualization.

For now the guests are still run with MSR bitmap disabled, but this
patch already introduces nested_vmx_merge_msr_bitmap for future use.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 15b78936c101..6d1d26f017c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -800,6 +800,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -5823,13 +5824,21 @@ static __init int hardware_setup(void)
 				(unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap_longmode_x2apic)
 		goto out4;
+
+	if (nested) {
+		vmx_msr_bitmap_nested =
+			(unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx_msr_bitmap_nested)
+			goto out5;
+	}
+
 	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmread_bitmap)
-		goto out5;
+		goto out6;
 
 	vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmwrite_bitmap)
-		goto out6;
+		goto out7;
 
 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5845,10 +5854,12 @@ static __init int hardware_setup(void)
 
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+	if (nested)
+		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
 
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
-		goto out7;
+		goto out8;
 	}
 
 	if (boot_cpu_has(X86_FEATURE_NX))
@@ -5968,10 +5979,13 @@ static __init int hardware_setup(void)
 
 	return alloc_kvm_area();
 
-out7:
+out8:
 	free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
 	free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+	if (nested)
+		free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -5998,6 +6012,8 @@ static __exit void hardware_unsetup(void)
 	free_page((unsigned long)vmx_io_bitmap_a);
 	free_page((unsigned long)vmx_vmwrite_bitmap);
 	free_page((unsigned long)vmx_vmread_bitmap);
+	if (nested)
+		free_page((unsigned long)vmx_msr_bitmap_nested);
 
 	free_kvm_area();
 }
@@ -8455,6 +8471,38 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
 		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
 
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+						struct vmcs12 *vmcs12)
+{
+	int maxphyaddr;
+	u64 addr;
+
+	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+		return 0;
+
+	if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+	maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+	if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+	   ((addr + PAGE_SIZE) >> maxphyaddr))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+					       struct vmcs12 *vmcs12)
+{
+	return false;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 				       unsigned long count_field,
 				       unsigned long addr_field,
@@ -8787,11 +8835,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
 	}
 
+	if (cpu_has_vmx_msr_bitmap() &&
+	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+	} else
+		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
 	/*
-	 * Merging of IO and MSR bitmaps not currently supported.
+	 * Merging of IO bitmap not currently supported.
 	 * Rather, exit every time.
 	 */
-	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
 	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
 
@@ -8942,15 +8996,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
-	if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
-			!PAGE_ALIGNED(vmcs12->msr_bitmap)) {
+	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
 		/*TODO: Also verify bits beyond physical address width are 0*/
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
 
-	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-		/*TODO: Also verify bits beyond physical address width are 0*/
+	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
@@ -9506,6 +9558,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	kvm_set_dr(vcpu, 7, 0x400);
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
+	if (cpu_has_vmx_msr_bitmap())
+		vmx_set_msr_bitmap(vcpu);
+
 	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
 				vmcs12->vm_exit_msr_load_count))
 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
-- 
cgit v1.2.3


From f2b93280edee5c7e95eecba48707a4e4a19b17c8 Mon Sep 17 00:00:00 2001
From: Wincy Van <fanwenyi0529@gmail.com>
Date: Tue, 3 Feb 2015 23:56:03 +0800
Subject: KVM: nVMX: Enable nested virtualize x2apic mode

When L2 is using x2apic, we can use virtualize x2apic mode to
gain higher performance, especially in apicv case.

This patch also introduces nested_vmx_check_apicv_controls
for the nested apicv patches.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6d1d26f017c1..25984e7e8a4d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1115,6 +1115,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
 		vmx_xsaves_supported();
 }
 
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2402,6 +2407,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	nested_vmx_secondary_ctls_low = 0;
 	nested_vmx_secondary_ctls_high &=
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 		SECONDARY_EXEC_WBINVD_EXITING |
 		SECONDARY_EXEC_XSAVES;
 
@@ -4163,6 +4169,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
 	}
 }
 
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+					       unsigned long *msr_bitmap_nested,
+					       u32 msr, int type)
+{
+	int f = sizeof(unsigned long);
+
+	if (!cpu_has_vmx_msr_bitmap()) {
+		WARN_ON(1);
+		return;
+	}
+
+	/*
+	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+	 * have the write-low and read-high bitmap offsets the wrong way round.
+	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+	 */
+	if (msr <= 0x1fff) {
+		if (type & MSR_TYPE_R &&
+		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+			/* read-low */
+			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+		if (type & MSR_TYPE_W &&
+		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+			/* write-low */
+			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		if (type & MSR_TYPE_R &&
+		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+			/* read-high */
+			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+		if (type & MSR_TYPE_W &&
+		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+			/* write-high */
+			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+	}
+}
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
 	if (!longmode_only)
@@ -8500,7 +8552,59 @@ static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 					       struct vmcs12 *vmcs12)
 {
-	return false;
+	struct page *page;
+	unsigned long *msr_bitmap;
+
+	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+		return false;
+
+	page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+	if (!page) {
+		WARN_ON(1);
+		return false;
+	}
+	msr_bitmap = (unsigned long *)kmap(page);
+	if (!msr_bitmap) {
+		nested_release_page_clean(page);
+		WARN_ON(1);
+		return false;
+	}
+
+	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+		/* TPR is allowed */
+		nested_vmx_disable_intercept_for_msr(msr_bitmap,
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+				MSR_TYPE_R | MSR_TYPE_W);
+	} else
+		__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+				MSR_TYPE_R | MSR_TYPE_W);
+	kunmap(page);
+	nested_release_page_clean(page);
+
+	return true;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+					   struct vmcs12 *vmcs12)
+{
+	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+		return 0;
+
+	/*
+	 * If virtualize x2apic mode is enabled,
+	 * virtualize apic access must be disabled.
+	 */
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		return -EINVAL;
+
+	/* tpr shadow is needed by all apicv features. */
+	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+		return -EINVAL;
+
+	return 0;
 }
 
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
@@ -8796,7 +8900,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			else
 				vmcs_write64(APIC_ACCESS_ADDR,
 				  page_to_phys(vmx->nested.apic_access_page));
-		} else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+			    (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
 			exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 			kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9007,6 +9112,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
+	if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
 	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
-- 
cgit v1.2.3


From b9c237bb1db61f107b5d7cee5008e4a6b96ff800 Mon Sep 17 00:00:00 2001
From: Wincy Van <fanwenyi0529@gmail.com>
Date: Tue, 3 Feb 2015 23:56:30 +0800
Subject: KVM: nVMX: Make nested control MSRs per-cpu

To enable nested apicv support, we need per-cpu vmx
control MSRs:
  1. If in-kernel irqchip is enabled, we can enable nested
     posted interrupt, we should set posted intr bit in
     the nested_vmx_pinbased_ctls_high.
  2. If in-kernel irqchip is disabled, we can not enable
     nested posted interrupt, the posted intr bit
     in the nested_vmx_pinbased_ctls_high will be cleared.

Since there would be different settings about in-kernel
irqchip between VMs, different nested control MSRs
are needed.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 215 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 129 insertions(+), 86 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 25984e7e8a4d..a17bbb862f91 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -411,6 +411,23 @@ struct nested_vmx {
 
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	u64 vmcs01_debugctl;
+
+	u32 nested_vmx_procbased_ctls_low;
+	u32 nested_vmx_procbased_ctls_high;
+	u32 nested_vmx_true_procbased_ctls_low;
+	u32 nested_vmx_secondary_ctls_low;
+	u32 nested_vmx_secondary_ctls_high;
+	u32 nested_vmx_pinbased_ctls_low;
+	u32 nested_vmx_pinbased_ctls_high;
+	u32 nested_vmx_exit_ctls_low;
+	u32 nested_vmx_exit_ctls_high;
+	u32 nested_vmx_true_exit_ctls_low;
+	u32 nested_vmx_entry_ctls_low;
+	u32 nested_vmx_entry_ctls_high;
+	u32 nested_vmx_true_entry_ctls_low;
+	u32 nested_vmx_misc_low;
+	u32 nested_vmx_misc_high;
+	u32 nested_vmx_ept_caps;
 };
 
 #define POSTED_INTR_ON  0
@@ -2292,20 +2309,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
  * if the corresponding bit in the (32-bit) control field *must* be on, and a
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
  */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 {
 	/*
 	 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2324,57 +2329,71 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 
 	/* pin-based controls */
 	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
-	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
-	nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-	nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
-	nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+		vmx->nested.nested_vmx_pinbased_ctls_low,
+		vmx->nested.nested_vmx_pinbased_ctls_high);
+	vmx->nested.nested_vmx_pinbased_ctls_low |=
+		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+	vmx->nested.nested_vmx_pinbased_ctls_high &=
+		PIN_BASED_EXT_INTR_MASK |
+		PIN_BASED_NMI_EXITING |
+		PIN_BASED_VIRTUAL_NMIS;
+	vmx->nested.nested_vmx_pinbased_ctls_high |=
+		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
 
 	/* exit controls */
 	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-		nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
-	nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+		vmx->nested.nested_vmx_exit_ctls_low,
+		vmx->nested.nested_vmx_exit_ctls_high);
+	vmx->nested.nested_vmx_exit_ctls_low =
+		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 
-	nested_vmx_exit_ctls_high &=
+	vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
 		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-	nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+	vmx->nested.nested_vmx_exit_ctls_high |=
+		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
 		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
 	if (vmx_mpx_supported())
-		nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+		vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
 	/* We support free control of debug control saving. */
-	nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+	vmx->nested.nested_vmx_true_exit_ctls_low =
+		vmx->nested.nested_vmx_exit_ctls_low &
 		~VM_EXIT_SAVE_DEBUG_CONTROLS;
 
 	/* entry controls */
 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
-		nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
-	nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
-	nested_vmx_entry_ctls_high &=
+		vmx->nested.nested_vmx_entry_ctls_low,
+		vmx->nested.nested_vmx_entry_ctls_high);
+	vmx->nested.nested_vmx_entry_ctls_low =
+		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+	vmx->nested.nested_vmx_entry_ctls_high &=
 #ifdef CONFIG_X86_64
 		VM_ENTRY_IA32E_MODE |
 #endif
 		VM_ENTRY_LOAD_IA32_PAT;
-	nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
-				       VM_ENTRY_LOAD_IA32_EFER);
+	vmx->nested.nested_vmx_entry_ctls_high |=
+		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
 	if (vmx_mpx_supported())
-		nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+		vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
 	/* We support free control of debug control loading. */
-	nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+	vmx->nested.nested_vmx_true_entry_ctls_low =
+		vmx->nested.nested_vmx_entry_ctls_low &
 		~VM_ENTRY_LOAD_DEBUG_CONTROLS;
 
 	/* cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
-		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
-	nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-	nested_vmx_procbased_ctls_high &=
+		vmx->nested.nested_vmx_procbased_ctls_low,
+		vmx->nested.nested_vmx_procbased_ctls_high);
+	vmx->nested.nested_vmx_procbased_ctls_low =
+		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+	vmx->nested.nested_vmx_procbased_ctls_high &=
 		CPU_BASED_VIRTUAL_INTR_PENDING |
 		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
 		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2394,18 +2413,21 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	 * can use it to avoid exits to L1 - even when L0 runs L2
 	 * without MSR bitmaps.
 	 */
-	nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+	vmx->nested.nested_vmx_procbased_ctls_high |=
+		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		CPU_BASED_USE_MSR_BITMAPS;
 
 	/* We support free control of CR3 access interception. */
-	nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+	vmx->nested.nested_vmx_true_procbased_ctls_low =
+		vmx->nested.nested_vmx_procbased_ctls_low &
 		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
 
 	/* secondary cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
-		nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
-	nested_vmx_secondary_ctls_low = 0;
-	nested_vmx_secondary_ctls_high &=
+		vmx->nested.nested_vmx_secondary_ctls_low,
+		vmx->nested.nested_vmx_secondary_ctls_high);
+	vmx->nested.nested_vmx_secondary_ctls_low = 0;
+	vmx->nested.nested_vmx_secondary_ctls_high &=
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 		SECONDARY_EXEC_WBINVD_EXITING |
@@ -2413,27 +2435,31 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
-		nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_ENABLE_EPT |
 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
-		nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
 			 VMX_EPT_INVEPT_BIT;
-		nested_vmx_ept_caps &= vmx_capability.ept;
+		vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
 		/*
 		 * For nested guests, we don't do anything specific
 		 * for single context invalidation. Hence, only advertise
 		 * support for global context invalidation.
 		 */
-		nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
 	} else
-		nested_vmx_ept_caps = 0;
+		vmx->nested.nested_vmx_ept_caps = 0;
 
 	/* miscellaneous data */
-	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
-	nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
-	nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+	rdmsr(MSR_IA32_VMX_MISC,
+		vmx->nested.nested_vmx_misc_low,
+		vmx->nested.nested_vmx_misc_high);
+	vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+	vmx->nested.nested_vmx_misc_low |=
+		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
 		VMX_MISC_ACTIVITY_HLT;
-	nested_vmx_misc_high = 0;
+	vmx->nested.nested_vmx_misc_high = 0;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2452,6 +2478,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
 /* Returns 0 on success, non-0 otherwise. */
 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
 	switch (msr_index) {
 	case MSR_IA32_VMX_BASIC:
 		/*
@@ -2466,36 +2494,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		break;
 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 	case MSR_IA32_VMX_PINBASED_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
-					nested_vmx_pinbased_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_pinbased_ctls_low,
+			vmx->nested.nested_vmx_pinbased_ctls_high);
 		break;
 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
-					nested_vmx_procbased_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_true_procbased_ctls_low,
+			vmx->nested.nested_vmx_procbased_ctls_high);
 		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
-					nested_vmx_procbased_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_procbased_ctls_low,
+			vmx->nested.nested_vmx_procbased_ctls_high);
 		break;
 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
-					nested_vmx_exit_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_true_exit_ctls_low,
+			vmx->nested.nested_vmx_exit_ctls_high);
 		break;
 	case MSR_IA32_VMX_EXIT_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
-					nested_vmx_exit_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_exit_ctls_low,
+			vmx->nested.nested_vmx_exit_ctls_high);
 		break;
 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
-					nested_vmx_entry_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_true_entry_ctls_low,
+			vmx->nested.nested_vmx_entry_ctls_high);
 		break;
 	case MSR_IA32_VMX_ENTRY_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
-					nested_vmx_entry_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_entry_ctls_low,
+			vmx->nested.nested_vmx_entry_ctls_high);
 		break;
 	case MSR_IA32_VMX_MISC:
-		*pdata = vmx_control_msr(nested_vmx_misc_low,
-					 nested_vmx_misc_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_misc_low,
+			vmx->nested.nested_vmx_misc_high);
 		break;
 	/*
 	 * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2520,12 +2556,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
 		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS2:
-		*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
-					nested_vmx_secondary_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_secondary_ctls_low,
+			vmx->nested.nested_vmx_secondary_ctls_high);
 		break;
 	case MSR_IA32_VMX_EPT_VPID_CAP:
 		/* Currently, no nested vpid support */
-		*pdata = nested_vmx_ept_caps;
+		*pdata = vmx->nested.nested_vmx_ept_caps;
 		break;
 	default:
 		return 1;
@@ -5045,11 +5082,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
-static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
 {
 	unsigned long always_on = VMXON_CR0_ALWAYSON;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-	if (nested_vmx_secondary_ctls_high &
+	if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
 		SECONDARY_EXEC_UNRESTRICTED_GUEST &&
 	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
 		always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5074,7 +5112,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 		val = (val & ~vmcs12->cr0_guest_host_mask) |
 			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
 
-		if (!nested_cr0_valid(vmcs12, val))
+		if (!nested_cr0_valid(vcpu, val))
 			return 1;
 
 		if (kvm_set_cr0(vcpu, val))
@@ -5969,9 +6007,6 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
 	}
 
-	if (nested)
-		nested_vmx_setup_ctls_msrs();
-
 	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -6989,6 +7024,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 /* Emulate the INVEPT instruction */
 static int handle_invept(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 vmx_instruction_info, types;
 	unsigned long type;
 	gva_t gva;
@@ -6997,8 +7033,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 		u64 eptp, gpa;
 	} operand;
 
-	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
-	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+	      SECONDARY_EXEC_ENABLE_EPT) ||
+	    !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
@@ -7014,7 +7051,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
-	types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+	types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 
 	if (!(types & (1UL << type))) {
 		nested_vmx_failValid(vcpu,
@@ -8254,6 +8291,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	if (nested)
+		nested_vmx_setup_ctls_msrs(vmx);
+
 	vmx->nested.current_vmptr = -1ull;
 	vmx->nested.current_vmcs12 = NULL;
 
@@ -8412,7 +8452,8 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
 	WARN_ON(mmu_is_nested(vcpu));
 	kvm_init_shadow_ept_mmu(vcpu,
-			nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+			VMX_EPT_EXECUTE_ONLY_BIT);
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -9123,18 +9164,20 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	}
 
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
-				nested_vmx_true_procbased_ctls_low,
-				nested_vmx_procbased_ctls_high) ||
+				vmx->nested.nested_vmx_true_procbased_ctls_low,
+				vmx->nested.nested_vmx_procbased_ctls_high) ||
 	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
-	      nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+				vmx->nested.nested_vmx_secondary_ctls_low,
+				vmx->nested.nested_vmx_secondary_ctls_high) ||
 	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
-	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+				vmx->nested.nested_vmx_pinbased_ctls_low,
+				vmx->nested.nested_vmx_pinbased_ctls_high) ||
 	    !vmx_control_verify(vmcs12->vm_exit_controls,
-				nested_vmx_true_exit_ctls_low,
-				nested_vmx_exit_ctls_high) ||
+				vmx->nested.nested_vmx_true_exit_ctls_low,
+				vmx->nested.nested_vmx_exit_ctls_high) ||
 	    !vmx_control_verify(vmcs12->vm_entry_controls,
-				nested_vmx_true_entry_ctls_low,
-				nested_vmx_entry_ctls_high))
+				vmx->nested.nested_vmx_true_entry_ctls_low,
+				vmx->nested.nested_vmx_entry_ctls_high))
 	{
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
@@ -9147,7 +9190,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
-	if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
+	if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
 	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
 		nested_vmx_entry_failure(vcpu, vmcs12,
 			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
-- 
cgit v1.2.3


From 82f0dd4b2749ece2e48509cbd768e9bcf44d2124 Mon Sep 17 00:00:00 2001
From: Wincy Van <fanwenyi0529@gmail.com>
Date: Tue, 3 Feb 2015 23:57:18 +0800
Subject: KVM: nVMX: Enable nested apic register virtualization

We can reduce apic register virtualization cost with this feature,
it is also a requirement for virtual interrupt delivery and posted
interrupt processing.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a17bbb862f91..ea9b2e938ed1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1137,6 +1137,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }
 
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2430,6 +2435,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_secondary_ctls_high &=
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_WBINVD_EXITING |
 		SECONDARY_EXEC_XSAVES;
 
@@ -7434,6 +7440,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_APIC_ACCESS:
 		return nested_cpu_has2(vmcs12,
 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+	case EXIT_REASON_APIC_WRITE:
+		/* apic_write should exit unconditionally. */
+		return 1;
 	case EXIT_REASON_EPT_VIOLATION:
 		/*
 		 * L0 always deals with the EPT violation. If nested EPT is
@@ -8593,6 +8602,7 @@ static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 					       struct vmcs12 *vmcs12)
 {
+	int msr;
 	struct page *page;
 	unsigned long *msr_bitmap;
 
@@ -8612,16 +8622,35 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	}
 
 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+		if (nested_cpu_has_apic_reg_virt(vmcs12))
+			for (msr = 0x800; msr <= 0x8ff; msr++)
+				nested_vmx_disable_intercept_for_msr(
+					msr_bitmap,
+					vmx_msr_bitmap_nested,
+					msr, MSR_TYPE_R);
 		/* TPR is allowed */
 		nested_vmx_disable_intercept_for_msr(msr_bitmap,
 				vmx_msr_bitmap_nested,
 				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 				MSR_TYPE_R | MSR_TYPE_W);
-	} else
+	} else {
+		/*
+		 * Enable reading intercept of all the x2apic
+		 * MSRs. We should not rely on vmcs12 to do any
+		 * optimizations here, it may have been modified
+		 * by L1.
+		 */
+		for (msr = 0x800; msr <= 0x8ff; msr++)
+			__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				msr,
+				MSR_TYPE_R);
+
 		__vmx_enable_intercept_for_msr(
 				vmx_msr_bitmap_nested,
 				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-				MSR_TYPE_R | MSR_TYPE_W);
+				MSR_TYPE_W);
+	}
 	kunmap(page);
 	nested_release_page_clean(page);
 
@@ -8631,14 +8660,16 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 					   struct vmcs12 *vmcs12)
 {
-	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+	    !nested_cpu_has_apic_reg_virt(vmcs12))
 		return 0;
 
 	/*
 	 * If virtualize x2apic mode is enabled,
 	 * virtualize apic access must be disabled.
 	 */
-	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+	if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 		return -EINVAL;
 
 	/* tpr shadow is needed by all apicv features. */
-- 
cgit v1.2.3


From 608406e290ca31d8f217cb765ee50152b41a7c9c Mon Sep 17 00:00:00 2001
From: Wincy Van <fanwenyi0529@gmail.com>
Date: Tue, 3 Feb 2015 23:57:51 +0800
Subject: KVM: nVMX: Enable nested virtual interrupt delivery

With virtual interrupt delivery, the hardware lets KVM use a more
efficient mechanism for interrupt injection. This is an important feature
for nested VMX, because it reduces vmexits substantially and they are
much more expensive with nested virtualization.  This is especially
important for throughput-bound scenarios.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ea9b2e938ed1..955eff21f14a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -219,6 +219,10 @@ struct __packed vmcs12 {
 	u64 virtual_apic_page_addr;
 	u64 apic_access_addr;
 	u64 ept_pointer;
+	u64 eoi_exit_bitmap0;
+	u64 eoi_exit_bitmap1;
+	u64 eoi_exit_bitmap2;
+	u64 eoi_exit_bitmap3;
 	u64 xss_exit_bitmap;
 	u64 guest_physical_address;
 	u64 vmcs_link_pointer;
@@ -341,6 +345,7 @@ struct __packed vmcs12 {
 	u16 guest_gs_selector;
 	u16 guest_ldtr_selector;
 	u16 guest_tr_selector;
+	u16 guest_intr_status;
 	u16 host_es_selector;
 	u16 host_cs_selector;
 	u16 host_ss_selector;
@@ -626,6 +631,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
 	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
 	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+	FIELD(GUEST_INTR_STATUS, guest_intr_status),
 	FIELD(HOST_ES_SELECTOR, host_es_selector),
 	FIELD(HOST_CS_SELECTOR, host_cs_selector),
 	FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -643,6 +649,10 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
 	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
 	FIELD64(EPT_POINTER, ept_pointer),
+	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1142,6 +1152,11 @@ static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 }
 
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2436,6 +2451,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
 		SECONDARY_EXEC_XSAVES;
 
@@ -7441,7 +7457,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has2(vmcs12,
 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 	case EXIT_REASON_APIC_WRITE:
-		/* apic_write should exit unconditionally. */
+	case EXIT_REASON_EOI_INDUCED:
+		/* apic_write and eoi_induced should exit unconditionally. */
 		return 1;
 	case EXIT_REASON_EPT_VIOLATION:
 		/*
@@ -8633,6 +8650,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 				vmx_msr_bitmap_nested,
 				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 				MSR_TYPE_R | MSR_TYPE_W);
+		if (nested_cpu_has_vid(vmcs12)) {
+			/* EOI and self-IPI are allowed */
+			nested_vmx_disable_intercept_for_msr(
+				msr_bitmap,
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_EOI >> 4),
+				MSR_TYPE_W);
+			nested_vmx_disable_intercept_for_msr(
+				msr_bitmap,
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+				MSR_TYPE_W);
+		}
 	} else {
 		/*
 		 * Enable reading intercept of all the x2apic
@@ -8650,6 +8680,14 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 				vmx_msr_bitmap_nested,
 				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 				MSR_TYPE_W);
+		__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_EOI >> 4),
+				MSR_TYPE_W);
+		__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+				MSR_TYPE_W);
 	}
 	kunmap(page);
 	nested_release_page_clean(page);
@@ -8661,7 +8699,8 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 					   struct vmcs12 *vmcs12)
 {
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
-	    !nested_cpu_has_apic_reg_virt(vmcs12))
+	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
+	    !nested_cpu_has_vid(vmcs12))
 		return 0;
 
 	/*
@@ -8672,6 +8711,14 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 		return -EINVAL;
 
+	/*
+	 * If virtual interrupt delivery is enabled,
+	 * we must exit on external interrupts.
+	 */
+	if (nested_cpu_has_vid(vmcs12) &&
+	   !nested_exit_on_intr(vcpu))
+		return -EINVAL;
+
 	/* tpr shadow is needed by all apicv features. */
 	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 		return -EINVAL;
@@ -8979,6 +9026,19 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			kvm_vcpu_reload_apic_access_page(vcpu);
 		}
 
+		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+			vmcs_write64(EOI_EXIT_BITMAP0,
+				vmcs12->eoi_exit_bitmap0);
+			vmcs_write64(EOI_EXIT_BITMAP1,
+				vmcs12->eoi_exit_bitmap1);
+			vmcs_write64(EOI_EXIT_BITMAP2,
+				vmcs12->eoi_exit_bitmap2);
+			vmcs_write64(EOI_EXIT_BITMAP3,
+				vmcs12->eoi_exit_bitmap3);
+			vmcs_write16(GUEST_INTR_STATUS,
+				vmcs12->guest_intr_status);
+		}
+
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
@@ -9550,6 +9610,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
 	}
 
+	if (nested_cpu_has_vid(vmcs12))
+		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
 	vmcs12->vm_entry_controls =
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
 		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
-- 
cgit v1.2.3


From 705699a139948a671cd66b915e8095c95fdf44d9 Mon Sep 17 00:00:00 2001
From: Wincy Van <fanwenyi0529@gmail.com>
Date: Tue, 3 Feb 2015 23:58:17 +0800
Subject: KVM: nVMX: Enable nested posted interrupt processing

If vcpu has a interrupt in vmx non-root mode, injecting that interrupt
requires a vmexit.  With posted interrupt processing, the vmexit
is not needed, and interrupts are fully taken care of by hardware.
In nested vmx, this feature avoids much more vmexits than non-nested vmx.

When L1 asks L0 to deliver L1's posted interrupt vector, and the target
VCPU is in non-root mode, we use a physical ipi to deliver POSTED_INTR_NV
to the target vCPU.  Using POSTED_INTR_NV avoids unexpected interrupts
if a concurrent vmexit happens and L1's vector is different with L0's.
The IPI triggers posted interrupt processing in the target physical CPU.

In case the target vCPU was not in guest mode, complete the posted
interrupt delivery on the next entry to L2.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c |  13 ++++-
 arch/x86/kvm/lapic.h |   1 +
 arch/x86/kvm/vmx.c   | 154 +++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 161 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3481e9eee8f4..86609c15726f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -325,17 +325,24 @@ static u8 count_vectors(void *bitmap)
 	return count;
 }
 
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+void __kvm_apic_update_irr(u32 *pir, void *regs)
 {
 	u32 i, pir_val;
-	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	for (i = 0; i <= 7; i++) {
 		pir_val = xchg(&pir[i], 0);
 		if (pir_val)
-			*((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+			*((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
 	}
 }
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	__kvm_apic_update_irr(pir, apic->regs);
+}
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
 static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c1ef25c89508..0bc6c656625b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,6 +57,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		unsigned long *dest_map);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 955eff21f14a..6e112472b0b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -218,6 +218,7 @@ struct __packed vmcs12 {
 	u64 tsc_offset;
 	u64 virtual_apic_page_addr;
 	u64 apic_access_addr;
+	u64 posted_intr_desc_addr;
 	u64 ept_pointer;
 	u64 eoi_exit_bitmap0;
 	u64 eoi_exit_bitmap1;
@@ -337,6 +338,7 @@ struct __packed vmcs12 {
 	u32 vmx_preemption_timer_value;
 	u32 padding32[7]; /* room for future expansion */
 	u16 virtual_processor_id;
+	u16 posted_intr_nv;
 	u16 guest_es_selector;
 	u16 guest_cs_selector;
 	u16 guest_ss_selector;
@@ -409,6 +411,10 @@ struct nested_vmx {
 	 */
 	struct page *apic_access_page;
 	struct page *virtual_apic_page;
+	struct page *pi_desc_page;
+	struct pi_desc *pi_desc;
+	bool pi_pending;
+	u16 posted_intr_nv;
 	u64 msr_ia32_feature_control;
 
 	struct hrtimer preemption_timer;
@@ -623,6 +629,7 @@ static int max_shadow_read_write_fields =
 
 static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+	FIELD(POSTED_INTR_NV, posted_intr_nv),
 	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
 	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
 	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -648,6 +655,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(TSC_OFFSET, tsc_offset),
 	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
 	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
 	FIELD64(EPT_POINTER, ept_pointer),
 	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
 	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -800,6 +808,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@@ -1157,6 +1166,11 @@ static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }
 
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2360,6 +2374,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_pinbased_ctls_high |=
 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
+	if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+		vmx->nested.nested_vmx_pinbased_ctls_high |=
+			PIN_BASED_POSTED_INTR;
 
 	/* exit controls */
 	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
@@ -4312,6 +4329,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
 	return enable_apicv && irqchip_in_kernel(kvm);
 }
 
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int max_irr;
+	void *vapic_page;
+	u16 status;
+
+	if (vmx->nested.pi_desc &&
+	    vmx->nested.pi_pending) {
+		vmx->nested.pi_pending = false;
+		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+			return 0;
+
+		max_irr = find_last_bit(
+			(unsigned long *)vmx->nested.pi_desc->pir, 256);
+
+		if (max_irr == 256)
+			return 0;
+
+		vapic_page = kmap(vmx->nested.virtual_apic_page);
+		if (!vapic_page) {
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+		kunmap(vmx->nested.virtual_apic_page);
+
+		status = vmcs_read16(GUEST_INTR_STATUS);
+		if ((u8)max_irr > ((u8)status & 0xff)) {
+			status &= ~0xff;
+			status |= (u8)max_irr;
+			vmcs_write16(GUEST_INTR_STATUS, status);
+		}
+	}
+	return 0;
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+						int vector)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (is_guest_mode(vcpu) &&
+	    vector == vmx->nested.posted_intr_nv) {
+		/* the PIR and ON have been set by L1. */
+		if (vcpu->mode == IN_GUEST_MODE)
+			apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+				POSTED_INTR_VECTOR);
+		/*
+		 * If a posted intr is not recognized by hardware,
+		 * we will accomplish it in the next vmentry.
+		 */
+		vmx->nested.pi_pending = true;
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		return 0;
+	}
+	return -1;
+}
 /*
  * Send interrupt to vcpu via posted interrupt way.
  * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4324,6 +4399,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int r;
 
+	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+	if (!r)
+		return;
+
 	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
 		return;
 
@@ -6585,6 +6664,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 		vmcs_write64(VMCS_LINK_POINTER, -1ull);
 	}
+	vmx->nested.posted_intr_nv = -1;
 	kunmap(vmx->nested.current_vmcs12_page);
 	nested_release_page(vmx->nested.current_vmcs12_page);
 	vmx->nested.current_vmptr = -1ull;
@@ -6613,6 +6693,12 @@ static void free_nested(struct vcpu_vmx *vmx)
 		nested_release_page(vmx->nested.virtual_apic_page);
 		vmx->nested.virtual_apic_page = NULL;
 	}
+	if (vmx->nested.pi_desc_page) {
+		kunmap(vmx->nested.pi_desc_page);
+		nested_release_page(vmx->nested.pi_desc_page);
+		vmx->nested.pi_desc_page = NULL;
+		vmx->nested.pi_desc = NULL;
+	}
 
 	nested_free_all_saved_vmcss(vmx);
 }
@@ -8320,6 +8406,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (nested)
 		nested_vmx_setup_ctls_msrs(vmx);
 
+	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
 	vmx->nested.current_vmcs12 = NULL;
 
@@ -8565,6 +8652,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 			return false;
 	}
 
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+			return false;
+
+		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+			kunmap(vmx->nested.pi_desc_page);
+			nested_release_page(vmx->nested.pi_desc_page);
+		}
+		vmx->nested.pi_desc_page =
+			nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+		if (!vmx->nested.pi_desc_page)
+			return false;
+
+		vmx->nested.pi_desc =
+			(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
+		if (!vmx->nested.pi_desc) {
+			nested_release_page_clean(vmx->nested.pi_desc_page);
+			return false;
+		}
+		vmx->nested.pi_desc =
+			(struct pi_desc *)((void *)vmx->nested.pi_desc +
+			(unsigned long)(vmcs12->posted_intr_desc_addr &
+			(PAGE_SIZE - 1)));
+	}
+
 	return true;
 }
 
@@ -8700,7 +8812,8 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 {
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
-	    !nested_cpu_has_vid(vmcs12))
+	    !nested_cpu_has_vid(vmcs12) &&
+	    !nested_cpu_has_posted_intr(vmcs12))
 		return 0;
 
 	/*
@@ -8719,6 +8832,17 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 	   !nested_exit_on_intr(vcpu))
 		return -EINVAL;
 
+	/*
+	 * bits 15:8 should be zero in posted_intr_nv,
+	 * the descriptor address has been already checked
+	 * in nested_get_vmcs12_pages.
+	 */
+	if (nested_cpu_has_posted_intr(vmcs12) &&
+	   (!nested_cpu_has_vid(vmcs12) ||
+	    !nested_exit_intr_ack_set(vcpu) ||
+	    vmcs12->posted_intr_nv & 0xff00))
+		return -EINVAL;
+
 	/* tpr shadow is needed by all apicv features. */
 	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 		return -EINVAL;
@@ -8961,8 +9085,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	exec_control = vmcs12->pin_based_vm_exec_control;
 	exec_control |= vmcs_config.pin_based_exec_ctrl;
-	exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
-                          PIN_BASED_POSTED_INTR);
+	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		/*
+		 * Note that we use L0's vector here and in
+		 * vmx_deliver_nested_posted_interrupt.
+		 */
+		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+		vmx->nested.pi_pending = false;
+		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+		vmcs_write64(POSTED_INTR_DESC_ADDR,
+			page_to_phys(vmx->nested.pi_desc_page) +
+			(unsigned long)(vmcs12->posted_intr_desc_addr &
+			(PAGE_SIZE - 1)));
+	} else
+		exec_control &= ~PIN_BASED_POSTED_INTR;
+
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
 
 	vmx->nested.preemption_timer_expired = false;
@@ -9498,9 +9637,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		if (vmx->nested.nested_run_pending)
 			return -EBUSY;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+		return 0;
 	}
 
-	return 0;
+	return vmx_complete_nested_posted_interrupt(vcpu);
 }
 
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -9878,6 +10018,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		nested_release_page(vmx->nested.virtual_apic_page);
 		vmx->nested.virtual_apic_page = NULL;
 	}
+	if (vmx->nested.pi_desc_page) {
+		kunmap(vmx->nested.pi_desc_page);
+		nested_release_page(vmx->nested.pi_desc_page);
+		vmx->nested.pi_desc_page = NULL;
+		vmx->nested.pi_desc = NULL;
+	}
 
 	/*
 	 * We are now running in L2, mmu_notifier will force to reload the
-- 
cgit v1.2.3


From 75aaf4c3e6a4ed48207230cf133a02258ca5abd5 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 23 Jan 2015 08:29:50 +0000
Subject: x86/raid6: correctly check for assembler capabilities

Just like for AVX2 (which simply needs an #if -> #ifdef conversion),
SSSE3 assembler support should be checked for before using it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 arch/x86/Makefile       | 1 +
 lib/raid6/algos.c       | 2 +-
 lib/raid6/recov_avx2.c  | 2 +-
 lib/raid6/recov_ssse3.c | 6 ++++++
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 920e6160c535..5ba2d9ce82dc 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -148,6 +148,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 7d0e5cd7b570..dbef2314901e 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -89,10 +89,10 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
 EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
 const struct raid6_recov_calls *const raid6_recov_algos[] = {
-#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
 #ifdef CONFIG_AS_AVX2
 	&raid6_recov_avx2,
 #endif
+#ifdef CONFIG_AS_SSSE3
 	&raid6_recov_ssse3,
 #endif
 	&raid6_recov_intx1,
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
index e1eea433a493..53fe3d7bdfb3 100644
--- a/lib/raid6/recov_avx2.c
+++ b/lib/raid6/recov_avx2.c
@@ -8,7 +8,7 @@
  * of the License.
  */
 
-#if CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX2
 
 #include <linux/raid/pq.h>
 #include "x86.h"
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
index a9168328f03b..cda33e56a5e3 100644
--- a/lib/raid6/recov_ssse3.c
+++ b/lib/raid6/recov_ssse3.c
@@ -7,6 +7,8 @@
  * of the License.
  */
 
+#ifdef CONFIG_AS_SSSE3
+
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -330,3 +332,7 @@ const struct raid6_recov_calls raid6_recov_ssse3 = {
 #endif
 	.priority = 1,
 };
+
+#else
+#warning "your version of binutils lacks SSSE3 support"
+#endif
-- 
cgit v1.2.3


From 12cf89b550d13eb7cb86ef182bd6c04345a33a1f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Feb 2015 16:45:18 -0600
Subject: livepatch: rename config to CONFIG_LIVEPATCH

Rename CONFIG_LIVE_PATCHING to CONFIG_LIVEPATCH to make the naming of
the config and the code more consistent.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/Kconfig                 | 2 +-
 arch/x86/include/asm/livepatch.h | 4 ++--
 arch/x86/kernel/Makefile         | 2 +-
 include/linux/livepatch.h        | 4 ++--
 kernel/livepatch/Kconfig         | 6 +++---
 kernel/livepatch/Makefile        | 2 +-
 samples/Kconfig                  | 4 ++--
 samples/livepatch/Makefile       | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 29b095231276..11970b076862 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,7 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_LIVE_PATCHING
+	select HAVE_LIVEPATCH
 
 ### Arch settings
 config X86
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 26e58134c8cb..a455a53d789a 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#ifdef CONFIG_LIVE_PATCHING
+#ifdef CONFIG_LIVEPATCH
 static inline int klp_check_compiler_support(void)
 {
 #ifndef CC_USING_FENTRY
@@ -40,7 +40,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 	regs->ip = ip;
 }
 #else
-#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#error Live patching support is disabled; check CONFIG_LIVEPATCH
 #endif
 
 #endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 316b34e74c15..732223496968 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,7 +63,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_LIVE_PATCHING)	+= livepatch.o
+obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index f14c6fb262b4..95023fd8b00d 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#if IS_ENABLED(CONFIG_LIVE_PATCHING)
+#if IS_ENABLED(CONFIG_LIVEPATCH)
 
 #include <asm/livepatch.h>
 
@@ -128,6 +128,6 @@ extern int klp_unregister_patch(struct klp_patch *);
 extern int klp_enable_patch(struct klp_patch *);
 extern int klp_disable_patch(struct klp_patch *);
 
-#endif /* CONFIG_LIVE_PATCHING */
+#endif /* CONFIG_LIVEPATCH */
 
 #endif /* _LINUX_LIVEPATCH_H_ */
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 347ee2221137..045022557936 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,15 +1,15 @@
-config HAVE_LIVE_PATCHING
+config HAVE_LIVEPATCH
 	bool
 	help
 	  Arch supports kernel live patching
 
-config LIVE_PATCHING
+config LIVEPATCH
 	bool "Kernel Live Patching"
 	depends on DYNAMIC_FTRACE_WITH_REGS
 	depends on MODULES
 	depends on SYSFS
 	depends on KALLSYMS_ALL
-	depends on HAVE_LIVE_PATCHING
+	depends on HAVE_LIVEPATCH
 	help
 	  Say Y here if you want to support kernel live patching.
 	  This option has no runtime impact until a kernel "patch"
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 7c1f00861428..e8780c0901d9 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_LIVE_PATCHING) += livepatch.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
 livepatch-objs := core.o
diff --git a/samples/Kconfig b/samples/Kconfig
index 0aed20df5f0b..224ebb46bed5 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -63,9 +63,9 @@ config SAMPLE_RPMSG_CLIENT
 	  to communicate with an AMP-configured remote processor over
 	  the rpmsg bus.
 
-config SAMPLE_LIVE_PATCHING
+config SAMPLE_LIVEPATCH
 	tristate "Build live patching sample -- loadable modules only"
-	depends on LIVE_PATCHING && m
+	depends on LIVEPATCH && m
 	help
 	  Builds a sample live patch that replaces the procfs handler
 	  for /proc/cmdline to print "this has been live patched".
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 7f1cdc131a02..10319d7ea0b1 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_SAMPLE_LIVE_PATCHING) += livepatch-sample.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
-- 
cgit v1.2.3


From 375074cc736ab1d89a708c0a8d7baa4a70d5d476 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:07 -0700
Subject: x86: Clean up cr4 manipulation

CR4 manipulation was split, seemingly at random, between direct
(write_cr4) and using a helper (set/clear_in_cr4).  Unfortunately,
the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code,
which only a small subset of users actually wanted.

This patch replaces all cr4 access in functions that don't leave cr4
exactly the way they found it with new helpers cr4_set_bits,
cr4_clear_bits, and cr4_set_bits_and_update_boot.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/495a10bdc9e67016b8fd3945700d46cfd5c12c2f.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h     | 33 --------------------------------
 arch/x86/include/asm/tlbflush.h      | 37 ++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/virtext.h       |  3 ++-
 arch/x86/kernel/cpu/common.c         | 10 +++++-----
 arch/x86/kernel/cpu/mcheck/mce.c     |  3 ++-
 arch/x86/kernel/cpu/mcheck/p5.c      |  3 ++-
 arch/x86/kernel/cpu/mcheck/winchip.c |  3 ++-
 arch/x86/kernel/cpu/perf_event.c     |  7 ++++---
 arch/x86/kernel/i387.c               |  3 ++-
 arch/x86/kernel/process.c            |  5 +++--
 arch/x86/kernel/xsave.c              |  3 ++-
 arch/x86/kvm/vmx.c                   |  4 ++--
 arch/x86/mm/init.c                   |  4 ++--
 arch/x86/xen/enlighten.c             |  4 ++--
 drivers/lguest/x86/core.c            |  5 +++--
 15 files changed, 70 insertions(+), 57 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a092a0cce0b7..ec1c93588cef 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -579,39 +579,6 @@ static inline void load_sp0(struct tss_struct *tss,
 #define set_iopl_mask native_set_iopl_mask
 #endif /* CONFIG_PARAVIRT */
 
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
-	unsigned long cr4;
-
-	mmu_cr4_features |= mask;
-	if (trampoline_cr4_features)
-		*trampoline_cr4_features = mmu_cr4_features;
-	cr4 = read_cr4();
-	cr4 |= mask;
-	write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
-	unsigned long cr4;
-
-	mmu_cr4_features &= ~mask;
-	if (trampoline_cr4_features)
-		*trampoline_cr4_features = mmu_cr4_features;
-	cr4 = read_cr4();
-	cr4 &= ~mask;
-	write_cr4(cr4);
-}
-
 typedef struct {
 	unsigned long		seg;
 } mm_segment_t;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..fc0c4bc356ce 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,43 @@
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = read_cr4();
+	cr4 |= mask;
+	write_cr4(cr4);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = read_cr4();
+	cr4 &= ~mask;
+	write_cr4(cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags.  This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+	mmu_cr4_features |= mask;
+	if (trampoline_cr4_features)
+		*trampoline_cr4_features = mmu_cr4_features;
+	cr4_set_bits(mask);
+}
+
 static inline void __native_flush_tlb(void)
 {
 	native_write_cr3(native_read_cr3());
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..f41e19ca717b 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@
 
 #include <asm/vmx.h>
 #include <asm/svm.h>
+#include <asm/tlbflush.h>
 
 /*
  * VMX functions:
@@ -40,7 +41,7 @@ static inline int cpu_has_vmx(void)
 static inline void cpu_vmxoff(void)
 {
 	asm volatile (ASM_VMX_VMXOFF : : : "cc");
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c6049650c093..9d8fc49f0922 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -278,7 +278,7 @@ __setup("nosmep", setup_disable_smep);
 static __always_inline void setup_smep(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_SMEP))
-		set_in_cr4(X86_CR4_SMEP);
+		cr4_set_bits(X86_CR4_SMEP);
 }
 
 static __init int setup_disable_smap(char *arg)
@@ -298,9 +298,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 
 	if (cpu_has(c, X86_FEATURE_SMAP)) {
 #ifdef CONFIG_X86_SMAP
-		set_in_cr4(X86_CR4_SMAP);
+		cr4_set_bits(X86_CR4_SMAP);
 #else
-		clear_in_cr4(X86_CR4_SMAP);
+		cr4_clear_bits(X86_CR4_SMAP);
 #endif
 	}
 }
@@ -1312,7 +1312,7 @@ void cpu_init(void)
 
 	pr_debug("Initializing CPU#%d\n", cpu);
 
-	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+	cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
 	/*
 	 * Initialize the per-CPU GDT with the boot GDT,
@@ -1393,7 +1393,7 @@ void cpu_init(void)
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
 
 	if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
-		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+		cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
 	load_current_idt();
 	switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d23179900755..15ad3ed1a3cd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -44,6 +44,7 @@
 
 #include <asm/processor.h>
 #include <asm/traps.h>
+#include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
@@ -1449,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void)
 	bitmap_fill(all_banks, MAX_NR_BANKS);
 	machine_check_poll(MCP_UC | m_fl, &all_banks);
 
-	set_in_cr4(X86_CR4_MCE);
+	cr4_set_bits(X86_CR4_MCE);
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index ec2663a708e4..737b0ad4e61a 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,6 +9,7 @@
 
 #include <asm/processor.h>
 #include <asm/traps.h>
+#include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
@@ -65,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 	       "Intel old style machine check architecture supported.\n");
 
 	/* Enable MCE: */
-	set_in_cr4(X86_CR4_MCE);
+	cr4_set_bits(X86_CR4_MCE);
 	printk(KERN_INFO
 	       "Intel old style machine check reporting enabled on CPU#%d.\n",
 	       smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index bd5d46a32210..44f138296fbe 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,6 +8,7 @@
 
 #include <asm/processor.h>
 #include <asm/traps.h>
+#include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
@@ -36,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
 	lo &= ~(1<<4);	/* Enable MCE */
 	wrmsr(MSR_IDT_FCR1, lo, hi);
 
-	set_in_cr4(X86_CR4_MCE);
+	cr4_set_bits(X86_CR4_MCE);
 
 	printk(KERN_INFO
 	       "Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5dc855..6b5acd5f4a34 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/alternative.h>
+#include <asm/tlbflush.h>
 #include <asm/timer.h>
 #include <asm/desc.h>
 #include <asm/ldt.h>
@@ -1328,7 +1329,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 
 	case CPU_STARTING:
 		if (x86_pmu.attr_rdpmc)
-			set_in_cr4(X86_CR4_PCE);
+			cr4_set_bits(X86_CR4_PCE);
 		if (x86_pmu.cpu_starting)
 			x86_pmu.cpu_starting(cpu);
 		break;
@@ -1834,9 +1835,9 @@ static void change_rdpmc(void *info)
 	bool enable = !!(unsigned long)info;
 
 	if (enable)
-		set_in_cr4(X86_CR4_PCE);
+		cr4_set_bits(X86_CR4_PCE);
 	else
-		clear_in_cr4(X86_CR4_PCE);
+		cr4_clear_bits(X86_CR4_PCE);
 }
 
 static ssize_t set_attr_rdpmc(struct device *cdev,
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..87727b03196d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,6 +13,7 @@
 #include <asm/sigcontext.h>
 #include <asm/processor.h>
 #include <asm/math_emu.h>
+#include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/i387.h>
@@ -180,7 +181,7 @@ void fpu_init(void)
 	if (cpu_has_xmm)
 		cr4_mask |= X86_CR4_OSXMMEXCPT;
 	if (cr4_mask)
-		set_in_cr4(cr4_mask);
+		cr4_set_bits(cr4_mask);
 
 	cr0 = read_cr0();
 	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127ddaa2d5a..046e2d620bbe 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
 #include <asm/fpu-internal.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
+#include <asm/tlbflush.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -141,7 +142,7 @@ void flush_thread(void)
 
 static void hard_disable_TSC(void)
 {
-	write_cr4(read_cr4() | X86_CR4_TSD);
+	cr4_set_bits(X86_CR4_TSD);
 }
 
 void disable_TSC(void)
@@ -158,7 +159,7 @@ void disable_TSC(void)
 
 static void hard_enable_TSC(void)
 {
-	write_cr4(read_cr4() & ~X86_CR4_TSD);
+	cr4_clear_bits(X86_CR4_TSD);
 }
 
 static void enable_TSC(void)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0de1fae2bdf0..34f66e58a896 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/sigframe.h>
+#include <asm/tlbflush.h>
 #include <asm/xcr.h>
 
 /*
@@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void)
  */
 static inline void xstate_enable(void)
 {
-	set_in_cr4(X86_CR4_OSXSAVE);
+	cr4_set_bits(X86_CR4_OSXSAVE);
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..db77537013d1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2812,7 +2812,7 @@ static int hardware_enable(void)
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
-	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+	cr4_set_bits(X86_CR4_VMXE);
 
 	if (vmm_exclusive) {
 		kvm_cpu_vmxon(phys_addr);
@@ -2849,7 +2849,7 @@ static void hardware_disable(void)
 		vmclear_local_loaded_vmcss();
 		kvm_cpu_vmxoff();
 	}
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 079c3b6a3ff1..d4eddbd92c28 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -173,11 +173,11 @@ static void __init probe_page_size_mask(void)
 
 	/* Enable PSE if available */
 	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
+		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
 	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
+		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 78a881b7fc41..bd8b8459c3d0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1494,10 +1494,10 @@ static void xen_pvh_set_cr_flags(int cpu)
 	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
 	*/
 	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
+		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	if (cpu_has_pge)
-		set_in_cr4(X86_CR4_PGE);
+		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 }
 
 /*
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 922a1acbf652..6adfd7ba4c97 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -47,6 +47,7 @@
 #include <asm/lguest.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/tlbflush.h>
 #include "../lg.h"
 
 static int cpu_had_pge;
@@ -452,9 +453,9 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 static void adjust_pge(void *on)
 {
 	if (on)
-		write_cr4(read_cr4() | X86_CR4_PGE);
+		cr4_set_bits(X86_CR4_PGE);
 	else
-		write_cr4(read_cr4() & ~X86_CR4_PGE);
+		cr4_clear_bits(X86_CR4_PGE);
 }
 
 /*H:020
-- 
cgit v1.2.3


From 1e02ce4cccdcb9688386e5b8d2c9fa4660b45389 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:08 -0700
Subject: x86: Store a per-cpu shadow copy of CR4

Context switches and TLB flushes can change individual bits of CR4.
CR4 reads take several cycles, so store a shadow copy of CR4 in a
per-cpu variable.

To avoid wasting a cache line, I added the CR4 shadow to
cpu_tlbstate, which is already touched in switch_mm.  The heaviest
users of the cr4 shadow will be switch_mm and __switch_to_xtra, and
__switch_to_xtra is called shortly after switch_mm during context
switch, so the cacheline is likely to be hot.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/3a54dd3353fffbf84804398e00dfdc5b7c1afd7d.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/paravirt.h      |  6 ++---
 arch/x86/include/asm/special_insns.h |  6 ++---
 arch/x86/include/asm/tlbflush.h      | 52 +++++++++++++++++++++++++++---------
 arch/x86/include/asm/virtext.h       |  2 +-
 arch/x86/kernel/acpi/sleep.c         |  2 +-
 arch/x86/kernel/cpu/common.c         |  7 +++++
 arch/x86/kernel/cpu/mtrr/cyrix.c     |  6 ++---
 arch/x86/kernel/cpu/mtrr/generic.c   |  6 ++---
 arch/x86/kernel/head32.c             |  1 +
 arch/x86/kernel/head64.c             |  2 ++
 arch/x86/kernel/process_32.c         |  2 +-
 arch/x86/kernel/process_64.c         |  2 +-
 arch/x86/kernel/setup.c              |  2 +-
 arch/x86/kvm/svm.c                   |  2 +-
 arch/x86/kvm/vmx.c                   |  6 ++---
 arch/x86/mm/fault.c                  |  2 +-
 arch/x86/mm/init.c                   |  9 +++++++
 arch/x86/mm/tlb.c                    |  3 ---
 arch/x86/power/cpu.c                 | 11 +++-----
 arch/x86/realmode/init.c             |  2 +-
 20 files changed, 85 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 32444ae939ca..965c47d254aa 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
 	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
 }
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
 	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
 	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
 }
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index e820c080a4e9..6a4b00fafb00 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x)
 	native_write_cr3(x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
 	return native_read_cr4();
 }
 
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
 	return native_read_cr4_safe();
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
 	native_write_cr4(x);
 }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fc0c4bc356ce..cd791948b286 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,14 +15,37 @@
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_state {
+#ifdef CONFIG_SMP
+	struct mm_struct *active_mm;
+	int state;
+#endif
+
+	/*
+	 * Access to this CR4 shadow and to H/W CR4 is protected by
+	 * disabling interrupts when modifying either one.
+	 */
+	unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+	this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
 /* Set in this cpu's CR4. */
 static inline void cr4_set_bits(unsigned long mask)
 {
 	unsigned long cr4;
 
-	cr4 = read_cr4();
-	cr4 |= mask;
-	write_cr4(cr4);
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	if ((cr4 | mask) != cr4) {
+		cr4 |= mask;
+		this_cpu_write(cpu_tlbstate.cr4, cr4);
+		__write_cr4(cr4);
+	}
 }
 
 /* Clear in this cpu's CR4. */
@@ -30,9 +53,18 @@ static inline void cr4_clear_bits(unsigned long mask)
 {
 	unsigned long cr4;
 
-	cr4 = read_cr4();
-	cr4 &= ~mask;
-	write_cr4(cr4);
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	if ((cr4 & ~mask) != cr4) {
+		cr4 &= ~mask;
+		this_cpu_write(cpu_tlbstate.cr4, cr4);
+		__write_cr4(cr4);
+	}
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+	return this_cpu_read(cpu_tlbstate.cr4);
 }
 
 /*
@@ -61,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 {
 	unsigned long cr4;
 
-	cr4 = native_read_cr4();
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
 	/* clear PGE */
 	native_write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
@@ -221,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 #define TLBSTATE_OK	1
 #define TLBSTATE_LAZY	2
 
-struct tlb_state {
-	struct mm_struct *active_mm;
-	int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
 static inline void reset_lazy_tlbstate(void)
 {
 	this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index f41e19ca717b..cce9ee68e335 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -46,7 +46,7 @@ static inline void cpu_vmxoff(void)
 
 static inline int cpu_vmx_enabled(void)
 {
-	return read_cr4() & X86_CR4_VMXE;
+	return __read_cr4() & X86_CR4_VMXE;
 }
 
 /** Disable VMX if it is enabled on the current CPU
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 31368207837c..d1daead5fcdd 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void)
 
 	header->pmode_cr0 = read_cr0();
 	if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
-		header->pmode_cr4 = read_cr4();
+		header->pmode_cr4 = __read_cr4();
 		header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
 	}
 	if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9d8fc49f0922..07f2fc3c13a4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
 #include <asm/archrandom.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
+#include <asm/tlbflush.h>
 #include <asm/debugreg.h>
 #include <asm/sections.h>
 #include <asm/vsyscall.h>
@@ -1293,6 +1294,12 @@ void cpu_init(void)
 
 	wait_for_master_cpu(cpu);
 
+	/*
+	 * Initialize the CR4 shadow before doing anything that could
+	 * try to read it.
+	 */
+	cr4_init_shadow();
+
 	/*
 	 * Load microcode on this cpu if a valid microcode is available.
 	 * This is early microcode loading procedure.
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..f8c81ba0b465 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)
 
 	/*  Save value of CR4 and clear Page Global Enable (bit 7)  */
 	if (cpu_has_pge) {
-		cr4 = read_cr4();
-		write_cr4(cr4 & ~X86_CR4_PGE);
+		cr4 = __read_cr4();
+		__write_cr4(cr4 & ~X86_CR4_PGE);
 	}
 
 	/*
@@ -171,7 +171,7 @@ static void post_set(void)
 
 	/* Restore value of CR4 */
 	if (cpu_has_pge)
-		write_cr4(cr4);
+		__write_cr4(cr4);
 }
 
 static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0e25a1bc5ab5..7d74f7b3c6ba 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 
 	/* Save value of CR4 and clear Page Global Enable (bit 7) */
 	if (cpu_has_pge) {
-		cr4 = read_cr4();
-		write_cr4(cr4 & ~X86_CR4_PGE);
+		cr4 = __read_cr4();
+		__write_cr4(cr4 & ~X86_CR4_PGE);
 	}
 
 	/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)
 
 	/* Restore value of CR4 */
 	if (cpu_has_pge)
-		write_cr4(cr4);
+		__write_cr4(cr4);
 	raw_spin_unlock(&set_atomicity_lock);
 }
 
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)
 
 asmlinkage __visible void __init i386_start_kernel(void)
 {
+	cr4_init_shadow();
 	sanitize_boot_params(&boot_params);
 
 	/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..3b241f0ca005 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -155,6 +155,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 				(__START_KERNEL & PGDIR_MASK)));
 	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
+	cr4_init_shadow();
+
 	/* Kill off the identity-map trampoline */
 	reset_early_page_tables();
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8f3ebfe710d0..603c4f99cb5a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all)
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = read_cr3();
-	cr4 = read_cr4_safe();
+	cr4 = __read_cr4_safe();
 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
 			cr0, cr2, cr3, cr4);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c02913af3..67fcc43577d2 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all)
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = read_cr3();
-	cr4 = read_cr4();
+	cr4 = __read_cr4();
 
 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 	       fs, fsindex, gs, gsindex, shadowgs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ab4734e5411d..04e6c62f1a93 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1178,7 +1178,7 @@ void __init setup_arch(char **cmdline_p)
 
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
-		mmu_cr4_features = read_cr4();
+		mmu_cr4_features = __read_cr4();
 		if (trampoline_cr4_features)
 			*trampoline_cr4_features = mmu_cr4_features;
 	}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..496a54839968 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1583,7 +1583,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
 	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
 	if (cr4 & X86_CR4_VMXE)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index db77537013d1..8dca6ccbb9ce 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2785,7 +2785,7 @@ static int hardware_enable(void)
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 	u64 old, test_bits;
 
-	if (read_cr4() & X86_CR4_VMXE)
+	if (cr4_read_shadow() & X86_CR4_VMXE)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -4255,7 +4255,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
 	/* Save the most likely value for this task's CR4 in the VMCS. */
-	cr4 = read_cr4();
+	cr4 = cr4_read_shadow();
 	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
 	vmx->host_state.vmcs_host_cr4 = cr4;
 
@@ -7784,7 +7784,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
-	cr4 = read_cr4();
+	cr4 = cr4_read_shadow();
 	if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
 		vmcs_writel(HOST_CR4, cr4);
 		vmx->host_state.vmcs_host_cr4 = cr4;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e3ff27a5b634..ede025fb46f1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 		if (pte && pte_present(*pte) && pte_exec(*pte) &&
 				(pgd_flags(*pgd) & _PAGE_USER) &&
-				(read_cr4() & X86_CR4_SMEP))
+				(__read_cr4() & X86_CR4_SMEP))
 			printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
 	}
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d4eddbd92c28..a74aa0fd1853 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -713,6 +713,15 @@ void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+	.active_mm = &init_mm,
+	.state = 0,
+#endif
+	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
+
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 {
 	/* entry 0 MUST be WB (hardwired to speed up translations) */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36d64f8..3250f2371aea 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
 #include <asm/uv/uv.h>
 #include <linux/debugfs.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
-			= { &init_mm, 0, };
-
 /*
  *	Smarter SMP flushing macros.
  *		c/o Linus Torvalds.
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6ec7910f59bf..3e32ed5648a0 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,11 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->cr0 = read_cr0();
 	ctxt->cr2 = read_cr2();
 	ctxt->cr3 = read_cr3();
-#ifdef CONFIG_X86_32
-	ctxt->cr4 = read_cr4_safe();
-#else
-/* CONFIG_X86_64 */
-	ctxt->cr4 = read_cr4();
+	ctxt->cr4 = __read_cr4_safe();
+#ifdef CONFIG_X86_64
 	ctxt->cr8 = read_cr8();
 #endif
 	ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
@@ -175,12 +172,12 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	/* cr4 was introduced in the Pentium CPU */
 #ifdef CONFIG_X86_32
 	if (ctxt->cr4)
-		write_cr4(ctxt->cr4);
+		__write_cr4(ctxt->cr4);
 #else
 /* CONFIG X86_64 */
 	wrmsrl(MSR_EFER, ctxt->efer);
 	write_cr8(ctxt->cr8);
-	write_cr4(ctxt->cr4);
+	__write_cr4(ctxt->cr4);
 #endif
 	write_cr3(ctxt->cr3);
 	write_cr2(ctxt->cr2);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index bad628a620c4..0b7a63d98440 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -81,7 +81,7 @@ void __init setup_real_mode(void)
 
 	trampoline_header->start = (u64) secondary_startup_64;
 	trampoline_cr4_features = &trampoline_header->cr4;
-	*trampoline_cr4_features = read_cr4();
+	*trampoline_cr4_features = __read_cr4();
 
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
-- 
cgit v1.2.3


From 22c4bd9fa921c2b1b3f2420d7b9dabbe982f3059 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:09 -0700
Subject: x86: Add a comment clarifying LDT context switching

The code is correct, but only for a rather subtle reason.  This
confused me for quite a while when I read switch_mm, so clarify the
code to avoid confusing other people, too.

TBH, I wouldn't be surprised if this code was only correct by
accident.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/0db86397f968996fb772c443c251415b0b430ddd.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4b75d591eb5e..52c18359f1dc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -55,12 +55,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
-		 * It's possible leave_mm(prev) has been called.  If so,
-		 * then prev->context.ldt could be out of sync with the
-		 * LDT descriptor or the LDT register.  This can only happen
-		 * if prev->context.ldt is non-null, since we never free
-		 * an LDT.  But LDTs can't be shared across mms, so
-		 * prev->context.ldt won't be equal to next->context.ldt.
+		 * It's possible that prev->context.ldt doesn't match
+		 * the LDT register.  This can happen if leave_mm(prev)
+		 * was called and then modify_ldt changed
+		 * prev->context.ldt but suppressed an IPI to this CPU.
+		 * In this case, prev->context.ldt != NULL, because we
+		 * never free an LDT while the mm still exists.  That
+		 * means that next->context.ldt != prev->context.ldt,
+		 * because mms never share an LDT.
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_LDT_nolock(&next->context);
-- 
cgit v1.2.3


From c1317ec2b906442930318d9d6e51425c5a69e9cb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:11 -0700
Subject: perf: Pass the event to arch_perf_update_userpage()

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/0fea9a7fac3c1eea86cb0a5954184e74f4213666.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ++-
 kernel/events/core.c             | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6b5acd5f4a34..73e84a348de1 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1915,7 +1915,8 @@ static struct pmu pmu = {
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 };
 
-void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event *event,
+			       struct perf_event_mmap_page *userpg, u64 now)
 {
 	struct cyc2ns_data *data;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cc1487145d33..13209a90b751 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4101,7 +4101,8 @@ unlock:
 	rcu_read_unlock();
 }
 
-void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(
+	struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
 {
 }
 
@@ -4151,7 +4152,7 @@ void perf_event_update_userpage(struct perf_event *event)
 	userpg->time_running = running +
 			atomic64_read(&event->child_total_time_running);
 
-	arch_perf_update_userpage(userpg, now);
+	arch_perf_update_userpage(event, userpg, now);
 
 	barrier();
 	++userpg->lock;
-- 
cgit v1.2.3


From 7911d3f7af14a614617e38245fedf98a724e46a9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:12 -0700
Subject: perf/x86: Only allow rdpmc if a perf_event is mapped

We currently allow any process to use rdpmc.  This significantly
weakens the protection offered by PR_TSC_DISABLED, and it could be
helpful to users attempting to exploit timing attacks.

Since we can't enable access to individual counters, use a very
coarse heuristic to limit access to rdpmc: allow access only when
a perf_event is mmapped.  This protects seccomp sandboxes.

There is plenty of room to further tighen these restrictions.  For
example, this allows rdpmc for any x86_pmu event, but it's only
useful for self-monitoring tasks.

As a side effect, cap_user_rdpmc will now be false for AMD uncore
events.  This isn't a real regression, since .event_idx is disabled
for these events anyway for the time being.  Whenever that gets
re-added, the cap_user_rdpmc code can be adjusted or refactored
accordingly.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/a2bdb3cf3a1d70c26980d7c6dddfbaa69f3182bf.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu.h         |  2 ++
 arch/x86/include/asm/mmu_context.h | 16 +++++++++++
 arch/x86/kernel/cpu/perf_event.c   | 57 +++++++++++++++++++++++++-------------
 arch/x86/kernel/cpu/perf_event.h   |  2 ++
 4 files changed, 58 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 876e74e8eec7..09b9620a73b4 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,6 +19,8 @@ typedef struct {
 
 	struct mutex lock;
 	void __user *vdso;
+
+	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 52c18359f1dc..89c1fece224e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -18,6 +18,18 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
 }
 #endif	/* !CONFIG_PARAVIRT */
 
+#ifdef CONFIG_PERF_EVENTS
+static inline void load_mm_cr4(struct mm_struct *mm)
+{
+	if (atomic_read(&mm->context.perf_rdpmc_allowed))
+		cr4_set_bits(X86_CR4_PCE);
+	else
+		cr4_clear_bits(X86_CR4_PCE);
+}
+#else
+static inline void load_mm_cr4(struct mm_struct *mm) {}
+#endif
+
 /*
  * Used for LDT copy/destruction.
  */
@@ -52,6 +64,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Stop flush ipis for the previous mm */
 		cpumask_clear_cpu(cpu, mm_cpumask(prev));
 
+		/* Load per-mm CR4 state */
+		load_mm_cr4(next);
+
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
@@ -87,6 +102,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+			load_mm_cr4(next);
 			load_LDT_nolock(&next->context);
 		}
 	}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 73e84a348de1..bec5cff7dc80 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/alternative.h>
+#include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 #include <asm/timer.h>
 #include <asm/desc.h>
@@ -1328,8 +1329,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_STARTING:
-		if (x86_pmu.attr_rdpmc)
-			cr4_set_bits(X86_CR4_PCE);
 		if (x86_pmu.cpu_starting)
 			x86_pmu.cpu_starting(cpu);
 		break;
@@ -1805,14 +1804,44 @@ static int x86_pmu_event_init(struct perf_event *event)
 			event->destroy(event);
 	}
 
+	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
 	return err;
 }
 
+static void refresh_pce(void *ignored)
+{
+	if (current->mm)
+		load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+	if (!current->mm)
+		return;
+
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
 static int x86_pmu_event_idx(struct perf_event *event)
 {
 	int idx = event->hw.idx;
 
-	if (!x86_pmu.attr_rdpmc)
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return 0;
 
 	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
@@ -1830,16 +1859,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev,
 	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
 }
 
-static void change_rdpmc(void *info)
-{
-	bool enable = !!(unsigned long)info;
-
-	if (enable)
-		cr4_set_bits(X86_CR4_PCE);
-	else
-		cr4_clear_bits(X86_CR4_PCE);
-}
-
 static ssize_t set_attr_rdpmc(struct device *cdev,
 			      struct device_attribute *attr,
 			      const char *buf, size_t count)
@@ -1854,11 +1873,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
 	if (x86_pmu.attr_rdpmc_broken)
 		return -ENOTSUPP;
 
-	if (!!val != !!x86_pmu.attr_rdpmc) {
-		x86_pmu.attr_rdpmc = !!val;
-		on_each_cpu(change_rdpmc, (void *)val, 1);
-	}
-
+	x86_pmu.attr_rdpmc = !!val;
 	return count;
 }
 
@@ -1901,6 +1916,9 @@ static struct pmu pmu = {
 
 	.event_init		= x86_pmu_event_init,
 
+	.event_mapped		= x86_pmu_event_mapped,
+	.event_unmapped		= x86_pmu_event_unmapped,
+
 	.add			= x86_pmu_add,
 	.del			= x86_pmu_del,
 	.start			= x86_pmu_start,
@@ -1922,7 +1940,8 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
-	userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
+	userpg->cap_user_rdpmc =
+		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
 	userpg->pmc_width = x86_pmu.cntval_bits;
 
 	if (!sched_clock_stable())
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0ddc70..df525d2be1e8 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
 #define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
+
 
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
-- 
cgit v1.2.3


From a66734297f78707ce39d756b656bfae861d53f62 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:13 -0700
Subject: perf/x86: Add /sys/devices/cpu/rdpmc=2 to allow rdpmc for all tasks

While perfmon2 is a sufficiently evil library (it pokes MSRs
directly) that breaking it is fair game, it's still useful, so we
might as well try to support it.  This allows users to write 2 to
/sys/devices/cpu/rdpmc to disable all rdpmc protection so that hack
like perfmon2 can continue to work.

At some point, if perf_event becomes fast enough to replace
perfmon2, then this can go.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/caac3c1c707dcca48ecbc35f4def21495856f479.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h |  5 ++++-
 arch/x86/kernel/cpu/perf_event.c   | 21 ++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 89c1fece224e..883f6b933fa4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -19,9 +19,12 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
 #endif	/* !CONFIG_PARAVIRT */
 
 #ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_always_available;
+
 static inline void load_mm_cr4(struct mm_struct *mm)
 {
-	if (atomic_read(&mm->context.perf_rdpmc_allowed))
+	if (static_key_true(&rdpmc_always_available) ||
+	    atomic_read(&mm->context.perf_rdpmc_allowed))
 		cr4_set_bits(X86_CR4_PCE);
 	else
 		cr4_clear_bits(X86_CR4_PCE);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bec5cff7dc80..b71a7f86d68a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -45,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 	.enabled = 1,
 };
 
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
 u64 __read_mostly hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1870,10 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
 	if (ret)
 		return ret;
 
+	if (val > 2)
+		return -EINVAL;
+
 	if (x86_pmu.attr_rdpmc_broken)
 		return -ENOTSUPP;
 
-	x86_pmu.attr_rdpmc = !!val;
+	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+		/*
+		 * Changing into or out of always available, aka
+		 * perf-event-bypassing mode.  This path is extremely slow,
+		 * but only root can trigger it, so it's okay.
+		 */
+		if (val == 2)
+			static_key_slow_inc(&rdpmc_always_available);
+		else
+			static_key_slow_dec(&rdpmc_always_available);
+		on_each_cpu(refresh_pce, NULL, 1);
+	}
+
+	x86_pmu.attr_rdpmc = val;
+
 	return count;
 }
 
-- 
cgit v1.2.3


From 1c2b364b225a5a93dbd1f317bd000d2fec2694be Mon Sep 17 00:00:00 2001
From: Tiejun Chen <tiejun.chen@intel.com>
Date: Thu, 5 Feb 2015 17:22:26 +0800
Subject: kvm: remove KVM_MMIO_SIZE

After f78146b0f923, "KVM: Fix page-crossing MMIO", and
87da7e66a405, "KVM: x86: fix vcpu->mmio_fragments overflow",
actually KVM_MMIO_SIZE is gone.

Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 include/linux/kvm_host.h        | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9dbc7435cbc2..848947ac6ade 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_MMIO_SIZE 16
-
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 32d057571bf6..8a82838034f1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -33,10 +33,6 @@
 
 #include <asm/kvm_host.h>
 
-#ifndef KVM_MMIO_SIZE
-#define KVM_MMIO_SIZE 8
-#endif
-
 /*
  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  * in kvm, other bits are visible for userspace which are defined in
-- 
cgit v1.2.3


From 14d76b68f2819a1d0b50236a7e9e9f2ea69869d9 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:44 +0800
Subject: PCI: Use common resource list management code instead of private
 implementation

Use common resource list management data structure and interfaces
instead of private implementation.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/kernel/bios32.c            |  5 ++---
 arch/x86/pci/bus_numa.c             |  4 ++--
 drivers/pci/bus.c                   | 18 ++++++------------
 drivers/pci/host-bridge.c           |  8 ++++----
 drivers/pci/host/pci-host-generic.c |  4 ++--
 drivers/pci/host/pci-xgene.c        |  4 ++--
 drivers/pci/host/pcie-xilinx.c      |  4 ++--
 drivers/pci/probe.c                 | 10 +++++-----
 include/linux/pci.h                 |  9 ++-------
 9 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index a4effd6d8f2f..016991792b0b 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -422,17 +422,16 @@ static int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 static int pcibios_init_resources(int busnr, struct pci_sys_data *sys)
 {
 	int ret;
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 
 	if (list_empty(&sys->resources)) {
 		pci_add_resource_offset(&sys->resources,
 			 &iomem_resource, sys->mem_offset);
 	}
 
-	list_for_each_entry(window, &sys->resources, list) {
+	resource_list_for_each_entry(window, &sys->resources)
 		if (resource_type(window->res) == IORESOURCE_IO)
 			return 0;
-	}
 
 	sys->io_res.start = (busnr * SZ_64K) ?  : pcibios_min_io;
 	sys->io_res.end = (busnr + 1) * SZ_64K - 1;
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index f3a2cfc14125..7bcf06a7cd12 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -31,7 +31,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 {
 	struct pci_root_info *info = x86_find_pci_root_info(bus);
 	struct pci_root_res *root_res;
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	bool found = false;
 
 	if (!info)
@@ -41,7 +41,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 	       bus);
 
 	/* already added by acpi ? */
-	list_for_each_entry(window, resources, list)
+	resource_list_for_each_entry(window, resources)
 		if (window->res->flags & IORESOURCE_BUS) {
 			found = true;
 			break;
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 8fb16188cd82..90fa3a78fb7c 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -20,17 +20,16 @@
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
 			     resource_size_t offset)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *entry;
 
-	window = kzalloc(sizeof(struct pci_host_bridge_window), GFP_KERNEL);
-	if (!window) {
+	entry = resource_list_create_entry(res, 0);
+	if (!entry) {
 		printk(KERN_ERR "PCI: can't add host bridge window %pR\n", res);
 		return;
 	}
 
-	window->res = res;
-	window->offset = offset;
-	list_add_tail(&window->list, resources);
+	entry->offset = offset;
+	resource_list_add_tail(entry, resources);
 }
 EXPORT_SYMBOL(pci_add_resource_offset);
 
@@ -42,12 +41,7 @@ EXPORT_SYMBOL(pci_add_resource);
 
 void pci_free_resource_list(struct list_head *resources)
 {
-	struct pci_host_bridge_window *window, *tmp;
-
-	list_for_each_entry_safe(window, tmp, resources, list) {
-		list_del(&window->list);
-		kfree(window);
-	}
+	resource_list_free(resources);
 }
 EXPORT_SYMBOL(pci_free_resource_list);
 
diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index 0e5f3c95af5b..39b2dbe585aa 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -35,10 +35,10 @@ void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region,
 			     struct resource *res)
 {
 	struct pci_host_bridge *bridge = find_pci_host_bridge(bus);
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	resource_size_t offset = 0;
 
-	list_for_each_entry(window, &bridge->windows, list) {
+	resource_list_for_each_entry(window, &bridge->windows) {
 		if (resource_contains(window->res, res)) {
 			offset = window->offset;
 			break;
@@ -60,10 +60,10 @@ void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res,
 			     struct pci_bus_region *region)
 {
 	struct pci_host_bridge *bridge = find_pci_host_bridge(bus);
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	resource_size_t offset = 0;
 
-	list_for_each_entry(window, &bridge->windows, list) {
+	resource_list_for_each_entry(window, &bridge->windows) {
 		struct pci_bus_region bus_region;
 
 		if (resource_type(res) != resource_type(window->res))
diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c
index 6eb1aa75bd37..aee3c620ecf9 100644
--- a/drivers/pci/host/pci-host-generic.c
+++ b/drivers/pci/host/pci-host-generic.c
@@ -149,14 +149,14 @@ static int gen_pci_parse_request_of_pci_ranges(struct gen_pci *pci)
 	struct device *dev = pci->host.dev.parent;
 	struct device_node *np = dev->of_node;
 	resource_size_t iobase;
-	struct pci_host_bridge_window *win;
+	struct resource_entry *win;
 
 	err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pci->resources,
 					       &iobase);
 	if (err)
 		return err;
 
-	list_for_each_entry(win, &pci->resources, list) {
+	resource_list_for_each_entry(win, &pci->resources) {
 		struct resource *parent, *res = win->res;
 
 		switch (resource_type(res)) {
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index b1d0596457c5..a704257bab7f 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -401,11 +401,11 @@ static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
 				 struct list_head *res,
 				 resource_size_t io_base)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	struct device *dev = port->dev;
 	int ret;
 
-	list_for_each_entry(window, res, list) {
+	resource_list_for_each_entry(window, res) {
 		struct resource *res = window->res;
 		u64 restype = resource_type(res);
 
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index ef3ebaf9a738..601261df7663 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -737,7 +737,7 @@ static int xilinx_pcie_parse_and_add_res(struct xilinx_pcie_port *port)
 	resource_size_t offset;
 	struct of_pci_range_parser parser;
 	struct of_pci_range range;
-	struct pci_host_bridge_window *win;
+	struct resource_entry *win;
 	int err = 0, mem_resno = 0;
 
 	/* Get the ranges */
@@ -807,7 +807,7 @@ static int xilinx_pcie_parse_and_add_res(struct xilinx_pcie_port *port)
 
 free_resources:
 	release_child_resources(&iomem_resource);
-	list_for_each_entry(win, &port->resources, list)
+	resource_list_for_each_entry(win, &port->resources)
 		devm_kfree(dev, win->res);
 	pci_free_resource_list(&port->resources);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 23212f8ae09b..8d2f400e96cb 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1895,7 +1895,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 	int error;
 	struct pci_host_bridge *bridge;
 	struct pci_bus *b, *b2;
-	struct pci_host_bridge_window *window, *n;
+	struct resource_entry *window, *n;
 	struct resource *res;
 	resource_size_t offset;
 	char bus_addr[64];
@@ -1959,8 +1959,8 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 		printk(KERN_INFO "PCI host bridge to bus %s\n", dev_name(&b->dev));
 
 	/* Add initial resources to the bus */
-	list_for_each_entry_safe(window, n, resources, list) {
-		list_move_tail(&window->list, &bridge->windows);
+	resource_list_for_each_entry_safe(window, n, resources) {
+		list_move_tail(&window->node, &bridge->windows);
 		res = window->res;
 		offset = window->offset;
 		if (res->flags & IORESOURCE_BUS)
@@ -2060,12 +2060,12 @@ void pci_bus_release_busn_res(struct pci_bus *b)
 struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 		struct pci_ops *ops, void *sysdata, struct list_head *resources)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	bool found = false;
 	struct pci_bus *b;
 	int max;
 
-	list_for_each_entry(window, resources, list)
+	resource_list_for_each_entry(window, resources)
 		if (window->res->flags & IORESOURCE_BUS) {
 			found = true;
 			break;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9603094ed59b..faa60fa26314 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -29,6 +29,7 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/resource_ext.h>
 #include <uapi/linux/pci.h>
 
 #include <linux/pci_ids.h>
@@ -397,16 +398,10 @@ static inline int pci_channel_offline(struct pci_dev *pdev)
 	return (pdev->error_state != pci_channel_io_normal);
 }
 
-struct pci_host_bridge_window {
-	struct list_head list;
-	struct resource *res;		/* host bridge aperture (CPU address) */
-	resource_size_t offset;		/* bus address + offset = CPU address */
-};
-
 struct pci_host_bridge {
 	struct device dev;
 	struct pci_bus *bus;		/* root bus */
-	struct list_head windows;	/* pci_host_bridge_windows */
+	struct list_head windows;	/* resource_entry */
 	void (*release_fn)(struct pci_host_bridge *);
 	void *release_data;
 };
-- 
cgit v1.2.3


From 812dbd9994f122629db73205a7f7f46b430a6360 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:45 +0800
Subject: x86/PCI: Fix the range check for IO resources

The range check in setup_res() checks the IO range against
iomem_resource. That's just wrong.

Reworked based on Thomas original patch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/pci/acpi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index bb98afd0591e..1d725d99963d 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -283,7 +283,7 @@ static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
 	struct acpi_resource_address64 addr;
 	acpi_status status;
 	unsigned long flags;
-	u64 start, orig_end, end;
+	u64 start, orig_end, end, res_end;
 
 	status = resource_to_addr(acpi_res, &addr);
 	if (!ACPI_SUCCESS(status))
@@ -293,8 +293,10 @@ static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
 		flags = IORESOURCE_MEM;
 		if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
 			flags |= IORESOURCE_PREFETCH;
+		res_end = (u64)iomem_resource.end;
 	} else if (addr.resource_type == ACPI_IO_RANGE) {
 		flags = IORESOURCE_IO;
+		res_end = (u64)ioport_resource.end;
 	} else
 		return AE_OK;
 
@@ -302,7 +304,7 @@ static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
 	orig_end = end = addr.address.maximum + addr.address.translation_offset;
 
 	/* Exclude non-addressable range or non-addressable portion of range */
-	end = min(end, (u64)iomem_resource.end);
+	end = min(end, res_end);
 	if (end <= start) {
 		dev_info(&info->bridge->dev,
 			"host bridge window [%#llx-%#llx] "
-- 
cgit v1.2.3


From 593669c2ac0fe18baee04a3cd5539a148aa48574 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:46 +0800
Subject: x86/PCI/ACPI: Use common ACPI resource interfaces to simplify
 implementation

Use common ACPI resource discovery interfaces to simplify PCI host bridge
resource enumeration.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/pci/acpi.c | 295 ++++++++++++++++------------------------------------
 1 file changed, 91 insertions(+), 204 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1d725d99963d..6ac273832f28 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -10,9 +10,6 @@
 struct pci_root_info {
 	struct acpi_device *bridge;
 	char name[16];
-	unsigned int res_num;
-	struct resource *res;
-	resource_size_t *res_offset;
 	struct pci_sysdata sd;
 #ifdef	CONFIG_PCI_MMCONFIG
 	bool mcfg_added;
@@ -218,132 +215,41 @@ static void teardown_mcfg_map(struct pci_root_info *info)
 }
 #endif
 
-static acpi_status resource_to_addr(struct acpi_resource *resource,
-				    struct acpi_resource_address64 *addr)
-{
-	acpi_status status;
-	struct acpi_resource_memory24 *memory24;
-	struct acpi_resource_memory32 *memory32;
-	struct acpi_resource_fixed_memory32 *fixed_memory32;
-
-	memset(addr, 0, sizeof(*addr));
-	switch (resource->type) {
-	case ACPI_RESOURCE_TYPE_MEMORY24:
-		memory24 = &resource->data.memory24;
-		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->address.minimum = memory24->minimum;
-		addr->address.address_length = memory24->address_length;
-		addr->address.maximum = addr->address.minimum + addr->address.address_length - 1;
-		return AE_OK;
-	case ACPI_RESOURCE_TYPE_MEMORY32:
-		memory32 = &resource->data.memory32;
-		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->address.minimum = memory32->minimum;
-		addr->address.address_length = memory32->address_length;
-		addr->address.maximum = addr->address.minimum + addr->address.address_length - 1;
-		return AE_OK;
-	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
-		fixed_memory32 = &resource->data.fixed_memory32;
-		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->address.minimum = fixed_memory32->address;
-		addr->address.address_length = fixed_memory32->address_length;
-		addr->address.maximum = addr->address.minimum + addr->address.address_length - 1;
-		return AE_OK;
-	case ACPI_RESOURCE_TYPE_ADDRESS16:
-	case ACPI_RESOURCE_TYPE_ADDRESS32:
-	case ACPI_RESOURCE_TYPE_ADDRESS64:
-		status = acpi_resource_to_address64(resource, addr);
-		if (ACPI_SUCCESS(status) &&
-		    (addr->resource_type == ACPI_MEMORY_RANGE ||
-		    addr->resource_type == ACPI_IO_RANGE) &&
-		    addr->address.address_length > 0) {
-			return AE_OK;
-		}
-		break;
-	}
-	return AE_ERROR;
-}
-
-static acpi_status count_resource(struct acpi_resource *acpi_res, void *data)
+static void validate_resources(struct device *dev, struct list_head *crs_res,
+			       unsigned long type)
 {
-	struct pci_root_info *info = data;
-	struct acpi_resource_address64 addr;
-	acpi_status status;
-
-	status = resource_to_addr(acpi_res, &addr);
-	if (ACPI_SUCCESS(status))
-		info->res_num++;
-	return AE_OK;
-}
-
-static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
-{
-	struct pci_root_info *info = data;
-	struct resource *res;
-	struct acpi_resource_address64 addr;
-	acpi_status status;
-	unsigned long flags;
-	u64 start, orig_end, end, res_end;
-
-	status = resource_to_addr(acpi_res, &addr);
-	if (!ACPI_SUCCESS(status))
-		return AE_OK;
-
-	if (addr.resource_type == ACPI_MEMORY_RANGE) {
-		flags = IORESOURCE_MEM;
-		if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
-			flags |= IORESOURCE_PREFETCH;
-		res_end = (u64)iomem_resource.end;
-	} else if (addr.resource_type == ACPI_IO_RANGE) {
-		flags = IORESOURCE_IO;
-		res_end = (u64)ioport_resource.end;
-	} else
-		return AE_OK;
-
-	start = addr.address.minimum + addr.address.translation_offset;
-	orig_end = end = addr.address.maximum + addr.address.translation_offset;
-
-	/* Exclude non-addressable range or non-addressable portion of range */
-	end = min(end, res_end);
-	if (end <= start) {
-		dev_info(&info->bridge->dev,
-			"host bridge window [%#llx-%#llx] "
-			"(ignored, not CPU addressable)\n", start, orig_end);
-		return AE_OK;
-	} else if (orig_end != end) {
-		dev_info(&info->bridge->dev,
-			"host bridge window [%#llx-%#llx] "
-			"([%#llx-%#llx] ignored, not CPU addressable)\n", 
-			start, orig_end, end + 1, orig_end);
-	}
+	LIST_HEAD(list);
+	struct resource *res1, *res2, *root = NULL;
+	struct resource_entry *tmp, *entry, *entry2;
 
-	res = &info->res[info->res_num];
-	res->name = info->name;
-	res->flags = flags;
-	res->start = start;
-	res->end = end;
-	info->res_offset[info->res_num] = addr.address.translation_offset;
-	info->res_num++;
+	BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0);
+	root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource;
 
-	if (!pci_use_crs)
-		dev_printk(KERN_DEBUG, &info->bridge->dev,
-			   "host bridge window %pR (ignored)\n", res);
+	list_splice_init(crs_res, &list);
+	resource_list_for_each_entry_safe(entry, tmp, &list) {
+		bool free = false;
+		resource_size_t end;
 
-	return AE_OK;
-}
-
-static void coalesce_windows(struct pci_root_info *info, unsigned long type)
-{
-	int i, j;
-	struct resource *res1, *res2;
-
-	for (i = 0; i < info->res_num; i++) {
-		res1 = &info->res[i];
+		res1 = entry->res;
 		if (!(res1->flags & type))
-			continue;
+			goto next;
+
+		/* Exclude non-addressable range or non-addressable portion */
+		end = min(res1->end, root->end);
+		if (end <= res1->start) {
+			dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n",
+				 res1);
+			free = true;
+			goto next;
+		} else if (res1->end != end) {
+			dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n",
+				 res1, (unsigned long long)end + 1,
+				 (unsigned long long)res1->end);
+			res1->end = end;
+		}
 
-		for (j = i + 1; j < info->res_num; j++) {
-			res2 = &info->res[j];
+		resource_list_for_each_entry(entry2, crs_res) {
+			res2 = entry2->res;
 			if (!(res2->flags & type))
 				continue;
 
@@ -355,118 +261,92 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type)
 			if (resource_overlaps(res1, res2)) {
 				res2->start = min(res1->start, res2->start);
 				res2->end = max(res1->end, res2->end);
-				dev_info(&info->bridge->dev,
-					 "host bridge window expanded to %pR; %pR ignored\n",
+				dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n",
 					 res2, res1);
-				res1->flags = 0;
+				free = true;
+				goto next;
 			}
 		}
+
+next:
+		resource_list_del(entry);
+		if (free)
+			resource_list_free_entry(entry);
+		else
+			resource_list_add_tail(entry, crs_res);
 	}
 }
 
 static void add_resources(struct pci_root_info *info,
-			  struct list_head *resources)
+			  struct list_head *resources,
+			  struct list_head *crs_res)
 {
-	int i;
-	struct resource *res, *root, *conflict;
-
-	coalesce_windows(info, IORESOURCE_MEM);
-	coalesce_windows(info, IORESOURCE_IO);
+	struct resource_entry *entry, *tmp;
+	struct resource *res, *conflict, *root = NULL;
 
-	for (i = 0; i < info->res_num; i++) {
-		res = &info->res[i];
+	validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM);
+	validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO);
 
+	resource_list_for_each_entry_safe(entry, tmp, crs_res) {
+		res = entry->res;
 		if (res->flags & IORESOURCE_MEM)
 			root = &iomem_resource;
 		else if (res->flags & IORESOURCE_IO)
 			root = &ioport_resource;
 		else
-			continue;
+			BUG_ON(res);
 
 		conflict = insert_resource_conflict(root, res);
-		if (conflict)
+		if (conflict) {
 			dev_info(&info->bridge->dev,
 				 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
 				 res, conflict->name, conflict);
-		else
-			pci_add_resource_offset(resources, res,
-					info->res_offset[i]);
+			resource_list_destroy_entry(entry);
+		}
 	}
-}
 
-static void free_pci_root_info_res(struct pci_root_info *info)
-{
-	kfree(info->res);
-	info->res = NULL;
-	kfree(info->res_offset);
-	info->res_offset = NULL;
-	info->res_num = 0;
+	list_splice_tail(crs_res, resources);
 }
 
-static void __release_pci_root_info(struct pci_root_info *info)
+static void release_pci_root_info(struct pci_host_bridge *bridge)
 {
-	int i;
 	struct resource *res;
+	struct resource_entry *entry;
+	struct pci_root_info *info = bridge->release_data;
 
-	for (i = 0; i < info->res_num; i++) {
-		res = &info->res[i];
-
-		if (!res->parent)
-			continue;
-
-		if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
-			continue;
-
-		release_resource(res);
+	resource_list_for_each_entry(entry, &bridge->windows) {
+		res = entry->res;
+		if (res->parent &&
+		    (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
+			release_resource(res);
 	}
 
-	free_pci_root_info_res(info);
-
 	teardown_mcfg_map(info);
-
 	kfree(info);
 }
 
-static void release_pci_root_info(struct pci_host_bridge *bridge)
-{
-	struct pci_root_info *info = bridge->release_data;
-
-	__release_pci_root_info(info);
-}
-
 static void probe_pci_root_info(struct pci_root_info *info,
 				struct acpi_device *device,
-				int busnum, int domain)
+				int busnum, int domain,
+				struct list_head *list)
 {
-	size_t size;
+	int ret;
+	struct resource_entry *entry;
 
 	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
 	info->bridge = device;
-
-	info->res_num = 0;
-	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
-				info);
-	if (!info->res_num)
-		return;
-
-	size = sizeof(*info->res) * info->res_num;
-	info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node);
-	if (!info->res) {
-		info->res_num = 0;
-		return;
-	}
-
-	size = sizeof(*info->res_offset) * info->res_num;
-	info->res_num = 0;
-	info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node);
-	if (!info->res_offset) {
-		kfree(info->res);
-		info->res = NULL;
-		return;
-	}
-
-	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
-				info);
+	ret = acpi_dev_get_resources(device, list,
+				     acpi_dev_filter_resource_type_cb,
+				     (void *)(IORESOURCE_IO | IORESOURCE_MEM));
+	if (ret < 0)
+		dev_warn(&device->dev,
+			 "failed to parse _CRS method, error code %d\n", ret);
+	else if (ret == 0)
+		dev_dbg(&device->dev,
+			"no IO and memory resources present in _CRS\n");
+	else
+		resource_list_for_each_entry(entry, list)
+			entry->res->name = info->name;
 }
 
 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -475,6 +355,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 	struct pci_root_info *info;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
+	struct resource_entry *res_entry;
+	LIST_HEAD(crs_res);
 	LIST_HEAD(resources);
 	struct pci_bus *bus;
 	struct pci_sysdata *sd;
@@ -522,18 +404,22 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 		memcpy(bus->sysdata, sd, sizeof(*sd));
 		kfree(info);
 	} else {
-		probe_pci_root_info(info, device, busnum, domain);
-
 		/* insert busn res at first */
 		pci_add_resource(&resources,  &root->secondary);
+
 		/*
 		 * _CRS with no apertures is normal, so only fall back to
 		 * defaults or native bridge info if we're ignoring _CRS.
 		 */
-		if (pci_use_crs)
-			add_resources(info, &resources);
-		else {
-			free_pci_root_info_res(info);
+		probe_pci_root_info(info, device, busnum, domain, &crs_res);
+		if (pci_use_crs) {
+			add_resources(info, &resources, &crs_res);
+		} else {
+			resource_list_for_each_entry(res_entry, &crs_res)
+				dev_printk(KERN_DEBUG, &device->dev,
+					   "host bridge window %pR (ignored)\n",
+					   res_entry->res);
+			resource_list_free(&crs_res);
 			x86_pci_root_bus_resources(busnum, &resources);
 		}
 
@@ -548,8 +434,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 				to_pci_host_bridge(bus->bridge),
 				release_pci_root_info, info);
 		} else {
-			pci_free_resource_list(&resources);
-			__release_pci_root_info(info);
+			resource_list_free(&resources);
+			teardown_mcfg_map(info);
+			kfree(info);
 		}
 	}
 
-- 
cgit v1.2.3


From b4b55cda587442477a3a9f0669e26bba4b7800c0 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:47 +0800
Subject: x86/PCI: Refine the way to release PCI IRQ resources

Some PCI device drivers assume that pci_dev->irq won't change after
calling pci_disable_device() and pci_enable_device() during suspend and
resume.

Commit c03b3b0738a5 ("x86, irq, mpparse: Release IOAPIC pin when
PCI device is disabled") frees PCI IRQ resources when pci_disable_device()
is called and reallocate IRQ resources when pci_enable_device() is
called again. This breaks above assumption. So commit 3eec595235c1
("x86, irq, PCI: Keep IRQ assignment for PCI devices during
suspend/hibernation") and 9eabc99a635a ("x86, irq, PCI: Keep IRQ
assignment for runtime power management") fix the issue by avoiding
freeing/reallocating IRQ resources during PCI device suspend/resume.
They achieve this by checking dev.power.is_prepared and
dev.power.runtime_status.  PM maintainer, Rafael, then pointed out that
it's really an ugly fix which leaking PM internal state information to
IRQ subsystem.

Recently David Vrabel <david.vrabel@citrix.com> also reports an
regression in pciback driver caused by commit cffe0a2b5a34 ("x86, irq:
Keep balance of IOAPIC pin reference count"). Please refer to:
http://lkml.org/lkml/2015/1/14/546

So this patch refine the way to release PCI IRQ resources. Instead of
releasing PCI IRQ resources in pci_disable_device()/
pcibios_disable_device(), we now release it at driver unbinding
notification BUS_NOTIFY_UNBOUND_DRIVER. In other word, we only release
PCI IRQ resources when there's no driver bound to the PCI device, and
it keeps the assumption that pci_dev->irq won't through multiple
invocation of pci_enable_device()/pci_disable_device().

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/pci_x86.h |  2 --
 arch/x86/pci/common.c          | 34 ++++++++++++++++++++++++++++------
 arch/x86/pci/intel_mid_pci.c   |  4 ++--
 arch/x86/pci/irq.c             | 15 +--------------
 drivers/acpi/pci_irq.c         |  9 +--------
 5 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 164e3f8d3c3d..fa1195dae425 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock;
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
 
-extern bool mp_should_keep_irq(struct device *dev);
-
 struct pci_raw_ops {
 	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 *val);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7b20bccf3648..ff1f0afa5ed1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -497,6 +497,31 @@ void __init pcibios_set_cache_line_size(void)
 	}
 }
 
+/*
+ * Some device drivers assume dev->irq won't change after calling
+ * pci_disable_device(). So delay releasing of IRQ resource to driver
+ * unbinding time. Otherwise it will break PM subsystem and drivers
+ * like xen-pciback etc.
+ */
+static int pci_irq_notifier(struct notifier_block *nb, unsigned long action,
+			    void *data)
+{
+	struct pci_dev *dev = to_pci_dev(data);
+
+	if (action != BUS_NOTIFY_UNBOUND_DRIVER)
+		return NOTIFY_DONE;
+
+	if (pcibios_disable_irq)
+		pcibios_disable_irq(dev);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block pci_irq_nb = {
+	.notifier_call = pci_irq_notifier,
+	.priority = INT_MIN,
+};
+
 int __init pcibios_init(void)
 {
 	if (!raw_pci_ops) {
@@ -509,6 +534,9 @@ int __init pcibios_init(void)
 
 	if (pci_bf_sort >= pci_force_bf)
 		pci_sort_breadthfirst();
+
+	bus_register_notifier(&pci_bus_type, &pci_irq_nb);
+
 	return 0;
 }
 
@@ -667,12 +695,6 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return 0;
 }
 
-void pcibios_disable_device (struct pci_dev *dev)
-{
-	if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
-		pcibios_disable_irq(dev);
-}
-
 int pci_ext_cfg_avail(void)
 {
 	if (raw_pci_ext_ops)
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 44b9271580b5..95c2471f6819 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -234,10 +234,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 {
-	if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
-	    dev->irq > 0) {
+	if (dev->irq_managed && dev->irq > 0) {
 		mp_unmap_irq(dev->irq);
 		dev->irq_managed = 0;
+		dev->irq = 0;
 	}
 }
 
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5dc6ca5e1741..e71b3dbd87b8 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1256,22 +1256,9 @@ static int pirq_enable_irq(struct pci_dev *dev)
 	return 0;
 }
 
-bool mp_should_keep_irq(struct device *dev)
-{
-	if (dev->power.is_prepared)
-		return true;
-#ifdef CONFIG_PM
-	if (dev->power.runtime_status == RPM_SUSPENDING)
-		return true;
-#endif
-
-	return false;
-}
-
 static void pirq_disable_irq(struct pci_dev *dev)
 {
-	if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
-	    dev->irq_managed && dev->irq) {
+	if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) {
 		mp_unmap_irq(dev->irq);
 		dev->irq = 0;
 		dev->irq_managed = 0;
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index b1def411c0b8..e7f718d6918a 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -485,14 +485,6 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	if (!pin || !dev->irq_managed || dev->irq <= 0)
 		return;
 
-	/* Keep IOAPIC pin configuration when suspending */
-	if (dev->dev.power.is_prepared)
-		return;
-#ifdef	CONFIG_PM
-	if (dev->dev.power.runtime_status == RPM_SUSPENDING)
-		return;
-#endif
-
 	entry = acpi_pci_irq_lookup(dev, pin);
 	if (!entry)
 		return;
@@ -513,5 +505,6 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	if (gsi >= 0) {
 		acpi_unregister_gsi(gsi);
 		dev->irq_managed = 0;
+		dev->irq = 0;
 	}
 }
-- 
cgit v1.2.3


From 2fad93083e288a7d24bf7284c87f0d80a5a2c599 Mon Sep 17 00:00:00 2001
From: Hanjun Guo <hanjun.guo@linaro.org>
Date: Thu, 5 Feb 2015 17:33:14 +0800
Subject: ACPI / table: remove duplicate NULL check for the handler of
 acpi_table_parse()

In acpi_table_parse(), pointer of the table to pass to handler() is
checked before handler() called, so remove all the duplicate NULL
check in the handler function.

CC: Tony Luck <tony.luck@intel.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/ia64/kernel/acpi.c     |  6 ------
 arch/x86/kernel/acpi/boot.c | 16 ++--------------
 drivers/acpi/numa.c         | 12 ++----------
 3 files changed, 4 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index e795cb848154..2c4498919d3c 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -380,9 +380,6 @@ static void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 static int __init acpi_parse_madt(struct acpi_table_header *table)
 {
-	if (!table)
-		return -EINVAL;
-
 	acpi_madt = (struct acpi_table_madt *)table;
 
 	acpi_madt_rev = acpi_madt->header.revision;
@@ -645,9 +642,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
 	struct acpi_table_header *fadt_header;
 	struct acpi_table_fadt *fadt;
 
-	if (!table)
-		return -EINVAL;
-
 	fadt_header = (struct acpi_table_header *)table;
 	if (fadt_header->revision != 3)
 		return -ENODEV;	/* Only deal with ACPI 2.0 FADT */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b9e30daa0881..14d3652ece8d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -843,13 +843,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
 
 static int __init acpi_parse_sbf(struct acpi_table_header *table)
 {
-	struct acpi_table_boot *sb;
-
-	sb = (struct acpi_table_boot *)table;
-	if (!sb) {
-		printk(KERN_WARNING PREFIX "Unable to map SBF\n");
-		return -ENODEV;
-	}
+	struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
 
 	sbf_port = sb->cmos_index;	/* Save CMOS port */
 
@@ -863,13 +857,7 @@ static struct resource *hpet_res __initdata;
 
 static int __init acpi_parse_hpet(struct acpi_table_header *table)
 {
-	struct acpi_table_hpet *hpet_tbl;
-
-	hpet_tbl = (struct acpi_table_hpet *)table;
-	if (!hpet_tbl) {
-		printk(KERN_WARNING PREFIX "Unable to map HPET\n");
-		return -ENODEV;
-	}
+	struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
 
 	if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
 		printk(KERN_WARNING PREFIX "HPET timers must be located in "
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 24b5476449a1..1333cbdc3ea2 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -177,12 +177,7 @@ static int __init slit_valid(struct acpi_table_slit *slit)
 
 static int __init acpi_parse_slit(struct acpi_table_header *table)
 {
-	struct acpi_table_slit *slit;
-
-	if (!table)
-		return -EINVAL;
-
-	slit = (struct acpi_table_slit *)table;
+	struct acpi_table_slit *slit = (struct acpi_table_slit *)table;
 
 	if (!slit_valid(slit)) {
 		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
@@ -260,11 +255,8 @@ acpi_parse_memory_affinity(struct acpi_subtable_header * header,
 
 static int __init acpi_parse_srat(struct acpi_table_header *table)
 {
-	struct acpi_table_srat *srat;
-	if (!table)
-		return -EINVAL;
+	struct acpi_table_srat *srat = (struct acpi_table_srat *)table;
 
-	srat = (struct acpi_table_srat *)table;
 	acpi_srat_revision = srat->header.revision;
 
 	/* Real work done in acpi_table_parse_srat below. */
-- 
cgit v1.2.3


From f7819512996361280b86259222456fcf15aad926 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 4 Feb 2015 18:20:58 +0100
Subject: kvm: add halt_poll_ns module parameter

This patch introduces a new module parameter for the KVM module; when it
is present, KVM attempts a bit of polling on every HLT before scheduling
itself out via kvm_vcpu_block.

This parameter helps a lot for latency-bound workloads---in particular
I tested it with O_DSYNC writes with a battery-backed disk in the host.
In this case, writes are fast (because the data doesn't have to go all
the way to the platters) but they cannot be merged by either the host or
the guest.  KVM's performance here is usually around 30% of bare metal,
or 50% if you use cache=directsync or cache=writethrough (these
parameters avoid that the guest sends pointless flush requests, and
at the same time they are not slow because of the battery-backed cache).
The bad performance happens because on every halt the host CPU decides
to halt itself too.  When the interrupt comes, the vCPU thread is then
migrated to a new physical CPU, and in general the latency is horrible
because the vCPU thread has to be scheduled back in.

With this patch performance reaches 60-65% of bare metal and, more
important, 99% of what you get if you use idle=poll in the guest.  This
means that the tunable gets rid of this particular bottleneck, and more
work can be done to improve performance in the kernel or QEMU.

Of course there is some price to pay; every time an otherwise idle vCPUs
is interrupted by an interrupt, it will poll unnecessarily and thus
impose a little load on the host.  The above results were obtained with
a mostly random value of the parameter (500000), and the load was around
1.5-2.5% CPU usage on one of the host's core for each idle guest vCPU.

The patch also adds a new stat, /sys/kernel/debug/kvm/halt_successful_poll,
that can be used to tune the parameter.  It counts how many HLT
instructions received an interrupt during the polling period; each
successful poll avoids that Linux schedules the VCPU thread out and back
in, and may also avoid a likely trip to C1 and back for the physical CPU.

While the VM is idle, a Linux 4 VCPU VM halts around 10 times per second.
Of these halts, almost all are failed polls.  During the benchmark,
instead, basically all halts end within the polling period, except a more
or less constant stream of 50 per second coming from vCPUs that are not
running the benchmark.  The wasted time is thus very low.  Things may
be slightly different for Windows VMs, which have a ~10 ms timer tick.

The effect is also visible on Marcelo's recently-introduced latency
test for the TSC deadline timer.  Though of course a non-RT kernel has
awful latency bounds, the latency of the timer is around 8000-10000 clock
cycles compared to 20000-120000 without setting halt_poll_ns.  For the TSC
deadline timer, thus, the effect is both a smaller average latency and
a smaller variance.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h     |  1 +
 arch/arm64/include/asm/kvm_host.h   |  1 +
 arch/mips/include/asm/kvm_host.h    |  1 +
 arch/mips/kvm/mips.c                |  1 +
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s.c           |  1 +
 arch/powerpc/kvm/booke.c            |  1 +
 arch/s390/include/asm/kvm_host.h    |  1 +
 arch/s390/kvm/kvm-s390.c            |  1 +
 arch/x86/include/asm/kvm_host.h     |  1 +
 arch/x86/kvm/x86.c                  |  1 +
 include/trace/events/kvm.h          | 19 +++++++++++++++
 virt/kvm/kvm_main.c                 | 48 +++++++++++++++++++++++++++++++------
 13 files changed, 71 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index bde494654bcc..6a79314bc1df 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -148,6 +148,7 @@ struct kvm_vm_stat {
 };
 
 struct kvm_vcpu_stat {
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 };
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 2c49aa4ac818..8efde89613f2 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -165,6 +165,7 @@ struct kvm_vm_stat {
 };
 
 struct kvm_vcpu_stat {
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 };
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f2c249796ea8..ac4fc716062b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -120,6 +120,7 @@ struct kvm_vcpu_stat {
 	u32 resvd_inst_exits;
 	u32 break_inst_exits;
 	u32 flush_dcache_exits;
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 };
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e97b90784031..c9eccf5df912 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -49,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "resvd_inst",	  VCPU_STAT(resvd_inst_exits),	 KVM_STAT_VCPU },
 	{ "break_inst",	  VCPU_STAT(break_inst_exits),	 KVM_STAT_VCPU },
 	{ "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
 	{ "halt_wakeup",  VCPU_STAT(halt_wakeup),	 KVM_STAT_VCPU },
 	{NULL}
 };
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7efd666a3fa7..8ef05121d3cd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -107,6 +107,7 @@ struct kvm_vcpu_stat {
 	u32 emulated_inst_exits;
 	u32 dec_exits;
 	u32 ext_intr_exits;
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 	u32 dbell_exits;
 	u32 gdbell_exits;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 888bf466d8c6..cfbcdc654201 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,6 +52,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "dec",         VCPU_STAT(dec_exits) },
 	{ "ext_intr",    VCPU_STAT(ext_intr_exits) },
 	{ "queue_intr",  VCPU_STAT(queue_intr) },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "pf_storage",  VCPU_STAT(pf_storage) },
 	{ "sp_storage",  VCPU_STAT(sp_storage) },
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9b55dec2d6cc..6c1316a15a27 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -62,6 +62,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "inst_emu",   VCPU_STAT(emulated_inst_exits) },
 	{ "dec",        VCPU_STAT(dec_exits) },
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "doorbell", VCPU_STAT(dbell_exits) },
 	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d1ecc7fd0579..f79058e3fd98 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -196,6 +196,7 @@ struct kvm_vcpu_stat {
 	u32 exit_stop_request;
 	u32 exit_validity;
 	u32 exit_instruction;
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 	u32 instruction_lctl;
 	u32 instruction_lctlg;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b2371c0fd1f8..1dbab2340a66 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -51,6 +51,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "exit_instruction", VCPU_STAT(exit_instruction) },
 	{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
 	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
 	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 848947ac6ade..a236e39cc385 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -655,6 +655,7 @@ struct kvm_vcpu_stat {
 	u32 irq_window_exits;
 	u32 nmi_window_exits;
 	u32 halt_exits;
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1373e04e1f19..bd7a70be41b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_window", VCPU_STAT(irq_window_exits) },
 	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
 	{ "halt_exits", VCPU_STAT(halt_exits) },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 6edf1f2028cd..6bfe7eec1c2c 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -37,6 +37,25 @@ TRACE_EVENT(kvm_userspace_exit,
 		  __entry->errno < 0 ? -__entry->errno : __entry->reason)
 );
 
+TRACE_EVENT(kvm_vcpu_wakeup,
+	    TP_PROTO(__u64 ns, bool waited),
+	    TP_ARGS(ns, waited),
+
+	TP_STRUCT__entry(
+		__field(	__u64,		ns		)
+		__field(	bool,		waited		)
+	),
+
+	TP_fast_assign(
+		__entry->ns		= ns;
+		__entry->waited		= waited;
+	),
+
+	TP_printk("%s time %lld ns",
+		  __entry->waited ? "wait" : "poll",
+		  __entry->ns)
+);
+
 #if defined(CONFIG_HAVE_KVM_IRQFD)
 TRACE_EVENT(kvm_set_irq,
 	TP_PROTO(unsigned int gsi, int level, int irq_source_id),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0c281760a1c5..32449e0e9aa8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,6 +66,9 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+unsigned int halt_poll_ns = 0;
+module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
+
 /*
  * Ordering of locks:
  *
@@ -1813,29 +1816,60 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
 
+static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
+{
+	if (kvm_arch_vcpu_runnable(vcpu)) {
+		kvm_make_request(KVM_REQ_UNHALT, vcpu);
+		return -EINTR;
+	}
+	if (kvm_cpu_has_pending_timer(vcpu))
+		return -EINTR;
+	if (signal_pending(current))
+		return -EINTR;
+
+	return 0;
+}
+
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
+	ktime_t start, cur;
 	DEFINE_WAIT(wait);
+	bool waited = false;
+
+	start = cur = ktime_get();
+	if (halt_poll_ns) {
+		ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+		do {
+			/*
+			 * This sets KVM_REQ_UNHALT if an interrupt
+			 * arrives.
+			 */
+			if (kvm_vcpu_check_block(vcpu) < 0) {
+				++vcpu->stat.halt_successful_poll;
+				goto out;
+			}
+			cur = ktime_get();
+		} while (single_task_running() && ktime_before(cur, stop));
+	}
 
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
-		if (kvm_arch_vcpu_runnable(vcpu)) {
-			kvm_make_request(KVM_REQ_UNHALT, vcpu);
-			break;
-		}
-		if (kvm_cpu_has_pending_timer(vcpu))
-			break;
-		if (signal_pending(current))
+		if (kvm_vcpu_check_block(vcpu) < 0)
 			break;
 
+		waited = true;
 		schedule();
 	}
 
 	finish_wait(&vcpu->wq, &wait);
+	cur = ktime_get();
+
+out:
+	trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
-- 
cgit v1.2.3


From 92082a8886f30a1c492a31ac4b5a4966bb64b1a6 Mon Sep 17 00:00:00 2001
From: Ken Xue <Ken.Xue@amd.com>
Date: Fri, 6 Feb 2015 08:27:51 +0800
Subject: ACPI: add AMD ACPI2Platform device support for x86 system

This new feature is to interpret AMD specific ACPI device to
platform device such as I2C, UART, GPIO found on AMD CZ and
later chipsets. It based on example intel LPSS. Now, it can
support AMD I2C, UART and GPIO.

Signed-off-by: Ken Xue <Ken.Xue@amd.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/Kconfig        |  11 ++++
 drivers/acpi/Makefile   |   2 +-
 drivers/acpi/acpi_apd.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/internal.h |   2 +
 drivers/acpi/scan.c     |   1 +
 5 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 drivers/acpi/acpi_apd.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0dc9d0144a27..3e15cee34aee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -496,6 +496,17 @@ config X86_INTEL_LPSS
 	  things like clock tree (common clock framework) and pincontrol
 	  which are needed by the LPSS peripheral drivers.
 
+config X86_AMD_PLATFORM_DEVICE
+	bool "AMD ACPI2Platform devices support"
+	depends on ACPI
+	select COMMON_CLK
+	select PINCTRL
+	---help---
+	  Select to interpret AMD specific ACPI device to platform device
+	  such as I2C, UART, GPIO found on AMD Carrizo and later chipsets.
+	  I2C and UART depend on COMMON_CLK to set clock. GPIO driver is
+	  implemented under PINCTRL subsystem.
+
 config IOSF_MBI
 	tristate "Intel SoC IOSF Sideband support for SoC platforms"
 	depends on PCI
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index f74317cc1ca9..0071141b6bbc 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -40,7 +40,7 @@ acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o
 acpi-y				+= ec.o
 acpi-$(CONFIG_ACPI_DOCK)	+= dock.o
 acpi-y				+= pci_root.o pci_link.o pci_irq.o
-acpi-y				+= acpi_lpss.o
+acpi-y				+= acpi_lpss.o acpi_apd.o
 acpi-y				+= acpi_platform.o
 acpi-y				+= acpi_pnp.o
 acpi-y				+= int340x_thermal.o
diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c
new file mode 100644
index 000000000000..3984ea96e5f7
--- /dev/null
+++ b/drivers/acpi/acpi_apd.c
@@ -0,0 +1,150 @@
+/*
+ * AMD ACPI support for ACPI2platform device.
+ *
+ * Copyright (c) 2014,2015 AMD Corporation.
+ * Authors: Ken Xue <Ken.Xue@amd.com>
+ *	Wu, Jeff <Jeff.Wu@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/clkdev.h>
+#include <linux/acpi.h>
+#include <linux/err.h>
+#include <linux/clk.h>
+#include <linux/pm.h>
+
+#include "internal.h"
+
+ACPI_MODULE_NAME("acpi_apd");
+struct apd_private_data;
+
+/**
+ * ACPI_APD_SYSFS : add device attributes in sysfs
+ * ACPI_APD_PM : attach power domain to device
+ */
+#define ACPI_APD_SYSFS	BIT(0)
+#define ACPI_APD_PM	BIT(1)
+
+/**
+ * struct apd_device_desc - a descriptor for apd device
+ * @flags: device flags like %ACPI_APD_SYSFS, %ACPI_APD_PM
+ * @fixed_clk_rate: fixed rate input clock source for acpi device;
+ *			0 means no fixed rate input clock source
+ * @setup: a hook routine to set device resource during create platform device
+ *
+ * Device description defined as acpi_device_id.driver_data
+ */
+struct apd_device_desc {
+	unsigned int flags;
+	unsigned int fixed_clk_rate;
+	int (*setup)(struct apd_private_data *pdata);
+};
+
+struct apd_private_data {
+	struct clk *clk;
+	struct acpi_device *adev;
+	const struct apd_device_desc *dev_desc;
+};
+
+#ifdef CONFIG_X86_AMD_PLATFORM_DEVICE
+#define APD_ADDR(desc)	((unsigned long)&desc)
+
+static int acpi_apd_setup(struct apd_private_data *pdata)
+{
+	const struct apd_device_desc *dev_desc = pdata->dev_desc;
+	struct clk *clk = ERR_PTR(-ENODEV);
+
+	if (dev_desc->fixed_clk_rate) {
+		clk = clk_register_fixed_rate(&pdata->adev->dev,
+					dev_name(&pdata->adev->dev),
+					NULL, CLK_IS_ROOT,
+					dev_desc->fixed_clk_rate);
+		clk_register_clkdev(clk, NULL, dev_name(&pdata->adev->dev));
+		pdata->clk = clk;
+	}
+
+	return 0;
+}
+
+static struct apd_device_desc cz_i2c_desc = {
+	.setup = acpi_apd_setup,
+	.fixed_clk_rate = 133000000,
+};
+
+static struct apd_device_desc cz_uart_desc = {
+	.setup = acpi_apd_setup,
+	.fixed_clk_rate = 48000000,
+};
+
+#else
+
+#define APD_ADDR(desc) (0UL)
+
+#endif /* CONFIG_X86_AMD_PLATFORM_DEVICE */
+
+/**
+* Create platform device during acpi scan attach handle.
+* Return value > 0 on success of creating device.
+*/
+static int acpi_apd_create_device(struct acpi_device *adev,
+				   const struct acpi_device_id *id)
+{
+	const struct apd_device_desc *dev_desc = (void *)id->driver_data;
+	struct apd_private_data *pdata;
+	struct platform_device *pdev;
+	int ret;
+
+	if (!dev_desc) {
+		pdev = acpi_create_platform_device(adev);
+		return IS_ERR_OR_NULL(pdev) ? PTR_ERR(pdev) : 1;
+	}
+
+	pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
+
+	pdata->adev = adev;
+	pdata->dev_desc = dev_desc;
+
+	if (dev_desc->setup) {
+		ret = dev_desc->setup(pdata);
+		if (ret)
+			goto err_out;
+	}
+
+	adev->driver_data = pdata;
+	pdev = acpi_create_platform_device(adev);
+	if (!IS_ERR_OR_NULL(pdev))
+		return 1;
+
+	ret = PTR_ERR(pdev);
+	adev->driver_data = NULL;
+
+ err_out:
+	kfree(pdata);
+	return ret;
+}
+
+static const struct acpi_device_id acpi_apd_device_ids[] = {
+	/* Generic apd devices */
+	{ "AMD0010", APD_ADDR(cz_i2c_desc) },
+	{ "AMD0020", APD_ADDR(cz_uart_desc) },
+	{ "AMD0030", },
+	{ }
+};
+
+static struct acpi_scan_handler apd_handler = {
+	.ids = acpi_apd_device_ids,
+	.attach = acpi_apd_create_device,
+};
+
+void __init acpi_apd_init(void)
+{
+	acpi_scan_add_handler(&apd_handler);
+}
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 163e82f536fa..c24ae9d7fdcb 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -68,6 +68,8 @@ static inline void acpi_debugfs_init(void) { return; }
 #endif
 void acpi_lpss_init(void);
 
+void acpi_apd_init(void);
+
 acpi_status acpi_hotplug_schedule(struct acpi_device *adev, u32 src);
 bool acpi_queue_hotplug_work(struct work_struct *work);
 void acpi_device_hotplug(struct acpi_device *adev, u32 src);
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index dc4d8960684a..bbca7830e18a 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2544,6 +2544,7 @@ int __init acpi_scan_init(void)
 	acpi_pci_link_init();
 	acpi_processor_init();
 	acpi_lpss_init();
+	acpi_apd_init();
 	acpi_cmos_rtc_init();
 	acpi_container_init();
 	acpi_memory_hotplug_init();
-- 
cgit v1.2.3


From d44e1212230a68f9dccd1a95b5c8ca5217c62094 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 9 Feb 2015 10:02:05 +0100
Subject: KVM: x86: emulate: correct page fault error code for NoWrite
 instructions

NoWrite instructions (e.g. cmp or test) never set the "write access"
bit in the error code, even if one of the operands is treated as a
destination.

Fixes: c205fb7d7d4f81e46fc577b707ceb9e356af1456
Cc: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 81dcf7964701..a943bf0c06d0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4954,7 +4954,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		rc = segmented_read(ctxt, ctxt->dst.addr.mem,
 				   &ctxt->dst.val, ctxt->dst.bytes);
 		if (rc != X86EMUL_CONTINUE) {
-			if (rc == X86EMUL_PROPAGATE_FAULT &&
+			if (!(ctxt->d & NoWrite) &&
+			    rc == X86EMUL_PROPAGATE_FAULT &&
 			    ctxt->exception.vector == PF_VECTOR)
 				ctxt->exception.error_code |= PFERR_WRITE_MASK;
 			goto done;
-- 
cgit v1.2.3


From a2413d8b2982fb6823cdb66704938a9a345d0a7d Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 9 Feb 2015 09:36:53 -0800
Subject: x86/mce: Fix regression. All error records should report via
 /dev/mcelog

I'm getting complaints from validation teams that have updated their
Linux kernels from ancient versions to current. They don't see the
error logs they expect. I tell the to unload any EDAC drivers[1], and
things start working again.  The problem is that we short-circuit
the logging process if any function on the decoder chain claims to
have dealt with the problem:

	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
	if (ret == NOTIFY_STOP)
		return;

The logic we used when we added this code was that we did not want
to confuse users with double reports of the same error.

But it turns out users are not confused - they are upset that they
don't see a log where their tools used to find a log.

I could also get into a long description of how the consumer of this
log does more than just decode model specific details of the error.
It keeps counts, tracks thresholds, takes actions and runs scripts
that can alert administrators to problems.

[1] We've recently compounded the problem because the acpi_extlog
driver also registers for this notifier and also returns NOTIFY_STOP.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..f439c429c133 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -150,14 +150,11 @@ static struct mce_log mcelog = {
 void mce_log(struct mce *mce)
 {
 	unsigned next, entry;
-	int ret = 0;
 
 	/* Emit the trace record: */
 	trace_mce_record(mce);
 
-	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
-	if (ret == NOTIFY_STOP)
-		return;
+	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 
 	mce->finished = 0;
 	wmb();
-- 
cgit v1.2.3


From 3a9a941d0b9361eac81fb763a89fb465f70b1c28 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 15 Aug 2014 02:39:52 -0400
Subject: tools/power turbostat: decode MSR_*_PERF_LIMIT_REASONS

The Processor generation code-named Haswell
added MSR_{CORE | GFX | RING}_PERF_LIMIT_REASONS
to explain when and how the processor limits frequency.

turbostat -v
will now decode these bits.

Each MSR has an "Active" set of bits which describe
current conditions, and a "Logged" set of bits,
which describe what has happened since last cleared.

Turbostat currently doesn't clear the log bits.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/uapi/asm/msr-index.h |   4 ++
 tools/power/x86/turbostat/turbostat.c | 126 ++++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c8aa65d56027..f721330541cb 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -152,6 +152,10 @@
 #define MSR_CC6_DEMOTION_POLICY_CONFIG	0x00000668
 #define MSR_MC6_DEMOTION_POLICY_CONFIG	0x00000669
 
+#define MSR_CORE_PERF_LIMIT_REASONS	0x00000690
+#define MSR_GFX_PERF_LIMIT_REASONS	0x000006B0
+#define MSR_RING_PERF_LIMIT_REASONS	0x000006B1
+
 /* Hardware P state interface */
 #define MSR_PPERF			0x0000064e
 #define MSR_PERF_LIMIT_REASONS		0x0000064f
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6f29fc11fde6..58913096d7b2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -83,6 +83,9 @@ unsigned int tcc_activation_temp;
 unsigned int tcc_activation_temp_override;
 double rapl_power_units, rapl_energy_units, rapl_time_units;
 double rapl_joule_counter_range;
+unsigned int do_core_perf_limit_reasons;
+unsigned int do_gfx_perf_limit_reasons;
+unsigned int do_ring_perf_limit_reasons;
 
 #define RAPL_PKG		(1 << 0)
 					/* 0x610 MSR_PKG_POWER_LIMIT */
@@ -1178,6 +1181,7 @@ print_nhm_turbo_ratio_limits:
 	if (ratio)
 		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 1 active cores\n",
 			ratio, bclk, ratio * bclk);
+
 }
 
 void free_all_buffers(void)
@@ -1594,6 +1598,103 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	return 0;
 }
 
+/*
+ * print_perf_limit()
+ */
+int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long msr;
+	int cpu;
+
+	cpu = t->cpu_id;
+
+	/* per-package */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
+	if (cpu_migrate(cpu)) {
+		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		return -1;
+	}
+
+	if (do_core_perf_limit_reasons) {
+		get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
+		fprintf(stderr, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
+		fprintf(stderr, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
+			(msr & 1 << 0) ? "PROCHOT, " : "",
+			(msr & 1 << 1) ? "ThermStatus, " : "",
+			(msr & 1 << 2) ? "bit2, " : "",
+			(msr & 1 << 4) ? "Graphics, " : "",
+			(msr & 1 << 5) ? "Auto-HWP, " : "",
+			(msr & 1 << 6) ? "VR-Therm, " : "",
+			(msr & 1 << 8) ? "Amps, " : "",
+			(msr & 1 << 9) ? "CorePwr, " : "",
+			(msr & 1 << 10) ? "PkgPwrL1, " : "",
+			(msr & 1 << 11) ? "PkgPwrL2, " : "",
+			(msr & 1 << 12) ? "MultiCoreTurbo, " : "",
+			(msr & 1 << 13) ? "Transitions, " : "",
+			(msr & 1 << 14) ? "bit14, " : "",
+			(msr & 1 << 15) ? "bit15, " : "");
+		fprintf(stderr, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
+			(msr & 1 << 16) ? "PROCHOT, " : "",
+			(msr & 1 << 17) ? "ThermStatus, " : "",
+			(msr & 1 << 18) ? "bit18, " : "",
+			(msr & 1 << 20) ? "Graphics, " : "",
+			(msr & 1 << 21) ? "Auto-HWP, " : "",
+			(msr & 1 << 22) ? "VR-Therm, " : "",
+			(msr & 1 << 24) ? "Amps, " : "",
+			(msr & 1 << 25) ? "CorePwr, " : "",
+			(msr & 1 << 26) ? "PkgPwrL1, " : "",
+			(msr & 1 << 27) ? "PkgPwrL2, " : "",
+			(msr & 1 << 28) ? "MultiCoreTurbo, " : "",
+			(msr & 1 << 29) ? "Transitions, " : "",
+			(msr & 1 << 30) ? "bit30, " : "",
+			(msr & 1 << 31) ? "bit31, " : "");
+
+	}
+	if (do_gfx_perf_limit_reasons) {
+		get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
+		fprintf(stderr, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
+		fprintf(stderr, " (Active: %s%s%s%s%s%s%s%s)",
+			(msr & 1 << 0) ? "PROCHOT, " : "",
+			(msr & 1 << 1) ? "ThermStatus, " : "",
+			(msr & 1 << 4) ? "Graphics, " : "",
+			(msr & 1 << 6) ? "VR-Therm, " : "",
+			(msr & 1 << 8) ? "Amps, " : "",
+			(msr & 1 << 9) ? "GFXPwr, " : "",
+			(msr & 1 << 10) ? "PkgPwrL1, " : "",
+			(msr & 1 << 11) ? "PkgPwrL2, " : "");
+		fprintf(stderr, " (Logged: %s%s%s%s%s%s%s%s)\n",
+			(msr & 1 << 16) ? "PROCHOT, " : "",
+			(msr & 1 << 17) ? "ThermStatus, " : "",
+			(msr & 1 << 20) ? "Graphics, " : "",
+			(msr & 1 << 22) ? "VR-Therm, " : "",
+			(msr & 1 << 24) ? "Amps, " : "",
+			(msr & 1 << 25) ? "GFXPwr, " : "",
+			(msr & 1 << 26) ? "PkgPwrL1, " : "",
+			(msr & 1 << 27) ? "PkgPwrL2, " : "");
+	}
+	if (do_ring_perf_limit_reasons) {
+		get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
+		fprintf(stderr, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
+		fprintf(stderr, " (Active: %s%s%s%s%s%s)",
+			(msr & 1 << 0) ? "PROCHOT, " : "",
+			(msr & 1 << 1) ? "ThermStatus, " : "",
+			(msr & 1 << 6) ? "VR-Therm, " : "",
+			(msr & 1 << 8) ? "Amps, " : "",
+			(msr & 1 << 10) ? "PkgPwrL1, " : "",
+			(msr & 1 << 11) ? "PkgPwrL2, " : "");
+		fprintf(stderr, " (Logged: %s%s%s%s%s%s)\n",
+			(msr & 1 << 16) ? "PROCHOT, " : "",
+			(msr & 1 << 17) ? "ThermStatus, " : "",
+			(msr & 1 << 22) ? "VR-Therm, " : "",
+			(msr & 1 << 24) ? "Amps, " : "",
+			(msr & 1 << 26) ? "PkgPwrL1, " : "",
+			(msr & 1 << 27) ? "PkgPwrL2, " : "");
+	}
+	return 0;
+}
+
 #define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
 #define	RAPL_TIME_GRANULARITY	0x3F /* 6 bit time granularity */
 
@@ -1683,6 +1784,27 @@ void rapl_probe(unsigned int family, unsigned int model)
 	return;
 }
 
+void perf_limit_reasons_probe(family, model)
+{
+	if (!genuine_intel)
+		return;
+
+	if (family != 6)
+		return;
+
+	switch (model) {
+	case 0x3C:	/* HSW */
+	case 0x45:	/* HSW */
+	case 0x46:	/* HSW */
+		do_gfx_perf_limit_reasons = 1;
+	case 0x3F:	/* HSX */
+		do_core_perf_limit_reasons = 1;
+		do_ring_perf_limit_reasons = 1;
+	default:
+		return;
+	}
+}
+
 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	unsigned long long msr;
@@ -2104,6 +2226,7 @@ void check_cpuid()
 	do_nehalem_turbo_ratio_limit = has_nehalem_turbo_ratio_limit(family, model);
 	do_ivt_turbo_ratio_limit = has_ivt_turbo_ratio_limit(family, model);
 	rapl_probe(family, model);
+	perf_limit_reasons_probe(family, model);
 
 	return;
 }
@@ -2341,6 +2464,9 @@ void turbostat_init()
 	if (verbose)
 		for_all_cpus(print_epb, ODD_COUNTERS);
 
+	if (verbose)
+		for_all_cpus(print_perf_limit, ODD_COUNTERS);
+
 	if (verbose)
 		for_all_cpus(print_rapl, ODD_COUNTERS);
 
-- 
cgit v1.2.3


From dab2087defcb4afb2a5574d268b87257ba0f6b22 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Mon, 9 Feb 2015 22:44:07 +0100
Subject: KVM: x86: fix build with !CONFIG_SMP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<asm/apic.h> isn't included directly and without CONFIG_SMP, an option
that automagically pulls it can't be enabled.

Reported-by: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e112472b0b3..3f73bfad0349 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
 #include <asm/perf_event.h>
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
+#include <asm/apic.h>
 
 #include "trace.h"
 
-- 
cgit v1.2.3


From ece84b390ab0ceada9c771749455f3594c36e3df Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:08:19 -0800
Subject: hugetlb, x86: register 1G page size if we can allocate them at
 runtime

After commit 944d9fec8d7a ("hugetlb: add support for gigantic page
allocation at runtime") we can allocate 1G pages at runtime if CMA is
enabled.

Let's register 1G pages into hugetlb even if the user hasn't requested
them explicitly at boot time with hugepagesz=1G.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8b977ebf9388..bca0aa3a003f 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -178,4 +178,15 @@ static __init int setup_hugepagesz(char *opt)
 	return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_CMA
+static __init int gigantic_pages_init(void)
+{
+	/* With CMA we can allocate gigantic pages at runtime */
+	if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
+		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	return 0;
+}
+arch_initcall(gigantic_pages_init);
+#endif
 #endif
-- 
cgit v1.2.3


From 0a191362058391878cc2a4d4ccddcd8223eb4f79 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:11:22 -0800
Subject: x86: drop _PAGE_FILE and pte_file()-related helpers

We've replaced remap_file_pages(2) implementation with emulation.  Nobody
creates non-linear mapping anymore.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable-2level.h | 38 +----------------------------------
 arch/x86/include/asm/pgtable-3level.h | 12 -----------
 arch/x86/include/asm/pgtable.h        | 20 ------------------
 arch/x86/include/asm/pgtable_64.h     |  6 +-----
 arch/x86/include/asm/pgtable_types.h  |  3 ---
 5 files changed, 2 insertions(+), 77 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 206a87fdd22d..fd74a11959de 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
 	return ((value >> rightshift) & mask) << leftshift;
 }
 
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range.
- */
-#define PTE_FILE_MAX_BITS	29
-#define PTE_FILE_SHIFT1		(_PAGE_BIT_PRESENT + 1)
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-
-#define PTE_FILE_MASK1		((1U << PTE_FILE_BITS1) - 1)
-#define PTE_FILE_MASK2		((1U << PTE_FILE_BITS2) - 1)
-
-#define PTE_FILE_LSHIFT2	(PTE_FILE_BITS1)
-#define PTE_FILE_LSHIFT3	(PTE_FILE_BITS1 + PTE_FILE_BITS2)
-
-static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
-{
-	return (pgoff_t)
-		(pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1,  0)		    +
-		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2,  PTE_FILE_LSHIFT2) +
-		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3,           -1UL,  PTE_FILE_LSHIFT3));
-}
-
-static __always_inline pte_t pgoff_to_pte(pgoff_t off)
-{
-	return (pte_t){
-		.pte_low =
-			pte_bitop(off,                0, PTE_FILE_MASK1,  PTE_FILE_SHIFT1) +
-			pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2,  PTE_FILE_SHIFT2) +
-			pte_bitop(off, PTE_FILE_LSHIFT3,           -1UL,  PTE_FILE_SHIFT3) +
-			_PAGE_FILE,
-	};
-}
-
 /* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 81bb91b49a88..cdaa58c9b39e 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
-/*
- * Bits 0, 6 and 7 are taken in the low part of the pte,
- * put the 32 bits of offset into the high part.
- *
- * For soft-dirty tracking 11 bit is taken from
- * the low part of pte as well.
- */
-#define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off)						\
-	((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-#define PTE_FILE_MAX_BITS       32
-
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e8a5454acc99..0fe03f834fb1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -115,11 +115,6 @@ static inline int pte_write(pte_t pte)
 	return pte_flags(pte) & _PAGE_RW;
 }
 
-static inline int pte_file(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_FILE;
-}
-
 static inline int pte_huge(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_PSE;
@@ -329,21 +324,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
-	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
-	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_SOFT_DIRTY;
-}
-
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 4572b2f30237..e227970f983e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* PUD - Level3 access */
 
 /* PMD  - Level 2 access */
-#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
-					    _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
 
 /* PTE - Level 1 access. */
 
@@ -145,7 +141,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
 /* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
 #ifdef CONFIG_NUMA_BALANCING
 /* Automatic NUMA balancing needs to be distinguishable from swap entries */
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 25bcd4a89517..5185a4f599ec 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -38,8 +38,6 @@
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
 
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
 #define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -114,7 +112,6 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
-- 
cgit v1.2.3


From d1c29465b8a52d8fc5a59aac92c6b206b69fe631 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:15:10 +1030
Subject: lguest: don't disable iospace.

This no longer speeds up boot (IDE got better, I guess), but it does stop
us probing for a PCI bus.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index c1c1544b8485..47ec7f201d27 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1399,14 +1399,6 @@ __init void lguest_init(void)
 	/* Hook in our special panic hypercall code. */
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
-	/*
-	 * The IDE code spends about 3 seconds probing for disks: if we reserve
-	 * all the I/O ports up front it can't get them and so doesn't probe.
-	 * Other device drivers are similar (but less severe).  This cuts the
-	 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
-	 */
-	paravirt_disable_iospace();
-
 	/*
 	 * This is messy CPU setup stuff which the native boot code does before
 	 * start_kernel, so we have to do, too:
-- 
cgit v1.2.3


From ee72576c143d8e9081ae1fe8644122454dd323c5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:15:10 +1030
Subject: lguest: disable ACPI explicitly.

Once we add PCI, it starts trying to manage our interrupts.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 47ec7f201d27..aa6e3b4ce29c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -56,6 +56,7 @@
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
 #include <linux/export.h>
+#include <asm/acpi.h>
 #include <asm/apic.h>
 #include <asm/lguest.h>
 #include <asm/paravirt.h>
@@ -1428,6 +1429,9 @@ __init void lguest_init(void)
 	/* Register our very early console. */
 	virtio_cons_early_init(early_put_chars);
 
+	/* Don't let ACPI try to control our PCI interrupts. */
+	disable_acpi();
+
 	/*
 	 * Last of all, we set the power management poweroff hook to point to
 	 * the Guest routine to power off, and the reboot hook to our restart
-- 
cgit v1.2.3


From e1b83e27881cf3153ce420aea853797fed29a9ea Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:15:10 +1030
Subject: lguest: Override pcibios_enable_irq/pcibios_disable_irq to our stupid
 PIC

This lets us deliver interrupts for our emulated PCI devices using our
dumb PIC, and not emulate an 8259 and PCI irq mapping tables or whatever.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index aa6e3b4ce29c..2943ab931671 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -56,6 +56,7 @@
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
 #include <linux/export.h>
+#include <linux/pci.h>
 #include <asm/acpi.h>
 #include <asm/apic.h>
 #include <asm/lguest.h>
@@ -72,6 +73,7 @@
 #include <asm/stackprotector.h>
 #include <asm/reboot.h>		/* for struct machine_ops */
 #include <asm/kvm_para.h>
+#include <asm/pci_x86.h>
 
 /*G:010
  * Welcome to the Guest!
@@ -832,6 +834,24 @@ static struct irq_chip lguest_irq_controller = {
 	.irq_unmask	= enable_lguest_irq,
 };
 
+static int lguest_enable_irq(struct pci_dev *dev)
+{
+	u8 line = 0;
+
+	/* We literally use the PCI interrupt line as the irq number. */
+	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
+	irq_set_chip_and_handler_name(line, &lguest_irq_controller,
+				      handle_level_irq, "level");
+	dev->irq = line;
+	return 0;
+}
+
+/* We don't do hotplug PCI, so this shouldn't be called. */
+static void lguest_disable_irq(struct pci_dev *dev)
+{
+	WARN_ON(1);
+}
+
 /*
  * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
  * interrupt (except 128, which is used for system calls), and then tells the
@@ -1432,6 +1452,10 @@ __init void lguest_init(void)
 	/* Don't let ACPI try to control our PCI interrupts. */
 	disable_acpi();
 
+	/* We control them ourselves, by overriding these two hooks. */
+	pcibios_enable_irq = lguest_enable_irq;
+	pcibios_disable_irq = lguest_disable_irq;
+
 	/*
 	 * Last of all, we set the power management poweroff hook to point to
 	 * the Guest routine to power off, and the reboot hook to our restart
-- 
cgit v1.2.3


From a561adfaecc9eb6fb66941b450458801f3f60ca0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:26:01 +1030
Subject: lguest: use the PCI console device's emerg_wr for early boot
 messages.

This involves manually checking the console device (which is always in
slot 1 of bus 0) and using the window in VIRTIO_PCI_CAP_PCI_CFG to
program it (as we can't map the BAR yet).

We could in fact do this much earlier, but we wait for the first
write from the virtio_cons_early_init() facility.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 146 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 134 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2943ab931671..531b844cb48d 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -57,6 +57,7 @@
 #include <linux/pm.h>
 #include <linux/export.h>
 #include <linux/pci.h>
+#include <linux/virtio_pci.h>
 #include <asm/acpi.h>
 #include <asm/apic.h>
 #include <asm/lguest.h>
@@ -74,6 +75,7 @@
 #include <asm/reboot.h>		/* for struct machine_ops */
 #include <asm/kvm_para.h>
 #include <asm/pci_x86.h>
+#include <asm/pci-direct.h>
 
 /*G:010
  * Welcome to the Guest!
@@ -1202,25 +1204,145 @@ static __init char *lguest_memory_setup(void)
 	return "LGUEST";
 }
 
+/* Offset within PCI config space of BAR access capability. */
+static int console_cfg_offset = 0;
+static int console_access_cap;
+
+/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
+static void set_cfg_window(u32 cfg_offset, u32 off)
+{
+	write_pci_config_byte(0, 1, 0,
+			      cfg_offset + offsetof(struct virtio_pci_cap, bar),
+			      0);
+	write_pci_config(0, 1, 0,
+			 cfg_offset + offsetof(struct virtio_pci_cap, length),
+			 4);
+	write_pci_config(0, 1, 0,
+			 cfg_offset + offsetof(struct virtio_pci_cap, offset),
+			 off);
+}
+
+static u32 read_bar_via_cfg(u32 cfg_offset, u32 off)
+{
+	set_cfg_window(cfg_offset, off);
+	return read_pci_config(0, 1, 0,
+			       cfg_offset + sizeof(struct virtio_pci_cap));
+}
+
+static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
+{
+	set_cfg_window(cfg_offset, off);
+	write_pci_config(0, 1, 0,
+			 cfg_offset + sizeof(struct virtio_pci_cap), val);
+}
+
+static void probe_pci_console(void)
+{
+	u8 cap, common_cap = 0, device_cap = 0;
+	/* Offsets within BAR0 */
+	u32 common_offset, device_offset;
+
+	/* Avoid recursive printk into here. */
+	console_cfg_offset = -1;
+
+	if (!early_pci_allowed()) {
+		printk(KERN_ERR "lguest: early PCI access not allowed!\n");
+		return;
+	}
+
+	/* We expect a console PCI device at BUS0, slot 1. */
+	if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
+		printk(KERN_ERR "lguest: PCI device is %#x!\n",
+		       read_pci_config(0, 1, 0, 0));
+		return;
+	}
+
+	/* Find the capabilities we need (must be in bar0) */
+	cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
+	while (cap) {
+		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
+		if (vndr == PCI_CAP_ID_VNDR) {
+			u8 type, bar;
+			u32 offset;
+
+			type = read_pci_config_byte(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, cfg_type));
+			bar = read_pci_config_byte(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, bar));
+			offset = read_pci_config(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, offset));
+
+			switch (type) {
+			case VIRTIO_PCI_CAP_COMMON_CFG:
+				if (bar == 0) {
+					common_cap = cap;
+					common_offset = offset;
+				}
+				break;
+			case VIRTIO_PCI_CAP_DEVICE_CFG:
+				if (bar == 0) {
+					device_cap = cap;
+					device_offset = offset;
+				}
+				break;
+			case VIRTIO_PCI_CAP_PCI_CFG:
+				console_access_cap = cap;
+				break;
+			}
+		}
+		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
+	}
+	if (!common_cap || !device_cap || !console_access_cap) {
+		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
+		       common_cap, device_cap, console_access_cap);
+		return;
+	}
+
+
+#define write_common_config(reg, val)					\
+	write_bar_via_cfg(console_access_cap,				\
+			  common_offset+offsetof(struct virtio_pci_common_cfg,reg),\
+			  val)
+
+#define read_common_config(reg)						\
+	read_bar_via_cfg(console_access_cap,				\
+			 common_offset+offsetof(struct virtio_pci_common_cfg,reg))
+
+	/* Check features: they must offer EMERG_WRITE */
+	write_common_config(device_feature_select, 0);
+
+	if (!(read_common_config(device_feature)
+	      & (1 << VIRTIO_CONSOLE_F_EMERG_WRITE))) {
+		printk(KERN_ERR "lguest: console missing EMERG_WRITE\n");
+		return;
+	}
+
+	console_cfg_offset = device_offset;
+}
+
 /*
  * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
- * console output.
+ * but before that is set up we use the virtio PCI console's backdoor mmio
+ * access and the "emergency" write facility (which is legal even before the
+ * device is configured).
  */
 static __init int early_put_chars(u32 vtermno, const char *buf, int count)
 {
-	char scratch[17];
-	unsigned int len = count;
+	/* If we couldn't find PCI console, forget it. */
+	if (console_cfg_offset < 0)
+		return count;
 
-	/* We use a nul-terminated string, so we make a copy.  Icky, huh? */
-	if (len > sizeof(scratch) - 1)
-		len = sizeof(scratch) - 1;
-	scratch[len] = '\0';
-	memcpy(scratch, buf, len);
-	hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
+	if (unlikely(!console_cfg_offset)) {
+		probe_pci_console();
+		if (console_cfg_offset < 0)
+			return count;
+	}
 
-	/* This routine returns the number of bytes actually written. */
-	return len;
+	write_bar_via_cfg(console_access_cap,
+			  console_cfg_offset
+			  + offsetof(struct virtio_console_config, emerg_wr),
+			  buf[0]);
+	return 1;
 }
 
 /*
-- 
cgit v1.2.3


From d9bab50aa46ce46dd4537d455eb13b200cdac516 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:28:01 +1030
Subject: lguest: remove NOTIFY call and eventfd facility.

Disappointing, as this was kind of neat (especially getting to use RCU
to manage the address -> eventfd mapping).  But now the devices are PCI
handled in userspace, we get rid of both the NOTIFY hypercall and
the interface to connect an eventfd.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h |   1 -
 drivers/lguest/core.c               |  20 +---
 drivers/lguest/hypercalls.c         |   4 -
 drivers/lguest/lg.h                 |  12 ---
 drivers/lguest/lguest_user.c        | 186 +-----------------------------------
 include/linux/lguest_launcher.h     |   2 +-
 6 files changed, 10 insertions(+), 215 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 879fd7d33877..ef01fef3eebc 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -16,7 +16,6 @@
 #define LHCALL_SET_PTE		14
 #define LHCALL_SET_PGD		15
 #define LHCALL_LOAD_TLS		16
-#define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
 #define LHCALL_SEND_INTERRUPTS	19
 
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 9159dbc583f6..7dc93aa004c8 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -225,22 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		if (cpu->hcall)
 			do_hypercalls(cpu);
 
-		/*
-		 * It's possible the Guest did a NOTIFY hypercall to the
-		 * Launcher.
-		 */
+		/* Do we have to tell the Launcher about a trap? */
 		if (cpu->pending.trap) {
-			/*
-			 * Does it just needs to write to a registered
-			 * eventfd (ie. the appropriate virtqueue thread)?
-			 */
-			if (!send_notify_to_eventfd(cpu)) {
-				/* OK, we tell the main Launcher. */
-				if (copy_to_user(user, &cpu->pending,
-						 sizeof(cpu->pending)))
-					return -EFAULT;
-				return sizeof(cpu->pending);
-			}
+			if (copy_to_user(user, &cpu->pending,
+					 sizeof(cpu->pending)))
+				return -EFAULT;
+			return sizeof(cpu->pending);
 		}
 
 		/*
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 5dd1fb8a6610..1219af493c0f 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -117,10 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		/* Similarly, this sets the halted flag for run_guest(). */
 		cpu->halted = 1;
 		break;
-	case LHCALL_NOTIFY:
-		cpu->pending.trap = LGUEST_TRAP_ENTRY;
-		cpu->pending.addr = args->arg1;
-		break;
 	default:
 		/* It should be an architecture-specific hypercall. */
 		if (lguest_arch_do_hcall(cpu, args))
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index eb81abc05995..307e8b39e7d1 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -81,16 +81,6 @@ struct lg_cpu {
 	struct lg_cpu_arch arch;
 };
 
-struct lg_eventfd {
-	unsigned long addr;
-	struct eventfd_ctx *event;
-};
-
-struct lg_eventfd_map {
-	unsigned int num;
-	struct lg_eventfd map[];
-};
-
 /* The private info the thread maintains about the guest. */
 struct lguest {
 	struct lguest_data __user *lguest_data;
@@ -117,8 +107,6 @@ struct lguest {
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
-	struct lg_eventfd_map *eventfds;
-
 	/* Dead? */
 	const char *dead;
 };
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index c8b0e8575b44..c4c6113eb9a6 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -2,182 +2,20 @@
  * launcher controls and communicates with the Guest.  For example,
  * the first write will tell us the Guest's memory layout and entry
  * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest doing a NOTIFY out to the Launcher.  There is
- * also a way for the Launcher to attach eventfds to particular NOTIFY
- * values instead of returning from the read() call.
+ * a signal or the Guest accessing a device.
 :*/
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/eventfd.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/export.h>
 #include "lg.h"
 
-/*L:056
- * Before we move on, let's jump ahead and look at what the kernel does when
- * it needs to look up the eventfds.  That will complete our picture of how we
- * use RCU.
- *
- * The notification value is in cpu->pending_notify: we return true if it went
- * to an eventfd.
- */
-bool send_notify_to_eventfd(struct lg_cpu *cpu)
-{
-	unsigned int i;
-	struct lg_eventfd_map *map;
-
-	/* We only connect LHCALL_NOTIFY to event fds, not other traps. */
-	if (cpu->pending.trap != LGUEST_TRAP_ENTRY)
-		return false;
-
-	/*
-	 * This "rcu_read_lock()" helps track when someone is still looking at
-	 * the (RCU-using) eventfds array.  It's not actually a lock at all;
-	 * indeed it's a noop in many configurations.  (You didn't expect me to
-	 * explain all the RCU secrets here, did you?)
-	 */
-	rcu_read_lock();
-	/*
-	 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
-	 * makes sure we don't access the memory pointed to by
-	 * cpu->lg->eventfds before cpu->lg->eventfds is set.  Sounds crazy,
-	 * but Alpha allows this!  Paul McKenney points out that a really
-	 * aggressive compiler could have the same effect:
-	 *   http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
-	 *
-	 * So play safe, use rcu_dereference to get the rcu-protected pointer:
-	 */
-	map = rcu_dereference(cpu->lg->eventfds);
-	/*
-	 * Simple array search: even if they add an eventfd while we do this,
-	 * we'll continue to use the old array and just won't see the new one.
-	 */
-	for (i = 0; i < map->num; i++) {
-		if (map->map[i].addr == cpu->pending.addr) {
-			eventfd_signal(map->map[i].event, 1);
-			cpu->pending.trap = 0;
-			break;
-		}
-	}
-	/* We're done with the rcu-protected variable cpu->lg->eventfds. */
-	rcu_read_unlock();
-
-	/* If we cleared the notification, it's because we found a match. */
-	return cpu->pending.trap == 0;
-}
-
-/*L:055
- * One of the more tricksy tricks in the Linux Kernel is a technique called
- * Read Copy Update.  Since one point of lguest is to teach lguest journeyers
- * about kernel coding, I use it here.  (In case you're curious, other purposes
- * include learning about virtualization and instilling a deep appreciation for
- * simplicity and puppies).
- *
- * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
- * add new eventfds without ever blocking readers from accessing the array.
- * The current Launcher only does this during boot, so that never happens.  But
- * Read Copy Update is cool, and adding a lock risks damaging even more puppies
- * than this code does.
- *
- * We allocate a brand new one-larger array, copy the old one and add our new
- * element.  Then we make the lg eventfd pointer point to the new array.
- * That's the easy part: now we need to free the old one, but we need to make
- * sure no slow CPU somewhere is still looking at it.  That's what
- * synchronize_rcu does for us: waits until every CPU has indicated that it has
- * moved on to know it's no longer using the old one.
- *
- * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
- */
-static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
-{
-	struct lg_eventfd_map *new, *old = lg->eventfds;
-
-	/*
-	 * We don't allow notifications on value 0 anyway (pending_notify of
-	 * 0 means "nothing pending").
-	 */
-	if (!addr)
-		return -EINVAL;
-
-	/*
-	 * Replace the old array with the new one, carefully: others can
-	 * be accessing it at the same time.
-	 */
-	new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
-		      GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
-
-	/* First make identical copy. */
-	memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
-	new->num = old->num;
-
-	/* Now append new entry. */
-	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_ctx_fdget(fd);
-	if (IS_ERR(new->map[new->num].event)) {
-		int err =  PTR_ERR(new->map[new->num].event);
-		kfree(new);
-		return err;
-	}
-	new->num++;
-
-	/*
-	 * Now put new one in place: rcu_assign_pointer() is a fancy way of
-	 * doing "lg->eventfds = new", but it uses memory barriers to make
-	 * absolutely sure that the contents of "new" written above is nailed
-	 * down before we actually do the assignment.
-	 *
-	 * We have to think about these kinds of things when we're operating on
-	 * live data without locks.
-	 */
-	rcu_assign_pointer(lg->eventfds, new);
-
-	/*
-	 * We're not in a big hurry.  Wait until no one's looking at old
-	 * version, then free it.
-	 */
-	synchronize_rcu();
-	kfree(old);
-
-	return 0;
-}
-
 /*L:052
- * Receiving notifications from the Guest is usually done by attaching a
- * particular LHCALL_NOTIFY value to an event filedescriptor.  The eventfd will
- * become readable when the Guest does an LHCALL_NOTIFY with that value.
- *
- * This is really convenient for processing each virtqueue in a separate
- * thread.
- */
-static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
-{
-	unsigned long addr, fd;
-	int err;
-
-	if (get_user(addr, input) != 0)
-		return -EFAULT;
-	input++;
-	if (get_user(fd, input) != 0)
-		return -EFAULT;
-
-	/*
-	 * Just make sure two callers don't add eventfds at once.  We really
-	 * only need to lock against callers adding to the same Guest, so using
-	 * the Big Lguest Lock is overkill.  But this is setup, not a fast path.
-	 */
-	mutex_lock(&lguest_lock);
-	err = add_eventfd(lg, addr, fd);
-	mutex_unlock(&lguest_lock);
-
-	return err;
-}
-
-/* The Launcher can get the registers, and also set some of them. */
+  The Launcher can get the registers, and also set some of them.
+*/
 static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
 {
 	unsigned long which;
@@ -409,13 +247,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
 		goto unlock;
 	}
 
-	lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
-	if (!lg->eventfds) {
-		err = -ENOMEM;
-		goto free_lg;
-	}
-	lg->eventfds->num = 0;
-
 	/* Populate the easy fields of our "struct lguest" */
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
@@ -424,7 +255,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
-		goto free_eventfds;
+		goto free_lg;
 
 	/*
 	 * Initialize the Guest's shadow page tables.  This allocates
@@ -445,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
 	/* FIXME: This should be in free_vcpu */
 	free_page(lg->cpus[0].regs_page);
-free_eventfds:
-	kfree(lg->eventfds);
 free_lg:
 	kfree(lg);
 unlock:
@@ -499,8 +328,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		return initialize(file, input);
 	case LHREQ_IRQ:
 		return user_send_irq(cpu, input);
-	case LHREQ_EVENTFD:
-		return attach_eventfd(lg, input);
 	case LHREQ_GETREG:
 		return getreg_setup(cpu, input);
 	case LHREQ_SETREG:
@@ -551,11 +378,6 @@ static int close(struct inode *inode, struct file *file)
 		mmput(lg->cpus[i].mm);
 	}
 
-	/* Release any eventfds they registered. */
-	for (i = 0; i < lg->eventfds->num; i++)
-		eventfd_ctx_put(lg->eventfds->map[i].event);
-	kfree(lg->eventfds);
-
 	/*
 	 * If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree().
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 677cde735d4b..acd5b12565cc 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -23,7 +23,7 @@ enum lguest_req
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* No longer used */
-	LHREQ_EVENTFD, /* + address, fd. */
+	LHREQ_EVENTFD, /* No longer used. */
 	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
 	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
 	LHREQ_TRAP, /* + trap number to deliver to guest. */
-- 
cgit v1.2.3


From 61f77eda9bbf0d2e922197ed2dcf88638a639ce5 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:25:15 -0800
Subject: mm/hugetlb: reduce arch dependent code around follow_huge_*

Currently we have many duplicates in definitions around
follow_huge_addr(), follow_huge_pmd(), and follow_huge_pud(), so this
patch tries to remove the m.  The basic idea is to put the default
implementation for these functions in mm/hugetlb.c as weak symbols
(regardless of CONFIG_ARCH_WANT_GENERAL_HUGETL B), and to implement
arch-specific code only when the arch needs it.

For follow_huge_addr(), only powerpc and ia64 have their own
implementation, and in all other architectures this function just returns
ERR_PTR(-EINVAL).  So this patch sets returning ERR_PTR(-EINVAL) as
default.

As for follow_huge_(pmd|pud)(), if (pmd|pud)_huge() is implemented to
always return 0 in your architecture (like in ia64 or sparc,) it's never
called (the callsite is optimized away) no matter how implemented it is.
So in such architectures, we don't need arch-specific implementation.

In some architecture (like mips, s390 and tile,) their current
arch-specific follow_huge_(pmd|pud)() are effectively identical with the
common code, so this patch lets these architecture use the common code.

One exception is metag, where pmd_huge() could return non-zero but it
expects follow_huge_pmd() to always return NULL.  This means that we need
arch-specific implementation which returns NULL.  This behavior looks
strange to me (because non-zero pmd_huge() implies that the architecture
supports PMD-based hugepage, so follow_huge_pmd() can/should return some
relevant value,) but that's beyond this cleanup patch, so let's keep it.

Justification of non-trivial changes:
- in s390, follow_huge_pmd() checks !MACHINE_HAS_HPAGE at first, and this
  patch removes the check. This is OK because we can assume MACHINE_HAS_HPAGE
  is true when follow_huge_pmd() can be called (note that pmd_huge() has
  the same check and always returns 0 for !MACHINE_HAS_HPAGE.)
- in s390 and mips, we use HPAGE_MASK instead of PMD_MASK as done in common
  code. This patch forces these archs use PMD_MASK, but it's OK because
  they are identical in both archs.
  In s390, both of HPAGE_SHIFT and PMD_SHIFT are 20.
  In mips, HPAGE_SHIFT is defined as (PAGE_SHIFT + PAGE_SHIFT - 3) and
  PMD_SHIFT is define as (PAGE_SHIFT + PAGE_SHIFT + PTE_ORDER - 3), but
  PTE_ORDER is always 0, so these are identical.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/hugetlbpage.c     |  6 ------
 arch/arm64/mm/hugetlbpage.c   |  6 ------
 arch/ia64/mm/hugetlbpage.c    |  6 ------
 arch/metag/mm/hugetlbpage.c   |  6 ------
 arch/mips/mm/hugetlbpage.c    | 18 ------------------
 arch/powerpc/mm/hugetlbpage.c |  8 ++++++++
 arch/s390/mm/hugetlbpage.c    | 20 --------------------
 arch/sh/mm/hugetlbpage.c      | 12 ------------
 arch/sparc/mm/hugetlbpage.c   | 12 ------------
 arch/tile/mm/hugetlbpage.c    | 28 ----------------------------
 arch/x86/mm/hugetlbpage.c     | 12 ------------
 mm/hugetlb.c                  | 30 +++++++++++++++---------------
 12 files changed, 23 insertions(+), 141 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index 66781bf34077..c72412415093 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -36,12 +36,6 @@
  * of type casting from pmd_t * to pte_t *.
  */
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pud_huge(pud_t pud)
 {
 	return 0;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 023747bf4dd7..2de9d2e59d96 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -38,12 +38,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 }
 #endif
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	return !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 76069c18ee42..52b7604b5215 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -114,12 +114,6 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
-{
-	return NULL;
-}
-
 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 3c32075d2945..7ca80ac42ed5 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -94,12 +94,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm,
-			      unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	return pmd_page_shift(pmd) > PAGE_SHIFT;
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 4ec8ee10d371..06e0f421b41b 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -68,12 +68,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	return (pmd_val(pmd) & _PAGE_HUGE) != 0;
@@ -83,15 +77,3 @@ int pud_huge(pud_t pud)
 {
 	return (pud_val(pud) & _PAGE_HUGE) != 0;
 }
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
-{
-	struct page *page;
-
-	page = pte_page(*(pte_t *)pmd);
-	if (page)
-		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
-}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 5ff4e07d920a..cf0464f4284f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -714,6 +714,14 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+		pud_t *pud, int write)
+{
+	BUG();
+	return NULL;
+}
+
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 				      unsigned long sz)
 {
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 3c80d2e38f03..210ffede0153 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -192,12 +192,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	if (!MACHINE_HAS_HPAGE)
@@ -210,17 +204,3 @@ int pud_huge(pud_t pud)
 {
 	return 0;
 }
-
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmdp, int write)
-{
-	struct page *page;
-
-	if (!MACHINE_HAS_HPAGE)
-		return NULL;
-
-	page = pmd_page(*pmdp);
-	if (page)
-		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
-}
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index d7762349ea48..534bc978af8a 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -67,12 +67,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm,
-			      unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	return 0;
@@ -82,9 +76,3 @@ int pud_huge(pud_t pud)
 {
 	return 0;
 }
-
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
-{
-	return NULL;
-}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index d329537739c6..4242eab12e10 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -215,12 +215,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm,
-			      unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	return 0;
@@ -230,9 +224,3 @@ int pud_huge(pud_t pud)
 {
 	return 0;
 }
-
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
-{
-	return NULL;
-}
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 3270e0019266..8416240c322c 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -150,12 +150,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return NULL;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE);
@@ -166,28 +160,6 @@ int pud_huge(pud_t pud)
 	return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
 }
 
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
-{
-	struct page *page;
-
-	page = pte_page(*(pte_t *)pmd);
-	if (page)
-		page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
-	return page;
-}
-
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-			     pud_t *pud, int write)
-{
-	struct page *page;
-
-	page = pte_page(*(pte_t *)pud);
-	if (page)
-		page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
-	return page;
-}
-
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
 	return 0;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index bca0aa3a003f..f48423f10141 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -52,20 +52,8 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
-{
-	return NULL;
-}
 #else
 
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 int pmd_huge(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_PSE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index be0e5d0db5ec..f533d336e569 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3660,7 +3660,20 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
-struct page *
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/*
+ * These functions are overwritable if your architecture needs its own
+ * behavior.
+ */
+struct page * __weak
+follow_huge_addr(struct mm_struct *mm, unsigned long address,
+			      int write)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+struct page * __weak
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
 {
@@ -3672,7 +3685,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return page;
 }
 
-struct page *
+struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
 		pud_t *pud, int write)
 {
@@ -3684,19 +3697,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 	return page;
 }
 
-#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-
-/* Can be overriden by architectures */
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-	       pud_t *pud, int write)
-{
-	BUG();
-	return NULL;
-}
-
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-
 #ifdef CONFIG_MEMORY_FAILURE
 
 /* Should be called in hugetlb_lock */
-- 
cgit v1.2.3


From cbef8478bee55775ac312a574aad48af7bb9cf9f Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:25:19 -0800
Subject: mm/hugetlb: pmd_huge() returns true for non-present hugepage

Migrating hugepages and hwpoisoned hugepages are considered as non-present
hugepages, and they are referenced via migration entries and hwpoison
entries in their page table slots.

This behavior causes race condition because pmd_huge() doesn't tell
non-huge pages from migrating/hwpoisoned hugepages.  follow_page_mask() is
one example where the kernel would call follow_page_pte() for such
hugepage while this function is supposed to handle only normal pages.

To avoid this, this patch makes pmd_huge() return true when pmd_none() is
true *and* pmd_present() is false.  We don't have to worry about mixing up
non-present pmd entry with normal pmd (pointing to leaf level pte entry)
because pmd_present() is true in normal pmd.

The same race condition could happen in (x86-specific) gup_pmd_range(),
where this patch simply adds pmd_present() check instead of pmd_huge().
This is because gup_pmd_range() is fast path.  If we have non-present
hugepage in this function, we will go into gup_huge_pmd(), then return 0
at flag mask check, and finally fall back to the slow path.

Fixes: 290408d4a2 ("hugetlb: hugepage migration core")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org>	[2.6.36+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c         | 2 +-
 arch/x86/mm/hugetlbpage.c | 8 +++++++-
 mm/hugetlb.c              | 2 ++
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d7547824e763..224b14235e96 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -172,7 +172,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		 */
 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
-		if (unlikely(pmd_large(pmd))) {
+		if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
 			/*
 			 * NUMA hinting faults need to be handled in the GUP
 			 * slowpath for accounting purposes and so that they
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f48423f10141..42982b26e32b 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -54,9 +54,15 @@ int pud_huge(pud_t pud)
 
 #else
 
+/*
+ * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
+ * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
+ * Otherwise, returns 0.
+ */
 int pmd_huge(pmd_t pmd)
 {
-	return !!(pmd_val(pmd) & _PAGE_PSE);
+	return !pmd_none(pmd) &&
+		(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
 }
 
 int pud_huge(pud_t pud)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f533d336e569..d96b8bfa748f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3679,6 +3679,8 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
 	struct page *page;
 
+	if (!pmd_present(*pmd))
+		return NULL;
 	page = pte_page(*(pte_t *)pmd);
 	if (page)
 		page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
-- 
cgit v1.2.3


From d016bf7ece53b2b947bfd769e0842fd2feb7556b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 11 Feb 2015 15:26:41 -0800
Subject: mm: make FIRST_USER_ADDRESS unsigned long on all archs

LKP has triggered a compiler warning after my recent patch "mm: account
pmd page tables to the process":

    mm/mmap.c: In function 'exit_mmap':
 >> mm/mmap.c:2857:2: warning: right shift count >= width of type [enabled by default]

The code:

 > 2857                WARN_ON(mm_nr_pmds(mm) >
   2858                                round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);

In this, on tile, we have FIRST_USER_ADDRESS defined as 0.  round_up() has
the same type -- int.  PUD_SHIFT.

I think the best way to fix it is to define FIRST_USER_ADDRESS as unsigned
long.  On every arch for consistency.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/pgtable.h         | 2 +-
 arch/arc/include/asm/pgtable.h           | 2 +-
 arch/arm/include/asm/pgtable-nommu.h     | 2 +-
 arch/arm64/include/asm/pgtable.h         | 2 +-
 arch/avr32/include/asm/pgtable.h         | 2 +-
 arch/cris/include/asm/pgtable.h          | 2 +-
 arch/frv/include/asm/pgtable.h           | 2 +-
 arch/hexagon/include/asm/pgtable.h       | 2 +-
 arch/ia64/include/asm/pgtable.h          | 2 +-
 arch/m32r/include/asm/pgtable.h          | 2 +-
 arch/m68k/include/asm/pgtable_mm.h       | 2 +-
 arch/microblaze/include/asm/pgtable.h    | 2 +-
 arch/mips/include/asm/pgtable-32.h       | 2 +-
 arch/mn10300/include/asm/pgtable.h       | 2 +-
 arch/nios2/include/asm/pgtable.h         | 2 +-
 arch/openrisc/include/asm/pgtable.h      | 2 +-
 arch/parisc/include/asm/pgtable.h        | 2 +-
 arch/powerpc/include/asm/pgtable-ppc32.h | 2 +-
 arch/powerpc/include/asm/pgtable-ppc64.h | 2 +-
 arch/s390/include/asm/pgtable.h          | 2 +-
 arch/score/include/asm/pgtable.h         | 2 +-
 arch/sh/include/asm/pgtable.h            | 2 +-
 arch/sparc/include/asm/pgtable_32.h      | 2 +-
 arch/sparc/include/asm/pgtable_64.h      | 2 +-
 arch/tile/include/asm/pgtable.h          | 2 +-
 arch/um/include/asm/pgtable-2level.h     | 2 +-
 arch/um/include/asm/pgtable-3level.h     | 2 +-
 arch/x86/include/asm/pgtable_types.h     | 2 +-
 arch/xtensa/include/asm/pgtable.h        | 2 +-
 29 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index fce22cf88ee9..a9a119592372 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -45,7 +45,7 @@ struct vm_area_struct;
 #define PTRS_PER_PMD	(1UL << (PAGE_SHIFT-3))
 #define PTRS_PER_PGD	(1UL << (PAGE_SHIFT-3))
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 /* Number of pointers that fit on a page:  this will go away. */
 #define PTRS_PER_PAGE	(1UL << (PAGE_SHIFT-3))
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index bdc8ccaf390d..ffed3b2cf313 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -211,7 +211,7 @@
  * No special requirements for lowest virtual address we permit any user space
  * mapping to be mapped at.
  */
-#define FIRST_USER_ADDRESS      0
+#define FIRST_USER_ADDRESS      0UL
 
 
 /****************************************************************
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index c35e53ee6663..add094d09e3e 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h
@@ -85,7 +85,7 @@ extern unsigned int kobjsize(const void *objp);
 #define	VMALLOC_START	0UL
 #define	VMALLOC_END	0xffffffffUL
 
-#define FIRST_USER_ADDRESS      (0)
+#define FIRST_USER_ADDRESS      0UL
 
 #include <asm-generic/pgtable.h>
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 4c445057169d..3e4d3c43632a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -45,7 +45,7 @@
 
 #define vmemmap			((struct page *)(VMALLOC_END + SZ_64K))
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #ifndef __ASSEMBLY__
 extern void __pte_error(const char *file, int line, unsigned long val);
diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h
index ac7a817e2126..35800664076e 100644
--- a/arch/avr32/include/asm/pgtable.h
+++ b/arch/avr32/include/asm/pgtable.h
@@ -30,7 +30,7 @@
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #ifndef __ASSEMBLY__
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h
index e824257971c4..ceefc314d64d 100644
--- a/arch/cris/include/asm/pgtable.h
+++ b/arch/cris/include/asm/pgtable.h
@@ -67,7 +67,7 @@ extern void paging_init(void);
  */
 
 #define USER_PTRS_PER_PGD       (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS      0
+#define FIRST_USER_ADDRESS      0UL
 
 /* zero page used for uninitialized stuff */
 #ifndef __ASSEMBLY__
diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h
index c49699d5902d..93bcf2abd1a1 100644
--- a/arch/frv/include/asm/pgtable.h
+++ b/arch/frv/include/asm/pgtable.h
@@ -140,7 +140,7 @@ extern unsigned long empty_zero_page;
 #define PTRS_PER_PTE		4096
 
 #define USER_PGDS_IN_LAST_PML4	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define USER_PGD_PTRS		(PAGE_OFFSET >> PGDIR_SHIFT)
 #define KERNEL_PGD_PTRS		(PTRS_PER_PGD - USER_PGD_PTRS)
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 6e35e71d2aea..49eab8136ec3 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -171,7 +171,7 @@ extern unsigned long _dflt_cache_att;
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];  /* located in head.S */
 
 /* Seems to be zero even in architectures where the zero page is firewalled? */
-#define FIRST_USER_ADDRESS 0
+#define FIRST_USER_ADDRESS 0UL
 #define pte_special(pte)	0
 #define pte_mkspecial(pte)	(pte)
 
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 2f07bb3dda91..7b6f8801df57 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -127,7 +127,7 @@
 #define PTRS_PER_PGD_SHIFT	PTRS_PER_PTD_SHIFT
 #define PTRS_PER_PGD		(1UL << PTRS_PER_PGD_SHIFT)
 #define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 /*
  * All the normal masks have the "page accessed" bits on, as any time
diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h
index 050f7a686e3d..8c1fb902a9ce 100644
--- a/arch/m32r/include/asm/pgtable.h
+++ b/arch/m32r/include/asm/pgtable.h
@@ -53,7 +53,7 @@ extern unsigned long empty_zero_page[1024];
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #ifndef __ASSEMBLY__
 /* Just any arbitrary offset to the start of the vmalloc VM area: the
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index 9f5abbda1ea7..28a145bfbb71 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -66,7 +66,7 @@
 #define PTRS_PER_PGD	128
 #endif
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 /* Virtual address region for use by kernel_map() */
 #ifdef CONFIG_SUN3
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index c6b6af4ca2a0..e53b8532353c 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -72,7 +72,7 @@ extern int mem_init_done;
 #include <asm/mmu.h>
 #include <asm/page.h>
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 extern unsigned long va_to_phys(unsigned long address);
 extern pte_t *va_to_pte(unsigned long address);
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index 16aa9f23e17b..a6be006b6f75 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -57,7 +57,7 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1,
 #define PTRS_PER_PTE	((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t))
 
 #define USER_PTRS_PER_PGD	(0x80000000UL/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define VMALLOC_START	  MAP_BASE
 
diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h
index 629181ae111e..afab728ab65e 100644
--- a/arch/mn10300/include/asm/pgtable.h
+++ b/arch/mn10300/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern void paging_init(void);
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define USER_PGD_PTRS		(PAGE_OFFSET >> PGDIR_SHIFT)
 #define KERNEL_PGD_PTRS		(PTRS_PER_PGD - USER_PGD_PTRS)
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 7b292e3a3138..a213e8c9aad0 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -24,7 +24,7 @@
 #include <asm/pgtable-bits.h>
 #include <asm-generic/pgtable-nopmd.h>
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define VMALLOC_START		CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
 #define VMALLOC_END		(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 18994ccb1185..69c7df0e1420 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -77,7 +77,7 @@ extern void paging_init(void);
  */
 
 #define USER_PTRS_PER_PGD       (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS      0
+#define FIRST_USER_ADDRESS      0UL
 
 /*
  * Kernels own virtual memory area.
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 1d49a4a7749b..8c966b2270aa 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -134,7 +134,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
  * pgd entries used up by user/kernel:
  */
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 /* NB: The tlb miss handlers make certain assumptions about the order */
 /*     of the following bits, so be careful (One example, bits 25-31  */
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 234e07c47803..e48e3292e713 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -45,7 +45,7 @@ extern int icache_44x_need_flush;
 #define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define pte_ERROR(e) \
 	pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index b9dcc936e2d1..d46532ccc386 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -12,7 +12,7 @@
 #endif
 #include <asm/barrier.h>
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 /*
  * Size of EA range mapped by our pagetables.
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index ffb1d8ce97ae..aabcd3f62d3b 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -99,7 +99,7 @@ extern unsigned long zero_page_mask;
 #endif /* CONFIG_64BIT */
 #define PTRS_PER_PGD	2048
 
-#define FIRST_USER_ADDRESS  0
+#define FIRST_USER_ADDRESS  0UL
 
 #define pte_ERROR(e) \
 	printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e))
diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h
index 5170ffdea643..0553e5cd5985 100644
--- a/arch/score/include/asm/pgtable.h
+++ b/arch/score/include/asm/pgtable.h
@@ -27,7 +27,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)];
 #define PTRS_PER_PTE	1024
 
 #define USER_PTRS_PER_PGD	(0x80000000UL/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define VMALLOC_START		(0xc0000000UL)
 
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index cf434c64408d..89c513a982fc 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -62,7 +62,7 @@ static inline unsigned long long neff_sign_extend(unsigned long val)
 /* Entries per level */
 #define PTRS_PER_PTE	(PAGE_SIZE / (1 << PTE_MAGNITUDE))
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define PHYS_ADDR_MASK29		0x1fffffff
 #define PHYS_ADDR_MASK32		0xffffffff
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 9912eb0b499a..f06b36a00a3b 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -44,7 +44,7 @@ unsigned long __init bootmem_init(unsigned long *pages_avail);
 #define PTRS_PER_PMD    	SRMMU_PTRS_PER_PMD
 #define PTRS_PER_PGD    	SRMMU_PTRS_PER_PGD
 #define USER_PTRS_PER_PGD	PAGE_OFFSET / SRMMU_PGDIR_SIZE
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 #define PTE_SIZE		(PTRS_PER_PTE*4)
 
 #define PAGE_NONE	SRMMU_PAGE_NONE
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2ac7873ad6fd..dc165ebdf05a 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -93,7 +93,7 @@ bool kern_addr_valid(unsigned long addr);
 #define PTRS_PER_PGD	(1UL << PGDIR_BITS)
 
 /* Kernel has a separate 44bit address space. */
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define pmd_ERROR(e)							\
 	pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n",		\
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index bc75b6ef2e79..95a4f19d16c5 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -67,7 +67,7 @@ extern void pgtable_cache_init(void);
 extern void paging_init(void);
 extern void set_page_homes(void);
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define _PAGE_PRESENT           HV_PTE_PRESENT
 #define _PAGE_HUGE_PAGE         HV_PTE_PAGE
diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h
index 7afe86035fa7..cfbe59752469 100644
--- a/arch/um/include/asm/pgtable-2level.h
+++ b/arch/um/include/asm/pgtable-2level.h
@@ -23,7 +23,7 @@
 #define PTRS_PER_PTE	1024
 #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
 #define PTRS_PER_PGD	1024
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define pte_ERROR(e) \
         printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 344c559c0a17..2b4274e7c095 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -41,7 +41,7 @@
 #endif
 
 #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define pte_ERROR(e) \
         printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 5185a4f599ec..3e0230c94cff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -4,7 +4,7 @@
 #include <linux/const.h>
 #include <asm/page_types.h>
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define _PAGE_BIT_PRESENT	0	/* is present */
 #define _PAGE_BIT_RW		1	/* writeable */
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 01b80dce9d65..a5e929a10c20 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -57,7 +57,7 @@
 #define PTRS_PER_PGD		1024
 #define PGD_ORDER		0
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 #define FIRST_USER_PGD_NR	(FIRST_USER_ADDRESS >> PGDIR_SHIFT)
 
 /*
-- 
cgit v1.2.3


From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 11 Feb 2015 15:26:50 -0800
Subject: mm: account pmd page tables to the process

Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup.  The trick is to allocate a lot of PMD page tables.  Linux
kernel doesn't account PMD tables to the process, only PTE.

The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low.  oom_score for the process will be 0.

	#include <errno.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <sys/mman.h>
	#include <sys/prctl.h>

	#define PUD_SIZE (1UL << 30)
	#define PMD_SIZE (1UL << 21)

	#define NR_PUD 130000

	int main(void)
	{
		char *addr = NULL;
		unsigned long i;

		prctl(PR_SET_THP_DISABLE);
		for (i = 0; i < NR_PUD ; i++) {
			addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
			if (addr == MAP_FAILED) {
				perror("mmap");
				break;
			}
			*addr = 'x';
			munmap(addr, PMD_SIZE);
			mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
			if (addr == MAP_FAILED)
				perror("re-mmap"), exit(1);
		}
		printf("PID %d consumed %lu KiB in PMD page tables\n",
				getpid(), i * 4096 >> 10);
		return pause();
	}

The patch addresses the issue by account PMD tables to the process the
same way we account PTE.

The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:

 - HugeTLB can share PMD page tables. The patch handles by accounting
   the table to all processes who share it.

 - x86 PAE pre-allocates few PMD tables on fork.

 - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
   check on exit(2).

Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded).  As with nr_ptes we use per-mm counter.  The
counter value is used to calculate baseline for badness score by
oom-killer.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 12 ++++++------
 arch/x86/mm/pgtable.c       | 14 +++++++++-----
 fs/proc/task_mmu.c          |  9 ++++++---
 include/linux/mm.h          | 24 ++++++++++++++++++++++++
 include/linux/mm_types.h    |  3 ++-
 kernel/fork.c               |  3 +++
 mm/debug.c                  |  3 ++-
 mm/hugetlb.c                |  8 ++++++--
 mm/memory.c                 | 15 +++++++++------
 mm/mmap.c                   |  4 +++-
 mm/oom_kill.c               |  9 +++++----
 11 files changed, 75 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 4415aa915681..e9c706e4627a 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -555,12 +555,12 @@ this is causing problems for your system/application.
 
 oom_dump_tasks
 
-Enables a system-wide task dump (excluding kernel threads) to be
-produced when the kernel performs an OOM-killing and includes such
-information as pid, uid, tgid, vm size, rss, nr_ptes, swapents,
-oom_score_adj score, and name.  This is helpful to determine why the
-OOM killer was invoked, to identify the rogue task that caused it,
-and to determine why the OOM killer chose the task it did to kill.
+Enables a system-wide task dump (excluding kernel threads) to be produced
+when the kernel performs an OOM-killing and includes such information as
+pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6fb6927f9e76..7b22adaad4f1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 
 #endif	/* CONFIG_X86_PAE */
 
-static void free_pmds(pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[])
 		if (pmds[i]) {
 			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 			free_page((unsigned long)pmds[i]);
+			mm_dec_nr_pmds(mm);
 		}
 }
 
-static int preallocate_pmds(pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 	bool failed = false;
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
 			pmd = NULL;
 			failed = true;
 		}
+		if (pmd)
+			mm_inc_nr_pmds(mm);
 		pmds[i] = pmd;
 	}
 
 	if (failed) {
-		free_pmds(pmds);
+		free_pmds(mm, pmds);
 		return -ENOMEM;
 	}
 
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 
 			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 			pmd_free(mm, pmd);
+			mm_dec_nr_pmds(mm);
 		}
 	}
 }
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	mm->pgd = pgd;
 
-	if (preallocate_pmds(pmds) != 0)
+	if (preallocate_pmds(mm, pmds) != 0)
 		goto out_free_pgd;
 
 	if (paravirt_pgd_alloc(mm) != 0)
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 
 out_free_pmds:
-	free_pmds(pmds);
+	free_pmds(mm, pmds);
 out_free_pgd:
 	free_page((unsigned long)pgd);
 out:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6396f88c6687..e6e0abeb5d12 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap;
+	unsigned long data, text, lib, swap, ptes, pmds;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	/*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
+		"VmPMD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE * sizeof(pte_t) *
-		 atomic_long_read(&mm->nr_ptes)) >> 10,
+		ptes >> 10,
+		pmds >> 10,
 		swap << (PAGE_SHIFT-10));
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c6bf813a6b3d..644990b83cda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1438,8 +1438,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
+
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_pmds);
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_pmds);
+}
+
+static inline void mm_dec_nr_pmds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_pmds);
+}
 #endif
 
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 20ff2105b564..199a03aab8dc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -363,7 +363,8 @@ struct mm_struct {
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
-	atomic_long_t nr_ptes;			/* Page table pages */
+	atomic_long_t nr_ptes;			/* PTE page table pages */
+	atomic_long_t nr_pmds;			/* PMD page table pages */
 	int map_count;				/* number of VMAs */
 
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
diff --git a/kernel/fork.c b/kernel/fork.c
index b379d9abddc7..c99098c52641 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
+#ifndef __PAGETABLE_PMD_FOLDED
+	atomic_long_set(&mm->nr_pmds, 0);
+#endif
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
diff --git a/mm/debug.c b/mm/debug.c
index d69cb5a7ba9a..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+		mm_nr_pmds((struct mm_struct *)mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fd28d6ba5e5d..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 		if (saddr) {
 			spte = huge_pte_offset(svma->vm_mm, saddr);
 			if (spte) {
+				mm_inc_nr_pmds(mm);
 				get_page(virt_to_page(spte));
 				break;
 			}
@@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 
 	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
 	spin_lock(ptl);
-	if (pud_none(*pud))
+	if (pud_none(*pud)) {
 		pud_populate(mm, pud,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
-	else
+	} else {
 		put_page(virt_to_page(spte));
+		mm_inc_nr_pmds(mm);
+	}
 	spin_unlock(ptl);
 out:
 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 
 	pud_clear(pud);
 	put_page(virt_to_page(ptep));
+	mm_dec_nr_pmds(mm);
 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
 	return 1;
 }
diff --git a/mm/memory.c b/mm/memory.c
index d63849b5188f..bbe6a73a899d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
 	pmd_free_tlb(tlb, pmd, start);
+	mm_dec_nr_pmds(tlb->mm);
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
-	if (pud_present(*pud))		/* Another has populated it */
-		pmd_free(mm, new);
-	else
+	if (!pud_present(*pud)) {
+		mm_inc_nr_pmds(mm);
 		pud_populate(mm, pud, new);
-#else
-	if (pgd_present(*pud))		/* Another has populated it */
+	} else	/* Another has populated it */
 		pmd_free(mm, new);
-	else
+#else
+	if (!pgd_present(*pud)) {
+		mm_inc_nr_pmds(mm);
 		pgd_populate(mm, pud, new);
+	} else /* Another has populated it */
+		pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 14d84666e8ba..6a7d36d133fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 
 	WARN_ON(atomic_long_read(&mm->nr_ptes) >
-			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+			round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT);
+	WARN_ON(mm_nr_pmds(mm) >
+			round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b8df76ee2be3..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
-		 get_mm_counter(p->mm, MM_SWAPENTS);
+	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			atomic_long_read(&task->mm->nr_ptes),
+			mm_nr_pmds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.2.3


From a7b780750e1a1c7833812681e1f8fa30bbb06802 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 11 Feb 2015 15:27:23 -0800
Subject: mm: gup: use get_user_pages_unlocked within get_user_pages_fast

This allows the get_user_pages_fast slow path to release the mmap_sem
before blocking.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/gup.c  |  8 +++-----
 arch/s390/mm/gup.c  |  6 ++----
 arch/sh/mm/gup.c    |  6 ++----
 arch/sparc/mm/gup.c |  6 ++----
 arch/x86/mm/gup.c   |  7 +++----
 mm/gup.c            |  6 ++----
 mm/util.c           | 10 ++--------
 7 files changed, 16 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 70795a67a276..349995d19c7f 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -301,11 +301,9 @@ slow_irqon:
 	start += nr << PAGE_SHIFT;
 	pages += nr;
 
-	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, start,
-				(end - start) >> PAGE_SHIFT,
-				write, 0, pages, NULL);
-	up_read(&mm->mmap_sem);
+	ret = get_user_pages_unlocked(current, mm, start,
+				      (end - start) >> PAGE_SHIFT,
+				      write, 0, pages);
 
 	/* Have to be a bit careful with return values */
 	if (nr > 0) {
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 639fce464008..5c586c78ca8d 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -235,10 +235,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	/* Try to get the remaining pages with get_user_pages */
 	start += nr << PAGE_SHIFT;
 	pages += nr;
-	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, start,
-			     nr_pages - nr, write, 0, pages, NULL);
-	up_read(&mm->mmap_sem);
+	ret = get_user_pages_unlocked(current, mm, start,
+			     nr_pages - nr, write, 0, pages);
 	/* Have to be a bit careful with return values */
 	if (nr > 0)
 		ret = (ret < 0) ? nr : ret + nr;
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 37458f38b220..e15f52a17b6c 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -257,10 +257,8 @@ slow_irqon:
 		start += nr << PAGE_SHIFT;
 		pages += nr;
 
-		down_read(&mm->mmap_sem);
-		ret = get_user_pages(current, mm, start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
-		up_read(&mm->mmap_sem);
+		ret = get_user_pages_unlocked(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index ae6ce383d4df..2e5c4fc2daa9 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -249,10 +249,8 @@ slow:
 		start += nr << PAGE_SHIFT;
 		pages += nr;
 
-		down_read(&mm->mmap_sem);
-		ret = get_user_pages(current, mm, start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
-		up_read(&mm->mmap_sem);
+		ret = get_user_pages_unlocked(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 224b14235e96..89df70e0caa6 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -388,10 +388,9 @@ slow_irqon:
 		start += nr << PAGE_SHIFT;
 		pages += nr;
 
-		down_read(&mm->mmap_sem);
-		ret = get_user_pages(current, mm, start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
-		up_read(&mm->mmap_sem);
+		ret = get_user_pages_unlocked(current, mm, start,
+					      (end - start) >> PAGE_SHIFT,
+					      write, 0, pages);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/mm/gup.c b/mm/gup.c
index dad5875fb766..c2da1163986a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1243,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		start += nr << PAGE_SHIFT;
 		pages += nr;
 
-		down_read(&mm->mmap_sem);
-		ret = get_user_pages(current, mm, start,
-				     nr_pages - nr, write, 0, pages, NULL);
-		up_read(&mm->mmap_sem);
+		ret = get_user_pages_unlocked(current, mm, start,
+					      nr_pages - nr, write, 0, pages);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/mm/util.c b/mm/util.c
index fec39d4509a9..f3ef639c4857 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -240,14 +240,8 @@ int __weak get_user_pages_fast(unsigned long start,
 				int nr_pages, int write, struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
-	int ret;
-
-	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, start, nr_pages,
-					write, 0, pages, NULL);
-	up_read(&mm->mmap_sem);
-
-	return ret;
+	return get_user_pages_unlocked(current, mm, start, nr_pages,
+				       write, 0, pages);
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
-- 
cgit v1.2.3


From e7bb4b6d1609cce391af1e7bc6f31d14f1a3a890 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:19 -0800
Subject: mm: add p[te|md] protnone helpers for use by NUMA balancing

This is a preparatory patch that introduces protnone helpers for automatic
NUMA balancing.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/pgtable.h | 16 ++++++++++++++++
 arch/x86/include/asm/pgtable.h     | 16 ++++++++++++++++
 include/asm-generic/pgtable.h      | 20 ++++++++++++++++++++
 3 files changed, 52 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7e77f2ca5132..1146006d3477 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -40,6 +40,22 @@ static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK)
 static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
 
 #ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h . On powerpc, this will only
+ * work for user pages and always return true for kernel pages.
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pte_protnone(pmd_pte(pmd));
+}
+
 static inline int pte_present(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_NUMA_MASK;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0fe03f834fb1..f519b0b529dd 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -483,6 +483,22 @@ static inline int pmd_present(pmd_t pmd)
 				 _PAGE_NUMA);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_PROTNONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_PROTNONE;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 static inline int pmd_none(pmd_t pmd)
 {
 	/* Only check low word on 32-bit platforms, since it might be
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 129de9204d18..067922c06c29 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -673,6 +673,26 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
 #endif
 }
 
+#ifndef CONFIG_NUMA_BALANCING
+/*
+ * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
+ * the only case the kernel cares is for NUMA balancing and is only ever set
+ * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked
+ * _PAGE_PROTNONE so by by default, implement the helper as "always no". It
+ * is the responsibility of the caller to distinguish between PROT_NONE
+ * protections and NUMA hinting fault protections.
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return 0;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
-- 
cgit v1.2.3


From 8a0516ed8b90c95ffa1363b420caa37418149f21 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:22 -0800
Subject: mm: convert p[te|md]_numa users to p[te|md]_protnone_numa

Convert existing users of pte_numa and friends to the new helper.  Note
that the kernel is broken after this patch is applied until the other page
table modifiers are also altered.  This patch layout is to make review
easier.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |  2 +-
 arch/powerpc/mm/fault.c             |  5 -----
 arch/powerpc/mm/pgtable.c           | 11 ++++++++---
 arch/powerpc/mm/pgtable_64.c        |  3 ++-
 arch/x86/mm/gup.c                   |  4 ++--
 include/uapi/linux/mempolicy.h      |  2 +-
 mm/gup.c                            | 10 +++++-----
 mm/huge_memory.c                    | 16 ++++++++--------
 mm/memory.c                         |  4 ++--
 mm/mprotect.c                       | 38 ++++++++++---------------------------
 mm/pgtable-generic.c                |  2 +-
 11 files changed, 40 insertions(+), 57 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 510bdfbc4073..625407e4d3b0 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -212,7 +212,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	/* Look up the Linux PTE for the backing page */
 	pte_size = psize;
 	pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
-	if (pte_present(pte) && !pte_numa(pte)) {
+	if (pte_present(pte) && !pte_protnone(pte)) {
 		if (writing && !pte_write(pte))
 			/* make the actual HPTE be read-only */
 			ptel = hpte_make_readonly(ptel);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 6154b0a2b063..f38327b95f76 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -398,8 +398,6 @@ good_area:
 		 * processors use the same I/D cache coherency mechanism
 		 * as embedded.
 		 */
-		if (error_code & DSISR_PROTFAULT)
-			goto bad_area;
 #endif /* CONFIG_PPC_STD_MMU */
 
 		/*
@@ -423,9 +421,6 @@ good_area:
 		flags |= FAULT_FLAG_WRITE;
 	/* a read */
 	} else {
-		/* protection fault */
-		if (error_code & 0x08000000)
-			goto bad_area;
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 			goto bad_area;
 	}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index c90e602677c9..83dfcb55ffef 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -172,9 +172,14 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
 void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 		pte_t pte)
 {
-#ifdef CONFIG_DEBUG_VM
-	WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
-#endif
+	/*
+	 * When handling numa faults, we already have the pte marked
+	 * _PAGE_PRESENT, but we can be sure that it is not in hpte.
+	 * Hence we can use set_pte_at for them.
+	 */
+	VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) ==
+		(_PAGE_PRESENT | _PAGE_USER));
+
 	/* Note: mm->context.id might not yet have been assigned as
 	 * this context might not have been activated yet when this
 	 * is called.
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 4fe5f64cc179..91bb8836825a 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -718,7 +718,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		pmd_t *pmdp, pmd_t pmd)
 {
 #ifdef CONFIG_DEBUG_VM
-	WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
+	WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
+		(_PAGE_PRESENT | _PAGE_USER));
 	assert_spin_locked(&mm->page_table_lock);
 	WARN_ON(!pmd_trans_huge(pmd));
 #endif
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 89df70e0caa6..81bf3d2af3eb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		struct page *page;
 
 		/* Similar to the PMD case, NUMA hinting must take slow path */
-		if (pte_numa(pte)) {
+		if (pte_protnone(pte)) {
 			pte_unmap(ptep);
 			return 0;
 		}
@@ -178,7 +178,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 			 * slowpath for accounting purposes and so that they
 			 * can be serialised against THP migration.
 			 */
-			if (pmd_numa(pmd))
+			if (pmd_protnone(pmd))
 				return 0;
 			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
 				return 0;
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 0d11c3dcd3a1..9cd8b21dddbe 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -67,7 +67,7 @@ enum mpol_rebind_step {
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
 #define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
-#define MPOL_F_MORON	(1 << 4) /* Migrate On pte_numa Reference On Node */
+#define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
 
 
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
diff --git a/mm/gup.c b/mm/gup.c
index c2da1163986a..51bf0b06ca7b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -64,7 +64,7 @@ retry:
 		migration_entry_wait(mm, pmd, address);
 		goto retry;
 	}
-	if ((flags & FOLL_NUMA) && pte_numa(pte))
+	if ((flags & FOLL_NUMA) && pte_protnone(pte))
 		goto no_page;
 	if ((flags & FOLL_WRITE) && !pte_write(pte)) {
 		pte_unmap_unlock(ptep, ptl);
@@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
-	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
 		return no_page_table(vma, flags);
 	if (pmd_trans_huge(*pmd)) {
 		if (flags & FOLL_SPLIT) {
@@ -906,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 
 		/*
 		 * Similar to the PMD case below, NUMA hinting must take slow
-		 * path
+		 * path using the pte_protnone check.
 		 */
 		if (!pte_present(pte) || pte_special(pte) ||
-			pte_numa(pte) || (write && !pte_write(pte)))
+			pte_protnone(pte) || (write && !pte_write(pte)))
 			goto pte_unmap;
 
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1104,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 			 * slowpath for accounting purposes and so that they
 			 * can be serialised against THP migration.
 			 */
-			if (pmd_numa(pmd))
+			if (pmd_protnone(pmd))
 				return 0;
 
 			if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c6921362c5fc..915941c45169 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1211,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 		return ERR_PTR(-EFAULT);
 
 	/* Full NUMA hinting faults to serialise migration in fault paths */
-	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
 		goto out;
 
 	page = pmd_page(*pmd);
@@ -1342,7 +1342,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	/*
 	 * Migrate the THP to the requested node, returns with page unlocked
-	 * and pmd_numa cleared.
+	 * and access rights restored.
 	 */
 	spin_unlock(ptl);
 	migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1357,7 +1357,7 @@ clear_pmdnuma:
 	BUG_ON(!PageLocked(page));
 	pmd = pmd_mknonnuma(pmd);
 	set_pmd_at(mm, haddr, pmdp, pmd);
-	VM_BUG_ON(pmd_numa(*pmdp));
+	VM_BUG_ON(pmd_protnone(*pmdp));
 	update_mmu_cache_pmd(vma, addr, pmdp);
 	unlock_page(page);
 out_unlock:
@@ -1483,7 +1483,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		ret = 1;
 		if (!prot_numa) {
 			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
-			if (pmd_numa(entry))
+			if (pmd_protnone(entry))
 				entry = pmd_mknonnuma(entry);
 			entry = pmd_modify(entry, newprot);
 			ret = HPAGE_PMD_NR;
@@ -1499,7 +1499,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			 * local vs remote hits on the zero page.
 			 */
 			if (!is_huge_zero_page(page) &&
-			    !pmd_numa(*pmd)) {
+			    !pmd_protnone(*pmd)) {
 				pmdp_set_numa(mm, addr, pmd);
 				ret = HPAGE_PMD_NR;
 			}
@@ -1767,9 +1767,9 @@ static int __split_huge_page_map(struct page *page,
 			pte_t *pte, entry;
 			BUG_ON(PageCompound(page+i));
 			/*
-			 * Note that pmd_numa is not transferred deliberately
-			 * to avoid any possibility that pte_numa leaks to
-			 * a PROT_NONE VMA by accident.
+			 * Note that NUMA hinting access restrictions are not
+			 * transferred to avoid any possibility of altering
+			 * permissions across VMAs.
 			 */
 			entry = mk_pte(page + i, vma->vm_page_prot);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/memory.c b/mm/memory.c
index bbe6a73a899d..92e6a6299e86 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3124,7 +3124,7 @@ static int handle_pte_fault(struct mm_struct *mm,
 					pte, pmd, flags, entry);
 	}
 
-	if (pte_numa(entry))
+	if (pte_protnone(entry))
 		return do_numa_page(mm, vma, address, entry, pte, pmd);
 
 	ptl = pte_lockptr(mm, pmd);
@@ -3202,7 +3202,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			if (pmd_trans_splitting(orig_pmd))
 				return 0;
 
-			if (pmd_numa(orig_pmd))
+			if (pmd_protnone(orig_pmd))
 				return do_huge_pmd_numa_page(mm, vma, address,
 							     orig_pmd, pmd);
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33121662f08b..44ffa698484d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,36 +75,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
 			pte_t ptent;
-			bool updated = false;
 
-			if (!prot_numa) {
-				ptent = ptep_modify_prot_start(mm, addr, pte);
-				if (pte_numa(ptent))
-					ptent = pte_mknonnuma(ptent);
-				ptent = pte_modify(ptent, newprot);
-				/*
-				 * Avoid taking write faults for pages we
-				 * know to be dirty.
-				 */
-				if (dirty_accountable && pte_dirty(ptent) &&
-				    (pte_soft_dirty(ptent) ||
-				     !(vma->vm_flags & VM_SOFTDIRTY)))
-					ptent = pte_mkwrite(ptent);
-				ptep_modify_prot_commit(mm, addr, pte, ptent);
-				updated = true;
-			} else {
-				struct page *page;
-
-				page = vm_normal_page(vma, addr, oldpte);
-				if (page && !PageKsm(page)) {
-					if (!pte_numa(oldpte)) {
-						ptep_set_numa(mm, addr, pte);
-						updated = true;
-					}
-				}
+			ptent = ptep_modify_prot_start(mm, addr, pte);
+			ptent = pte_modify(ptent, newprot);
+
+			/* Avoid taking write faults for known dirty pages */
+			if (dirty_accountable && pte_dirty(ptent) &&
+					(pte_soft_dirty(ptent) ||
+					 !(vma->vm_flags & VM_SOFTDIRTY))) {
+				ptent = pte_mkwrite(ptent);
 			}
-			if (updated)
-				pages++;
+			ptep_modify_prot_commit(mm, addr, pte, ptent);
+			pages++;
 		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e028ecb..4b8ad760dde3 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,7 +193,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
 	pmd_t entry = *pmdp;
-	if (pmd_numa(entry))
+	if (pmd_protnone(entry))
 		entry = pmd_mknonnuma(entry);
 	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-- 
cgit v1.2.3


From 21d9ee3eda7792c45880b2f11bff8e95c9a061fb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:32 -0800
Subject: mm: remove remaining references to NUMA hinting bits and helpers

This patch removes the NUMA PTE bits and associated helpers.  As a
side-effect it increases the maximum possible swap space on x86-64.

One potential source of problems is races between the marking of PTEs
PROT_NONE, NUMA hinting faults and migration.  It must be guaranteed that
a PTE being protected is not faulted in parallel, seen as a pte_none and
corrupting memory.  The base case is safe but transhuge has problems in
the past due to an different migration mechanism and a dependance on page
lock to serialise migrations and warrants a closer look.

task_work hinting update			parallel fault
------------------------			--------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						pmd_none
						  do_huge_pmd_anonymous_page
						  read? pmd_lock blocks until hinting complete, fail !pmd_none test
						  write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none
      pmd_modify
      set_pmd_at

task_work hinting update			parallel migration
------------------------			------------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						  do_huge_pmd_numa_page
						    migrate_misplaced_transhuge_page
						    pmd_lock waits for updates to complete, recheck pmd_same
      pmd_modify
      set_pmd_at

Both of those are safe and the case where a transhuge page is inserted
during a protection update is unchanged.  The case where two processes try
migrating at the same time is unchanged by this series so should still be
ok.  I could not find a case where we are accidentally depending on the
PTE not being cleared and flushed.  If one is missed, it'll manifest as
corruption problems that start triggering shortly after this series is
merged and only happen when NUMA balancing is enabled.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/pgtable.h    |  54 +-----------
 arch/powerpc/include/asm/pte-common.h |   5 --
 arch/powerpc/include/asm/pte-hash64.h |   6 --
 arch/x86/include/asm/pgtable.h        |  22 +----
 arch/x86/include/asm/pgtable_64.h     |   5 --
 arch/x86/include/asm/pgtable_types.h  |  41 +--------
 include/asm-generic/pgtable.h         | 155 ----------------------------------
 include/linux/swapops.h               |   2 +-
 8 files changed, 7 insertions(+), 283 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 1146006d3477..79fee2eb8d56 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -55,64 +55,12 @@ static inline int pmd_protnone(pmd_t pmd)
 {
 	return pte_protnone(pmd_pte(pmd));
 }
-
-static inline int pte_present(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_NUMA_MASK;
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t pte)
-{
-	return pte_val(pte) & (_PAGE_PRESENT);
-}
-
-#define ptep_set_numa ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	if ((pte_val(*ptep) & _PAGE_PRESENT) == 0)
-		VM_BUG_ON(1);
-
-	pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
-	return;
-}
-
-#define pmdp_set_numa pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0)
-		VM_BUG_ON(1);
-
-	pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
-	return;
-}
-
-/*
- * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
- * which was inherited from x86. For the purposes of powerpc pte_basic_t and
- * pmd_t are equivalent
- */
-#define pteval_t pte_basic_t
-#define pmdval_t pmd_t
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_NUMA_MASK;
-}
-
-# else
+#endif /* CONFIG_NUMA_BALANCING */
 
 static inline int pte_present(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_PRESENT;
 }
-#endif /* CONFIG_NUMA_BALANCING */
 
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 2aef9b7a0eb2..c5a755ef7011 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -104,11 +104,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
 			 _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \
 			 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Mask of bits that distinguish present and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
-#endif
-
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
  * cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 2505d8eab15c..55aea0caf95e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,12 +27,6 @@
 #define _PAGE_RW		0x0200 /* software: user write access allowed */
 #define _PAGE_BUSY		0x0800 /* software: PTE & hash are busy */
 
-/*
- * Used for tracking numa faults
- */
-#define _PAGE_NUMA	0x00000010 /* Gather numa placement stats */
-
-
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
 #define _PAGE_KERNEL_RO		 _PAGE_KERNEL_RW
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f519b0b529dd..34d42a7d5595 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -300,7 +300,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -442,13 +442,6 @@ static inline int pte_same(pte_t a, pte_t b)
 }
 
 static inline int pte_present(pte_t a)
-{
-	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
-			       _PAGE_NUMA);
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t a)
 {
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
@@ -459,7 +452,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 	if (pte_flags(a) & _PAGE_PRESENT)
 		return true;
 
-	if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+	if ((pte_flags(a) & _PAGE_PROTNONE) &&
 			mm_tlb_flush_pending(mm))
 		return true;
 
@@ -479,8 +472,7 @@ static inline int pmd_present(pmd_t pmd)
 	 * the _PAGE_PSE flag will remain set at all times while the
 	 * _PAGE_PRESENT bit is clear).
 	 */
-	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-				 _PAGE_NUMA);
+	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -555,11 +547,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 
 static inline int pmd_bad(pmd_t pmd)
 {
-#ifdef CONFIG_NUMA_BALANCING
-	/* pmd_numa check */
-	if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
-		return 0;
-#endif
 	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
 
@@ -878,19 +865,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
 static inline int pte_swp_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
 }
 
 static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 #endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e227970f983e..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 /* Encode and de-code a swap entry */
 #define SWP_TYPE_BITS 5
-#ifdef CONFIG_NUMA_BALANCING
-/* Automatic NUMA balancing needs to be distinguishable from swap entries */
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
-#else
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#endif
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3e0230c94cff..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -27,14 +27,6 @@
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
-/*
- * Swap offsets on configurations that allow automatic NUMA balancing use the
- * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
- * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
- * maximum possible swap space from 16TB to 8TB.
- */
-#define _PAGE_BIT_NUMA		(_PAGE_BIT_GLOBAL+1)
-
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
@@ -75,21 +67,6 @@
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
-/*
- * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
- * that is not present. The hinting fault gathers numa placement statistics
- * (see pte_numa()). The bit is always zero when the PTE is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- */
-#ifdef CONFIG_NUMA_BALANCING
-#define _PAGE_NUMA	(_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
-#else
-#define _PAGE_NUMA	(_AT(pteval_t, 0))
-#endif
-
 /*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
@@ -122,8 +99,8 @@
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
-			 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
+			 _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 /*
  * The cache modes defined here are used to translate between pure SW usage
@@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
 	return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Set of bits that distinguishes present, prot_none and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_NUMA_MASK;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 067922c06c29..4d46085c1b90 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -244,10 +244,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 # define pte_accessible(mm, pte)	((void)(pte), 1)
 #endif
 
-#ifndef pte_present_nonuma
-#define pte_present_nonuma(pte) pte_present(pte)
-#endif
-
 #ifndef flush_tlb_fix_spurious_fault
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
@@ -693,157 +689,6 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
- * is protected for PROT_NONE and a NUMA hinting fault entry. If the
- * architecture defines __PAGE_PROTNONE then it should take that into account
- * but those that do not can rely on the fact that the NUMA hinting scanner
- * skips inaccessible VMAs.
- *
- * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
- * fault triggers on those regions if pte/pmd_numa returns true
- * (because _PAGE_PRESENT is not set).
- */
-#ifndef pte_numa
-static inline int pte_numa(pte_t pte)
-{
-	return ptenuma_flags(pte) == _PAGE_NUMA;
-}
-#endif
-
-#ifndef pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
-	return pmdnuma_flags(pmd) == _PAGE_NUMA;
-}
-#endif
-
-/*
- * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
- * because they're called by the NUMA hinting minor page fault. If we
- * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
- * would be forced to set it later while filling the TLB after we
- * return to userland. That would trigger a second write to memory
- * that we optimize away by setting _PAGE_ACCESSED here.
- */
-#ifndef pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	pteval_t val = pte_val(pte);
-
-	val &= ~_PAGE_NUMA;
-	val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
-	return __pte(val);
-}
-#endif
-
-#ifndef pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	pmdval_t val = pmd_val(pmd);
-
-	val &= ~_PAGE_NUMA;
-	val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
-
-	return __pmd(val);
-}
-#endif
-
-#ifndef pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	pteval_t val = pte_val(pte);
-
-	VM_BUG_ON(!(val & _PAGE_PRESENT));
-
-	val &= ~_PAGE_PRESENT;
-	val |= _PAGE_NUMA;
-
-	return __pte(val);
-}
-#endif
-
-#ifndef ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	pte_t ptent = *ptep;
-
-	ptent = pte_mknuma(ptent);
-	set_pte_at(mm, addr, ptep, ptent);
-	return;
-}
-#endif
-
-#ifndef pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	pmdval_t val = pmd_val(pmd);
-
-	val &= ~_PAGE_PRESENT;
-	val |= _PAGE_NUMA;
-
-	return __pmd(val);
-}
-#endif
-
-#ifndef pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	pmd_t pmd = *pmdp;
-
-	pmd = pmd_mknuma(pmd);
-	set_pmd_at(mm, addr, pmdp, pmd);
-	return;
-}
-#endif
-#else
-static inline int pmd_numa(pmd_t pmd)
-{
-	return 0;
-}
-
-static inline int pte_numa(pte_t pte)
-{
-	return 0;
-}
-
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	return pmd;
-}
-
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	return;
-}
-
-
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	return pmd;
-}
-
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	return ;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #endif /* CONFIG_MMU */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 831a3168ab35..cedf3d3c373f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present_nonuma(pte);
+	return !pte_none(pte) && !pte_present(pte);
 }
 #endif
 
-- 
cgit v1.2.3


From c819f37e7e174d68cd013abf33725b4e07ced023 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:38 -0800
Subject: x86: mm: restore original pte_special check

Commit b38af4721f59 ("x86,mm: fix pte_special versus pte_numa") adjusted
the pte_special check to take into account that a special pte had
SPECIAL and neither PRESENT nor PROTNONE.  Now that NUMA hinting PTEs
are no longer modifying _PAGE_PRESENT it should be safe to restore the
original pte_special behaviour.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 34d42a7d5595..67fc3d2b0aab 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -132,13 +132,7 @@ static inline int pte_exec(pte_t pte)
 
 static inline int pte_special(pte_t pte)
 {
-	/*
-	 * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
-	 * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
-	 * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
-	 */
-	return (pte_flags(pte) & _PAGE_SPECIAL) &&
-		(pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
+	return pte_flags(pte) & _PAGE_SPECIAL;
 }
 
 static inline unsigned long pte_pfn(pte_t pte)
-- 
cgit v1.2.3


From f56141e3e2d9aabf7e6b89680ab572c2cdbb2a24 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 12 Feb 2015 15:01:14 -0800
Subject: all arches, signal: move restart_block to struct task_struct

If an attacker can cause a controlled kernel stack overflow, overwriting
the restart block is a very juicy exploit target.  This is because the
restart_block is held in the same memory allocation as the kernel stack.

Moving the restart block to struct task_struct prevents this exploit by
making the restart_block harder to locate.

Note that there are other fields in thread_info that are also easy
targets, at least on some architectures.

It's also a decent simplification, since the restart code is more or less
identical on all architectures.

[james.hogan@imgtec.com: metag: align thread_info::supervisor_stack]
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Miller <davem@davemloft.net>
Acked-by: Richard Weinberger <richard@nod.at>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Tested-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/thread_info.h      |  5 -----
 arch/alpha/kernel/signal.c                |  2 +-
 arch/arc/include/asm/thread_info.h        |  4 ----
 arch/arc/kernel/signal.c                  |  2 +-
 arch/arm/include/asm/thread_info.h        |  4 ----
 arch/arm/kernel/signal.c                  |  4 ++--
 arch/arm64/include/asm/thread_info.h      |  4 ----
 arch/arm64/kernel/signal.c                |  2 +-
 arch/arm64/kernel/signal32.c              |  4 ++--
 arch/avr32/include/asm/thread_info.h      |  4 ----
 arch/avr32/kernel/asm-offsets.c           |  1 -
 arch/avr32/kernel/signal.c                |  2 +-
 arch/blackfin/include/asm/thread_info.h   |  4 ----
 arch/blackfin/kernel/signal.c             |  2 +-
 arch/c6x/include/asm/thread_info.h        |  4 ----
 arch/c6x/kernel/signal.c                  |  2 +-
 arch/cris/arch-v10/kernel/signal.c        |  2 +-
 arch/cris/arch-v32/kernel/signal.c        |  2 +-
 arch/cris/include/asm/thread_info.h       |  4 ----
 arch/frv/include/asm/thread_info.h        |  4 ----
 arch/frv/kernel/asm-offsets.c             |  1 -
 arch/frv/kernel/signal.c                  |  2 +-
 arch/hexagon/include/asm/thread_info.h    |  4 ----
 arch/hexagon/kernel/signal.c              |  2 +-
 arch/ia64/include/asm/thread_info.h       |  4 ----
 arch/ia64/kernel/signal.c                 |  2 +-
 arch/m32r/include/asm/thread_info.h       |  5 -----
 arch/m32r/kernel/signal.c                 |  2 +-
 arch/m68k/include/asm/thread_info.h       |  4 ----
 arch/m68k/kernel/signal.c                 |  4 ++--
 arch/metag/include/asm/thread_info.h      |  6 +-----
 arch/metag/kernel/signal.c                |  2 +-
 arch/microblaze/include/asm/thread_info.h |  4 ----
 arch/microblaze/kernel/signal.c           |  2 +-
 arch/mips/include/asm/thread_info.h       |  4 ----
 arch/mips/kernel/asm-offsets.c            |  1 -
 arch/mips/kernel/signal.c                 |  2 +-
 arch/mips/kernel/signal32.c               |  2 +-
 arch/mn10300/include/asm/thread_info.h    |  4 ----
 arch/mn10300/kernel/asm-offsets.c         |  1 -
 arch/mn10300/kernel/signal.c              |  2 +-
 arch/openrisc/include/asm/thread_info.h   |  4 ----
 arch/openrisc/kernel/signal.c             |  2 +-
 arch/parisc/include/asm/thread_info.h     |  4 ----
 arch/parisc/kernel/signal.c               |  2 +-
 arch/powerpc/include/asm/thread_info.h    |  4 ----
 arch/powerpc/kernel/signal_32.c           |  4 ++--
 arch/powerpc/kernel/signal_64.c           |  2 +-
 arch/s390/include/asm/thread_info.h       |  4 ----
 arch/s390/kernel/compat_signal.c          |  2 +-
 arch/s390/kernel/signal.c                 |  2 +-
 arch/score/include/asm/thread_info.h      |  4 ----
 arch/score/kernel/asm-offsets.c           |  1 -
 arch/score/kernel/signal.c                |  2 +-
 arch/sh/include/asm/thread_info.h         |  4 ----
 arch/sh/kernel/asm-offsets.c              |  1 -
 arch/sh/kernel/signal_32.c                |  4 ++--
 arch/sh/kernel/signal_64.c                |  4 ++--
 arch/sparc/include/asm/thread_info_32.h   |  6 ------
 arch/sparc/include/asm/thread_info_64.h   | 12 +++---------
 arch/sparc/kernel/signal32.c              |  4 ++--
 arch/sparc/kernel/signal_32.c             |  2 +-
 arch/sparc/kernel/signal_64.c             |  2 +-
 arch/sparc/kernel/traps_64.c              |  2 --
 arch/tile/include/asm/thread_info.h       |  4 ----
 arch/tile/kernel/signal.c                 |  2 +-
 arch/um/include/asm/thread_info.h         |  4 ----
 arch/unicore32/include/asm/thread_info.h  |  4 ----
 arch/unicore32/kernel/signal.c            |  2 +-
 arch/x86/ia32/ia32_signal.c               |  2 +-
 arch/x86/include/asm/thread_info.h        |  4 ----
 arch/x86/kernel/signal.c                  |  2 +-
 arch/x86/um/signal.c                      |  2 +-
 arch/xtensa/include/asm/thread_info.h     |  5 -----
 arch/xtensa/kernel/signal.c               |  2 +-
 fs/select.c                               |  2 +-
 include/linux/init_task.h                 |  3 +++
 include/linux/sched.h                     |  2 ++
 kernel/compat.c                           |  5 ++---
 kernel/futex.c                            |  2 +-
 kernel/signal.c                           |  2 +-
 kernel/time/alarmtimer.c                  |  2 +-
 kernel/time/hrtimer.c                     |  2 +-
 kernel/time/posix-cpu-timers.c            |  3 +--
 84 files changed, 62 insertions(+), 194 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 48bbea6898b3..d5b98ab514bb 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -27,8 +27,6 @@ struct thread_info {
 	int bpt_nsaved;
 	unsigned long bpt_addr[2];		/* breakpoint handling  */
 	unsigned int bpt_insn[2];
-
-	struct restart_block	restart_block;
 };
 
 /*
@@ -40,9 +38,6 @@ struct thread_info {
 	.exec_domain	= &default_exec_domain,	\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 6cec2881acbf..8dbfb15f1745 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -150,7 +150,7 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
 	struct switch_stack *sw = (struct switch_stack *)regs - 1;
 	long i, err = __get_user(regs->pc, &sc->sc_pc);
 
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	sw->r26 = (unsigned long) ret_from_sys_call;
 
diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h
index 02bc5ec0fb2e..1163a1838ac1 100644
--- a/arch/arc/include/asm/thread_info.h
+++ b/arch/arc/include/asm/thread_info.h
@@ -46,7 +46,6 @@ struct thread_info {
 	struct exec_domain *exec_domain;/* execution domain */
 	__u32 cpu;			/* current CPU */
 	unsigned long thr_ptr;		/* TLS ptr */
-	struct restart_block restart_block;
 };
 
 /*
@@ -62,9 +61,6 @@ struct thread_info {
 	.cpu        = 0,			\
 	.preempt_count  = INIT_PREEMPT_COUNT,	\
 	.addr_limit = KERNEL_DS,		\
-	.restart_block  = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info    (init_thread_union.thread_info)
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index cb3142a2d40b..114234e83caa 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	struct pt_regs *regs = current_pt_regs();
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* Since we stacked the signal on a word boundary,
 	 * then 'sp' should be word aligned here.  If it's
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index d890e41f5520..72812a1f3d1c 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -68,7 +68,6 @@ struct thread_info {
 #ifdef CONFIG_ARM_THUMBEE
 	unsigned long		thumbee_state;	/* ThumbEE Handler Base register */
 #endif
-	struct restart_block	restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)						\
@@ -81,9 +80,6 @@ struct thread_info {
 	.cpu_domain	= domain_val(DOMAIN_USER, DOMAIN_MANAGER) |	\
 			  domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) |	\
 			  domain_val(DOMAIN_IO, DOMAIN_CLIENT),		\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 8aa6f1b87c9e..023ac905e4c3 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -191,7 +191,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs)
 	struct sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
@@ -221,7 +221,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 459bf8e53208..702e1e6a0d80 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -48,7 +48,6 @@ struct thread_info {
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
-	struct restart_block	restart_block;
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	int			cpu;		/* cpu */
 };
@@ -60,9 +59,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6fa792137eda..660ccf9f7524 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -131,7 +131,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 128-bit boundary, then 'sp' should
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e299de396e9b..c20a300e2213 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -347,7 +347,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs)
 	struct compat_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
@@ -381,7 +381,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs)
 	struct compat_rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index a978f3fe7c25..d56afa99a514 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -30,7 +30,6 @@ struct thread_info {
 						   saved by debug handler
 						   when setting up
 						   trampoline */
-	struct restart_block	restart_block;
 	__u8			supervisor_stack[0];
 };
 
@@ -41,9 +40,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.cpu		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall				\
-	}								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c
index d6a8193a1d2f..e41c84516e5d 100644
--- a/arch/avr32/kernel/asm-offsets.c
+++ b/arch/avr32/kernel/asm-offsets.c
@@ -18,7 +18,6 @@ void foo(void)
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
 	OFFSET(TI_rar_saved, thread_info, rar_saved);
 	OFFSET(TI_rsr_saved, thread_info, rsr_saved);
-	OFFSET(TI_restart_block, thread_info, restart_block);
 	BLANK();
 	OFFSET(TSK_active_mm, task_struct, active_mm);
 	BLANK();
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index d309fbcc3bd6..8f1c63b9b983 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -69,7 +69,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *)regs->sp;
 	pr_debug("SIG return: frame = %p\n", frame);
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index 55f473bdad36..57c3a8bd583d 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
 	int cpu;		/* cpu we're on */
 	int preempt_count;	/* 0 => preemptable, <0 => BUG */
 	mm_segment_t addr_limit;	/* address limit */
-	struct restart_block restart_block;
 #ifndef CONFIG_SMP
 	struct l1_scratch_task_info l1_task_info;
 #endif
@@ -58,9 +57,6 @@ struct thread_info {
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index ef275571d885..f2a8b5493bd3 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -44,7 +44,7 @@ rt_restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *p
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 #define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x)
 
diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h
index d4e9ef87076d..584e253f3217 100644
--- a/arch/c6x/include/asm/thread_info.h
+++ b/arch/c6x/include/asm/thread_info.h
@@ -45,7 +45,6 @@ struct thread_info {
 	int			cpu;		/* cpu we're on */
 	int			preempt_count;	/* 0 = preemptable, <0 = BUG */
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block	restart_block;
 };
 
 /*
@@ -61,9 +60,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index fe68226f6c4d..3c4bb5a5c382 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -68,7 +68,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a dword boundary,
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 9b32d338838b..74d7ba35120d 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -67,7 +67,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	unsigned long old_usp;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* restore the regs from &sc->regs (same as sc, since regs is first)
 	 * (sc is already checked for VERIFY_READ since the sigframe was
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 78ce3b1c9bcb..870e3e069318 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -59,7 +59,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	unsigned long old_usp;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Restore the registers from &sc->regs. sc is already checked
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index 55dede18c032..7286db5ed90e 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -38,7 +38,6 @@ struct thread_info {
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 	__u8			supervisor_stack[0];
 };
 
@@ -56,9 +55,6 @@ struct thread_info {
 	.cpu		= 0,				\
 	.preempt_count	= INIT_PREEMPT_COUNT,		\
 	.addr_limit	= KERNEL_DS,			\
-	.restart_block = {				\
-		       .fn = do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index af29e17c0181..6b917f1c2955 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -41,7 +41,6 @@ struct thread_info {
 						 * 0-0xBFFFFFFF for user-thead
 						 * 0-0xFFFFFFFF for kernel-thread
 						 */
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -65,9 +64,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/frv/kernel/asm-offsets.c b/arch/frv/kernel/asm-offsets.c
index 9de96843a278..446e89d500cc 100644
--- a/arch/frv/kernel/asm-offsets.c
+++ b/arch/frv/kernel/asm-offsets.c
@@ -40,7 +40,6 @@ void foo(void)
 	OFFSET(TI_CPU,			thread_info, cpu);
 	OFFSET(TI_PREEMPT_COUNT,	thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT,		thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK,	thread_info, restart_block);
 	BLANK();
 
 	/* offsets into register file storage */
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index dc3d59de0870..336713ab4745 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -62,7 +62,7 @@ static int restore_sigcontext(struct sigcontext __user *sc, int *_gr8)
 	unsigned long tbr, psr;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	tbr = user->i.tbr;
 	psr = user->i.psr;
diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h
index a59dad3b3695..bacd3d6030c5 100644
--- a/arch/hexagon/include/asm/thread_info.h
+++ b/arch/hexagon/include/asm/thread_info.h
@@ -56,7 +56,6 @@ struct thread_info {
 	 * used for syscalls somehow;
 	 * seems to have a function pointer and four arguments
 	 */
-	struct restart_block    restart_block;
 	/* Points to the current pt_regs frame  */
 	struct pt_regs		*regs;
 	/*
@@ -83,9 +82,6 @@ struct thread_info {
 	.cpu            = 0,                    \
 	.preempt_count  = 1,                    \
 	.addr_limit     = KERNEL_DS,            \
-	.restart_block = {                      \
-		.fn = do_no_restart_syscall,    \
-	},                                      \
 	.sp = 0,				\
 	.regs = NULL,			\
 }
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index eadd70e47e7e..b039a624c170 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -239,7 +239,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	sigset_t blocked;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *)pt_psp(regs);
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 5b17418b4223..c16f21a068ff 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -27,7 +27,6 @@ struct thread_info {
 	__u32 status;			/* Thread synchronous flags */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-	struct restart_block restart_block;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	__u64 ac_stamp;
 	__u64 ac_leave;
@@ -46,9 +45,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #ifndef ASM_OFFSETS_C
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 6d92170be457..b3a124da71e5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -46,7 +46,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 	long err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* restore scratch that always needs gets updated during signal delivery: */
 	err  = __get_user(flags, &sc->sc_flags);
diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h
index 00171703402f..32422d0211c3 100644
--- a/arch/m32r/include/asm/thread_info.h
+++ b/arch/m32r/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
 					 	   0-0xBFFFFFFF for user-thread
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -49,7 +48,6 @@ struct thread_info {
 #define TI_CPU		0x00000010
 #define TI_PRE_COUNT	0x00000014
 #define TI_ADDR_LIMIT	0x00000018
-#define TI_RESTART_BLOCK 0x000001C
 
 #endif
 
@@ -68,9 +66,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 95408b8f130a..7736c6660a15 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -48,7 +48,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 #define COPY(x)		err |= __get_user(regs->x, &sc->sc_##x)
 	COPY(r4);
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 21a4784ca5a1..c54256e69e64 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u32			cpu;		/* should always be 0 on m68k */
 	unsigned long		tp_value;	/* thread pointer */
-	struct restart_block    restart_block;
 };
 #endif /* __ASSEMBLY__ */
 
@@ -41,9 +40,6 @@ struct thread_info {
 	.exec_domain	= &default_exec_domain,	\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 967a8b7e1527..d7179281e74a 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -655,7 +655,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* get previous context */
 	if (copy_from_user(&context, usc, sizeof(context)))
@@ -693,7 +693,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = __get_user(temp, &uc->uc_mcontext.version);
 	if (temp != MCONTEXT_VERSION)
diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h
index 47711336119e..afb3ca4776d1 100644
--- a/arch/metag/include/asm/thread_info.h
+++ b/arch/metag/include/asm/thread_info.h
@@ -35,9 +35,8 @@ struct thread_info {
 	int preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t addr_limit;	/* thread address space */
-	struct restart_block restart_block;
 
-	u8 supervisor_stack[0];
+	u8 supervisor_stack[0] __aligned(8);
 };
 
 #else /* !__ASSEMBLY__ */
@@ -74,9 +73,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
index 0d100d5c1407..ce49d429c74a 100644
--- a/arch/metag/kernel/signal.c
+++ b/arch/metag/kernel/signal.c
@@ -48,7 +48,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL,
 				   &sc->regs);
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 8c9d36591a03..b699fbd7de4a 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -71,7 +71,6 @@ struct thread_info {
 	__u32			cpu; /* current CPU */
 	__s32			preempt_count; /* 0 => preemptable,< 0 => BUG*/
 	mm_segment_t		addr_limit; /* thread address space */
-	struct restart_block	restart_block;
 
 	struct cpu_context	cpu_context;
 };
@@ -87,9 +86,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 235706055b7f..a1cbaf90e2ea 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -89,7 +89,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	int rval;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index e4440f92b366..9e1295f874f0 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
 						 * 0x7fffffff for user-thead
 						 * 0xffffffff for kernel-thread
 						 */
-	struct restart_block	restart_block;
 	struct pt_regs		*regs;
 	long			syscall;	/* syscall number */
 };
@@ -50,9 +49,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index b1d84bd4efb3..3b2dfdb4865f 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -98,7 +98,6 @@ void output_thread_info_defines(void)
 	OFFSET(TI_CPU, thread_info, cpu);
 	OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
 	OFFSET(TI_REGS, thread_info, regs);
 	DEFINE(_THREAD_SIZE, THREAD_SIZE);
 	DEFINE(_THREAD_MASK, THREAD_MASK);
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 545bf11bd2ed..6a28c792d862 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -243,7 +243,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	int i;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err |= __get_user(regs->cp0_epc, &sc->sc_pc);
 
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index d69179c0d49d..19a7705f2a01 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -220,7 +220,7 @@ static int restore_sigcontext32(struct pt_regs *regs,
 	int i;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err |= __get_user(regs->cp0_epc, &sc->sc_pc);
 	err |= __get_user(regs->hi, &sc->sc_mdhi);
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index bf280eaccd36..c1c374f0ec12 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -50,7 +50,6 @@ struct thread_info {
 						   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -80,9 +79,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c
index 47b3bb0c04ff..d780670cbaf3 100644
--- a/arch/mn10300/kernel/asm-offsets.c
+++ b/arch/mn10300/kernel/asm-offsets.c
@@ -28,7 +28,6 @@ void foo(void)
 	OFFSET(TI_cpu,			thread_info, cpu);
 	OFFSET(TI_preempt_count,	thread_info, preempt_count);
 	OFFSET(TI_addr_limit,		thread_info, addr_limit);
-	OFFSET(TI_restart_block,	thread_info, restart_block);
 	BLANK();
 
 	OFFSET(REG_D0,			pt_regs, d0);
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index a6c0858592c3..8609845f12c5 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -40,7 +40,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (is_using_fpu(current))
 		fpu_kill_state(current);
diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h
index d797acc901e4..875f0845a707 100644
--- a/arch/openrisc/include/asm/thread_info.h
+++ b/arch/openrisc/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
 					       0-0x7FFFFFFF for user-thead
 					       0-0xFFFFFFFF for kernel-thread
 					     */
-	struct restart_block    restart_block;
 	__u8			supervisor_stack[0];
 
 	/* saved context data */
@@ -79,9 +78,6 @@ struct thread_info {
 	.cpu		= 0,				\
 	.preempt_count	= 1,				\
 	.addr_limit	= KERNEL_DS,			\
-	.restart_block  = {				\
-			  .fn = do_no_restart_syscall,	\
-	},						\
 	.ksp            = 0,                            \
 }
 
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 7d1b8235bf90..4112175bf803 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -46,7 +46,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Restore the regs from &sc->regs.
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index a84611835549..fb13e3865563 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -14,7 +14,6 @@ struct thread_info {
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	__u32 cpu;			/* current CPU */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-	struct restart_block restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)			\
@@ -25,9 +24,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-  	.restart_block	= {			\
-		.fn = do_no_restart_syscall	\
-	}					\
 }
 
 #define init_thread_info        (init_thread_union.thread_info)
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 012d4fa63d97..9b910a0251b8 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -99,7 +99,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
 		sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
 #endif
 
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* Unwind the user stack to get the rt_sigframe structure. */
 	frame = (struct rt_sigframe __user *)
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index e8abc83e699f..72489799cf02 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,7 +43,6 @@ struct thread_info {
 	int		cpu;			/* cpu we're on */
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
-	struct restart_block restart_block;
 	unsigned long	local_flags;		/* private flags for thread */
 
 	/* low level flags - has atomic operations done on it */
@@ -59,9 +58,6 @@ struct thread_info {
 	.exec_domain =	&default_exec_domain,	\
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 	.flags =	0,			\
 }
 
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index b171001698ff..d3a831ac0f92 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1231,7 +1231,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 	int tm_restore = 0;
 #endif
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	rt_sf = (struct rt_sigframe __user *)
 		(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
@@ -1504,7 +1504,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 #endif
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
 	sc = &sf->sctx;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 2cb0c94cafa5..c7c24d2e2bdb 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -666,7 +666,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
 #endif
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
 		goto badframe;
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 4d62fd5b56e5..ef1df718642d 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -39,7 +39,6 @@ struct thread_info {
 	unsigned long		sys_call_table;	/* System call table address */
 	unsigned int		cpu;		/* current CPU */
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
-	struct restart_block	restart_block;
 	unsigned int		system_call;
 	__u64			user_timer;
 	__u64			system_timer;
@@ -56,9 +55,6 @@ struct thread_info {
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 34d5fa7b01b5..bc1df12dd4f8 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -209,7 +209,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
 	int i;
 
 	/* Alwys make any pending restarted system call return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
 		return -EFAULT;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 6a2ac257d98f..b3ae6f70c6d6 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -162,7 +162,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 	_sigregs user_sregs;
 
 	/* Alwys make any pending restarted system call return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
 		return -EFAULT;
diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h
index 656b7ada9326..33864fa2a8d4 100644
--- a/arch/score/include/asm/thread_info.h
+++ b/arch/score/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
 	 * 0-0xFFFFFFFF for kernel-thread
 	 */
 	mm_segment_t		addr_limit;
-	struct restart_block	restart_block;
 	struct pt_regs		*regs;
 };
 
@@ -58,9 +57,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= 1,			\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/score/kernel/asm-offsets.c b/arch/score/kernel/asm-offsets.c
index 57788f44c6fb..b4d5214a7a7e 100644
--- a/arch/score/kernel/asm-offsets.c
+++ b/arch/score/kernel/asm-offsets.c
@@ -106,7 +106,6 @@ void output_thread_info_defines(void)
 	OFFSET(TI_CPU, thread_info, cpu);
 	OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
 	OFFSET(TI_REGS, thread_info, regs);
 	DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE);
 	DEFINE(KERNEL_STACK_MASK, THREAD_MASK);
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 1651807774ad..e381c8c4ff65 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -141,7 +141,7 @@ score_rt_sigreturn(struct pt_regs *regs)
 	int sig;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *) regs->regs[0];
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index ad27ffa65e2e..657c03919627 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -33,7 +33,6 @@ struct thread_info {
 	__u32			cpu;
 	int			preempt_count; /* 0 => preemptable, <0 => BUG */
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block	restart_block;
 	unsigned long		previous_sp;	/* sp of previous stack in case
 						   of nested IRQ stacks */
 	__u8			supervisor_stack[0];
@@ -63,9 +62,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
index 08a2be775b6c..542225fedb11 100644
--- a/arch/sh/kernel/asm-offsets.c
+++ b/arch/sh/kernel/asm-offsets.c
@@ -25,7 +25,6 @@ int main(void)
 	DEFINE(TI_FLAGS,	offsetof(struct thread_info, flags));
 	DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
 	DEFINE(TI_PRE_COUNT,	offsetof(struct thread_info, preempt_count));
-	DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block));
 	DEFINE(TI_SIZE,		sizeof(struct thread_info));
 
 #ifdef CONFIG_HIBERNATION
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2f002b24fb92..0b34f2a704fe 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -156,7 +156,7 @@ asmlinkage int sys_sigreturn(void)
 	int r0;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -186,7 +186,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	int r0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 897abe7b871e..71993c6a7d94 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -260,7 +260,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
 	long long ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -294,7 +294,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
 	long long ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 025c98446b1e..fd7bd0a440ca 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -47,8 +47,6 @@ struct thread_info {
 	struct reg_window32	reg_window[NSWINS];	/* align for ldd! */
 	unsigned long		rwbuf_stkptrs[NSWINS];
 	unsigned long		w_saved;
-
-	struct restart_block	restart_block;
 };
 
 /*
@@ -62,9 +60,6 @@ struct thread_info {
 	.flags		=	0,			\
 	.cpu		=	0,			\
 	.preempt_count	=	INIT_PREEMPT_COUNT,	\
-	.restart_block	= {				\
-		.fn	=	do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
@@ -103,7 +98,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define TI_REG_WINDOW	0x30
 #define TI_RWIN_SPTRS	0x230
 #define TI_W_SAVED	0x250
-/* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */
 
 /*
  * thread information flag bit numbers
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 798f0279a4b5..ff455164732a 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -58,8 +58,6 @@ struct thread_info {
 	unsigned long		gsr[7];
 	unsigned long		xfsr[7];
 
-	struct restart_block	restart_block;
-
 	struct pt_regs		*kern_una_regs;
 	unsigned int		kern_una_insn;
 
@@ -92,10 +90,9 @@ struct thread_info {
 #define TI_RWIN_SPTRS	0x000003c8
 #define TI_GSR		0x00000400
 #define TI_XFSR		0x00000438
-#define TI_RESTART_BLOCK 0x00000470
-#define TI_KUNA_REGS	0x000004a0
-#define TI_KUNA_INSN	0x000004a8
-#define TI_FPREGS	0x000004c0
+#define TI_KUNA_REGS	0x00000470
+#define TI_KUNA_INSN	0x00000478
+#define TI_FPREGS	0x00000480
 
 /* We embed this in the uppermost byte of thread_info->flags */
 #define FAULT_CODE_WRITE	0x01	/* Write access, implies D-TLB	   */
@@ -124,9 +121,6 @@ struct thread_info {
 	.current_ds	=	ASI_P,			\
 	.exec_domain	=	&default_exec_domain,	\
 	.preempt_count	=	INIT_PREEMPT_COUNT,	\
-	.restart_block	= {				\
-		.fn	=	do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 62deba7be1a9..4eed773a7735 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -150,7 +150,7 @@ void do_sigreturn32(struct pt_regs *regs)
 	int err, i;
 	
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 
@@ -235,7 +235,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
 	int err, i;
 	
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 9ee72fc8e0e4..52aa5e4ce5e7 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -70,7 +70,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 1a6999868031..d88beff47bab 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -254,7 +254,7 @@ void do_rt_sigreturn(struct pt_regs *regs)
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack ();
 	sf = (struct rt_signal_frame __user *)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 981a769b9558..a27651e866e7 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2730,8 +2730,6 @@ void __init trap_init(void)
 		     TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
 		     TI_CURRENT_DS != offsetof(struct thread_info,
 						current_ds) ||
-		     TI_RESTART_BLOCK != offsetof(struct thread_info,
-						  restart_block) ||
 		     TI_KUNA_REGS != offsetof(struct thread_info,
 					      kern_una_regs) ||
 		     TI_KUNA_INSN != offsetof(struct thread_info,
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 48e4fd0f38e4..96c14c1430d8 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -36,7 +36,6 @@ struct thread_info {
 
 	mm_segment_t		addr_limit;	/* thread address space
 						   (KERNEL_DS or USER_DS) */
-	struct restart_block	restart_block;
 	struct single_step_state *step_state;	/* single step state
 						   (if non-zero) */
 	int			align_ctl;	/* controls unaligned access */
@@ -57,9 +56,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 	.step_state	= NULL,			\
 	.align_ctl	= 0,			\
 }
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index bb0a9ce7ae23..8a524e332c1a 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -48,7 +48,7 @@ int restore_sigcontext(struct pt_regs *regs,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Enforce that sigcontext is like pt_regs, and doesn't mess
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index 1c5b2a83046a..e04114c4fcd9 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -22,7 +22,6 @@ struct thread_info {
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user
 						   0-0xFFFFFFFF for kernel */
-	struct restart_block    restart_block;
 	struct thread_info	*real_thread;    /* Points to non-IRQ stack */
 };
 
@@ -34,9 +33,6 @@ struct thread_info {
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit =	KERNEL_DS,		\
-	.restart_block =  {			\
-		.fn =  do_no_restart_syscall,	\
-	},					\
 	.real_thread = NULL,			\
 }
 
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
index af36d8eabdf1..63e2839dfeb8 100644
--- a/arch/unicore32/include/asm/thread_info.h
+++ b/arch/unicore32/include/asm/thread_info.h
@@ -79,7 +79,6 @@ struct thread_info {
 #ifdef CONFIG_UNICORE_FPU_F64
 	struct fp_state		fpstate __attribute__((aligned(8)));
 #endif
-	struct restart_block	restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)						\
@@ -89,9 +88,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 7c8fb7018dc6..d329f85766cc 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -105,7 +105,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9e181aaba97..d0165c9a2932 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	u32 tmp;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 		/*
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e82e95abc92b..1d4e4f279a32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	int			saved_preempt_count;
 	mm_segment_t		addr_limit;
-	struct restart_block    restart_block;
 	void __user		*sysenter_return;
 	unsigned int		sig_on_uaccess_error:1;
 	unsigned int		uaccess_err:1;	/* uaccess failed */
@@ -45,9 +44,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2a33c8f68319..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 79d824551c1a..0c8c32bfd792 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	int err, pid;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = copy_from_user(&sc, from, sizeof(sc));
 	if (err)
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index 470153e8547c..a9b5d3ba196c 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -51,7 +51,6 @@ struct thread_info {
 	__s32			preempt_count;	/* 0 => preemptable,< 0 => BUG*/
 
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block    restart_block;
 
 	unsigned long		cpenable;
 
@@ -72,7 +71,6 @@ struct thread_info {
 #define TI_CPU		 0x00000010
 #define TI_PRE_COUNT	 0x00000014
 #define TI_ADDR_LIMIT	 0x00000018
-#define TI_RESTART_BLOCK 0x000001C
 
 #endif
 
@@ -90,9 +88,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 4612321c73cc..3d733ba16f28 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -245,7 +245,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
 	int ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (regs->depc > 64)
 		panic("rt_sigreturn in double exception!\n");
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 	if (ret == -EINTR) {
 		struct restart_block *restart_block;
 
-		restart_block = &current_thread_info()->restart_block;
+		restart_block = &current->restart_block;
 		restart_block->fn = do_restart_poll;
 		restart_block->poll.ufds = ufds;
 		restart_block->poll.nfds = nfds;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3037fc085e8e..d3d43ecf148c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -193,6 +193,9 @@ extern struct task_group root_task_group;
 	.nr_cpus_allowed= NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
+	.restart_block = {						\
+		.fn = do_no_restart_syscall,				\
+	},								\
 	.se		= {						\
 		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
 	},								\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..22ee0d5d7f8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1370,6 +1370,8 @@ struct task_struct {
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
+	struct restart_block restart_block;
+
 	pid_t pid;
 	pid_t tgid;
 
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 	 * core implementation decides to return random nonsense.
 	 */
 	if (ret == -ERESTART_RESTARTBLOCK) {
-		struct restart_block *restart
-			= &current_thread_info()->restart_block;
+		struct restart_block *restart = &current->restart_block;
 
 		restart->fn = compat_nanosleep_restart;
 		restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		return -EFAULT;
 
 	if (err == -ERESTART_RESTARTBLOCK) {
-		restart = &current_thread_info()->restart_block;
+		restart = &current->restart_block;
 		restart->fn = compat_clock_nanosleep_restart;
 		restart->nanosleep.compat_rmtp = rmtp;
 	}
diff --git a/kernel/futex.c b/kernel/futex.c
index 4eeb63de7e54..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
 	if (!abs_time)
 		goto out;
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = futex_wait_restart;
 	restart->futex.uaddr = uaddr;
 	restart->futex.val = val;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..33a52759cc0e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
  */
 SYSCALL_DEFINE0(restart_syscall)
 {
-	struct restart_block *restart = &current_thread_info()->restart_block;
+	struct restart_block *restart = &current->restart_block;
 	return restart->fn(restart);
 }
 
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 			goto out;
 	}
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = alarm_timer_nsleep_restart;
 	restart->nanosleep.clockid = type;
 	restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3f5e183c3d97..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1583,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 			goto out;
 	}
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.clockid = t.timer.base->clockid;
 	restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 			    struct timespec *rqtp, struct timespec __user *rmtp)
 {
-	struct restart_block *restart_block =
-		&current_thread_info()->restart_block;
+	struct restart_block *restart_block = &current->restart_block;
 	struct itimerspec it;
 	int error;
 
-- 
cgit v1.2.3


From 02f1f2170d2831b3233e91091c60a66622f29e82 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:31 -0800
Subject: kernel.h: remove ancient __FUNCTION__ hack

__FUNCTION__ hasn't been treated as a string literal since gcc 3.4, so
this only helps people who only test-compile using 3.3 (compiler-gcc3.h
barks at anything older than that).  Besides, there are almost no
occurrences of __FUNCTION__ left in the tree.

[akpm@linux-foundation.org: convert remaining __FUNCTION__ references]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c                       | 2 +-
 arch/x86/kernel/rtc.c                        | 4 ++--
 arch/x86/platform/intel-mid/intel_mid_vrtc.c | 2 +-
 drivers/acpi/acpica/utdebug.c                | 4 ++--
 drivers/block/xen-blkfront.c                 | 2 +-
 include/acpi/acoutput.h                      | 6 +++---
 include/linux/kernel.h                       | 3 ---
 7 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
 #define hpet_print_config()					\
 do {								\
 	if (hpet_verbose)					\
-		_hpet_print_config(__FUNCTION__, __LINE__);	\
+		_hpet_print_config(__func__, __LINE__);	\
 } while (0)
 
 /*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index fe3dbfe0c4a5..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
 		retval = set_rtc_time(&tm);
 		if (retval)
 			printk(KERN_ERR "%s: RTC write failed with error %d\n",
-			       __FUNCTION__, retval);
+			       __func__, retval);
 	} else {
 		printk(KERN_ERR
 		       "%s: Invalid RTC value: write of %lx to RTC failed\n",
-			__FUNCTION__, nowtime);
+			__func__, nowtime);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 4762cff7facd..32947ba0f62d 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now)
 		spin_unlock_irqrestore(&rtc_lock, flags);
 	} else {
 		pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
-			__FUNCTION__, now->tv_sec);
+			__func__, now->tv_sec);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c
index 57078e3ea9b7..4f3f888d33bb 100644
--- a/drivers/acpi/acpica/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -111,8 +111,8 @@ void acpi_ut_track_stack_ptr(void)
  * RETURN:      Updated pointer to the function name
  *
  * DESCRIPTION: Remove the "Acpi" prefix from the function name, if present.
- *              This allows compiler macros such as __FUNCTION__ to be used
- *              with no change to the debug output.
+ *              This allows compiler macros such as __func__ to be used with no
+ *              change to the debug output.
  *
  ******************************************************************************/
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2236c6f31608..d2cae5fc211a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1391,7 +1391,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 			if (major != XENVBD_MAJOR) {
 				printk(KERN_INFO
 						"%s: HVM does not support vbd %d as xen block device\n",
-						__FUNCTION__, vdevice);
+						__func__, vdevice);
 				return -ENODEV;
 			}
 		}
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 9318a87ee39a..a8f344363e77 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -240,7 +240,7 @@
 /*
  * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header,
  * define it now. This is the case where there the compiler does not support
- * a __FUNCTION__ macro or equivalent.
+ * a __func__ macro or equivalent.
  */
 #ifndef ACPI_GET_FUNCTION_NAME
 #define ACPI_GET_FUNCTION_NAME          _acpi_function_name
@@ -249,12 +249,12 @@
  * The Name parameter should be the procedure name as a quoted string.
  * The function name is also used by the function exit macros below.
  * Note: (const char) is used to be compatible with the debug interfaces
- * and macros such as __FUNCTION__.
+ * and macros such as __func__.
  */
 #define ACPI_FUNCTION_NAME(name)        static const char _acpi_function_name[] = #name;
 
 #else
-/* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */
+/* Compiler supports __func__ (or equivalent) -- Ignore this macro */
 
 #define ACPI_FUNCTION_NAME(name)
 #endif				/* ACPI_GET_FUNCTION_NAME */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e42e7dc34c68..d6d630d31ef3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -800,9 +800,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
 	(type *)( (char *)__mptr - offsetof(type,member) );})
 
-/* Trap pasters of __FUNCTION__ at compile-time */
-#define __FUNCTION__ (__func__)
-
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From 55c2d7884e9a97c2f2d46d5818f783bf3dcc5314 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Feb 2015 17:13:43 +1030
Subject: lguest: don't look in console features to find emerg_wr.

The 1.0 spec clearly states that you must set the ACKNOWLEDGE and
DRIVER status bits before accessing the feature bits.  This is a
problem for the early console code, which doesn't really want to
acknowledge the device (the spec specifically excepts writing to the
console's emerg_wr from the usual ordering constrains).

Instead, we check that the *size* of the device configuration is
sufficient to hold emerg_wr: at worst (if the device doesn't support
the VIRTIO_CONSOLE_F_EMERG_WRITE feature), it will ignore the
writes.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 57 +++++++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 531b844cb48d..ac4453d8520e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1222,15 +1222,13 @@ static void set_cfg_window(u32 cfg_offset, u32 off)
 			 off);
 }
 
-static u32 read_bar_via_cfg(u32 cfg_offset, u32 off)
-{
-	set_cfg_window(cfg_offset, off);
-	return read_pci_config(0, 1, 0,
-			       cfg_offset + sizeof(struct virtio_pci_cap));
-}
-
 static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
 {
+	/*
+	 * We could set this up once, then leave it; nothing else in the *
+	 * kernel should touch these registers.  But if it went wrong, that
+	 * would be a horrible bug to find.
+	 */
 	set_cfg_window(cfg_offset, off);
 	write_pci_config(0, 1, 0,
 			 cfg_offset + sizeof(struct virtio_pci_cap), val);
@@ -1239,8 +1237,9 @@ static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
 static void probe_pci_console(void)
 {
 	u8 cap, common_cap = 0, device_cap = 0;
-	/* Offsets within BAR0 */
-	u32 common_offset, device_offset;
+	/* Offset within BAR0 */
+	u32 device_offset;
+	u32 device_len;
 
 	/* Avoid recursive printk into here. */
 	console_cfg_offset = -1;
@@ -1263,7 +1262,7 @@ static void probe_pci_console(void)
 		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
 		if (vndr == PCI_CAP_ID_VNDR) {
 			u8 type, bar;
-			u32 offset;
+			u32 offset, length;
 
 			type = read_pci_config_byte(0, 1, 0,
 			    cap + offsetof(struct virtio_pci_cap, cfg_type));
@@ -1271,18 +1270,15 @@ static void probe_pci_console(void)
 			    cap + offsetof(struct virtio_pci_cap, bar));
 			offset = read_pci_config(0, 1, 0,
 			    cap + offsetof(struct virtio_pci_cap, offset));
+			length = read_pci_config(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, length));
 
 			switch (type) {
-			case VIRTIO_PCI_CAP_COMMON_CFG:
-				if (bar == 0) {
-					common_cap = cap;
-					common_offset = offset;
-				}
-				break;
 			case VIRTIO_PCI_CAP_DEVICE_CFG:
 				if (bar == 0) {
 					device_cap = cap;
 					device_offset = offset;
+					device_len = length;
 				}
 				break;
 			case VIRTIO_PCI_CAP_PCI_CFG:
@@ -1292,32 +1288,27 @@ static void probe_pci_console(void)
 		}
 		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
 	}
-	if (!common_cap || !device_cap || !console_access_cap) {
+	if (!device_cap || !console_access_cap) {
 		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
 		       common_cap, device_cap, console_access_cap);
 		return;
 	}
 
-
-#define write_common_config(reg, val)					\
-	write_bar_via_cfg(console_access_cap,				\
-			  common_offset+offsetof(struct virtio_pci_common_cfg,reg),\
-			  val)
-
-#define read_common_config(reg)						\
-	read_bar_via_cfg(console_access_cap,				\
-			 common_offset+offsetof(struct virtio_pci_common_cfg,reg))
-
-	/* Check features: they must offer EMERG_WRITE */
-	write_common_config(device_feature_select, 0);
-
-	if (!(read_common_config(device_feature)
-	      & (1 << VIRTIO_CONSOLE_F_EMERG_WRITE))) {
-		printk(KERN_ERR "lguest: console missing EMERG_WRITE\n");
+	/*
+	 * Note that we can't check features, until we've set the DRIVER
+	 * status bit.  We don't want to do that until we have a real driver,
+	 * so we just check that the device-specific config has room for
+	 * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
+	 * it should ignore the access.
+	 */
+	if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
+			  + sizeof(u32))) {
+		printk(KERN_ERR "lguest: console missing emerg_wr field\n");
 		return;
 	}
 
 	console_cfg_offset = device_offset;
+	printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
 }
 
 /*
-- 
cgit v1.2.3


From 96738c69a7fcdbf0d7c9df0c8a27660011e82a7b Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 13 Jan 2015 15:25:00 +0000
Subject: x86/efi: Avoid triple faults during EFI mixed mode calls

Andy pointed out that if an NMI or MCE is received while we're in the
middle of an EFI mixed mode call a triple fault will occur. This can
happen, for example, when issuing an EFI mixed mode call while running
perf.

The reason for the triple fault is that we execute the mixed mode call
in 32-bit mode with paging disabled but with 64-bit kernel IDT handlers
installed throughout the call.

At Andy's suggestion, stop playing the games we currently do at runtime,
such as disabling paging and installing a 32-bit GDT for __KERNEL_CS. We
can simply switch to the __KERNEL32_CS descriptor before invoking
firmware services, and run in compatibility mode. This way, if an
NMI/MCE does occur the kernel IDT handler will execute correctly, since
it'll jump to __KERNEL_CS automatically.

However, this change is only possible post-ExitBootServices(). Before
then the firmware "owns" the machine and expects for its 32-bit IDT
handlers to be left intact to service interrupts, etc.

So, we now need to distinguish between early boot and runtime
invocations of EFI services. During early boot, we need to restore the
GDT that the firmware expects to be present. We can only jump to the
__KERNEL32_CS code segment for mixed mode calls after ExitBootServices()
has been invoked.

A liberal sprinkling of comments in the thunking code should make the
differences in early and late environments more apparent.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/boot/compressed/Makefile       |   1 +
 arch/x86/boot/compressed/efi_stub_64.S  |  25 ----
 arch/x86/boot/compressed/efi_thunk_64.S | 196 ++++++++++++++++++++++++++++++++
 arch/x86/platform/efi/efi_stub_64.S     | 161 --------------------------
 arch/x86/platform/efi/efi_thunk_64.S    | 121 +++++++++++++++++---
 5 files changed, 301 insertions(+), 203 deletions(-)
 create mode 100644 arch/x86/boot/compressed/efi_thunk_64.S

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index ad754b4411f7..8bd44e8ee6e2 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -49,6 +49,7 @@ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 
 vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
 	$(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
 
 $(obj)/vmlinux: $(vmlinux-objs-y) FORCE
 	$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
index 7ff3632806b1..99494dff2113 100644
--- a/arch/x86/boot/compressed/efi_stub_64.S
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -3,28 +3,3 @@
 #include <asm/processor-flags.h>
 
 #include "../../platform/efi/efi_stub_64.S"
-
-#ifdef CONFIG_EFI_MIXED
-	.code64
-	.text
-ENTRY(efi64_thunk)
-	push	%rbp
-	push	%rbx
-
-	subq	$16, %rsp
-	leaq	efi_exit32(%rip), %rax
-	movl	%eax, 8(%rsp)
-	leaq	efi_gdt64(%rip), %rax
-	movl	%eax, 4(%rsp)
-	movl	%eax, 2(%rax)		/* Fixup the gdt base address */
-	leaq	efi32_boot_gdt(%rip), %rax
-	movl	%eax, (%rsp)
-
-	call	__efi64_thunk
-
-	addq	$16, %rsp
-	pop	%rbx
-	pop	%rbp
-	ret
-ENDPROC(efi64_thunk)
-#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
new file mode 100644
index 000000000000..630384a4c14a
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT before we make EFI serivce calls,
+ * since the firmware's 32-bit IDT is still currently installed and it
+ * needs to be able to service interrupts.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identify
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+	.code64
+	.text
+ENTRY(efi64_thunk)
+	push	%rbp
+	push	%rbx
+
+	subq	$8, %rsp
+	leaq	efi_exit32(%rip), %rax
+	movl	%eax, 4(%rsp)
+	leaq	efi_gdt64(%rip), %rax
+	movl	%eax, (%rsp)
+	movl	%eax, 2(%rax)		/* Fixup the gdt base address */
+
+	movl	%ds, %eax
+	push	%rax
+	movl	%es, %eax
+	push	%rax
+	movl	%ss, %eax
+	push	%rax
+
+	/*
+	 * Convert x86-64 ABI params to i386 ABI
+	 */
+	subq	$32, %rsp
+	movl	%esi, 0x0(%rsp)
+	movl	%edx, 0x4(%rsp)
+	movl	%ecx, 0x8(%rsp)
+	movq	%r8, %rsi
+	movl	%esi, 0xc(%rsp)
+	movq	%r9, %rsi
+	movl	%esi,  0x10(%rsp)
+
+	sgdt	save_gdt(%rip)
+
+	leaq	1f(%rip), %rbx
+	movq	%rbx, func_rt_ptr(%rip)
+
+	/*
+	 * Switch to gdt with 32-bit segments. This is the firmware GDT
+	 * that was installed when the kernel started executing. This
+	 * pointer was saved at the EFI stub entry point in head_64.S.
+	 */
+	leaq	efi32_boot_gdt(%rip), %rax
+	lgdt	(%rax)
+
+	pushq	$__KERNEL_CS
+	leaq	efi_enter32(%rip), %rax
+	pushq	%rax
+	lretq
+
+1:	addq	$32, %rsp
+
+	lgdt	save_gdt(%rip)
+
+	pop	%rbx
+	movl	%ebx, %ss
+	pop	%rbx
+	movl	%ebx, %es
+	pop	%rbx
+	movl	%ebx, %ds
+
+	/*
+	 * Convert 32-bit status code into 64-bit.
+	 */
+	test	%rax, %rax
+	jz	1f
+	movl	%eax, %ecx
+	andl	$0x0fffffff, %ecx
+	andl	$0xf0000000, %eax
+	shl	$32, %rax
+	or	%rcx, %rax
+1:
+	addq	$8, %rsp
+	pop	%rbx
+	pop	%rbp
+	ret
+ENDPROC(efi64_thunk)
+
+ENTRY(efi_exit32)
+	movq	func_rt_ptr(%rip), %rax
+	push	%rax
+	mov	%rdi, %rax
+	ret
+ENDPROC(efi_exit32)
+
+	.code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %ss
+
+	/* Reload pgtables */
+	movl	%cr3, %eax
+	movl	%eax, %cr3
+
+	/* Disable paging */
+	movl	%cr0, %eax
+	btrl	$X86_CR0_PG_BIT, %eax
+	movl	%eax, %cr0
+
+	/* Disable long mode via EFER */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btrl	$_EFER_LME, %eax
+	wrmsr
+
+	call	*%edi
+
+	/* We must preserve return value */
+	movl	%eax, %edi
+
+	/*
+	 * Some firmware will return with interrupts enabled. Be sure to
+	 * disable them before we switch GDTs.
+	 */
+	cli
+
+	movl	56(%esp), %eax
+	movl	%eax, 2(%eax)
+	lgdtl	(%eax)
+
+	movl	%cr4, %eax
+	btsl	$(X86_CR4_PAE_BIT), %eax
+	movl	%eax, %cr4
+
+	movl	%cr3, %eax
+	movl	%eax, %cr3
+
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btsl	$_EFER_LME, %eax
+	wrmsr
+
+	xorl	%eax, %eax
+	lldt	%ax
+
+	movl	60(%esp), %eax
+	pushl	$__KERNEL_CS
+	pushl	%eax
+
+	/* Enable paging */
+	movl	%cr0, %eax
+	btsl	$X86_CR0_PG_BIT, %eax
+	movl	%eax, %cr0
+	lret
+ENDPROC(efi_enter32)
+
+	.data
+	.balign	8
+	.global	efi32_boot_gdt
+efi32_boot_gdt:	.word	0
+		.quad	0
+
+save_gdt:	.word	0
+		.quad	0
+func_rt_ptr:	.quad	0
+
+	.global efi_gdt64
+efi_gdt64:
+	.word	efi_gdt64_end - efi_gdt64
+	.long	0			/* Filled out by user */
+	.word	0
+	.quad	0x0000000000000000	/* NULL descriptor */
+	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
+	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
+	.quad	0x0080890000000000	/* TS descriptor */
+	.quad   0x0000000000000000	/* TS continued */
+efi_gdt64_end:
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 5fcda7272550..86d0f9e08dd9 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -91,167 +91,6 @@ ENTRY(efi_call)
 	ret
 ENDPROC(efi_call)
 
-#ifdef CONFIG_EFI_MIXED
-
-/*
- * We run this function from the 1:1 mapping.
- *
- * This function must be invoked with a 1:1 mapped stack.
- */
-ENTRY(__efi64_thunk)
-	movl	%ds, %eax
-	push	%rax
-	movl	%es, %eax
-	push	%rax
-	movl	%ss, %eax
-	push	%rax
-
-	subq	$32, %rsp
-	movl	%esi, 0x0(%rsp)
-	movl	%edx, 0x4(%rsp)
-	movl	%ecx, 0x8(%rsp)
-	movq	%r8, %rsi
-	movl	%esi, 0xc(%rsp)
-	movq	%r9, %rsi
-	movl	%esi,  0x10(%rsp)
-
-	sgdt	save_gdt(%rip)
-
-	leaq	1f(%rip), %rbx
-	movq	%rbx, func_rt_ptr(%rip)
-
-	/* Switch to gdt with 32-bit segments */
-	movl	64(%rsp), %eax
-	lgdt	(%rax)
-
-	leaq	efi_enter32(%rip), %rax
-	pushq	$__KERNEL_CS
-	pushq	%rax
-	lretq
-
-1:	addq	$32, %rsp
-
-	lgdt	save_gdt(%rip)
-
-	pop	%rbx
-	movl	%ebx, %ss
-	pop	%rbx
-	movl	%ebx, %es
-	pop	%rbx
-	movl	%ebx, %ds
-
-	/*
-	 * Convert 32-bit status code into 64-bit.
-	 */
-	test	%rax, %rax
-	jz	1f
-	movl	%eax, %ecx
-	andl	$0x0fffffff, %ecx
-	andl	$0xf0000000, %eax
-	shl	$32, %rax
-	or	%rcx, %rax
-1:
-	ret
-ENDPROC(__efi64_thunk)
-
-ENTRY(efi_exit32)
-	movq	func_rt_ptr(%rip), %rax
-	push	%rax
-	mov	%rdi, %rax
-	ret
-ENDPROC(efi_exit32)
-
-	.code32
-/*
- * EFI service pointer must be in %edi.
- *
- * The stack should represent the 32-bit calling convention.
- */
-ENTRY(efi_enter32)
-	movl	$__KERNEL_DS, %eax
-	movl	%eax, %ds
-	movl	%eax, %es
-	movl	%eax, %ss
-
-	/* Reload pgtables */
-	movl	%cr3, %eax
-	movl	%eax, %cr3
-
-	/* Disable paging */
-	movl	%cr0, %eax
-	btrl	$X86_CR0_PG_BIT, %eax
-	movl	%eax, %cr0
-
-	/* Disable long mode via EFER */
-	movl	$MSR_EFER, %ecx
-	rdmsr
-	btrl	$_EFER_LME, %eax
-	wrmsr
-
-	call	*%edi
-
-	/* We must preserve return value */
-	movl	%eax, %edi
-
-	/*
-	 * Some firmware will return with interrupts enabled. Be sure to
-	 * disable them before we switch GDTs.
-	 */
-	cli
-
-	movl	68(%esp), %eax
-	movl	%eax, 2(%eax)
-	lgdtl	(%eax)
-
-	movl	%cr4, %eax
-	btsl	$(X86_CR4_PAE_BIT), %eax
-	movl	%eax, %cr4
-
-	movl	%cr3, %eax
-	movl	%eax, %cr3
-
-	movl	$MSR_EFER, %ecx
-	rdmsr
-	btsl	$_EFER_LME, %eax
-	wrmsr
-
-	xorl	%eax, %eax
-	lldt	%ax
-
-	movl	72(%esp), %eax
-	pushl	$__KERNEL_CS
-	pushl	%eax
-
-	/* Enable paging */
-	movl	%cr0, %eax
-	btsl	$X86_CR0_PG_BIT, %eax
-	movl	%eax, %cr0
-	lret
-ENDPROC(efi_enter32)
-
-	.data
-	.balign	8
-	.global	efi32_boot_gdt
-efi32_boot_gdt:	.word	0
-		.quad	0
-
-save_gdt:	.word	0
-		.quad	0
-func_rt_ptr:	.quad	0
-
-	.global efi_gdt64
-efi_gdt64:
-	.word	efi_gdt64_end - efi_gdt64
-	.long	0			/* Filled out by user */
-	.word	0
-	.quad	0x0000000000000000	/* NULL descriptor */
-	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
-	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
-	.quad	0x0080890000000000	/* TS descriptor */
-	.quad   0x0000000000000000	/* TS continued */
-efi_gdt64_end:
-#endif /* CONFIG_EFI_MIXED */
-
 	.data
 ENTRY(efi_scratch)
 	.fill 3,8,0
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 8806fa73e6e6..ff85d28c50f2 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -1,9 +1,26 @@
 /*
  * Copyright (C) 2014 Intel Corporation; author Matt Fleming
+ *
+ * Support for invoking 32-bit EFI runtime services from a 64-bit
+ * kernel.
+ *
+ * The below thunking functions are only used after ExitBootServices()
+ * has been called. This simplifies things considerably as compared with
+ * the early EFI thunking because we can leave all the kernel state
+ * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime
+ * services from __KERNEL32_CS. This means we can continue to service
+ * interrupts across an EFI mixed mode call.
+ *
+ * We do however, need to handle the fact that we're running in a full
+ * 64-bit virtual address space. Things like the stack and instruction
+ * addresses need to be accessible by the 32-bit firmware, so we rely on
+ * using the identity mappings in the EFI page table to access the stack
+ * and kernel text (see efi_setup_page_tables()).
  */
 
 #include <linux/linkage.h>
 #include <asm/page_types.h>
+#include <asm/segment.h>
 
 	.text
 	.code64
@@ -33,14 +50,6 @@ ENTRY(efi64_thunk)
 	leaq	efi_exit32(%rip), %rbx
 	subq	%rax, %rbx
 	movl	%ebx, 8(%rsp)
-	leaq	efi_gdt64(%rip), %rbx
-	subq	%rax, %rbx
-	movl	%ebx, 2(%ebx)
-	movl	%ebx, 4(%rsp)
-	leaq	efi_gdt32(%rip), %rbx
-	subq	%rax, %rbx
-	movl	%ebx, 2(%ebx)
-	movl	%ebx, (%rsp)
 
 	leaq	__efi64_thunk(%rip), %rbx
 	subq	%rax, %rbx
@@ -52,14 +61,92 @@ ENTRY(efi64_thunk)
 	retq
 ENDPROC(efi64_thunk)
 
-	.data
-efi_gdt32:
-	.word 	efi_gdt32_end - efi_gdt32
-	.long	0			/* Filled out above */
-	.word	0
-	.quad	0x0000000000000000	/* NULL descriptor */
-	.quad	0x00cf9a000000ffff	/* __KERNEL_CS */
-	.quad	0x00cf93000000ffff	/* __KERNEL_DS */
-efi_gdt32_end:
+/*
+ * We run this function from the 1:1 mapping.
+ *
+ * This function must be invoked with a 1:1 mapped stack.
+ */
+ENTRY(__efi64_thunk)
+	movl	%ds, %eax
+	push	%rax
+	movl	%es, %eax
+	push	%rax
+	movl	%ss, %eax
+	push	%rax
+
+	subq	$32, %rsp
+	movl	%esi, 0x0(%rsp)
+	movl	%edx, 0x4(%rsp)
+	movl	%ecx, 0x8(%rsp)
+	movq	%r8, %rsi
+	movl	%esi, 0xc(%rsp)
+	movq	%r9, %rsi
+	movl	%esi,  0x10(%rsp)
+
+	leaq	1f(%rip), %rbx
+	movq	%rbx, func_rt_ptr(%rip)
+
+	/* Switch to 32-bit descriptor */
+	pushq	$__KERNEL32_CS
+	leaq	efi_enter32(%rip), %rax
+	pushq	%rax
+	lretq
+
+1:	addq	$32, %rsp
+
+	pop	%rbx
+	movl	%ebx, %ss
+	pop	%rbx
+	movl	%ebx, %es
+	pop	%rbx
+	movl	%ebx, %ds
 
+	/*
+	 * Convert 32-bit status code into 64-bit.
+	 */
+	test	%rax, %rax
+	jz	1f
+	movl	%eax, %ecx
+	andl	$0x0fffffff, %ecx
+	andl	$0xf0000000, %eax
+	shl	$32, %rax
+	or	%rcx, %rax
+1:
+	ret
+ENDPROC(__efi64_thunk)
+
+ENTRY(efi_exit32)
+	movq	func_rt_ptr(%rip), %rax
+	push	%rax
+	mov	%rdi, %rax
+	ret
+ENDPROC(efi_exit32)
+
+	.code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %ss
+
+	call	*%edi
+
+	/* We must preserve return value */
+	movl	%eax, %edi
+
+	movl	72(%esp), %eax
+	pushl	$__KERNEL_CS
+	pushl	%eax
+
+	lret
+ENDPROC(efi_enter32)
+
+	.data
+	.balign	8
+func_rt_ptr:		.quad 0
 efi_saved_sp:		.quad 0
-- 
cgit v1.2.3


From 8329aa9fff3fca84009e6a444d8d160193643bac Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 13 Feb 2015 10:26:18 -0800
Subject: Revert "x86/apic: Only disable CPU x2apic mode when necessary"

This reverts commit 5fcee53ce705d49c766f8a302c7e93bdfc33c124.

It causes the suspend to fail on at least the Chromebook Pixel, possibly
other platforms too.

Joerg Roedel points out that the logic should probably have been

                if (max_physical_apicid > 255 ||
                    !(IS_ENABLED(CONFIG_HYPERVISOR_GUEST) &&
                      hypervisor_x2apic_available())) {

instead, but since the code is not in any fast-path, so we can just live
without that optimization and just revert to the original code.

Acked-by: Joerg Roedel <joro@8bytes.org>
Acked-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/apic/apic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b665d241efad..ad3639ae1b9b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1580,8 +1580,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
 		 * under KVM
 		 */
 		if (max_physical_apicid > 255 ||
-		    (IS_ENABLED(CONFIG_HYPERVISOR_GUEST) &&
-		     !hypervisor_x2apic_available())) {
+		    !hypervisor_x2apic_available()) {
 			pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
 			x2apic_disable();
 			return;
-- 
cgit v1.2.3


From bf58b4879c33b3475a33740562ebf6583f531d4a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:37:12 -0800
Subject: x86: use %*pb[l] to print bitmaps including cpumasks and nodemasks

printk and friends can now format bitmaps using '%*pb[l]'.  cpumask
and nodemask also provide cpumask_pr_args() and nodemask_pr_args()
respectively which can be used to generate the two printf arguments
necessary to format the specified cpu/nodemask.

* Unnecessary buffer size calculation and condition on the lenght
  removed from intel_cacheinfo.c::show_shared_cpu_map_func().

* uv_nmi_nr_cpus_pr() got overly smart and implemented "..."
  abbreviation if the output stretched over the predefined 1024 byte
  buffer.  Replaced with plain printk.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mike Travis <travis@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 26 ++++++++++++--------------
 arch/x86/mm/numa.c                    |  6 ++----
 arch/x86/platform/uv/uv_nmi.c         | 25 +++++++------------------
 3 files changed, 21 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c7035073dfc1..659643376dbf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
 static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
 					int type, char *buf)
 {
-	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
-	int n = 0;
-
-	if (len > 1) {
-		const struct cpumask *mask;
-
-		mask = to_cpumask(this_leaf->shared_cpu_map);
-		n = type ?
-			cpulist_scnprintf(buf, len-2, mask) :
-			cpumask_scnprintf(buf, len-2, mask);
-		buf[n++] = '\n';
-		buf[n] = '\0';
-	}
-	return n;
+	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+	int ret;
+
+	if (type)
+		ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
+				cpumask_pr_args(mask));
+	else
+		ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
+				cpumask_pr_args(mask));
+	buf[ret++] = '\n';
+	buf[ret] = '\0';
+	return ret;
 }
 
 static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1a883705a12a..cd4785bbacb9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -794,7 +794,6 @@ int early_cpu_to_node(int cpu)
 void debug_cpumask_set_cpu(int cpu, int node, bool enable)
 {
 	struct cpumask *mask;
-	char buf[64];
 
 	if (node == NUMA_NO_NODE) {
 		/* early_cpu_to_node() already emits a warning and trace */
@@ -812,10 +811,9 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable)
 	else
 		cpumask_clear_cpu(cpu, mask);
 
-	cpulist_scnprintf(buf, sizeof(buf), mask);
-	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
 		enable ? "numa_add_cpu" : "numa_remove_cpu",
-		cpu, node, buf);
+		cpu, node, cpumask_pr_args(mask));
 	return;
 }
 
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index c6b146e67116..7488cafab955 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -273,20 +273,6 @@ static inline void uv_clear_nmi(int cpu)
 	}
 }
 
-/* Print non-responding cpus */
-static void uv_nmi_nr_cpus_pr(char *fmt)
-{
-	static char cpu_list[1024];
-	int len = sizeof(cpu_list);
-	int c = cpumask_weight(uv_nmi_cpu_mask);
-	int n = cpulist_scnprintf(cpu_list, len, uv_nmi_cpu_mask);
-
-	if (n >= len-1)
-		strcpy(&cpu_list[len - 6], "...\n");
-
-	printk(fmt, c, cpu_list);
-}
-
 /* Ping non-responding cpus attemping to force them into the NMI handler */
 static void uv_nmi_nr_cpus_ping(void)
 {
@@ -371,16 +357,19 @@ static void uv_nmi_wait(int master)
 			break;
 
 		/* if not all made it in, send IPI NMI to them */
-		uv_nmi_nr_cpus_pr(KERN_ALERT
-			"UV: Sending NMI IPI to %d non-responding CPUs: %s\n");
+		pr_alert("UV: Sending NMI IPI to %d non-responding CPUs: %*pbl\n",
+			 cpumask_weight(uv_nmi_cpu_mask),
+			 cpumask_pr_args(uv_nmi_cpu_mask));
+
 		uv_nmi_nr_cpus_ping();
 
 		/* if all cpus are in, then done */
 		if (!uv_nmi_wait_cpus(0))
 			break;
 
-		uv_nmi_nr_cpus_pr(KERN_ALERT
-			"UV: %d CPUs not in NMI loop: %s\n");
+		pr_alert("UV: %d CPUs not in NMI loop: %*pbl\n",
+			 cpumask_weight(uv_nmi_cpu_mask),
+			 cpumask_pr_args(uv_nmi_cpu_mask));
 	} while (0);
 
 	pr_alert("UV: %d of %d CPUs in NMI\n",
-- 
cgit v1.2.3


From ef7f0d6a6ca8c9e4b27d78895af86c2fbfaeedb2 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:25 -0800
Subject: x86_64: add KASan support

This patch adds arch specific code for kernel address sanitizer.

16TB of virtual addressed used for shadow memory.  It's located in range
[ffffec0000000000 - fffffc0000000000] between vmemmap and %esp fixup
stacks.

At early stage we map whole shadow region with zero page.  Latter, after
pages mapped to direct mapping address range we unmap zero pages from
corresponding shadow (see kasan_map_shadow()) and allocate and map a real
shadow memory reusing vmemmap_populate() function.

Also replace __pa with __pa_nodebug before shadow initialized.  __pa with
CONFIG_DEBUG_VIRTUAL=y make external function call (__phys_addr)
__phys_addr is instrumented, so __asan_load could be called before shadow
area initialized.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/x86/x86_64/mm.txt   |   2 +
 arch/x86/Kconfig                  |   1 +
 arch/x86/boot/Makefile            |   2 +
 arch/x86/boot/compressed/Makefile |   2 +
 arch/x86/include/asm/kasan.h      |  31 ++++++
 arch/x86/kernel/Makefile          |   2 +
 arch/x86/kernel/dumpstack.c       |   5 +-
 arch/x86/kernel/head64.c          |   9 +-
 arch/x86/kernel/head_64.S         |  30 ++++++
 arch/x86/kernel/setup.c           |   3 +
 arch/x86/mm/Makefile              |   3 +
 arch/x86/mm/kasan_init_64.c       | 199 ++++++++++++++++++++++++++++++++++++++
 arch/x86/realmode/Makefile        |   2 +-
 arch/x86/realmode/rm/Makefile     |   1 +
 arch/x86/vdso/Makefile            |   1 +
 lib/Kconfig.kasan                 |   1 +
 16 files changed, 290 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/include/asm/kasan.h
 create mode 100644 arch/x86/mm/kasan_init_64.c

(limited to 'arch/x86')

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 052ee643a32e..05712ac83e38 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
 ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
 ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
+ffffec0000000000 - fffffc0000000000 (=44 bits) kasan shadow memory (16TB)
+... unused hole ...
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 33ce9a344e38..eb1cf898ed3c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -85,6 +85,7 @@ config X86
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_ARCH_JUMP_LABEL
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 3db07f30636f..57bbf2fb21f6 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -14,6 +14,8 @@
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
+KASAN_SANITIZE := n
+
 SVGA_MODE	:= -DSVGA_MODE=NORMAL_VGA
 
 targets		:= vmlinux.bin setup.bin setup.elf bzImage
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index ad754b4411f7..843feb3eb20b 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -16,6 +16,8 @@
 #	(see scripts/Makefile.lib size_append)
 #	compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
 
+KASAN_SANITIZE := n
+
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
new file mode 100644
index 000000000000..8b22422fbad8
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START      (KASAN_SHADOW_OFFSET + \
+					(0xffff800000000000ULL >> 3))
+/* 47 bits for kernel address -> (47 - 3) bits for shadow */
+#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+
+#ifndef __ASSEMBLY__
+
+extern pte_t kasan_zero_pte[];
+extern pte_t kasan_zero_pmd[];
+extern pte_t kasan_zero_pud[];
+
+#ifdef CONFIG_KASAN
+void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_init(void);
+#else
+static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 732223496968..b13b70634124 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,8 @@ CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_early_printk.o = -pg
 endif
 
+KASAN_SANITIZE_head$(BITS).o := n
+
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index b74ebc7c4402..cf3df1d8d039 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	printk("SMP ");
 #endif
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
+	printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_KASAN
+	printk("KASAN");
 #endif
 	printk("\n");
 	if (notify_die(DIE_OOPS, str, regs, err,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..efcddfaf05f9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/bootparam_utils.h>
 #include <asm/microcode.h>
+#include <asm/kasan.h>
 
 /*
  * Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
 
 	next_early_pgt = 0;
 
-	write_cr3(__pa(early_level4_pgt));
+	write_cr3(__pa_nodebug(early_level4_pgt));
 }
 
 /* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
@@ -158,6 +159,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	/* Kill off the identity-map trampoline */
 	reset_early_page_tables();
 
+	kasan_map_early_shadow(early_level4_pgt);
+
 	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
@@ -179,6 +182,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	/* set init_level4_pgt kernel high mapping*/
 	init_level4_pgt[511] = early_level4_pgt[511];
 
+	kasan_map_early_shadow(init_level4_pgt);
+
 	x86_64_start_reservations(real_mode_data);
 }
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a65c42..6fd514d9f69a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -514,8 +514,38 @@ ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
 	.quad   0x0000000000000000
 
+#ifdef CONFIG_KASAN
+#define FILL(VAL, COUNT)				\
+	.rept (COUNT) ;					\
+	.quad	(VAL) ;					\
+	.endr
+
+NEXT_PAGE(kasan_zero_pte)
+	FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pmd)
+	FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pud)
+	FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
+
+#undef FILL
+#endif
+
+
 #include "../../x86/xen/xen-head.S"
 	
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(empty_zero_page)
 	.skip PAGE_SIZE
+
+#ifdef CONFIG_KASAN
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+NEXT_PAGE(kasan_zero_page)
+	.skip PAGE_SIZE
+#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4648adadd7d..27d200929864 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,6 +89,7 @@
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 #include <asm/bugs.h>
+#include <asm/kasan.h>
 
 #include <asm/vsyscall.h>
 #include <asm/cpu.h>
@@ -1174,6 +1175,8 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.paging.pagetable_init();
 
+	kasan_init();
+
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
 		mmu_cr4_features = read_cr4();
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index ecfdc46a024a..c4cc74006c61 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
 obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
 
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
+
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
 mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
new file mode 100644
index 000000000000..3e4d9a1a39fa
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,199 @@
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+extern unsigned char kasan_zero_page[PAGE_SIZE];
+
+static int __init map_range(struct range *range)
+{
+	unsigned long start;
+	unsigned long end;
+
+	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+	/*
+	 * end + 1 here is intentional. We check several shadow bytes in advance
+	 * to slightly speed up fastpath. In some rare cases we could cross
+	 * boundary of mapped shadow, so we just map some more here.
+	 */
+	return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
+}
+
+static void __init clear_pgds(unsigned long start,
+			unsigned long end)
+{
+	for (; start < end; start += PGDIR_SIZE)
+		pgd_clear(pgd_offset_k(start));
+}
+
+void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+	int i;
+	unsigned long start = KASAN_SHADOW_START;
+	unsigned long end = KASAN_SHADOW_END;
+
+	for (i = pgd_index(start); start < end; i++) {
+		pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+				| _KERNPG_TABLE);
+		start += PGDIR_SIZE;
+	}
+}
+
+static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+				unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	while (addr + PAGE_SIZE <= end) {
+		WARN_ON(!pte_none(*pte));
+		set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
+					| __PAGE_KERNEL_RO));
+		addr += PAGE_SIZE;
+		pte = pte_offset_kernel(pmd, addr);
+	}
+	return 0;
+}
+
+static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
+		WARN_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
+					| __PAGE_KERNEL_RO));
+		addr += PMD_SIZE;
+		pmd = pmd_offset(pud, addr);
+	}
+	if (addr < end) {
+		if (pmd_none(*pmd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pte_populate(pmd, addr, end);
+	}
+	return ret;
+}
+
+
+static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pud_t *pud = pud_offset(pgd, addr);
+
+	while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
+		WARN_ON(!pud_none(*pud));
+		set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
+					| __PAGE_KERNEL_RO));
+		addr += PUD_SIZE;
+		pud = pud_offset(pgd, addr);
+	}
+
+	if (addr < end) {
+		if (pud_none(*pud)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pmd_populate(pud, addr, end);
+	}
+	return ret;
+}
+
+static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
+{
+	int ret = 0;
+	pgd_t *pgd = pgd_offset_k(addr);
+
+	while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
+		WARN_ON(!pgd_none(*pgd));
+		set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
+					| __PAGE_KERNEL_RO));
+		addr += PGDIR_SIZE;
+		pgd = pgd_offset_k(addr);
+	}
+
+	if (addr < end) {
+		if (pgd_none(*pgd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pud_populate(pgd, addr, end);
+	}
+	return ret;
+}
+
+
+static void __init populate_zero_shadow(const void *start, const void *end)
+{
+	if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
+		panic("kasan: unable to map zero shadow!");
+}
+
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+			     unsigned long val,
+			     void *data)
+{
+	if (val == DIE_GPF) {
+		pr_emerg("CONFIG_KASAN_INLINE enabled");
+		pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+	.notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_init(void)
+{
+	int i;
+
+#ifdef CONFIG_KASAN_INLINE
+	register_die_notifier(&kasan_die_notifier);
+#endif
+
+	memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+	load_cr3(early_level4_pgt);
+
+	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+	populate_zero_shadow((void *)KASAN_SHADOW_START,
+			kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+	for (i = 0; i < E820_X_MAX; i++) {
+		if (pfn_mapped[i].end == 0)
+			break;
+
+		if (map_range(&pfn_mapped[i]))
+			panic("kasan: unable to allocate shadow!");
+	}
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+				(void *)KASAN_SHADOW_END);
+
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	load_cr3(init_level4_pgt);
+}
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index 94f7fbe97b08..e02c2c6c56a5 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -6,7 +6,7 @@
 # for more details.
 #
 #
-
+KASAN_SANITIZE := n
 subdir- := rm
 
 obj-y += init.o
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 7c0d7be176a5..2730d775ef9a 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -6,6 +6,7 @@
 # for more details.
 #
 #
+KASAN_SANITIZE := n
 
 always := realmode.bin realmode.relocs
 
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 09297c8e1fcd..7b9be9822724 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -3,6 +3,7 @@
 #
 
 KBUILD_CFLAGS += $(DISABLE_LTO)
+KASAN_SANITIZE := n
 
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index e5b3fbe5560f..0052b1b9aadd 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -15,6 +15,7 @@ config KASAN
 
 config KASAN_SHADOW_OFFSET
 	hex
+	default 0xdffffc0000000000 if X86_64
 
 choice
 	prompt "Instrumentation type"
-- 
cgit v1.2.3


From 393f203f5fd54421fddb1e2a263f64d3876eeadb Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:56 -0800
Subject: x86_64: kasan: add interceptors for memset/memmove/memcpy functions

Recently instrumentation of builtin functions calls was removed from GCC
5.0.  To check the memory accessed by such functions, userspace asan
always uses interceptors for them.

So now we should do this as well.  This patch declares
memset/memmove/memcpy as weak symbols.  In mm/kasan/kasan.c we have our
own implementation of those functions which checks memory before accessing
it.

Default memset/memmove/memcpy now now always have aliases with '__'
prefix.  For files that built without kasan instrumentation (e.g.
mm/slub.c) original mem* replaced (via #define) with prefixed variants,
cause we don't want to check memory accesses there.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/boot/compressed/eboot.c       |  3 +--
 arch/x86/boot/compressed/misc.h        |  1 +
 arch/x86/include/asm/string_64.h       | 18 +++++++++++++++++-
 arch/x86/kernel/x8664_ksyms_64.c       | 10 ++++++++--
 arch/x86/lib/memcpy_64.S               |  6 ++++--
 arch/x86/lib/memmove_64.S              |  4 ++++
 arch/x86/lib/memset_64.S               | 10 ++++++----
 drivers/firmware/efi/libstub/efistub.h |  4 ++++
 mm/kasan/kasan.c                       | 29 +++++++++++++++++++++++++++++
 9 files changed, 74 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 92b9a5f2aed6..ef17683484e9 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -13,8 +13,7 @@
 #include <asm/setup.h>
 #include <asm/desc.h>
 
-#undef memcpy			/* Use memcpy from misc.c */
-
+#include "../string.h"
 #include "eboot.h"
 
 static efi_system_table_t *sys_table;
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 24e3e569a13c..04477d68403f 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
  * we just keep it from happening
  */
 #undef CONFIG_PARAVIRT
+#undef CONFIG_KASAN
 #ifdef CONFIG_X86_32
 #define _ASM_X86_DESC_H 1
 #endif
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c468fc2c..e4661196994e 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
+extern void *__memcpy(void *to, const void *from, size_t len);
+
 #ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
 extern void *memcpy(void *to, const void *from, size_t len);
 #else
-extern void *__memcpy(void *to, const void *from, size_t len);
 #define memcpy(dst, src, len)					\
 ({								\
 	size_t __len = (len);					\
@@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
 
 #define __HAVE_ARCH_MEMMOVE
 void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
 
 int memcmp(const void *cs, const void *ct, size_t count);
 size_t strlen(const char *s);
@@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src);
 char *strcat(char *dest, const char *src);
 int strcmp(const char *cs, const char *ct);
 
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 040681928e9d..37d8fa4438f0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
 #undef memset
 #undef memmove
 
+extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
 extern void *memset(void *, int, __kernel_size_t);
 extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
 
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(memmove);
 
 #ifndef CONFIG_DEBUG_VIRTUAL
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 56313a326188..89b53c9968e7 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -53,6 +53,8 @@
 .Lmemcpy_e_e:
 	.previous
 
+.weak memcpy
+
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
@@ -199,8 +201,8 @@ ENDPROC(__memcpy)
 	 * only outcome...
 	 */
 	.section .altinstructions, "a"
-	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
 			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
-	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
 			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 	.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 65268a6104f4..9c4b530575da 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,7 +24,10 @@
  * Output:
  * rax: dest
  */
+.weak memmove
+
 ENTRY(memmove)
+ENTRY(__memmove)
 	CFI_STARTPROC
 
 	/* Handle more 32 bytes in loop */
@@ -220,4 +223,5 @@ ENTRY(memmove)
 		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
 		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 	.previous
+ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2dcb3808cbda..6f44935c6a60 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -56,6 +56,8 @@
 .Lmemset_e_e:
 	.previous
 
+.weak memset
+
 ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
@@ -147,8 +149,8 @@ ENDPROC(__memset)
          * feature to implement the right patch order.
 	 */
 	.section .altinstructions,"a"
-	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
-	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
+	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
 	.previous
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 2be10984a67a..47437b16b186 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -5,6 +5,10 @@
 /* error code which can't be mistaken for valid address */
 #define EFI_ERROR	(~0UL)
 
+#undef memcpy
+#undef memset
+#undef memmove
+
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index dc83f070edb6..799c52b9826c 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -255,6 +255,35 @@ static __always_inline void check_memory_region(unsigned long addr,
 	kasan_report(addr, size, write, _RET_IP_);
 }
 
+void __asan_loadN(unsigned long addr, size_t size);
+void __asan_storeN(unsigned long addr, size_t size);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+	__asan_storeN((unsigned long)addr, len);
+
+	return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+	__asan_loadN((unsigned long)src, len);
+	__asan_storeN((unsigned long)dest, len);
+
+	return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+	__asan_loadN((unsigned long)src, len);
+	__asan_storeN((unsigned long)dest, len);
+
+	return __memcpy(dest, src, len);
+}
+
 void kasan_alloc_pages(struct page *page, unsigned int order)
 {
 	if (likely(!PageHighMem(page)))
-- 
cgit v1.2.3


From c420f167db8c799d69fe43a801c58a7f02e9d57c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:59 -0800
Subject: kasan: enable stack instrumentation

Stack instrumentation allows to detect out of bounds memory accesses for
variables allocated on stack.  Compiler adds redzones around every
variable on stack and poisons redzones in function's prologue.

Such approach significantly increases stack usage, so all in-kernel stacks
size were doubled.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/page_64_types.h | 12 +++++++++---
 arch/x86/kernel/Makefile             |  2 ++
 arch/x86/mm/kasan_init_64.c          | 11 +++++++++--
 include/linux/init_task.h            |  8 ++++++++
 mm/kasan/kasan.h                     |  9 +++++++++
 mm/kasan/report.c                    |  6 ++++++
 scripts/Makefile.kasan               |  1 +
 7 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 75450b2c7be4..4edd53b79a81 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_SIZE_ORDER	2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define DOUBLEFAULT_STACK 1
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b13b70634124..cdb1b70ddad0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,6 +17,8 @@ CFLAGS_REMOVE_early_printk.o = -pg
 endif
 
 KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
 
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 3e4d9a1a39fa..53508708b7aa 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -189,11 +189,18 @@ void __init kasan_init(void)
 		if (map_range(&pfn_mapped[i]))
 			panic("kasan: unable to allocate shadow!");
 	}
-
 	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-				(void *)KASAN_SHADOW_END);
+			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+			(unsigned long)kasan_mem_to_shadow(_end),
+			NUMA_NO_NODE);
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR),
+			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
 
 	load_cr3(init_level4_pgt);
+	init_task.kasan_depth = 0;
 }
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d3d43ecf148c..696d22312b31 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -175,6 +175,13 @@ extern struct task_group root_task_group;
 # define INIT_NUMA_BALANCING(tsk)
 #endif
 
+#ifdef CONFIG_KASAN
+# define INIT_KASAN(tsk)						\
+	.kasan_depth = 1,
+#else
+# define INIT_KASAN(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -250,6 +257,7 @@ extern struct task_group root_task_group;
 	INIT_RT_MUTEXES(tsk)						\
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
+	INIT_KASAN(tsk)							\
 }
 
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 5b052ab40cf9..1fcc1d81a9cf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -12,6 +12,15 @@
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
 
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT        0xF1
+#define KASAN_STACK_MID         0xF2
+#define KASAN_STACK_RIGHT       0xF3
+#define KASAN_STACK_PARTIAL     0xF4
+
 
 struct kasan_access_info {
 	const void *access_addr;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 2760edb4d0a8..866732ef3db3 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -64,6 +64,12 @@ static void print_error_description(struct kasan_access_info *info)
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
+	case KASAN_STACK_LEFT:
+	case KASAN_STACK_MID:
+	case KASAN_STACK_RIGHT:
+	case KASAN_STACK_PARTIAL:
+		bug_type = "out of bounds on stack";
+		break;
 	}
 
 	pr_err("BUG: KASan: %s in %pS at addr %p\n",
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 7acd6faa0335..2163b8cc446e 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -9,6 +9,7 @@ CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
 
 CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
 		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+		--param asan-stack=1 \
 		--param asan-instrumentation-with-call-threshold=$(call_threshold))
 
 ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
-- 
cgit v1.2.3


From cb9e3c292d0115499c660028ad35ac5501d722b5 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:07 -0800
Subject: mm: vmalloc: pass additional vm_flags to __vmalloc_node_range()

For instrumenting global variables KASan will shadow memory backing memory
for modules.  So on module loading we will need to allocate memory for
shadow and map it at address in shadow that corresponds to the address
allocated in module_alloc().

__vmalloc_node_range() could be used for this purpose, except it puts a
guard hole after allocated area.  Guard hole in shadow memory should be a
problem because at some future point we might need to have a shadow memory
at address occupied by guard hole.  So we could fail to allocate shadow
for module_alloc().

Now we have VM_NO_GUARD flag disabling guard page, so we need to pass into
__vmalloc_node_range().  Add new parameter 'vm_flags' to
__vmalloc_node_range() function.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/module.c       |  2 +-
 arch/arm64/kernel/module.c     |  4 ++--
 arch/mips/kernel/module.c      |  2 +-
 arch/parisc/kernel/module.c    |  2 +-
 arch/s390/kernel/module.c      |  2 +-
 arch/sparc/kernel/module.c     |  2 +-
 arch/unicore32/kernel/module.c |  2 +-
 arch/x86/kernel/module.c       |  2 +-
 include/linux/vmalloc.h        |  4 +++-
 mm/vmalloc.c                   | 10 ++++++----
 10 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index bea7db9e5b80..2e11961f65ae 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -41,7 +41,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 9b6f71db2709..67bf4107f6ef 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -35,8 +35,8 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
-				    __builtin_return_address(0));
+				    GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 enum aarch64_reloc_op {
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 2a52568dbcd6..1833f5171ccd 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -47,7 +47,7 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
-				GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 5822e8e200e6..3c63a820fcda 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -219,7 +219,7 @@ void *module_alloc(unsigned long size)
 	 * init_data correctly */
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
 				    GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_RWX, NUMA_NO_NODE,
+				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 409d152585be..36154a2f1814 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -50,7 +50,7 @@ void *module_alloc(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 #endif
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 97655e0fd243..192a617a32f3 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -29,7 +29,7 @@ static void *module_map(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #else
diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
index dc41f6dfedb6..e191b3448bd3 100644
--- a/arch/unicore32/kernel/module.c
+++ b/arch/unicore32/kernel/module.c
@@ -25,7 +25,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..e830e61aae05 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -88,7 +88,7 @@ void *module_alloc(unsigned long size)
 	return __vmalloc_node_range(size, 1,
 				    MODULES_VADDR + get_module_load_offset(),
 				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1526fe712ca0..7d7acb35603d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -76,7 +76,9 @@ extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller);
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller);
+
 extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2e74e99d4cfe..35b25e1340ca 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1619,6 +1619,7 @@ fail:
  *	@end:		vm area range end
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
+ *	@vm_flags:	additional vm area flags (e.g. %VM_NO_GUARD)
  *	@node:		node to use for allocation or NUMA_NO_NODE
  *	@caller:	caller's return address
  *
@@ -1628,7 +1629,8 @@ fail:
  */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller)
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller)
 {
 	struct vm_struct *area;
 	void *addr;
@@ -1638,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		goto fail;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
-				  start, end, node, gfp_mask, caller);
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
+				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
@@ -1688,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    int node, const void *caller)
 {
 	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-				gfp_mask, prot, node, caller);
+				gfp_mask, prot, 0, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
-- 
cgit v1.2.3


From bebf56a1b176c2e1c9efe44e7e6915532cc682cf Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:17 -0800
Subject: kasan: enable instrumentation of global variables

This feature let us to detect accesses out of bounds of global variables.
This will work as for globals in kernel image, so for globals in modules.
Currently this won't work for symbols in user-specified sections (e.g.
__init, __read_mostly, ...)

The idea of this is simple.  Compiler increases each global variable by
redzone size and add constructors invoking __asan_register_globals()
function.  Information about global variable (address, size, size with
redzone ...) passed to __asan_register_globals() so we could poison
variable's redzone.

This patch also forces module_alloc() to return 8*PAGE_SIZE aligned
address making shadow memory handling (
kasan_module_alloc()/kasan_module_free() ) more simple.  Such alignment
guarantees that each shadow page backing modules address space correspond
to only one module_alloc() allocation.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kasan.txt       |  2 +-
 arch/x86/kernel/module.c      | 12 +++++++++-
 arch/x86/mm/kasan_init_64.c   |  2 +-
 include/linux/compiler-gcc4.h |  4 ++++
 include/linux/compiler-gcc5.h |  2 ++
 include/linux/kasan.h         | 10 +++++++++
 kernel/module.c               |  2 ++
 lib/Kconfig.kasan             |  1 +
 mm/kasan/kasan.c              | 52 +++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h              | 25 +++++++++++++++++++++
 mm/kasan/report.c             | 22 ++++++++++++++++++
 scripts/Makefile.kasan        |  2 +-
 12 files changed, 132 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
index f0645a8a992f..092fc10961fe 100644
--- a/Documentation/kasan.txt
+++ b/Documentation/kasan.txt
@@ -9,7 +9,7 @@ a fast and comprehensive solution for finding use-after-free and out-of-bounds
 bugs.
 
 KASan uses compile-time instrumentation for checking every memory access,
-therefore you will need a certain version of GCC >= 4.9.2
+therefore you will need a certain version of GCC > 4.9.2
 
 Currently KASan is supported only for x86_64 architecture and requires that the
 kernel be built with the SLUB allocator.
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e830e61aae05..d1ac80b72c72 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kasan.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -83,13 +84,22 @@ static unsigned long int get_module_load_offset(void)
 
 void *module_alloc(unsigned long size)
 {
+	void *p;
+
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-	return __vmalloc_node_range(size, 1,
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
 				    MODULES_VADDR + get_module_load_offset(),
 				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
 				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 53508708b7aa..4860906c6b9f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -196,7 +196,7 @@ void __init kasan_init(void)
 			(unsigned long)kasan_mem_to_shadow(_end),
 			NUMA_NO_NODE);
 
-	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR),
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index d1a558239b1a..769e19864632 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -85,3 +85,7 @@
 #define __HAVE_BUILTIN_BSWAP16__
 #endif
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#if GCC_VERSION >= 40902
+#define KASAN_ABI_VERSION 3
+#endif
diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h
index c8c565952548..efee493714eb 100644
--- a/include/linux/compiler-gcc5.h
+++ b/include/linux/compiler-gcc5.h
@@ -63,3 +63,5 @@
 #define __HAVE_BUILTIN_BSWAP64__
 #define __HAVE_BUILTIN_BSWAP16__
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#define KASAN_ABI_VERSION 4
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d5310eef3e38..72ba725ddf9c 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -49,8 +49,15 @@ void kasan_krealloc(const void *object, size_t new_size);
 void kasan_slab_alloc(struct kmem_cache *s, void *object);
 void kasan_slab_free(struct kmem_cache *s, void *object);
 
+#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_module_free(void *addr);
+
 #else /* CONFIG_KASAN */
 
+#define MODULE_ALIGN 1
+
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
 static inline void kasan_enable_current(void) {}
@@ -74,6 +81,9 @@ static inline void kasan_krealloc(const void *object, size_t new_size) {}
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
 static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
 
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_module_free(void *addr) {}
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/kernel/module.c b/kernel/module.c
index 82dc1f899e6d..8426ad48362c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/kasan.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
@@ -1813,6 +1814,7 @@ static void unset_module_init_ro_nx(struct module *mod) { }
 void __weak module_memfree(void *module_region)
 {
 	vfree(module_region);
+	kasan_module_free(module_region);
 }
 
 void __weak module_arch_cleanup(struct module *mod)
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 4d47d874335c..4fecaedc80a2 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -6,6 +6,7 @@ if HAVE_ARCH_KASAN
 config KASAN
 	bool "KASan: runtime memory debugger"
 	depends on SLUB_DEBUG
+	select CONSTRUCTORS
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
 	  designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 799c52b9826c..78fee632a7ee 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -22,6 +22,7 @@
 #include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -395,6 +396,57 @@ void kasan_kfree_large(const void *ptr)
 			KASAN_FREE_PAGE);
 }
 
+int kasan_module_alloc(void *addr, size_t size)
+{
+	void *ret;
+	size_t shadow_size;
+	unsigned long shadow_start;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+	shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+			PAGE_SIZE);
+
+	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+		return -EINVAL;
+
+	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+			shadow_start + shadow_size,
+			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+			__builtin_return_address(0));
+	return ret ? 0 : -ENOMEM;
+}
+
+void kasan_module_free(void *addr)
+{
+	vfree(kasan_mem_to_shadow(addr));
+}
+
+static void register_global(struct kasan_global *global)
+{
+	size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(global->beg, global->size);
+
+	kasan_poison_shadow(global->beg + aligned_size,
+		global->size_with_redzone - aligned_size,
+		KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
 #define DEFINE_ASAN_LOAD_STORE(size)				\
 	void __asan_load##size(unsigned long addr)		\
 	{							\
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1fcc1d81a9cf..4986b0acab21 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -11,6 +11,7 @@
 #define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
 
 /*
  * Stack redzone shadow values
@@ -21,6 +22,10 @@
 #define KASAN_STACK_RIGHT       0xF3
 #define KASAN_STACK_PARTIAL     0xF4
 
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
 
 struct kasan_access_info {
 	const void *access_addr;
@@ -30,6 +35,26 @@ struct kasan_access_info {
 	unsigned long ip;
 };
 
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+	const char *filename;
+	int line_no;
+	int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+	const void *beg;		/* Address of the beginning of the global variable. */
+	size_t size;			/* Size of the global variable. */
+	size_t size_with_redzone;	/* Size of the variable + size of the red zone. 32 bytes aligned */
+	const void *name;
+	const void *module_name;	/* Name of the module where the global variable is declared. */
+	unsigned long has_dynamic_init;	/* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+	struct kasan_source_location *location;
+#endif
+};
+
 void kasan_report_error(struct kasan_access_info *info);
 void kasan_report_user_access(struct kasan_access_info *info);
 
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 866732ef3db3..680ceedf810a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -23,6 +23,8 @@
 #include <linux/types.h>
 #include <linux/kasan.h>
 
+#include <asm/sections.h>
+
 #include "kasan.h"
 #include "../slab.h"
 
@@ -61,6 +63,7 @@ static void print_error_description(struct kasan_access_info *info)
 		break;
 	case KASAN_PAGE_REDZONE:
 	case KASAN_KMALLOC_REDZONE:
+	case KASAN_GLOBAL_REDZONE:
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
@@ -80,6 +83,20 @@ static void print_error_description(struct kasan_access_info *info)
 		info->access_size, current->comm, task_pid_nr(current));
 }
 
+static inline bool kernel_or_module_addr(const void *addr)
+{
+	return (addr >= (void *)_stext && addr < (void *)_end)
+		|| (addr >= (void *)MODULES_VADDR
+			&& addr < (void *)MODULES_END);
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+	return addr >= (void *)&init_thread_union.stack &&
+		(addr <= (void *)&init_thread_union.stack +
+			sizeof(init_thread_union.stack));
+}
+
 static void print_address_description(struct kasan_access_info *info)
 {
 	const void *addr = info->access_addr;
@@ -107,6 +124,11 @@ static void print_address_description(struct kasan_access_info *info)
 		dump_page(page, "kasan: bad access detected");
 	}
 
+	if (kernel_or_module_addr(addr)) {
+		if (!init_task_stack_addr(addr))
+			pr_err("Address belongs to variable %pS\n", addr);
+	}
+
 	dump_stack();
 }
 
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 2163b8cc446e..631619b2b118 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -9,7 +9,7 @@ CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
 
 CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
 		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
-		--param asan-stack=1 \
+		--param asan-stack=1 --param asan-globals=1 \
 		--param asan-instrumentation-with-call-threshold=$(call_threshold))
 
 ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
-- 
cgit v1.2.3


From d6abfdb2022368d8c6c4be3f11a06656601a6cc2 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Fri, 6 Feb 2015 16:44:11 +0530
Subject: x86/spinlocks/paravirt: Fix memory corruption on unlock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Paravirt spinlock clears slowpath flag after doing unlock.
As explained by Linus currently it does:

                prev = *lock;
                add_smp(&lock->tickets.head, TICKET_LOCK_INC);

                /* add_smp() is a full mb() */

                if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
                        __ticket_unlock_slowpath(lock, prev);

which is *exactly* the kind of things you cannot do with spinlocks,
because after you've done the "add_smp()" and released the spinlock
for the fast-path, you can't access the spinlock any more.  Exactly
because a fast-path lock might come in, and release the whole data
structure.

Linus suggested that we should not do any writes to lock after unlock(),
and we can move slowpath clearing to fastpath lock.

So this patch implements the fix with:

 1. Moving slowpath flag to head (Oleg):
    Unlocked locks don't care about the slowpath flag; therefore we can keep
    it set after the last unlock, and clear it again on the first (try)lock.
    -- this removes the write after unlock. note that keeping slowpath flag would
    result in unnecessary kicks.
    By moving the slowpath flag from the tail to the head ticket we also avoid
    the need to access both the head and tail tickets on unlock.

 2. use xadd to avoid read/write after unlock that checks the need for
    unlock_kick (Linus):
    We further avoid the need for a read-after-release by using xadd;
    the prev head value will include the slowpath flag and indicate if we
    need to do PV kicking of suspended spinners -- on modern chips xadd
    isn't (much) more expensive than an add + load.

Result:
 setup: 16core (32 cpu +ht sandy bridge 8GB 16vcpu guest)
 benchmark overcommit %improve
 kernbench  1x           -0.13
 kernbench  2x            0.02
 dbench     1x           -1.77
 dbench     2x           -0.63

[Jeremy: Hinted missing TICKET_LOCK_INC for kick]
[Oleg: Moved slowpath flag to head, ticket_equals idea]
[PeterZ: Added detailed changelog]

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Jones <drjones@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Fernando Luis Vázquez Cao <fernando_b1@lab.ntt.co.jp>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Waiman Long <Waiman.Long@hp.com>
Cc: a.ryabinin@samsung.com
Cc: dave@stgolabs.net
Cc: hpa@zytor.com
Cc: jasowang@redhat.com
Cc: jeremy@goop.org
Cc: paul.gortmaker@windriver.com
Cc: riel@redhat.com
Cc: tglx@linutronix.de
Cc: waiman.long@hp.com
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20150215173043.GA7471@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/spinlock.h | 94 ++++++++++++++++++++---------------------
 arch/x86/kernel/kvm.c           | 13 ++++--
 arch/x86/xen/spinlock.c         | 13 ++++--
 3 files changed, 64 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 625660f8a2fc..cf87de3fc390 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key);
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
 {
-	set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+	set_bit(0, (volatile unsigned long *)&lock->tickets.head);
 }
 
 #else  /* !CONFIG_PARAVIRT_SPINLOCKS */
@@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
 }
 
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
+static inline int  __tickets_equal(__ticket_t one, __ticket_t two)
+{
+	return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
+}
+
+static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
+							__ticket_t head)
+{
+	if (head & TICKET_SLOWPATH_FLAG) {
+		arch_spinlock_t old, new;
+
+		old.tickets.head = head;
+		new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
+		old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+
+		/* try to clear slowpath flag when there are no contenders */
+		cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
+	}
+}
 
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
-	return lock.tickets.head == lock.tickets.tail;
+	return __tickets_equal(lock.tickets.head, lock.tickets.tail);
 }
 
 /*
@@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 	if (likely(inc.head == inc.tail))
 		goto out;
 
-	inc.tail &= ~TICKET_SLOWPATH_FLAG;
 	for (;;) {
 		unsigned count = SPIN_THRESHOLD;
 
 		do {
-			if (READ_ONCE(lock->tickets.head) == inc.tail)
-				goto out;
+			inc.head = READ_ONCE(lock->tickets.head);
+			if (__tickets_equal(inc.head, inc.tail))
+				goto clear_slowpath;
 			cpu_relax();
 		} while (--count);
 		__ticket_lock_spinning(lock, inc.tail);
 	}
-out:	barrier();	/* make sure nothing creeps before the lock is taken */
+clear_slowpath:
+	__ticket_check_and_clear_slowpath(lock, inc.head);
+out:
+	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
 	arch_spinlock_t old, new;
 
 	old.tickets = READ_ONCE(lock->tickets);
-	if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
+	if (!__tickets_equal(old.tickets.head, old.tickets.tail))
 		return 0;
 
 	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+	new.head_tail &= ~TICKET_SLOWPATH_FLAG;
 
 	/* cmpxchg is a full barrier, so nothing can move before it */
 	return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
-					    arch_spinlock_t old)
-{
-	arch_spinlock_t new;
-
-	BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
-	/* Perform the unlock on the "before" copy */
-	old.tickets.head += TICKET_LOCK_INC;
-
-	/* Clear the slowpath flag */
-	new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
-
-	/*
-	 * If the lock is uncontended, clear the flag - use cmpxchg in
-	 * case it changes behind our back though.
-	 */
-	if (new.tickets.head != new.tickets.tail ||
-	    cmpxchg(&lock->head_tail, old.head_tail,
-					new.head_tail) != old.head_tail) {
-		/*
-		 * Lock still has someone queued for it, so wake up an
-		 * appropriate waiter.
-		 */
-		__ticket_unlock_kick(lock, old.tickets.head);
-	}
-}
-
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	if (TICKET_SLOWPATH_FLAG &&
-	    static_key_false(&paravirt_ticketlocks_enabled)) {
-		arch_spinlock_t prev;
+		static_key_false(&paravirt_ticketlocks_enabled)) {
+		__ticket_t head;
 
-		prev = *lock;
-		add_smp(&lock->tickets.head, TICKET_LOCK_INC);
+		BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
 
-		/* add_smp() is a full mb() */
+		head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
 
-		if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
-			__ticket_unlock_slowpath(lock, prev);
+		if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
+			head &= ~TICKET_SLOWPATH_FLAG;
+			__ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
+		}
 	} else
 		__add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
 }
@@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return tmp.tail != tmp.head;
+	return !__tickets_equal(tmp.tail, tmp.head);
 }
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+	tmp.head &= ~TICKET_SLOWPATH_FLAG;
+	return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
 
@@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	__ticket_t head = ACCESS_ONCE(lock->tickets.head);
+	__ticket_t head = READ_ONCE(lock->tickets.head);
 
 	for (;;) {
-		struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 		/*
 		 * We need to check "unlocked" in a loop, tmp.head == head
 		 * can be false positive because of overflow.
 		 */
-		if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) ||
-		    tmp.head != head)
+		if (__tickets_equal(tmp.head, tmp.tail) ||
+				!__tickets_equal(tmp.head, head))
 			break;
 
 		cpu_relax();
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 94f643484300..e354cc6446ab 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -609,7 +609,7 @@ static inline void check_zero(void)
 	u8 ret;
 	u8 old;
 
-	old = ACCESS_ONCE(zero_stats);
+	old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	int cpu;
 	u64 start;
 	unsigned long flags;
+	__ticket_t head;
 
 	if (in_nmi())
 		return;
@@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking.
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 	add_stats(RELEASED_SLOW, 1);
 	for_each_cpu(cpu, &waiting_cpus) {
 		const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == ticket) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == ticket) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			kvm_kick_cpu(cpu);
 			break;
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23b45eb9a89c..956374c1edbc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -41,7 +41,7 @@ static u8 zero_stats;
 static inline void check_zero(void)
 {
 	u8 ret;
-	u8 old = ACCESS_ONCE(zero_stats);
+	u8 old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -112,6 +112,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
 	int cpu = smp_processor_id();
 	u64 start;
+	__ticket_t head;
 	unsigned long flags;
 
 	/* If kicker interrupts not initialized yet, just spin */
@@ -159,11 +160,15 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -204,8 +209,8 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 		const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
 
 		/* Make sure we read lock before want */
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == next) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == next) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
 			break;
-- 
cgit v1.2.3


From 1ea76fbadd667b19c4fa4466f3a3b55a505e83d9 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 16 Feb 2015 10:11:13 +0800
Subject: x86/irq: Fix regression caused by commit b568b8601f05
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit b568b8601f05 ("Treat SCI interrupt as normal GSI interrupt")
accidently removes support of legacy PIC interrupt when fixing a
regression for Xen, which causes a nasty regression on HP/Compaq
nc6000 where we fail to register the ACPI interrupt, and thus
lose eg. thermal notifications leading a potentially overheated
machine.

So reintroduce support of legacy PIC based ACPI SCI interrupt.

Reported-by: Ville Syrjälä <syrjala@sci.fi>
Tested-by: Ville Syrjälä <syrjala@sci.fi>
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: <stable@vger.kernel.org> # 3.19+
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Cc: linux-pm@vger.kernel.org
Link: http://lkml.kernel.org/r/1424052673-22974-1-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/acpi/boot.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a18fff361c7f..8b5916342902 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -613,6 +613,11 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
 {
 	int rc, irq, trigger, polarity;
 
+	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+		*irqp = gsi;
+		return 0;
+	}
+
 	rc = acpi_get_override_irq(gsi, &trigger, &polarity);
 	if (rc == 0) {
 		trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
-- 
cgit v1.2.3


From d97eb8966c91f2c9d05f0a22eb89ed5b76d966d1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 4 Feb 2015 13:33:33 +0100
Subject: x86/irq: Check for valid irq descriptor in
 check_irq_vectors_for_cpu_disable()

When an interrupt is migrated away from a cpu it will stay
in its vector_irq array until smp_irq_move_cleanup_interrupt
succeeded. The cfg->move_in_progress flag is cleared already
when the IPI was sent.

When the interrupt is destroyed after migration its 'struct
irq_desc' is freed and the vector_irq arrays are cleaned up.
But since cfg->move_in_progress is already 0 the references
at cpus before the last migration will not be cleared. So
this would leave a reference to an already destroyed irq
alive.

When the cpu is taken down at this point, the
check_irq_vectors_for_cpu_disable() function finds a valid irq
number in the vector_irq array, but gets NULL for its
descriptor and dereferences it, causing a kernel panic.

This has been observed on real systems at shutdown. Add a
check to check_irq_vectors_for_cpu_disable() for a valid
'struct irq_desc' to prevent this issue.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: alnovak@suse.com
Cc: joro@8bytes.org
Link: http://lkml.kernel.org/r/20150204132754.GA10078@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/irq.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 705ef8d48e2d..67b1cbe0093a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void)
 		irq = __this_cpu_read(vector_irq[vector]);
 		if (irq >= 0) {
 			desc = irq_to_desc(irq);
+			if (!desc)
+				continue;
+
 			data = irq_desc_get_irq_data(desc);
 			cpumask_copy(&affinity_new, data->affinity);
 			cpu_clear(this_cpu, affinity_new);
-- 
cgit v1.2.3


From 097f4e5e839359021c8f0ea273655031e6ed04ff Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 12 Feb 2015 20:18:50 +0100
Subject: uprobes/x86: Add comment with insn opcodes, mnemonics and why we dont
 support them

After adding these, it's clear we have some awkward choices
there. Some valid instructions are prohibited from uprobing
while several invalid ones are allowed.

Hopefully future edits to the good-opcode tables will fix wrong
bits or explain why those bits are not wrong.

No actual code changes.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1423768732-32194-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 153 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 134 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 8b96a947021f..54e36248e9c0 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -66,6 +66,49 @@
  * Good-instruction tables for 32-bit apps.  This is non-const and volatile
  * to keep gcc from statically optimizing it out, as variable_test_bit makes
  * some versions of gcc to think only *(unsigned long*) is used.
+ *
+ * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder
+ * won't report *prefixes* as OPCODE1(insn).
+ * 0f - 2-byte opcode prefix
+ * 26,2e,36,3e - es:/cs:/ss:/ds:
+ * 64 - fs: (marked as "good", why?)
+ * 65 - gs: (marked as "good", why?)
+ * 66 - operand-size prefix
+ * 67 - address-size prefix
+ * f0 - lock prefix
+ * f2 - repnz    (marked as "good", why?)
+ * f3 - rep/repz (marked as "good", why?)
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
+ *	(why we support bound (62) then? it's similar, and similarly unused...)
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * 07,17,1f - pop es/ss/ds
+ *	Normally not used in userspace, but would execute if used.
+ *	Can cause GP or stack exception if tries to load wrong segment descriptor.
+ *	We hesitate to run them under single step since kernel's handling
+ *	of userspace single-stepping (TF flag) is fragile.
+ *	We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
+ *	on the same grounds that they are never used.
+ * cd - int N.
+ *	Used by userspace for "int 80" syscall entry. (Other "int N"
+ *	cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ *	Not supported since kernel's handling of userspace single-stepping
+ *	(TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ *
+ * Opcodes which can be enabled right away:
+ * 63 - arpl. This insn has no unusual exceptions (it's basically an arith op).
+ * d6 - salc. Undocumented "sign-extend carry flag to AL" insn
  */
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static volatile u32 good_insns_32[256 / 32] = {
@@ -94,7 +137,55 @@ static volatile u32 good_insns_32[256 / 32] = {
 #define good_insns_32	NULL
 #endif
 
-/* Good-instruction tables for 64-bit apps */
+/* Good-instruction tables for 64-bit apps.
+ *
+ * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder
+ * won't report *prefixes* as OPCODE1(insn).
+ * 0f - 2-byte opcode prefix
+ * 26,2e,36,3e - es:/cs:/ss:/ds:
+ * 40-4f - rex prefixes
+ * 64 - fs: (marked as "good", why?)
+ * 65 - gs: (marked as "good", why?)
+ * 66 - operand-size prefix
+ * 67 - address-size prefix
+ * f0 - lock prefix
+ * f2 - repnz    (marked as "good", why?)
+ * f3 - rep/repz (marked as "good", why?)
+ *
+ * Genuinely invalid opcodes:
+ * 06,07 - formerly push/pop es
+ * 0e - formerly push cs
+ * 16,17 - formerly push/pop ss
+ * 1e,1f - formerly push/pop ds
+ * 27,2f,37,3f - formerly daa/das/aaa/aas
+ * 60,61 - formerly pusha/popa
+ * 62 - formerly bound. EVEX prefix for AVX512
+ * 82 - formerly redundant encoding of Group1
+ * 9a - formerly call seg:ofs (marked as "supported"???)
+ * c4,c5 - formerly les/lds. VEX prefixes for AVX
+ * ce - formerly into
+ * d4,d5 - formerly aam/aad
+ * d6 - formerly undocumented salc
+ * ea - formerly jmp seg:ofs (marked as "supported"???)
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * cd - int N.
+ *	Used by userspace for "int 80" syscall entry. (Other "int N"
+ *	cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ *	Not supported since kernel's handling of userspace single-stepping
+ *	(TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
 #if defined(CONFIG_X86_64)
 static volatile u32 good_insns_64[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
@@ -122,7 +213,48 @@ static volatile u32 good_insns_64[256 / 32] = {
 #define good_insns_64	NULL
 #endif
 
-/* Using this for both 64-bit and 32-bit apps */
+/* Using this for both 64-bit and 32-bit apps.
+ * Opcodes we don't support:
+ * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
+ * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
+ *	Also encodes tons of other system insns if mod=11.
+ *	Some are in fact non-system: xend, xtest, rdtscp, maybe more
+ * 0f 02 - lar (why? should be safe, it throws no exceptipons)
+ * 0f 03 - lsl (why? should be safe, it throws no exceptipons)
+ * 0f 04 - undefined
+ * 0f 05 - syscall
+ * 0f 06 - clts (CPL0 insn)
+ * 0f 07 - sysret
+ * 0f 08 - invd (CPL0 insn)
+ * 0f 09 - wbinvd (CPL0 insn)
+ * 0f 0a - undefined
+ * 0f 0b - ud2
+ * 0f 0c - undefined
+ * 0f 0d - prefetchFOO (amd prefetch insns)
+ * 0f 18 - prefetchBAR (intel prefetch insns)
+ * 0f 24 - mov from test regs (perhaps entire 20-27 area can be disabled (special reg ops))
+ * 0f 25 - undefined
+ * 0f 26 - mov to test regs
+ * 0f 27 - undefined
+ * 0f 30 - wrmsr (CPL0 insn)
+ * 0f 34 - sysenter
+ * 0f 35 - sysexit
+ * 0f 36 - undefined
+ * 0f 37 - getsec
+ * 0f 38-3f - 3-byte opcodes (why?? all look safe)
+ * 0f 78 - vmread
+ * 0f 79 - vmwrite
+ * 0f 7a - undefined
+ * 0f 7b - undefined
+ * 0f 7c - undefined
+ * 0f 7d - undefined
+ * 0f a6 - undefined
+ * 0f a7 - undefined
+ * 0f b8 - popcnt (why?? it's an ordinary ALU op)
+ * 0f d0 - undefined
+ * 0f f0 - lddqu (why?? it's an ordinary vector load op)
+ * 0f ff - undefined
+ */
 static volatile u32 good_2byte_insns[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
@@ -148,23 +280,6 @@ static volatile u32 good_2byte_insns[256 / 32] = {
 #undef W
 
 /*
- * opcodes we'll probably never support:
- *
- *  6c-6d, e4-e5, ec-ed - in
- *  6e-6f, e6-e7, ee-ef - out
- *  cc, cd - int3, int
- *  cf - iret
- *  d6 - illegal instruction
- *  f1 - int1/icebp
- *  f4 - hlt
- *  fa, fb - cli, sti
- *  0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
- *
- * invalid opcodes in 64-bit mode:
- *
- *  06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
- *  63 - we support this opcode in x86_64 but not in i386.
- *
  * opcodes we may need to refine support for:
  *
  *  0f - 2-byte instructions: For many of these instructions, the validity
-- 
cgit v1.2.3


From 67fc809217dc7fd793211585b2a8d7b61715d06b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 12 Feb 2015 20:18:51 +0100
Subject: uprobes/x86: Fix 1-byte opcode tables

This change fixes 1-byte opcode tables so that only insns
for which we have real reasons to disallow probing are marked
with unset bits.

To that end:

Set bits for all prefix bytes. Their setting is ignored anyway -
we check the bitmap against OPCODE1(insn), not against first
byte. Keeping them set to 0 only confuses code reader with
"why we don't support that opcode" question.

Thus: enable bytes c4,c5 in 64-bit mode (VEX prefixes).
Byte 62 (EVEX prefix) is not yet enabled since insn decoder
does not support that yet.

For 32-bit mode, enable probing of opcodes 63 (arpl) and d6
(salc). They don't require any special handling.

For 64-bit mode, disable 9a and ea - these undefined opcodes
were mistakenly left enabled.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1423768732-32194-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 66 +++++++++++++----------------------------------
 1 file changed, 18 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 54e36248e9c0..aa1da96d09ff 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -67,18 +67,6 @@
  * to keep gcc from statically optimizing it out, as variable_test_bit makes
  * some versions of gcc to think only *(unsigned long*) is used.
  *
- * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder
- * won't report *prefixes* as OPCODE1(insn).
- * 0f - 2-byte opcode prefix
- * 26,2e,36,3e - es:/cs:/ss:/ds:
- * 64 - fs: (marked as "good", why?)
- * 65 - gs: (marked as "good", why?)
- * 66 - operand-size prefix
- * 67 - address-size prefix
- * f0 - lock prefix
- * f2 - repnz    (marked as "good", why?)
- * f3 - rep/repz (marked as "good", why?)
- *
  * Opcodes we'll probably never support:
  * 6c-6f - ins,outs. SEGVs if used in userspace
  * e4-e7 - in,out imm. SEGVs if used in userspace
@@ -105,31 +93,27 @@
  *	Not supported since kernel's handling of userspace single-stepping
  *	(TF flag) is fragile.
  * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
- *
- * Opcodes which can be enabled right away:
- * 63 - arpl. This insn has no unusual exceptions (it's basically an arith op).
- * d6 - salc. Undocumented "sign-extend carry flag to AL" insn
  */
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static volatile u32 good_insns_32[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
+	W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
 	W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
-	W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
 	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
 	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
-	W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
 	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
 	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
 	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
 	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
 	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
 	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
-	W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
 	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
-	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
+	W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
 	/*      ----------------------------------------------         */
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 };
@@ -138,19 +122,6 @@ static volatile u32 good_insns_32[256 / 32] = {
 #endif
 
 /* Good-instruction tables for 64-bit apps.
- *
- * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder
- * won't report *prefixes* as OPCODE1(insn).
- * 0f - 2-byte opcode prefix
- * 26,2e,36,3e - es:/cs:/ss:/ds:
- * 40-4f - rex prefixes
- * 64 - fs: (marked as "good", why?)
- * 65 - gs: (marked as "good", why?)
- * 66 - operand-size prefix
- * 67 - address-size prefix
- * f0 - lock prefix
- * f2 - repnz    (marked as "good", why?)
- * f3 - rep/repz (marked as "good", why?)
  *
  * Genuinely invalid opcodes:
  * 06,07 - formerly push/pop es
@@ -159,14 +130,13 @@ static volatile u32 good_insns_32[256 / 32] = {
  * 1e,1f - formerly push/pop ds
  * 27,2f,37,3f - formerly daa/das/aaa/aas
  * 60,61 - formerly pusha/popa
- * 62 - formerly bound. EVEX prefix for AVX512
+ * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
  * 82 - formerly redundant encoding of Group1
- * 9a - formerly call seg:ofs (marked as "supported"???)
- * c4,c5 - formerly les/lds. VEX prefixes for AVX
+ * 9a - formerly call seg:ofs
  * ce - formerly into
  * d4,d5 - formerly aam/aad
  * d6 - formerly undocumented salc
- * ea - formerly jmp seg:ofs (marked as "supported"???)
+ * ea - formerly jmp seg:ofs
  *
  * Opcodes we'll probably never support:
  * 6c-6f - ins,outs. SEGVs if used in userspace
@@ -190,22 +160,22 @@ static volatile u32 good_insns_32[256 / 32] = {
 static volatile u32 good_insns_64[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
+	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
 	W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
-	W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
-	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
 	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
-	W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
 	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
 	W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
 	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
 	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
-	W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
 	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
-	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
+	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
 	/*      ----------------------------------------------         */
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 };
-- 
cgit v1.2.3


From 5154d4f2adfdabe5aeb247e5b2b6b10fae5f6d50 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 12 Feb 2015 20:18:52 +0100
Subject: uprobes/x86: Fix 2-byte opcode table

Enabled probing of lar, lsl, popcnt, lddqu, prefetch insns.
They should be safe to probe, they throw no exceptions.

Enabled probing of 3-byte opcodes 0f 38-3f xx - these are
vector isns, so should be safe.

Enabled probing of many currently undefined 0f xx insns.
At the rate new vector instructions are getting added,
we don't want to constantly enable more bits.
We want to only occasionally *disable* ones which
for some reason can't be probed.
This includes 0f 24,26 opcodes, which are undefined
since Pentium. On 486, they were "mov to/from test register".

Explained more fully what 0f 78,79 opcodes are.

Explained what 0f ae opcode is. (It's unclear why we don't allow
probing it, but let's not change it for now).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1423768732-32194-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 52 ++++++++++++++++-------------------------------
 1 file changed, 17 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index aa1da96d09ff..81f8adb0679e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -189,61 +189,43 @@ static volatile u32 good_insns_64[256 / 32] = {
  * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
  *	Also encodes tons of other system insns if mod=11.
  *	Some are in fact non-system: xend, xtest, rdtscp, maybe more
- * 0f 02 - lar (why? should be safe, it throws no exceptipons)
- * 0f 03 - lsl (why? should be safe, it throws no exceptipons)
- * 0f 04 - undefined
  * 0f 05 - syscall
  * 0f 06 - clts (CPL0 insn)
  * 0f 07 - sysret
  * 0f 08 - invd (CPL0 insn)
  * 0f 09 - wbinvd (CPL0 insn)
- * 0f 0a - undefined
  * 0f 0b - ud2
- * 0f 0c - undefined
- * 0f 0d - prefetchFOO (amd prefetch insns)
- * 0f 18 - prefetchBAR (intel prefetch insns)
- * 0f 24 - mov from test regs (perhaps entire 20-27 area can be disabled (special reg ops))
- * 0f 25 - undefined
- * 0f 26 - mov to test regs
- * 0f 27 - undefined
- * 0f 30 - wrmsr (CPL0 insn)
+ * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
  * 0f 34 - sysenter
  * 0f 35 - sysexit
- * 0f 36 - undefined
  * 0f 37 - getsec
- * 0f 38-3f - 3-byte opcodes (why?? all look safe)
- * 0f 78 - vmread
- * 0f 79 - vmwrite
- * 0f 7a - undefined
- * 0f 7b - undefined
- * 0f 7c - undefined
- * 0f 7d - undefined
- * 0f a6 - undefined
- * 0f a7 - undefined
- * 0f b8 - popcnt (why?? it's an ordinary ALU op)
- * 0f d0 - undefined
- * 0f f0 - lddqu (why?? it's an ordinary vector load op)
- * 0f ff - undefined
+ * 0f 78 - vmread (Intel VMX. CPL0 insn)
+ * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
+ *	Note: with prefixes, these two opcodes are
+ *	extrq/insertq/AVX512 convert vector ops.
+ * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
+ *	{rd,wr}{fs,gs}base,{s,l,m}fence.
+ *	Why? They are all user-executable.
  */
 static volatile u32 good_2byte_insns[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
-	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
-	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
-	W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+	W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+	W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
 	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
 	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
 	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
-	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
 	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
 	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
-	W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
-	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
 	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
-	W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
 	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
-	W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */
+	W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)   /* f0 */
 	/*      ----------------------------------------------         */
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 };
-- 
cgit v1.2.3


From b7e37567d080301d38a302bb93ba79d1ca446dca Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 10 Feb 2015 09:34:05 +0800
Subject: kprobes/x86: Mark 2 bytes NOP as boostable

Currently, x86 kprobes is unable to boost 2 bytes nop like:

  nopl 0x0(%rax,%rax,1)

which is 0x0f 0x1f 0x44 0x00 0x00.

Such nops have exactly 5 bytes to hold a relative jmp
instruction. Boosting them should be obviously safe.

This patch enable boosting such nops by simply updating
twobyte_is_boostable[] array.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1423532045-41049-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 98f654d466e5..6a1146ea4d4d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -84,7 +84,7 @@ static volatile u32 twobyte_is_boostable[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 	/*      ----------------------------------------------          */
 	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
-	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
+	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
 	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
 	W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
 	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
-- 
cgit v1.2.3


From b273c2c2f2d2d13dc0bfa8923d52fbaf8fa56ae8 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Date: Mon, 2 Feb 2015 20:27:11 +0100
Subject: x86/apic: Fix the devicetree build in certain configs

Without this patch:

  LD      init/built-in.o
  arch/x86/built-in.o: In function `dtb_lapic_setup': kernel/devicetree.c:155:
  undefined reference to `apic_force_enable'
  Makefile:923: recipe for target 'vmlinux' failed
  make: *** [vmlinux] Error 1

Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Reviewed-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Jan Beulich <JBeulich@suse.com>
Link: http://lkml.kernel.org/r/1422905231-16067-1-git-send-email-ricardo.ribalda@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 92003f3c8a42..efc3b22d896e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -213,7 +213,15 @@ void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+	return -1;
+}
+#else
 extern int apic_force_enable(unsigned long addr);
+#endif
 
 extern int apic_bsp_setup(bool upmode);
 extern void apic_ap_setup(void);
-- 
cgit v1.2.3


From 28a375df16c2b6d01227541f3956568995aa5fda Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Date: Fri, 30 Jan 2015 16:29:38 +0000
Subject: x86/intel/quark: Add Isolated Memory Regions for Quark X1000

Intel's Quark X1000 SoC contains a set of registers called
Isolated Memory Regions. IMRs are accessed over the IOSF mailbox
interface. IMRs are areas carved out of memory that define
read/write access rights to the various system agents within the
Quark system. For a given agent in the system it is possible to
specify if that agent may read or write an area of memory
defined by an IMR with a granularity of 1 KiB.

Quark_SecureBootPRM_330234_001.pdf section 4.5 details the
concept of IMRs quark-x1000-datasheet.pdf section 12.7.4 details
the implementation of IMRs in silicon.

eSRAM flush, CPU Snoop write-only, CPU SMM Mode, CPU non-SMM
mode, RMU and PCIe Virtual Channels (VC0 and VC1) can have
individual read/write access masks applied to them for a given
memory region in Quark X1000. This enables IMRs to treat each
memory transaction type listed above on an individual basis and
to filter appropriately based on the IMR access mask for the
memory region. Quark supports eight IMRs.

Since all of the DMA capable SoC components in the X1000 are
mapped to VC0 it is possible to define sections of memory as
invalid for DMA write operations originating from Ethernet, USB,
SD and any other DMA capable south-cluster component on VC0.
Similarly it is possible to mark kernel memory as non-SMM mode
read/write only or to mark BIOS runtime memory as SMM mode
accessible only depending on the particular memory footprint on
a given system.

On an IMR violation Quark SoC X1000 systems are configured to
reset the system, so ensuring that the IMR memory map is
consistent with the EFI provided memory map is critical to
ensure no IMR violations reset the system.

The API for accessing IMRs is based on MTRR code but doesn't
provide a /proc or /sys interface to manipulate IMRs. Defining
the size and extent of IMRs is exclusively the domain of
in-kernel code.

Quark firmware sets up a series of locked IMRs around pieces of
memory that firmware owns such as ACPI runtime data. During boot
a series of unlocked IMRs are placed around items in memory to
guarantee no DMA modification of those items can take place.
Grub also places an unlocked IMR around the kernel boot params
data structure and compressed kernel image. It is necessary for
the kernel to tear down all unlocked IMRs in order to ensure
that the kernel's view of memory passed via the EFI memory map
is consistent with the IMR memory map. Without tearing down all
unlocked IMRs on boot transitory IMRs such as those used to
protect the compressed kernel image will cause IMR violations and system reboots.

The IMR init code tears down all unlocked IMRs and sets a
protective IMR around the kernel .text and .rodata as one
contiguous block. This sanitizes the IMR memory map with respect
to the EFI memory map and protects the read-only portions of the
kernel from unwarranted DMA access.

Tested-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Reviewed-by: Andy Shevchenko <andy.schevchenko@gmail.com>
Reviewed-by: Darren Hart <dvhart@linux.intel.com>
Reviewed-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Cc: andy.shevchenko@gmail.com
Cc: dvhart@infradead.org
Link: http://lkml.kernel.org/r/1422635379-12476-2-git-send-email-pure.logic@nexus-software.ie
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug                       |  13 +
 arch/x86/include/asm/imr.h                   |  60 +++
 arch/x86/platform/intel-quark/Makefile       |   2 +
 arch/x86/platform/intel-quark/imr.c          | 668 +++++++++++++++++++++++++++
 arch/x86/platform/intel-quark/imr_selftest.c | 129 ++++++
 drivers/platform/x86/Kconfig                 |  25 +
 6 files changed, 897 insertions(+)
 create mode 100644 arch/x86/include/asm/imr.h
 create mode 100644 arch/x86/platform/intel-quark/Makefile
 create mode 100644 arch/x86/platform/intel-quark/imr.c
 create mode 100644 arch/x86/platform/intel-quark/imr_selftest.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 61bd2ad94281..20028da8ae18 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -313,6 +313,19 @@ config DEBUG_NMI_SELFTEST
 
 	  If unsure, say N.
 
+config DEBUG_IMR_SELFTEST
+	bool "Isolated Memory Region self test"
+	default n
+	depends on INTEL_IMR
+	---help---
+	  This option enables automated sanity testing of the IMR code.
+	  Some simple tests are run to verify IMR bounds checking, alignment
+	  and overlapping. This option is really only useful if you are
+	  debugging an IMR memory map or are modifying the IMR code and want to
+	  test your changes.
+
+	  If unsure say N here.
+
 config X86_DEBUG_STATIC_CPU_HAS
 	bool "Debug alternatives"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
new file mode 100644
index 000000000000..cd2ce4068441
--- /dev/null
+++ b/arch/x86/include/asm/imr.h
@@ -0,0 +1,60 @@
+/*
+ * imr.h: Isolated Memory Region API
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _IMR_H
+#define _IMR_H
+
+#include <linux/types.h>
+
+/*
+ * IMR agent access mask bits
+ * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register
+ * definitions.
+ */
+#define IMR_ESRAM_FLUSH		BIT(31)
+#define IMR_CPU_SNOOP		BIT(30)		/* Applicable only to write */
+#define IMR_RMU			BIT(29)
+#define IMR_VC1_SAI_ID3		BIT(15)
+#define IMR_VC1_SAI_ID2		BIT(14)
+#define IMR_VC1_SAI_ID1		BIT(13)
+#define IMR_VC1_SAI_ID0		BIT(12)
+#define IMR_VC0_SAI_ID3		BIT(11)
+#define IMR_VC0_SAI_ID2		BIT(10)
+#define IMR_VC0_SAI_ID1		BIT(9)
+#define IMR_VC0_SAI_ID0		BIT(8)
+#define IMR_CPU_0		BIT(1)		/* SMM mode */
+#define IMR_CPU			BIT(0)		/* Non SMM mode */
+#define IMR_ACCESS_NONE		0
+
+/*
+ * Read/Write access-all bits here include some reserved bits
+ * These are the values firmware uses and are accepted by hardware.
+ * The kernel defines read/write access-all in the same way as firmware
+ * in order to have a consistent and crisp definition across firmware,
+ * bootloader and kernel.
+ */
+#define IMR_READ_ACCESS_ALL	0xBFFFFFFF
+#define IMR_WRITE_ACCESS_ALL	0xFFFFFFFF
+
+/* Number of IMRs provided by Quark X1000 SoC */
+#define QUARK_X1000_IMR_MAX	0x08
+#define QUARK_X1000_IMR_REGBASE 0x40
+
+/* IMR alignment bits - only bits 31:10 are checked for IMR validity */
+#define IMR_ALIGN		0x400
+#define IMR_MASK		(IMR_ALIGN - 1)
+
+int imr_add_range(phys_addr_t base, size_t size,
+		  unsigned int rmask, unsigned int wmask, bool lock);
+
+int imr_remove_range(phys_addr_t base, size_t size);
+
+#endif /* _IMR_H */
diff --git a/arch/x86/platform/intel-quark/Makefile b/arch/x86/platform/intel-quark/Makefile
new file mode 100644
index 000000000000..9cc57ed36022
--- /dev/null
+++ b/arch/x86/platform/intel-quark/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INTEL_IMR) += imr.o
+obj-$(CONFIG_DEBUG_IMR_SELFTEST) += imr_selftest.o
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
new file mode 100644
index 000000000000..16e4df1c9290
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -0,0 +1,668 @@
+/**
+ * imr.c
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR registers define an isolated region of memory that can
+ * be masked to prohibit certain system agents from accessing memory.
+ * When a device behind a masked port performs an access - snooped or
+ * not, an IMR may optionally prevent that transaction from changing
+ * the state of memory or from getting correct data in response to the
+ * operation.
+ *
+ * Write data will be dropped and reads will return 0xFFFFFFFF, the
+ * system will reset and system BIOS will print out an error message to
+ * inform the user that an IMR has been violated.
+ *
+ * This code is based on the Linux MTRR code and reference code from
+ * Intel's Quark BSP EFI, Linux and grub code.
+ *
+ * See quark-x1000-datasheet.pdf for register definitions.
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/quark-x1000-datasheet.pdf
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
+#include <asm/imr.h>
+#include <asm/iosf_mbi.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+struct imr_device {
+	struct dentry	*file;
+	bool		init;
+	struct mutex	lock;
+	int		max_imr;
+	int		reg_base;
+};
+
+static struct imr_device imr_dev;
+
+/*
+ * IMR read/write mask control registers.
+ * See quark-x1000-datasheet.pdf sections 12.7.4.5 and 12.7.4.6 for
+ * bit definitions.
+ *
+ * addr_hi
+ * 31		Lock bit
+ * 30:24	Reserved
+ * 23:2		1 KiB aligned lo address
+ * 1:0		Reserved
+ *
+ * addr_hi
+ * 31:24	Reserved
+ * 23:2		1 KiB aligned hi address
+ * 1:0		Reserved
+ */
+#define IMR_LOCK	BIT(31)
+
+struct imr_regs {
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 rmask;
+	u32 wmask;
+};
+
+#define IMR_NUM_REGS	(sizeof(struct imr_regs)/sizeof(u32))
+#define IMR_SHIFT	8
+#define imr_to_phys(x)	((x) << IMR_SHIFT)
+#define phys_to_imr(x)	((x) >> IMR_SHIFT)
+
+/**
+ * imr_is_enabled - true if an IMR is enabled false otherwise.
+ *
+ * Determines if an IMR is enabled based on address range and read/write
+ * mask. An IMR set with an address range set to zero and a read/write
+ * access mask set to all is considered to be disabled. An IMR in any
+ * other state - for example set to zero but without read/write access
+ * all is considered to be enabled. This definition of disabled is how
+ * firmware switches off an IMR and is maintained in kernel for
+ * consistency.
+ *
+ * @imr:	pointer to IMR descriptor.
+ * @return:	true if IMR enabled false if disabled.
+ */
+static inline int imr_is_enabled(struct imr_regs *imr)
+{
+	return !(imr->rmask == IMR_READ_ACCESS_ALL &&
+		 imr->wmask == IMR_WRITE_ACCESS_ALL &&
+		 imr_to_phys(imr->addr_lo) == 0 &&
+		 imr_to_phys(imr->addr_hi) == 0);
+}
+
+/**
+ * imr_read - read an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @imr_id:	IMR entry to read.
+ * @imr:	IMR structure representing address and access masks.
+ * @return:	0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
+{
+	u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+	int ret;
+
+	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->addr_lo);
+	if (ret)
+		return ret;
+
+	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->addr_hi);
+	if (ret)
+		return ret;
+
+	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->rmask);
+	if (ret)
+		return ret;
+
+	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->wmask);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * imr_write - write an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ * Note lock bits need to be written independently of address bits.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @imr_id:	IMR entry to write.
+ * @imr:	IMR structure representing address and access masks.
+ * @lock:	indicates if the IMR lock bit should be applied.
+ * @return:	0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_write(struct imr_device *idev, u32 imr_id,
+		     struct imr_regs *imr, bool lock)
+{
+	unsigned long flags;
+	u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+	int ret;
+
+	local_irq_save(flags);
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++,
+				imr->addr_lo);
+	if (ret)
+		goto failed;
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+				reg++, imr->addr_hi);
+	if (ret)
+		goto failed;
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+				reg++, imr->rmask);
+	if (ret)
+		goto failed;
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+				reg++, imr->wmask);
+	if (ret)
+		goto failed;
+
+	/* Lock bit must be set separately to addr_lo address bits. */
+	if (lock) {
+		imr->addr_lo |= IMR_LOCK;
+		ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+					reg - IMR_NUM_REGS, imr->addr_lo);
+		if (ret)
+			goto failed;
+	}
+
+	local_irq_restore(flags);
+	return 0;
+failed:
+	/*
+	 * If writing to the IOSF failed then we're in an unknown state,
+	 * likely a very bad state. An IMR in an invalid state will almost
+	 * certainly lead to a memory access violation.
+	 */
+	local_irq_restore(flags);
+	WARN(ret, "IOSF-MBI write fail range 0x%08x-0x%08x unreliable\n",
+	     imr_to_phys(imr->addr_lo), imr_to_phys(imr->addr_hi) + IMR_MASK);
+
+	return ret;
+}
+
+/**
+ * imr_dbgfs_state_show - print state of IMR registers.
+ *
+ * @s:		pointer to seq_file for output.
+ * @unused:	unused parameter.
+ * @return:	0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_dbgfs_state_show(struct seq_file *s, void *unused)
+{
+	phys_addr_t base;
+	phys_addr_t end;
+	int i;
+	struct imr_device *idev = s->private;
+	struct imr_regs imr;
+	size_t size;
+	int ret = -ENODEV;
+
+	mutex_lock(&idev->lock);
+
+	for (i = 0; i < idev->max_imr; i++) {
+
+		ret = imr_read(idev, i, &imr);
+		if (ret)
+			break;
+
+		/*
+		 * Remember to add IMR_ALIGN bytes to size to indicate the
+		 * inherent IMR_ALIGN size bytes contained in the masked away
+		 * lower ten bits.
+		 */
+		if (imr_is_enabled(&imr)) {
+			base = imr_to_phys(imr.addr_lo);
+			end = imr_to_phys(imr.addr_hi) + IMR_MASK;
+		} else {
+			base = 0;
+			end = 0;
+		}
+		size = end - base;
+		seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx "
+			   "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i,
+			   &base, &end, size, imr.rmask, imr.wmask,
+			   imr_is_enabled(&imr) ? "enabled " : "disabled",
+			   imr.addr_lo & IMR_LOCK ? "locked" : "unlocked");
+	}
+
+	mutex_unlock(&idev->lock);
+	return ret;
+}
+
+/**
+ * imr_state_open - debugfs open callback.
+ *
+ * @inode:	pointer to struct inode.
+ * @file:	pointer to struct file.
+ * @return:	result of single open.
+ */
+static int imr_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, imr_dbgfs_state_show, inode->i_private);
+}
+
+static const struct file_operations imr_state_ops = {
+	.open		= imr_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/**
+ * imr_debugfs_register - register debugfs hooks.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @return:	0 on success - errno on failure.
+ */
+static int imr_debugfs_register(struct imr_device *idev)
+{
+	idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL,
+					 idev, &imr_state_ops);
+	if (IS_ERR(idev->file))
+		return PTR_ERR(idev->file);
+
+	return 0;
+}
+
+/**
+ * imr_debugfs_unregister - unregister debugfs hooks.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @return:
+ */
+static void imr_debugfs_unregister(struct imr_device *idev)
+{
+	debugfs_remove(idev->file);
+}
+
+/**
+ * imr_check_params - check passed address range IMR alignment and non-zero size
+ *
+ * @base:	base address of intended IMR.
+ * @size:	size of intended IMR.
+ * @return:	zero on valid range -EINVAL on unaligned base/size.
+ */
+static int imr_check_params(phys_addr_t base, size_t size)
+{
+	if ((base & IMR_MASK) || (size & IMR_MASK)) {
+		pr_err("base %pa size 0x%08zx must align to 1KiB\n",
+			&base, size);
+		return -EINVAL;
+	}
+	if (size == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * imr_raw_size - account for the IMR_ALIGN bytes that addr_hi appends.
+ *
+ * IMR addr_hi has a built in offset of plus IMR_ALIGN (0x400) bytes from the
+ * value in the register. We need to subtract IMR_ALIGN bytes from input sizes
+ * as a result.
+ *
+ * @size:	input size bytes.
+ * @return:	reduced size.
+ */
+static inline size_t imr_raw_size(size_t size)
+{
+	return size - IMR_ALIGN;
+}
+
+/**
+ * imr_address_overlap - detects an address overlap.
+ *
+ * @addr:	address to check against an existing IMR.
+ * @imr:	imr being checked.
+ * @return:	true for overlap false for no overlap.
+ */
+static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
+{
+	return addr >= imr_to_phys(imr->addr_lo) && addr <= imr_to_phys(imr->addr_hi);
+}
+
+/**
+ * imr_add_range - add an Isolated Memory Region.
+ *
+ * @base:	physical base address of region aligned to 1KiB.
+ * @size:	physical size of region in bytes must be aligned to 1KiB.
+ * @read_mask:	read access mask.
+ * @write_mask:	write access mask.
+ * @lock:	indicates whether or not to permanently lock this region.
+ * @return:	zero on success or negative value indicating error.
+ */
+int imr_add_range(phys_addr_t base, size_t size,
+		  unsigned int rmask, unsigned int wmask, bool lock)
+{
+	phys_addr_t end;
+	unsigned int i;
+	struct imr_device *idev = &imr_dev;
+	struct imr_regs imr;
+	size_t raw_size;
+	int reg;
+	int ret;
+
+	if (WARN_ONCE(idev->init == false, "driver not initialized"))
+		return -ENODEV;
+
+	ret = imr_check_params(base, size);
+	if (ret)
+		return ret;
+
+	/* Tweak the size value. */
+	raw_size = imr_raw_size(size);
+	end = base + raw_size;
+
+	/*
+	 * Check for reserved IMR value common to firmware, kernel and grub
+	 * indicating a disabled IMR.
+	 */
+	imr.addr_lo = phys_to_imr(base);
+	imr.addr_hi = phys_to_imr(end);
+	imr.rmask = rmask;
+	imr.wmask = wmask;
+	if (!imr_is_enabled(&imr))
+		return -ENOTSUPP;
+
+	mutex_lock(&idev->lock);
+
+	/*
+	 * Find a free IMR while checking for an existing overlapping range.
+	 * Note there's no restriction in silicon to prevent IMR overlaps.
+	 * For the sake of simplicity and ease in defining/debugging an IMR
+	 * memory map we exclude IMR overlaps.
+	 */
+	reg = -1;
+	for (i = 0; i < idev->max_imr; i++) {
+		ret = imr_read(idev, i, &imr);
+		if (ret)
+			goto failed;
+
+		/* Find overlap @ base or end of requested range. */
+		ret = -EINVAL;
+		if (imr_is_enabled(&imr)) {
+			if (imr_address_overlap(base, &imr))
+				goto failed;
+			if (imr_address_overlap(end, &imr))
+				goto failed;
+		} else {
+			reg = i;
+		}
+	}
+
+	/* Error out if we have no free IMR entries. */
+	if (reg == -1) {
+		ret = -ENOMEM;
+		goto failed;
+	}
+
+	pr_debug("add %d phys %pa-%pa size %zx mask 0x%08x wmask 0x%08x\n",
+		 reg, &base, &end, raw_size, rmask, wmask);
+
+	/* Enable IMR at specified range and access mask. */
+	imr.addr_lo = phys_to_imr(base);
+	imr.addr_hi = phys_to_imr(end);
+	imr.rmask = rmask;
+	imr.wmask = wmask;
+
+	ret = imr_write(idev, reg, &imr, lock);
+	if (ret < 0) {
+		/*
+		 * In the highly unlikely event iosf_mbi_write failed
+		 * attempt to rollback the IMR setup skipping the trapping
+		 * of further IOSF write failures.
+		 */
+		imr.addr_lo = 0;
+		imr.addr_hi = 0;
+		imr.rmask = IMR_READ_ACCESS_ALL;
+		imr.wmask = IMR_WRITE_ACCESS_ALL;
+		imr_write(idev, reg, &imr, false);
+	}
+failed:
+	mutex_unlock(&idev->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(imr_add_range);
+
+/**
+ * __imr_remove_range - delete an Isolated Memory Region.
+ *
+ * This function allows you to delete an IMR by its index specified by reg or
+ * by address range specified by base and size respectively. If you specify an
+ * index on its own the base and size parameters are ignored.
+ * imr_remove_range(0, base, size); delete IMR at index 0 base/size ignored.
+ * imr_remove_range(-1, base, size); delete IMR from base to base+size.
+ *
+ * @reg:	imr index to remove.
+ * @base:	physical base address of region aligned to 1 KiB.
+ * @size:	physical size of region in bytes aligned to 1 KiB.
+ * @return:	-EINVAL on invalid range or out or range id
+ *		-ENODEV if reg is valid but no IMR exists or is locked
+ *		0 on success.
+ */
+static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
+{
+	phys_addr_t end;
+	bool found = false;
+	unsigned int i;
+	struct imr_device *idev = &imr_dev;
+	struct imr_regs imr;
+	size_t raw_size;
+	int ret = 0;
+
+	if (WARN_ONCE(idev->init == false, "driver not initialized"))
+		return -ENODEV;
+
+	/*
+	 * Validate address range if deleting by address, else we are
+	 * deleting by index where base and size will be ignored.
+	 */
+	if (reg == -1) {
+		ret = imr_check_params(base, size);
+		if (ret)
+			return ret;
+	}
+
+	/* Tweak the size value. */
+	raw_size = imr_raw_size(size);
+	end = base + raw_size;
+
+	mutex_lock(&idev->lock);
+
+	if (reg >= 0) {
+		/* If a specific IMR is given try to use it. */
+		ret = imr_read(idev, reg, &imr);
+		if (ret)
+			goto failed;
+
+		if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) {
+			ret = -ENODEV;
+			goto failed;
+		}
+		found = true;
+	} else {
+		/* Search for match based on address range. */
+		for (i = 0; i < idev->max_imr; i++) {
+			ret = imr_read(idev, i, &imr);
+			if (ret)
+				goto failed;
+
+			if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK)
+				continue;
+
+			if ((imr_to_phys(imr.addr_lo) == base) &&
+			    (imr_to_phys(imr.addr_hi) == end)) {
+				found = true;
+				reg = i;
+				break;
+			}
+		}
+	}
+
+	if (!found) {
+		ret = -ENODEV;
+		goto failed;
+	}
+
+	pr_debug("remove %d phys %pa-%pa size %zx\n", reg, &base, &end, raw_size);
+
+	/* Tear down the IMR. */
+	imr.addr_lo = 0;
+	imr.addr_hi = 0;
+	imr.rmask = IMR_READ_ACCESS_ALL;
+	imr.wmask = IMR_WRITE_ACCESS_ALL;
+
+	ret = imr_write(idev, reg, &imr, false);
+
+failed:
+	mutex_unlock(&idev->lock);
+	return ret;
+}
+
+/**
+ * imr_remove_range - delete an Isolated Memory Region by address
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by base and size respectively.
+ * imr_remove_range(base, size); delete IMR from base to base+size.
+ *
+ * @base:	physical base address of region aligned to 1 KiB.
+ * @size:	physical size of region in bytes aligned to 1 KiB.
+ * @return:	-EINVAL on invalid range or out or range id
+ *		-ENODEV if reg is valid but no IMR exists or is locked
+ *		0 on success.
+ */
+int imr_remove_range(phys_addr_t base, size_t size)
+{
+	return __imr_remove_range(-1, base, size);
+}
+EXPORT_SYMBOL_GPL(imr_remove_range);
+
+/**
+ * imr_clear - delete an Isolated Memory Region by index
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by the index of the IMR. Useful for initial sanitization of the IMR
+ * address map.
+ * imr_ge(base, size); delete IMR from base to base+size.
+ *
+ * @reg:	imr index to remove.
+ * @return:	-EINVAL on invalid range or out or range id
+ *		-ENODEV if reg is valid but no IMR exists or is locked
+ *		0 on success.
+ */
+static inline int imr_clear(int reg)
+{
+	return __imr_remove_range(reg, 0, 0);
+}
+
+/**
+ * imr_fixup_memmap - Tear down IMRs used during bootup.
+ *
+ * BIOS and Grub both setup IMRs around compressed kernel, initrd memory
+ * that need to be removed before the kernel hands out one of the IMR
+ * encased addresses to a downstream DMA agent such as the SD or Ethernet.
+ * IMRs on Galileo are setup to immediately reset the system on violation.
+ * As a result if you're running a root filesystem from SD - you'll need
+ * the boot-time IMRs torn down or you'll find seemingly random resets when
+ * using your filesystem.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @return:
+ */
+static void __init imr_fixup_memmap(struct imr_device *idev)
+{
+	phys_addr_t base = virt_to_phys(&_text);
+	size_t size = virt_to_phys(&__end_rodata) - base;
+	int i;
+	int ret;
+
+	/* Tear down all existing unlocked IMRs. */
+	for (i = 0; i < idev->max_imr; i++)
+		imr_clear(i);
+
+	/*
+	 * Setup a locked IMR around the physical extent of the kernel
+	 * from the beginning of the .text secton to the end of the
+	 * .rodata section as one physically contiguous block.
+	 */
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true);
+	if (ret < 0) {
+		pr_err("unable to setup IMR for kernel: (%p - %p)\n",
+			&_text, &__end_rodata);
+	} else {
+		pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n",
+			size / 1024, &_text, &__end_rodata);
+	}
+
+}
+
+static const struct x86_cpu_id imr_ids[] __initconst = {
+	{ X86_VENDOR_INTEL, 5, 9 },	/* Intel Quark SoC X1000. */
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, imr_ids);
+
+/**
+ * imr_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_init(void)
+{
+	struct imr_device *idev = &imr_dev;
+	int ret;
+
+	if (!x86_match_cpu(imr_ids) || !iosf_mbi_available())
+		return -ENODEV;
+
+	idev->max_imr = QUARK_X1000_IMR_MAX;
+	idev->reg_base = QUARK_X1000_IMR_REGBASE;
+	idev->init = true;
+
+	mutex_init(&idev->lock);
+	ret = imr_debugfs_register(idev);
+	if (ret != 0)
+		pr_warn("debugfs register failed!\n");
+	imr_fixup_memmap(idev);
+	return 0;
+}
+
+/**
+ * imr_exit - exit point for IMR code.
+ *
+ * Deregisters debugfs, leave IMR state as-is.
+ *
+ * return:
+ */
+static void __exit imr_exit(void)
+{
+	imr_debugfs_unregister(&imr_dev);
+}
+
+module_init(imr_init);
+module_exit(imr_exit);
+
+MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
+MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
new file mode 100644
index 000000000000..c9a0838890e2
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -0,0 +1,129 @@
+/**
+ * imr_selftest.c
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR self test. The purpose of this module is to run a set of tests on the
+ * IMR API to validate it's sanity. We check for overlapping, reserved
+ * addresses and setup/teardown sanity.
+ *
+ */
+
+#include <asm-generic/sections.h>
+#include <asm/imr.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define SELFTEST KBUILD_MODNAME ": "
+/**
+ * imr_self_test_result - Print result string for self test.
+ *
+ * @res:	result code - true if test passed false otherwise.
+ * @fmt:	format string.
+ * ...		variadic argument list.
+ */
+static void __init imr_self_test_result(int res, const char *fmt, ...)
+{
+	va_list vlist;
+
+	/* Print pass/fail. */
+	if (res)
+		pr_info(SELFTEST "pass ");
+	else
+		pr_info(SELFTEST "fail ");
+
+	/* Print variable string. */
+	va_start(vlist, fmt);
+	vprintk(fmt, vlist);
+	va_end(vlist);
+
+	/* Optional warning. */
+	WARN(res == 0, "test failed");
+}
+#undef SELFTEST
+
+/**
+ * imr_self_test
+ *
+ * Verify IMR self_test with some simple tests to verify overlap,
+ * zero sized allocations and 1 KiB sized areas.
+ *
+ */
+static void __init imr_self_test(void)
+{
+	phys_addr_t base  = virt_to_phys(&_text);
+	size_t size = virt_to_phys(&__end_rodata) - base;
+	const char *fmt_over = "overlapped IMR @ (0x%08lx - 0x%08lx)\n";
+	int ret;
+
+	/* Test zero zero. */
+	ret = imr_add_range(0, 0, 0, 0, false);
+	imr_self_test_result(ret < 0, "zero sized IMR\n");
+
+	/* Test exact overlap. */
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+	/* Test overlap with base inside of existing. */
+	base += size - IMR_ALIGN;
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+	/* Test overlap with end inside of existing. */
+	base -= size + IMR_ALIGN * 2;
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+	/* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
+	ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
+			    IMR_WRITE_ACCESS_ALL, false);
+	imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
+
+	/* Test that a 1 KiB IMR @ zero with CPU only will work. */
+	ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
+	if (ret >= 0) {
+		ret = imr_remove_range(0, IMR_ALIGN);
+		imr_self_test_result(ret == 0, "teardown - cpu-access\n");
+	}
+
+	/* Test 2 KiB works. */
+	size = IMR_ALIGN * 2;
+	ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL,
+			    IMR_WRITE_ACCESS_ALL, false);
+	imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
+	if (ret >= 0) {
+		ret = imr_remove_range(0, size);
+		imr_self_test_result(ret == 0, "teardown 2KiB\n");
+	}
+}
+
+/**
+ * imr_self_test_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_self_test_init(void)
+{
+	imr_self_test();
+	return 0;
+}
+
+/**
+ * imr_self_test_exit - exit point for IMR code.
+ *
+ * return:
+ */
+static void __exit imr_self_test_exit(void)
+{
+}
+
+module_init(imr_self_test_init);
+module_exit(imr_self_test_exit);
+
+MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
+MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 638e797037da..97527614141b 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -735,6 +735,31 @@ config INTEL_IPS
 	  functionality.  If in doubt, say Y here; it will only load on
 	  supported platforms.
 
+config INTEL_IMR
+	bool "Intel Isolated Memory Region support"
+	default n
+	depends on X86_INTEL_QUARK && IOSF_MBI
+	---help---
+	  This option provides a means to manipulate Isolated Memory Regions.
+	  IMRs are a set of registers that define read and write access masks
+	  to prohibit certain system agents from accessing memory with 1 KiB
+	  granularity.
+
+	  IMRs make it possible to control read/write access to an address
+	  by hardware agents inside the SoC. Read and write masks can be
+	  defined for:
+		- eSRAM flush
+		- Dirty CPU snoop (write only)
+		- RMU access
+		- PCI Virtual Channel 0/Virtual Channel 1
+		- SMM mode
+		- Non SMM mode
+
+	  Quark contains a set of eight IMR registers and makes use of those
+	  registers during its bootup process.
+
+	  If you are running on a Galileo/Quark say Y here.
+
 config IBM_RTL
 	tristate "Device driver to enable PRTL support"
 	depends on X86 && PCI
-- 
cgit v1.2.3


From 8bbc2a135b63bee6b41fa90f415521a97995a49f Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Date: Fri, 30 Jan 2015 16:29:39 +0000
Subject: x86/intel/quark: Add Intel Quark platform support

Add Intel Quark platform support. Quark needs to pull down all
unlocked IMRs to ensure agreement with the EFI memory map post
boot.

This patch adds an entry in Kconfig for Quark as a platform and
makes IMR support mandatory if selected.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Tested-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Reviewed-by: Andy Shevchenko <andy.schevchenko@gmail.com>
Reviewed-by: Darren Hart <dvhart@linux.intel.com>
Reviewed-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Cc: dvhart@infradead.org
Link: http://lkml.kernel.org/r/1422635379-12476-3-git-send-email-pure.logic@nexus-software.ie
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig           | 16 ++++++++++++++++
 arch/x86/platform/Makefile |  1 +
 2 files changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e28e2be3a41..1f97c7f0d049 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -486,6 +486,22 @@ config X86_INTEL_MID
 	  Intel MID platforms are based on an Intel processor and chipset which
 	  consume less power than most of the x86 derivatives.
 
+config X86_INTEL_QUARK
+	bool "Intel Quark platform support"
+	depends on X86_32
+	depends on X86_EXTENDED_PLATFORM
+	depends on X86_PLATFORM_DEVICES
+	depends on X86_TSC
+	depends on PCI
+	depends on PCI_GOANY
+	depends on X86_IO_APIC
+	select IOSF_MBI
+	select INTEL_IMR
+	---help---
+	  Select to include support for Quark X1000 SoC.
+	  Say Y here if you have a Quark based system such as the Arduino
+	  compatible Intel Galileo.
+
 config X86_INTEL_LPSS
 	bool "Intel Low Power Subsystem Support"
 	depends on ACPI
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 85afde1fa3e5..a62e0be3a2f1 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -5,6 +5,7 @@ obj-y	+= geode/
 obj-y	+= goldfish/
 obj-y	+= iris/
 obj-y	+= intel-mid/
+obj-y	+= intel-quark/
 obj-y	+= olpc/
 obj-y	+= scx200/
 obj-y	+= sfi/
-- 
cgit v1.2.3


From b0bd96fe9a1428a04c82f8c3834db69468869d65 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Feb 2015 12:30:49 +1030
Subject: lguest: now depends on PCI

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 4a0890f815c4..21e89807244c 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST_GUEST
 	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT
+	depends on X86_32 && PARAVIRT && PCI
 	select TTY
 	select VIRTUALIZATION
 	select VIRTIO
-- 
cgit v1.2.3


From f476893459318cb2eff3ecd2a05d4ceacf82e73e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Feb 2015 14:43:21 +1030
Subject: lguest: update help text.

We now add about 10k, not 6k, when lguest support is compiled in.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 21e89807244c..08f41caada45 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -8,7 +8,7 @@ config LGUEST_GUEST
 	help
 	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
 	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 6k.  If in doubt, say N.
+	  your kernel size by about 10k.  If in doubt, say N.
 
 	  If you say Y here, make sure you say Y (or M) to the virtio block
 	  and net drivers which lguest needs.
-- 
cgit v1.2.3


From 32d39169d7f56849b8c6c8c51aca7b73194d05f1 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Thu, 19 Feb 2015 16:14:32 +0800
Subject: x86/intel/quark: Fix ptr_ret.cocci warnings

arch/x86/platform/intel-quark/imr.c:280:1-3: WARNING: PTR_ERR_OR_ZERO can be used

 Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR

Generated by: scripts/coccinelle/api/ptr_ret.cocci

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Andy Shevchenko <andy.schevchenko@gmail.com>
Cc: Ong, Boon Leong <boon.leong.ong@intel.com>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: kbuild-all@01.org
Link: http://lkml.kernel.org/r/20150219081432.GA21983@waimea
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-quark/imr.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 16e4df1c9290..60c01eb47dd6 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -277,10 +277,7 @@ static int imr_debugfs_register(struct imr_device *idev)
 {
 	idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL,
 					 idev, &imr_state_ops);
-	if (IS_ERR(idev->file))
-		return PTR_ERR(idev->file);
-
-	return 0;
+	return PTR_ERR_OR_ZERO(idev->file);
 }
 
 /**
-- 
cgit v1.2.3


From c11a25f443e9bee06fe302b6a78ff44dac554036 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Thu, 19 Feb 2015 16:14:32 +0800
Subject: x86/intel/quark: Fix simple_return.cocci warnings

arch/x86/platform/intel-quark/imr.c:129:1-4: WARNING: end returns can be simpified

 Simplify a trivial if-return sequence.  Possibly combine with a preceding function call.

Generated by: scripts/coccinelle/misc/simple_return.cocci

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Andy Shevchenko <andy.schevchenko@gmail.com>
Cc: Ong, Boon Leong <boon.leong.ong@intel.com>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: kbuild-all@01.org
Link: http://lkml.kernel.org/r/20150219081432.GA21996@waimea
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-quark/imr.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 60c01eb47dd6..0ee619f9fcb7 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -126,12 +126,8 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
 	if (ret)
 		return ret;
 
-	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+	return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
 				reg++, &imr->wmask);
-	if (ret)
-		return ret;
-
-	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From f47233c2d34f243ecdaac179c3408a39ff9216a7 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 13 Feb 2015 16:04:55 +0100
Subject: x86/mm/ASLR: Propagate base load address calculation

Commit:

  e2b32e678513 ("x86, kaslr: randomize module base load address")

makes the base address for module to be unconditionally randomized in
case when CONFIG_RANDOMIZE_BASE is defined and "nokaslr" option isn't
present on the commandline.

This is not consistent with how choose_kernel_location() decides whether
it will randomize kernel load base.

Namely, CONFIG_HIBERNATION disables kASLR (unless "kaslr" option is
explicitly specified on kernel commandline), which makes the state space
larger than what module loader is looking at. IOW CONFIG_HIBERNATION &&
CONFIG_RANDOMIZE_BASE is a valid config option, kASLR wouldn't be applied
by default in that case, but module loader is not aware of that.

Instead of fixing the logic in module.c, this patch takes more generic
aproach. It introduces a new bootparam setup data_type SETUP_KASLR and
uses that to pass the information whether kaslr has been applied during
kernel decompression, and sets a global 'kaslr_enabled' variable
accordingly, so that any kernel code (module loading, livepatching, ...)
can make decisions based on its value.

x86 module loader is converted to make use of this flag.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Link: https://lkml.kernel.org/r/alpine.LNX.2.00.1502101411280.10719@pobox.suse.cz
[ Always dump correct kaslr status when panicking ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/boot/compressed/aslr.c       | 34 +++++++++++++++++++++++++++++++++-
 arch/x86/boot/compressed/misc.c       |  3 ++-
 arch/x86/boot/compressed/misc.h       |  6 ++++--
 arch/x86/include/asm/page_types.h     |  3 +++
 arch/x86/include/uapi/asm/bootparam.h |  1 +
 arch/x86/kernel/module.c              | 11 ++---------
 arch/x86/kernel/setup.c               | 22 ++++++++++++++++++----
 7 files changed, 63 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index bb1376381985..7083c16cccba 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -14,6 +14,13 @@
 static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
 		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
 
+struct kaslr_setup_data {
+	__u64 next;
+	__u32 type;
+	__u32 len;
+	__u8 data[1];
+} kaslr_setup_data;
+
 #define I8254_PORT_CONTROL	0x43
 #define I8254_PORT_COUNTER0	0x40
 #define I8254_CMD_READBACK	0xC0
@@ -295,7 +302,29 @@ static unsigned long find_random_addr(unsigned long minimum,
 	return slots_fetch_random();
 }
 
-unsigned char *choose_kernel_location(unsigned char *input,
+static void add_kaslr_setup_data(struct boot_params *params, __u8 enabled)
+{
+	struct setup_data *data;
+
+	kaslr_setup_data.type = SETUP_KASLR;
+	kaslr_setup_data.len = 1;
+	kaslr_setup_data.next = 0;
+	kaslr_setup_data.data[0] = enabled;
+
+	data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
+
+	while (data && data->next)
+		data = (struct setup_data *)(unsigned long)data->next;
+
+	if (data)
+		data->next = (unsigned long)&kaslr_setup_data;
+	else
+		params->hdr.setup_data = (unsigned long)&kaslr_setup_data;
+
+}
+
+unsigned char *choose_kernel_location(struct boot_params *params,
+				      unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size)
@@ -306,14 +335,17 @@ unsigned char *choose_kernel_location(unsigned char *input,
 #ifdef CONFIG_HIBERNATION
 	if (!cmdline_find_option_bool("kaslr")) {
 		debug_putstr("KASLR disabled by default...\n");
+		add_kaslr_setup_data(params, 0);
 		goto out;
 	}
 #else
 	if (cmdline_find_option_bool("nokaslr")) {
 		debug_putstr("KASLR disabled by cmdline...\n");
+		add_kaslr_setup_data(params, 0);
 		goto out;
 	}
 #endif
+	add_kaslr_setup_data(params, 1);
 
 	/* Record the various known unsafe memory ranges. */
 	mem_avoid_init((unsigned long)input, input_size,
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a950864a64da..5903089c818f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -401,7 +401,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 	 * the entire decompressed kernel plus relocation table, or the
 	 * entire decompressed kernel plus .bss and .brk sections.
 	 */
-	output = choose_kernel_location(input_data, input_len, output,
+	output = choose_kernel_location(real_mode, input_data, input_len,
+					output,
 					output_len > run_size ? output_len
 							      : run_size);
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 24e3e569a13c..6d6730743024 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -56,7 +56,8 @@ int cmdline_find_option_bool(const char *option);
 
 #if CONFIG_RANDOMIZE_BASE
 /* aslr.c */
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *params,
+				      unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size);
@@ -64,7 +65,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
 bool has_cpuflag(int flag);
 #else
 static inline
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *params,
+				      unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..3d43ce36eaba 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
+#include <asm/bootparam.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
@@ -51,6 +52,8 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
+extern bool kaslr_enabled;
+
 static inline phys_addr_t get_max_mapped(void)
 {
 	return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 225b0988043a..44e6dd7e36a2 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -7,6 +7,7 @@
 #define SETUP_DTB			2
 #define SETUP_PCI			3
 #define SETUP_EFI			4
+#define SETUP_KASLR			5
 
 /* ram_size flags */
 #define RAMDISK_IMAGE_START_MASK	0x07FF
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..c3c59a3a14ad 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -32,6 +32,7 @@
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/page_types.h>
 
 #if 0
 #define DEBUGP(fmt, ...)				\
@@ -46,21 +47,13 @@ do {							\
 
 #ifdef CONFIG_RANDOMIZE_BASE
 static unsigned long module_load_offset;
-static int randomize_modules = 1;
 
 /* Mutex protects the module_load_offset. */
 static DEFINE_MUTEX(module_kaslr_mutex);
 
-static int __init parse_nokaslr(char *p)
-{
-	randomize_modules = 0;
-	return 0;
-}
-early_param("nokaslr", parse_nokaslr);
-
 static unsigned long int get_module_load_offset(void)
 {
-	if (randomize_modules) {
+	if (kaslr_enabled) {
 		mutex_lock(&module_kaslr_mutex);
 		/*
 		 * Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ab4734e5411d..16b6043cb073 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -121,6 +121,8 @@
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
 
+bool __read_mostly kaslr_enabled = false;
+
 #ifdef CONFIG_DMI
 RESERVE_BRK(dmi_alloc, 65536);
 #endif
@@ -424,6 +426,11 @@ static void __init reserve_initrd(void)
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
 
+static void __init parse_kaslr_setup(u64 pa_data, u32 data_len)
+{
+	kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data));
+}
+
 static void __init parse_setup_data(void)
 {
 	struct setup_data *data;
@@ -451,6 +458,9 @@ static void __init parse_setup_data(void)
 		case SETUP_EFI:
 			parse_efi_setup(pa_data, data_len);
 			break;
+		case SETUP_KASLR:
+			parse_kaslr_setup(pa_data, data_len);
+			break;
 		default:
 			break;
 		}
@@ -833,10 +843,14 @@ static void __init trim_low_memory_range(void)
 static int
 dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
 {
-	pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
-		 "(relocation range: 0x%lx-0x%lx)\n",
-		 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
-		 __START_KERNEL_map, MODULES_VADDR-1);
+	if (kaslr_enabled)
+		pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+			 (unsigned long)&_text - __START_KERNEL,
+			 __START_KERNEL,
+			 __START_KERNEL_map,
+			 MODULES_VADDR-1);
+	else
+		pr_emerg("Kernel Offset: disabled\n");
 
 	return 0;
 }
-- 
cgit v1.2.3


From f15e05186c3244e9195378a0a568283a8ccc60b0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 10 Feb 2015 13:20:30 -0800
Subject: x86/mm/init: Fix incorrect page size in init_memory_mapping() printks

With 32-bit non-PAE kernels, we have 2 page sizes available
(at most): 4k and 4M.

Enabling PAE replaces that 4M size with a 2M one (which 64-bit
systems use too).

But, when booting a 32-bit non-PAE kernel, in one of our
early-boot printouts, we say:

  init_memory_mapping: [mem 0x00000000-0x000fffff]
   [mem 0x00000000-0x000fffff] page 4k
  init_memory_mapping: [mem 0x37000000-0x373fffff]
   [mem 0x37000000-0x373fffff] page 2M
  init_memory_mapping: [mem 0x00100000-0x36ffffff]
   [mem 0x00100000-0x003fffff] page 4k
   [mem 0x00400000-0x36ffffff] page 2M
  init_memory_mapping: [mem 0x37400000-0x377fdfff]
   [mem 0x37400000-0x377fdfff] page 4k

Which is obviously wrong.  There is no 2M page available.  This
is probably because of a badly-named variable: in the map_range
code: PG_LEVEL_2M.

Instead of renaming all the PG_LEVEL_2M's.  This patch just
fixes the printout:

  init_memory_mapping: [mem 0x00000000-0x000fffff]
   [mem 0x00000000-0x000fffff] page 4k
  init_memory_mapping: [mem 0x37000000-0x373fffff]
   [mem 0x37000000-0x373fffff] page 4M
  init_memory_mapping: [mem 0x00100000-0x36ffffff]
   [mem 0x00100000-0x003fffff] page 4k
   [mem 0x00400000-0x36ffffff] page 4M
  init_memory_mapping: [mem 0x37400000-0x377fdfff]
   [mem 0x37400000-0x377fdfff] page 4k
  BRK [0x03206000, 0x03206fff] PGTABLE

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/20150210212030.665EC267@viggo.jf.intel.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/mm/init.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 079c3b6a3ff1..7ff24240d863 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -238,6 +238,31 @@ static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
 	}
 }
 
+static const char *page_size_string(struct map_range *mr)
+{
+	static const char str_1g[] = "1G";
+	static const char str_2m[] = "2M";
+	static const char str_4m[] = "4M";
+	static const char str_4k[] = "4k";
+
+	if (mr->page_size_mask & (1<<PG_LEVEL_1G))
+		return str_1g;
+	/*
+	 * 32-bit without PAE has a 4M large page size.
+	 * PG_LEVEL_2M is misnamed, but we can at least
+	 * print out the right size in the string.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32) &&
+	    !IS_ENABLED(CONFIG_X86_PAE) &&
+	    mr->page_size_mask & (1<<PG_LEVEL_2M))
+		return str_4m;
+
+	if (mr->page_size_mask & (1<<PG_LEVEL_2M))
+		return str_2m;
+
+	return str_4k;
+}
+
 static int __meminit split_mem_range(struct map_range *mr, int nr_range,
 				     unsigned long start,
 				     unsigned long end)
@@ -333,8 +358,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
 	for (i = 0; i < nr_range; i++)
 		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
 				mr[i].start, mr[i].end - 1,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+				page_size_string(&mr[i]));
 
 	return nr_range;
 }
-- 
cgit v1.2.3


From 4e7c22d447bb6d7e37bfe39ff658486ae78e8d77 Mon Sep 17 00:00:00 2001
From: Hector Marco-Gisbert <hecmargi@upv.es>
Date: Sat, 14 Feb 2015 09:33:50 -0800
Subject: x86, mm/ASLR: Fix stack randomization on 64-bit systems

The issue is that the stack for processes is not properly randomized on
64 bit architectures due to an integer overflow.

The affected function is randomize_stack_top() in file
"fs/binfmt_elf.c":

  static unsigned long randomize_stack_top(unsigned long stack_top)
  {
           unsigned int random_variable = 0;

           if ((current->flags & PF_RANDOMIZE) &&
                   !(current->personality & ADDR_NO_RANDOMIZE)) {
                   random_variable = get_random_int() & STACK_RND_MASK;
                   random_variable <<= PAGE_SHIFT;
           }
           return PAGE_ALIGN(stack_top) + random_variable;
           return PAGE_ALIGN(stack_top) - random_variable;
  }

Note that, it declares the "random_variable" variable as "unsigned int".
Since the result of the shifting operation between STACK_RND_MASK (which
is 0x3fffff on x86_64, 22 bits) and PAGE_SHIFT (which is 12 on x86_64):

	  random_variable <<= PAGE_SHIFT;

then the two leftmost bits are dropped when storing the result in the
"random_variable". This variable shall be at least 34 bits long to hold
the (22+12) result.

These two dropped bits have an impact on the entropy of process stack.
Concretely, the total stack entropy is reduced by four: from 2^28 to
2^30 (One fourth of expected entropy).

This patch restores back the entropy by correcting the types involved
in the operations in the functions randomize_stack_top() and
stack_maxrandom_size().

The successful fix can be tested with:

  $ for i in `seq 1 10`; do cat /proc/self/maps | grep stack; done
  7ffeda566000-7ffeda587000 rw-p 00000000 00:00 0                          [stack]
  7fff5a332000-7fff5a353000 rw-p 00000000 00:00 0                          [stack]
  7ffcdb7a1000-7ffcdb7c2000 rw-p 00000000 00:00 0                          [stack]
  7ffd5e2c4000-7ffd5e2e5000 rw-p 00000000 00:00 0                          [stack]
  ...

Once corrected, the leading bytes should be between 7ffc and 7fff,
rather than always being 7fff.

Signed-off-by: Hector Marco-Gisbert <hecmargi@upv.es>
Signed-off-by: Ismael Ripoll <iripoll@upv.es>
[ Rebased, fixed 80 char bugs, cleaned up commit message, added test example and CVE ]
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: <stable@vger.kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Fixes: CVE-2015-1593
Link: http://lkml.kernel.org/r/20150214173350.GA18393@www.outflux.net
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/mm/mmap.c | 6 +++---
 fs/binfmt_elf.c    | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 919b91205cd4..df4552bd239e 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -35,12 +35,12 @@ struct va_alignment __read_mostly va_align = {
 	.flags = -1,
 };
 
-static unsigned int stack_maxrandom_size(void)
+static unsigned long stack_maxrandom_size(void)
 {
-	unsigned int max = 0;
+	unsigned long max = 0;
 	if ((current->flags & PF_RANDOMIZE) &&
 		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
+		max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
 	}
 
 	return max;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 02b16910f4c9..995986b8e36b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -645,11 +645,12 @@ out:
 
 static unsigned long randomize_stack_top(unsigned long stack_top)
 {
-	unsigned int random_variable = 0;
+	unsigned long random_variable = 0;
 
 	if ((current->flags & PF_RANDOMIZE) &&
 		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		random_variable = get_random_int() & STACK_RND_MASK;
+		random_variable = (unsigned long) get_random_int();
+		random_variable &= STACK_RND_MASK;
 		random_variable <<= PAGE_SHIFT;
 	}
 #ifdef CONFIG_STACK_GROWSUP
-- 
cgit v1.2.3


From f84598bd7c851f8b0bf8cd0d7c3be0d73c432ff4 Mon Sep 17 00:00:00 2001
From: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Date: Tue, 3 Feb 2015 13:00:22 +0100
Subject: x86/microcode/intel: Guard against stack overflow in the loader

mc_saved_tmp is a static array allocated on the stack, we need to make
sure mc_saved_count stays within its bounds, otherwise we're overflowing
the stack in _save_mc(). A specially crafted microcode header could lead
to a kernel crash or potentially kernel execution.

Signed-off-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1422964824-22056-1-git-send-email-quentin.casasnovas@oracle.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index ec9df6f9cd47..5e109a31f62b 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -321,7 +321,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
 	int i;
 
-	while (leftover) {
+	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
 		mc_header = (struct microcode_header_intel *)ucode_ptr;
 
 		mc_size = get_totalsize(mc_header);
-- 
cgit v1.2.3


From 35a9ff4eec7a1725ac4364972fc6c156e4feedd0 Mon Sep 17 00:00:00 2001
From: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Date: Tue, 3 Feb 2015 13:00:24 +0100
Subject: x86/microcode/intel: Handle truncated microcode images more robustly

We do not check the input data bounds containing the microcode before
copying a struct microcode_intel_header from it. A specially crafted
microcode could cause the kernel to read invalid memory and lead to a
denial-of-service.

Signed-off-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1422964824-22056-3-git-send-email-quentin.casasnovas@oracle.com
[ Made error message differ from the next one and flipped comparison. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel.c       | 5 +++++
 arch/x86/kernel/cpu/microcode/intel_early.c | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index c6826d1e8082..746e7fd08aad 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -196,6 +196,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 		struct microcode_header_intel mc_header;
 		unsigned int mc_size;
 
+		if (leftover < sizeof(mc_header)) {
+			pr_err("error! Truncated header in microcode data file\n");
+			break;
+		}
+
 		if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
 			break;
 
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 5e109a31f62b..420eb933189c 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -322,6 +322,10 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	int i;
 
 	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+
+		if (leftover < sizeof(mc_header))
+			break;
+
 		mc_header = (struct microcode_header_intel *)ucode_ptr;
 
 		mc_size = get_totalsize(mc_header);
-- 
cgit v1.2.3


From e3a1f6cac1fe20e7ac01d96c914c25726723a64e Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 19 Feb 2015 13:06:53 +0000
Subject: x86: pte_protnone() and pmd_protnone() must check entry is not
 present

Since _PAGE_PROTNONE aliases _PAGE_GLOBAL it is only valid if
_PAGE_PRESENT is clear.  Make pte_protnone() and pmd_protnone() check
for this.

This fixes a 64-bit Xen PV guest regression introduced by 8a0516ed8b90
("mm: convert p[te|md]_numa users to p[te|md]_protnone_numa").  Any
userspace process would endlessly fault.

In a 64-bit PV guest, userspace page table entries have _PAGE_GLOBAL set
by the hypervisor.  This meant that any fault on a present userspace
entry (e.g., a write to a read-only mapping) would be misinterpreted as
a NUMA hinting fault and the fault would not be correctly handled,
resulting in the access endlessly faulting.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 67fc3d2b0aab..a0c35bf6cb92 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -476,12 +476,14 @@ static inline int pmd_present(pmd_t pmd)
  */
 static inline int pte_protnone(pte_t pte)
 {
-	return pte_flags(pte) & _PAGE_PROTNONE;
+	return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+		== _PAGE_PROTNONE;
 }
 
 static inline int pmd_protnone(pmd_t pmd)
 {
-	return pmd_flags(pmd) & _PAGE_PROTNONE;
+	return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+		== _PAGE_PROTNONE;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-- 
cgit v1.2.3


From 570e1aa84c376ff39809442f09c7606ddf62cfd1 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 20 Feb 2015 10:18:59 +0100
Subject: x86/mm/ASLR: Avoid PAGE_SIZE redefinition for UML subarch

Commit f47233c2d34 ("x86/mm/ASLR: Propagate base load address
calculation") causes PAGE_SIZE redefinition warnings for UML
subarch  builds. This is caused by added includes that were
leftovers from previous  patch versions are are not actually
needed (especially page_types.h  inlcude in module.c). Drop
those stray includes.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1502201017240.28769@pobox.suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page_types.h | 1 -
 arch/x86/kernel/module.c          | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 3d43ce36eaba..95e11f79f123 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,7 +3,6 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
-#include <asm/bootparam.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index c3c59a3a14ad..ef00116e8270 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -32,7 +32,6 @@
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/page_types.h>
 
 #if 0
 #define DEBUGP(fmt, ...)				\
-- 
cgit v1.2.3


From 650b7b23cb1e32d77daeefbac1ceb1329abf3b23 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.cz>
Date: Fri, 20 Feb 2015 15:07:29 +0100
Subject: kprobes/x86: Use 5-byte NOP when the code might be modified by ftrace

can_probe() checks if the given address points to the beginning
of an instruction. It analyzes all the instructions from the
beginning of the function until the given address. The code
might be modified by another Kprobe. In this case, the current
code is read into a buffer, int3 breakpoint is replaced by the
saved opcode in the buffer, and can_probe() analyzes the buffer
instead.

There is a bug that __recover_probed_insn() tries to restore
the original code even for Kprobes using the ftrace framework.
But in this case, the opcode is not stored. See the difference
between arch_prepare_kprobe() and arch_prepare_kprobe_ftrace().
The opcode is stored by arch_copy_kprobe() only from
arch_prepare_kprobe().

This patch makes Kprobe to use the ideal 5-byte NOP when the
code can be modified by ftrace. It is the original instruction,
see ftrace_make_nop() and ftrace_nop_replace().

Note that we always need to use the NOP for ftrace locations.
Kprobes do not block ftrace and the instruction might get
modified at anytime. It might even be in an inconsistent state
because it is modified step by step using the int3 breakpoint.

The patch also fixes indentation of the touched comment.

Note that I found this problem when playing with Kprobes. I did
it on x86_64 with gcc-4.8.3 that supported -mfentry. I modified
samples/kprobes/kprobe_example.c and added offset 5 to put
the probe right after the fentry area:

 static struct kprobe kp = {
 	.symbol_name	= "do_fork",
+	.offset = 5,
 };

Then I was able to load kprobe_example before jprobe_example
but not the other way around:

  $> modprobe jprobe_example
  $> modprobe kprobe_example
  modprobe: ERROR: could not insert 'kprobe_example': Invalid or incomplete multibyte or wide character

It did not make much sense and debugging pointed to the bug
described above.

Signed-off-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth NMavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1424441250-27146-2-git-send-email-pmladek@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6a1146ea4d4d..c3b4b46b4797 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -223,27 +223,41 @@ static unsigned long
 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 {
 	struct kprobe *kp;
+	unsigned long faddr;
 
 	kp = get_kprobe((void *)addr);
-	/* There is no probe, return original address */
-	if (!kp)
+	faddr = ftrace_location(addr);
+	/*
+	 * Use the current code if it is not modified by Kprobe
+	 * and it cannot be modified by ftrace.
+	 */
+	if (!kp && !faddr)
 		return addr;
 
 	/*
-	 *  Basically, kp->ainsn.insn has an original instruction.
-	 *  However, RIP-relative instruction can not do single-stepping
-	 *  at different place, __copy_instruction() tweaks the displacement of
-	 *  that instruction. In that case, we can't recover the instruction
-	 *  from the kp->ainsn.insn.
+	 * Basically, kp->ainsn.insn has an original instruction.
+	 * However, RIP-relative instruction can not do single-stepping
+	 * at different place, __copy_instruction() tweaks the displacement of
+	 * that instruction. In that case, we can't recover the instruction
+	 * from the kp->ainsn.insn.
 	 *
-	 *  On the other hand, kp->opcode has a copy of the first byte of
-	 *  the probed instruction, which is overwritten by int3. And
-	 *  the instruction at kp->addr is not modified by kprobes except
-	 *  for the first byte, we can recover the original instruction
-	 *  from it and kp->opcode.
+	 * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+	 * of the first byte of the probed instruction, which is overwritten
+	 * by int3. And the instruction at kp->addr is not modified by kprobes
+	 * except for the first byte, we can recover the original instruction
+	 * from it and kp->opcode.
+	 *
+	 * In case of Kprobes using ftrace, we do not have a copy of
+	 * the original instruction. In fact, the ftrace location might
+	 * be modified at anytime and even could be in an inconsistent state.
+	 * Fortunately, we know that the original code is the ideal 5-byte
+	 * long NOP.
 	 */
-	memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	buf[0] = kp->opcode;
+	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (faddr)
+		memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+	else
+		buf[0] = kp->opcode;
 	return (unsigned long)buf;
 }
 
-- 
cgit v1.2.3


From 2a6730c8b6e075adf826a89a3e2caa705807afdb Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.cz>
Date: Fri, 20 Feb 2015 15:07:30 +0100
Subject: kprobes/x86: Check for invalid ftrace location in
 __recover_probed_insn()

__recover_probed_insn() should always be called from an address
where an instructions starts. The check for ftrace_location()
might help to discover a potential inconsistency.

This patch adds WARN_ON() when the inconsistency is detected.
Also it adds handling of the situation when the original code
can not get recovered.

Suggested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Cc: Ananth NMavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1424441250-27146-3-git-send-email-pmladek@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 12 ++++++++++++
 arch/x86/kernel/kprobes/opt.c  |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index c3b4b46b4797..4e3d5a9621fe 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -227,6 +227,13 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 
 	kp = get_kprobe((void *)addr);
 	faddr = ftrace_location(addr);
+	/*
+	 * Addresses inside the ftrace location are refused by
+	 * arch_check_ftrace_location(). Something went terribly wrong
+	 * if such an address is checked here.
+	 */
+	if (WARN_ON(faddr && faddr != addr))
+		return 0UL;
 	/*
 	 * Use the current code if it is not modified by Kprobe
 	 * and it cannot be modified by ftrace.
@@ -265,6 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
  * Recover the probed instruction at addr for further analysis.
  * Caller must lock kprobes by kprobe_mutex, or disable preemption
  * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered.
  */
 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 {
@@ -299,6 +307,8 @@ static int can_probe(unsigned long paddr)
 		 * normally used, we just go through if there is no kprobe.
 		 */
 		__addr = recover_probed_instruction(buf, addr);
+		if (!__addr)
+			return 0;
 		kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 
@@ -347,6 +357,8 @@ int __copy_instruction(u8 *dest, u8 *src)
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
+	if (!recovered_insn)
+		return 0;
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
 	/* Another subsystem puts a breakpoint, failed to recover */
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..3aef248ec1ee 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -259,6 +259,8 @@ static int can_optimize(unsigned long paddr)
 			 */
 			return 0;
 		recovered_insn = recover_probed_instruction(buf, addr);
+		if (!recovered_insn)
+			return 0;
 		kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 		/* Another subsystem puts a breakpoint */
-- 
cgit v1.2.3


From a927792c196f1c24410f3c12ccf45238a353783a Mon Sep 17 00:00:00 2001
From: Yannick Guerrini <yguerrini@tomshardware.fr>
Date: Sat, 21 Feb 2015 23:41:50 +0100
Subject: x86/cpu/intel: Fix trivial typo in intel_tlb_table[]

Change 'ssociative' to 'associative'

Signed-off-by: Yannick Guerrini <yguerrini@tomshardware.fr>
Cc: Borislav Petkov <bp@suse.de>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Chris Bainbridge <chris.bainbridge@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Steven Honeyman <stevenhoneyman@gmail.com>
Cc: trivial@kernel.org
Link: http://lkml.kernel.org/r/1424558510-1420-1-git-send-email-yguerrini@tomshardware.fr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/intel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 94d7dcb12145..50163fa9034f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -565,8 +565,8 @@ static const struct _tlb_table intel_tlb_table[] = {
 	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },
 	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },
 	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" },
-	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
-	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
+	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set associative" },
+	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set associative" },
 	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },
 	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
 	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" },
-- 
cgit v1.2.3


From 31795b470b0872b66f7fa26f791b695c79821220 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Wed, 11 Feb 2015 14:39:18 -0500
Subject: x86/xen: Make sure X2APIC_ENABLE bit of MSR_IA32_APICBASE is not set

Commit d524165cb8db ("x86/apic: Check x2apic early") tests X2APIC_ENABLE
bit of MSR_IA32_APICBASE when CONFIG_X86_X2APIC is off and panics
the kernel when this bit is set.

Xen's PV guests will pass this MSR read to the hypervisor which will
return its version of the MSR, where this bit might be set. Make sure
we clear it before returning MSR value to the caller.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/enlighten.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bd8b8459c3d0..efee14db009b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1070,6 +1070,23 @@ static inline void xen_write_cr8(unsigned long val)
 	BUG_ON(val);
 }
 #endif
+
+static u64 xen_read_msr_safe(unsigned int msr, int *err)
+{
+	u64 val;
+
+	val = native_read_msr_safe(msr, err);
+	switch (msr) {
+	case MSR_IA32_APICBASE:
+#ifdef CONFIG_X86_X2APIC
+		if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
+#endif
+			val &= ~X2APIC_ENABLE;
+		break;
+	}
+	return val;
+}
+
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
 	int ret;
@@ -1240,7 +1257,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 
 	.wbinvd = native_wbinvd,
 
-	.read_msr = native_read_msr_safe,
+	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
 	.read_tsc = native_read_tsc,
-- 
cgit v1.2.3


From fdfd811ddde3678247248ca9a27faa999ca4cd51 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 19 Feb 2015 15:23:17 +0000
Subject: x86/xen: allow privcmd hypercalls to be preempted

Hypercalls submitted by user space tools via the privcmd driver can
take a long time (potentially many 10s of seconds) if the hypercall
has many sub-operations.

A fully preemptible kernel may deschedule such as task in any upcall
called from a hypercall continuation.

However, in a kernel with voluntary or no preemption, hypercall
continuations in Xen allow event handlers to be run but the task
issuing the hypercall will not be descheduled until the hypercall is
complete and the ioctl returns to user space.  These long running
tasks may also trigger the kernel's soft lockup detection.

Add xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() to
bracket hypercalls that may be preempted.  Use these in the privcmd
driver.

When returning from an upcall, call xen_maybe_preempt_hcall() which
adds a schedule point if if the current task was within a preemptible
hypercall.

Since _cond_resched() can move the task to a different CPU, clear and
set xen_in_preemptible_hcall around the call.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/kernel/entry_32.S |  3 +++
 arch/x86/kernel/entry_64.S |  3 +++
 drivers/xen/Makefile       |  2 +-
 drivers/xen/preempt.c      | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/xen/privcmd.c      |  2 ++
 include/xen/xen-ops.h      | 26 ++++++++++++++++++++++++++
 6 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/preempt.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 000d4199b03e..31e2d5bf3e38 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -982,6 +982,9 @@ ENTRY(xen_hypervisor_callback)
 ENTRY(xen_do_upcall)
 1:	mov %esp, %eax
 	call xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+	call xen_maybe_preempt_hcall
+#endif
 	jmp  ret_from_intr
 	CFI_ENDPROC
 ENDPROC(xen_hypervisor_callback)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index db13655c3a2a..10074ad9ebf8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1208,6 +1208,9 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
 	popq %rsp
 	CFI_DEF_CFA_REGISTER rsp
 	decl PER_CPU_VAR(irq_count)
+#ifndef CONFIG_PREEMPT
+	call xen_maybe_preempt_hcall
+#endif
 	jmp  error_exit
 	CFI_ENDPROC
 END(xen_do_hypervisor_callback)
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 2140398a2a8c..2ccd3592d41f 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -2,7 +2,7 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)
 obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
 endif
 obj-$(CONFIG_X86)			+= fallback.o
-obj-y	+= grant-table.o features.o balloon.o manage.o
+obj-y	+= grant-table.o features.o balloon.o manage.o preempt.o
 obj-y	+= events/
 obj-y	+= xenbus/
 
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
new file mode 100644
index 000000000000..a1800c150839
--- /dev/null
+++ b/drivers/xen/preempt.c
@@ -0,0 +1,44 @@
+/*
+ * Preemptible hypercalls
+ *
+ * Copyright (C) 2014 Citrix Systems R&D ltd.
+ *
+ * This source code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <xen/xen-ops.h>
+
+#ifndef CONFIG_PREEMPT
+
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+asmlinkage __visible void xen_maybe_preempt_hcall(void)
+{
+	if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
+		     && should_resched())) {
+		/*
+		 * Clear flag as we may be rescheduled on a different
+		 * cpu.
+		 */
+		__this_cpu_write(xen_in_preemptible_hcall, false);
+		_cond_resched();
+		__this_cpu_write(xen_in_preemptible_hcall, true);
+	}
+}
+#endif /* CONFIG_PREEMPT */
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 569a13b9e856..59ac71c4a043 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -56,10 +56,12 @@ static long privcmd_ioctl_hypercall(void __user *udata)
 	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
 		return -EFAULT;
 
+	xen_preemptible_hcall_begin();
 	ret = privcmd_call(hypercall.op,
 			   hypercall.arg[0], hypercall.arg[1],
 			   hypercall.arg[2], hypercall.arg[3],
 			   hypercall.arg[4]);
+	xen_preemptible_hcall_end();
 
 	return ret;
 }
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 7491ee5d8164..83338210ee04 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -46,4 +46,30 @@ static inline efi_system_table_t __init *xen_efi_probe(void)
 }
 #endif
 
+#ifdef CONFIG_PREEMPT
+
+static inline void xen_preemptible_hcall_begin(void)
+{
+}
+
+static inline void xen_preemptible_hcall_end(void)
+{
+}
+
+#else
+
+DECLARE_PER_CPU(bool, xen_in_preemptible_hcall);
+
+static inline void xen_preemptible_hcall_begin(void)
+{
+	__this_cpu_write(xen_in_preemptible_hcall, true);
+}
+
+static inline void xen_preemptible_hcall_end(void)
+{
+	__this_cpu_write(xen_in_preemptible_hcall, false);
+}
+
+#endif /* CONFIG_PREEMPT */
+
 #endif /* INCLUDE_XEN_OPS_H */
-- 
cgit v1.2.3


From 5054daa285beaf706f051fbd395dc36c9f0f907f Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Mon, 23 Feb 2015 11:01:00 -0500
Subject: x86/xen: Initialize cr4 shadow for 64-bit PV(H) guests

Commit 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4")
introduced CR4 shadows.

These shadows are initialized in early boot code. The commit missed
initialization for 64-bit PV(H) guests that this patch adds.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/enlighten.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index efee14db009b..5240f563076d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1758,6 +1758,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #ifdef CONFIG_X86_32
 	i386_start_kernel();
 #else
+	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
 #endif
 }
-- 
cgit v1.2.3


From 579deee571a755c485ad702ef82c77a98a2ccc05 Mon Sep 17 00:00:00 2001
From: Yannick Guerrini <yguerrini@tomshardware.fr>
Date: Mon, 23 Feb 2015 17:52:38 +0100
Subject: x86/platform/intel-mid: Fix trivial printk message typo in
 intel_mid_arch_setup()

Change 'Uknown' to 'Unknown'

Signed-off-by: Yannick Guerrini <yguerrini@tomshardware.fr>
Cc: trivial@kernel.org
Link: http://lkml.kernel.org/r/1424710358-10140-1-git-send-email-yguerrini@tomshardware.fr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/intel-mid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 1bbedc4b0f88..3005f0c89f2e 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -130,7 +130,7 @@ static void intel_mid_arch_setup(void)
 		intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
 	else {
 		intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
-		pr_info("ARCH: Uknown SoC, assuming PENWELL!\n");
+		pr_info("ARCH: Unknown SoC, assuming PENWELL!\n");
 	}
 
 out:
-- 
cgit v1.2.3


From 5b2bdbc84556774afbe11bcfd24c2f6411cfa92b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 27 Feb 2015 14:50:19 -0500
Subject: x86: Init per-cpu shadow copy of CR4 on 32-bit CPUs too

Commit:

   1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4")

added a shadow CR4 such that reads and writes that do not
modify the CR4 execute much faster than always reading the
register itself.

The change modified cpu_init() in common.c, so that the
shadow CR4 gets initialized before anything uses it.

Unfortunately, there's two cpu_init()s in common.c. There's
one for 64-bit and one for 32-bit. The commit only added
the shadow init to the 64-bit path, but the 32-bit path
needs the init too.

Link: http://lkml.kernel.org/r/20150227125208.71c36402@gandalf.local.home Fixes: 1e02ce4cccdc "x86: Store a per-cpu shadow copy of CR4"
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20150227145019.2bdd4354@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b5c8ff5e9dfc..2346c95c6ab1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1396,6 +1396,12 @@ void cpu_init(void)
 
 	wait_for_master_cpu(cpu);
 
+	/*
+	 * Initialize the CR4 shadow before doing anything that could
+	 * try to read it.
+	 */
+	cr4_init_shadow();
+
 	show_ucode_info_early();
 
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
-- 
cgit v1.2.3