From 886d751a2ea99a160f2d0a472231566d9cb0cf58 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 20 Dec 2012 14:11:33 -0500 Subject: x86, efi: correct precedence of operators in setup_efi_pci With the current code, the condition in the if() doesn't make much sense due to precedence of operators. Signed-off-by: Sasha Levin Link: http://lkml.kernel.org/r/1356030701-16284-25-git-send-email-sasha.levin@oracle.com Cc: Matt Fleming Cc: Matthew Garrett Cc: Bjorn Helgaas Cc: Seth Forshee Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index b1942e222768..18e329ca108e 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -302,7 +302,7 @@ static efi_status_t setup_efi_pci(struct boot_params *params) if (status != EFI_SUCCESS) continue; - if (!attributes & EFI_PCI_IO_ATTRIBUTE_EMBEDDED_ROM) + if (!(attributes & EFI_PCI_IO_ATTRIBUTE_EMBEDDED_ROM)) continue; if (!pci->romimage || !pci->romsize) -- cgit v1.2.3 From a706d965dcfdff73bf2bad1c300f8119900714c7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 28 Dec 2012 19:56:07 -0700 Subject: perf x86: revert 20b279 - require exclude_guest to use PEBS - kernel side This patch is brought to you by the letter 'H'. Commit 20b279 breaks compatiblity with older perf binaries when run with precise modifier (:p or :pp) by requiring the exclude_guest attribute to be set. Older binaries default exclude_guest to 0 (ie., wanting guest-based samples) unless host only profiling is requested (:H modifier). The workaround for older binaries is to add H to the modifier list (e.g., -e cycles:ppH - toggles exclude_guest to 1). This was deemed unacceptable by Linus: https://lkml.org/lkml/2012/12/12/570 Between family in town and the fresh snow in Breckenridge there is no time left to be working on the proper fix for this over the holidays. In the New Year I have more pressing problems to resolve -- like some memory leaks in perf which are proving to be elusive -- although the aforementioned snow is probably why they are proving to be elusive. Either way I do not have any spare time to work on this and from the time I have managed to spend on it the solution is more difficult than just moving to a new exclude_guest flag (does not work) or flipping the logic to include_guest (which is not as trivial as one would think). So, two options: silently force exclude_guest on as suggested by Gleb which means no impact to older perf binaries or revert the original patch which caused the breakage. This patch does the latter -- reverts the original patch that introduced the regression. The problem can be revisited in the future as time allows. Signed-off-by: David Ahern Cc: Avi Kivity Cc: Gleb Natapov Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Link: http://lkml.kernel.org/r/1356749767-17322-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4428fd178bce..6774c17a5576 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -340,9 +340,6 @@ int x86_setup_perfctr(struct perf_event *event) /* BTS is currently only allowed for user-mode. */ if (!attr->exclude_kernel) return -EOPNOTSUPP; - - if (!attr->exclude_guest) - return -EOPNOTSUPP; } hwc->config |= config; @@ -385,9 +382,6 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { int precise = 0; - if (!event->attr.exclude_guest) - return -EOPNOTSUPP; - /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; -- cgit v1.2.3 From a9acc5365dbda29f7be2884efb63771dc24bd815 Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Wed, 14 Nov 2012 20:43:31 +0000 Subject: x86/Sandy Bridge: reserve pages when integrated graphics is present SNB graphics devices have a bug that prevent them from accessing certain memory ranges, namely anything below 1M and in the pages listed in the table. So reserve those at boot if set detect a SNB gfx device on the CPU to avoid GPU hangs. Stephane Marchesin had a similar patch to the page allocator awhile back, but rather than reserving pages up front, it leaked them at allocation time. [ hpa: made a number of stylistic changes, marked arrays as static const, and made less verbose; use "memblock=debug" for full verbosity. ] Signed-off-by: Jesse Barnes Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 23ddd558fbd5..9dcb32545032 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -610,6 +610,81 @@ static __init void reserve_ibft_region(void) static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; +static bool __init snb_gfx_workaround_needed(void) +{ + int i; + u16 vendor, devid; + static const u16 snb_ids[] = { + 0x0102, + 0x0112, + 0x0122, + 0x0106, + 0x0116, + 0x0126, + 0x010a, + }; + + /* Assume no if something weird is going on with PCI */ + if (!early_pci_allowed()) + return false; + + vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); + if (vendor != 0x8086) + return false; + + devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); + for (i = 0; i < ARRAY_SIZE(snb_ids); i++) + if (devid == snb_ids[i]) + return true; + + return false; +} + +/* + * Sandy Bridge graphics has trouble with certain ranges, exclude + * them from allocation. + */ +static void __init trim_snb_memory(void) +{ + static const unsigned long bad_pages[] = { + 0x20050000, + 0x20110000, + 0x20130000, + 0x20138000, + 0x40004000, + }; + int i; + + if (!snb_gfx_workaround_needed()) + return; + + printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); + + /* + * Reserve all memory below the 1 MB mark that has not + * already been reserved. + */ + memblock_reserve(0, 1<<20); + + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) + printk(KERN_WARNING "failed to reserve 0x%08lx\n", + bad_pages[i]); + } +} + +/* + * Here we put platform-specific memory range workarounds, i.e. + * memory known to be corrupt or otherwise in need to be reserved on + * specific platforms. + * + * If this gets used more widely it could use a real dispatch mechanism. + */ +static void __init trim_platform_memory_ranges(void) +{ + trim_snb_memory(); +} + static void __init trim_bios_range(void) { /* @@ -630,6 +705,7 @@ static void __init trim_bios_range(void) * take them out. */ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } @@ -908,6 +984,8 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); + trim_platform_memory_ranges(); + init_gbpages(); /* max_pfn_mapped is updated here */ -- cgit v1.2.3 From ab3cd8670e0b3fcde7f029e1503ed3c5138e9571 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 13 Jan 2013 20:36:39 -0800 Subject: x86/Sandy Bridge: mark arrays in __init functions as __initconst Mark static arrays as __initconst so they get removed when the init sections are flushed. Reported-by: Mathias Krause Link: http://lkml.kernel.org/r/75F4BEE6-CB0E-4426-B40B-697451677738@googlemail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9dcb32545032..18182d19b71b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -614,7 +614,7 @@ static bool __init snb_gfx_workaround_needed(void) { int i; u16 vendor, devid; - static const u16 snb_ids[] = { + static const __initconst u16 snb_ids[] = { 0x0102, 0x0112, 0x0122, @@ -646,7 +646,7 @@ static bool __init snb_gfx_workaround_needed(void) */ static void __init trim_snb_memory(void) { - static const unsigned long bad_pages[] = { + static const __initconst unsigned long bad_pages[] = { 0x20050000, 0x20110000, 0x20130000, -- cgit v1.2.3 From e43b3cec711a61edf047adf6204d542f3a659ef8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 13 Jan 2013 20:56:41 -0800 Subject: x86/Sandy Bridge: Sandy Bridge workaround depends on CONFIG_PCI early_pci_allowed() and read_pci_config_16() are only available if CONFIG_PCI is defined. Signed-off-by: H. Peter Anvin Cc: Jesse Barnes --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 18182d19b71b..00f6c1472b85 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -612,6 +612,7 @@ static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; static bool __init snb_gfx_workaround_needed(void) { +#ifdef CONFIG_PCI int i; u16 vendor, devid; static const __initconst u16 snb_ids[] = { @@ -636,6 +637,7 @@ static bool __init snb_gfx_workaround_needed(void) for (i = 0; i < ARRAY_SIZE(snb_ids); i++) if (devid == snb_ids[i]) return true; +#endif return false; } -- cgit v1.2.3 From d55bf532d72b3cfdfe84e696ace995067324c96c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 15 Jan 2013 22:40:26 -0500 Subject: Revert "xen/smp: Fix CPU online/offline bug triggering a BUG: scheduling while atomic." This reverts commit 41bd956de3dfdc3a43708fe2e0c8096c69064a1e. The fix is incorrect and not appropiate for the latest kernels. In fact it _causes_ the BUG: scheduling while atomic while doing vCPU hotplug. Suggested-by: Wei Liu Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4f7d2599b484..34bc4cee8887 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -432,13 +432,6 @@ static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); cpu_bringup(); - /* - * Balance out the preempt calls - as we are running in cpu_idle - * loop which has been called at bootup from cpu_bringup_and_idle. - * The cpucpu_bringup_and_idle called cpu_bringup which made a - * preempt_disable() So this preempt_enable will balance it out. - */ - preempt_enable(); } #else /* !CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 9174adbee4a9a49d0139f5d71969852b36720809 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Wed, 16 Jan 2013 12:00:55 +0000 Subject: xen: Fix stack corruption in xen_failsafe_callback for 32bit PVOPS guests. This fixes CVE-2013-0190 / XSA-40 There has been an error on the xen_failsafe_callback path for failed iret, which causes the stack pointer to be wrong when entering the iret_exc error path. This can result in the kernel crashing. In the classic kernel case, the relevant code looked a little like: popl %eax # Error code from hypervisor jz 5f addl $16,%esp jmp iret_exc # Hypervisor said iret fault 5: addl $16,%esp # Hypervisor said segment selector fault Here, there are two identical addls on either option of a branch which appears to have been optimised by hoisting it above the jz, and converting it to an lea, which leaves the flags register unaffected. In the PVOPS case, the code looks like: popl_cfi %eax # Error from the hypervisor lea 16(%esp),%esp # Add $16 before choosing fault path CFI_ADJUST_CFA_OFFSET -16 jz 5f addl $16,%esp # Incorrectly adjust %esp again jmp iret_exc It is possible unprivileged userspace applications to cause this behaviour, for example by loading an LDT code selector, then changing the code selector to be not-present. At this point, there is a race condition where it is possible for the hypervisor to return back to userspace from an interrupt, fault on its own iret, and inject a failsafe_callback into the kernel. This bug has been present since the introduction of Xen PVOPS support in commit 5ead97c84 (xen: Core Xen implementation), in 2.6.23. Signed-off-by: Frediano Ziglio Signed-off-by: Andrew Cooper Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/entry_32.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 88b725aa1d52..cf8639b4dcf3 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1084,7 +1084,6 @@ ENTRY(xen_failsafe_callback) lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f - addl $16,%esp jmp iret_exc 5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL -- cgit v1.2.3 From 9899d11f654474d2d54ea52ceaa2a1f4db3abd68 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:48:00 +0100 Subject: ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL putreg() assumes that the tracee is not running and pt_regs_access() can safely play with its stack. However a killed tracee can return from ptrace_stop() to the low-level asm code and do RESTORE_REST, this means that debugger can actually read/modify the kernel stack until the tracee does SAVE_REST again. set_task_blockstep() can race with SIGKILL too and in some sense this race is even worse, the very fact the tracee can be woken up breaks the logic. As Linus suggested we can clear TASK_WAKEKILL around the arch_ptrace() call, this ensures that nobody can ever wakeup the tracee while the debugger looks at it. Not only this fixes the mentioned problems, we can do some cleanups/simplifications in arch_ptrace() paths. Probably ptrace_unfreeze_traced() needs more callers, for example it makes sense to make the tracee killable for oom-killer before access_process_vm(). While at it, add the comment into may_ptrace_stop() to explain why ptrace_stop() still can't rely on SIGKILL and signal_pending_state(). Reported-by: Salman Qazi Reported-by: Suleiman Souhlal Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- arch/x86/kernel/step.c | 9 +++---- kernel/ptrace.c | 64 ++++++++++++++++++++++++++++++++++++++++++-------- kernel/signal.c | 5 ++++ 3 files changed, 64 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index cd3b2438a980..9b4d51d0c0d0 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -165,10 +165,11 @@ void set_task_blockstep(struct task_struct *task, bool on) * Ensure irq/preemption can't change debugctl in between. * Note also that both TIF_BLOCKSTEP and debugctl should * be changed atomically wrt preemption. - * FIXME: this means that set/clear TIF_BLOCKSTEP is simply - * wrong if task != current, SIGKILL can wakeup the stopped - * tracee and set/clear can play with the running task, this - * can confuse the next __switch_to_xtra(). + * + * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if + * task is current or it can't be running, otherwise we can race + * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but + * PTRACE_KILL is not safe. */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 62f7c2774b16..6cbeaae4406d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -122,6 +122,40 @@ void __ptrace_unlink(struct task_struct *child) spin_unlock(&child->sighand->siglock); } +/* Ensure that nothing can wake it up, even SIGKILL */ +static bool ptrace_freeze_traced(struct task_struct *task) +{ + bool ret = false; + + /* Lockless, nobody but us can set this flag */ + if (task->jobctl & JOBCTL_LISTENING) + return ret; + + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !__fatal_signal_pending(task)) { + task->state = __TASK_TRACED; + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); + + return ret; +} + +static void ptrace_unfreeze_traced(struct task_struct *task) +{ + if (task->state != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); + + spin_lock_irq(&task->sighand->siglock); + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + spin_unlock_irq(&task->sighand->siglock); +} + /** * ptrace_check_attach - check whether ptracee is ready for ptrace operation * @child: ptracee to check for @@ -151,24 +185,29 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current) { + if (child->ptrace && child->parent == current) { + WARN_ON(child->state == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ - spin_lock_irq(&child->sighand->siglock); - WARN_ON_ONCE(task_is_stopped(child)); - if (ignore_state || (task_is_traced(child) && - !(child->jobctl & JOBCTL_LISTENING))) + if (ignore_state || ptrace_freeze_traced(child)) ret = 0; - spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) - ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; + if (!ret && !ignore_state) { + if (!wait_task_inactive(child, __TASK_TRACED)) { + /* + * This can only happen if may_ptrace_stop() fails and + * ptrace_stop() changes ->state back to TASK_RUNNING, + * so we should not worry about leaking __TASK_TRACED. + */ + WARN_ON(child->state == __TASK_TRACED); + ret = -ESRCH; + } + } - /* All systems go.. */ return ret; } @@ -900,6 +939,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); out_put_task_struct: put_task_struct(child); @@ -1039,8 +1080,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, ret = ptrace_check_attach(child, request == PTRACE_KILL || request == PTRACE_INTERRUPT); - if (!ret) + if (!ret) { ret = compat_arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); + } out_put_task_struct: put_task_struct(child); diff --git a/kernel/signal.c b/kernel/signal.c index 6e97aa6fa32c..3d09cf6cde75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1794,6 +1794,10 @@ static inline int may_ptrace_stop(void) * If SIGKILL was already sent before the caller unlocked * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). + * + * This is almost outdated, a task with the pending SIGKILL can't + * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported + * after SIGKILL was already dequeued. */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) @@ -1919,6 +1923,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (gstop_done) do_notify_parent_cldstop(current, false, why); + /* tasklist protects us from ptrace_freeze_traced() */ __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; -- cgit v1.2.3