summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c61
-rw-r--r--arch/x86/mm/numa.c21
-rw-r--r--arch/x86/mm/numa_32.c2
-rw-r--r--arch/x86/mm/srat.c16
-rw-r--r--arch/x86/mm/tlb.c52
5 files changed, 82 insertions, 70 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d591c895803..a10c8c792161 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1001,6 +1001,12 @@ static int fault_in_kernel_space(unsigned long address)
static inline bool smap_violation(int error_code, struct pt_regs *regs)
{
+ if (!IS_ENABLED(CONFIG_X86_SMAP))
+ return false;
+
+ if (!static_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
if (error_code & PF_USER)
return false;
@@ -1014,13 +1020,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
+ *
+ * This function must have noinline because both callers
+ * {,trace_}do_page_fault() have notrace on. Having this an actual function
+ * guarantees there's a function trace entry.
*/
-static void __kprobes
-__do_page_fault(struct pt_regs *regs, unsigned long error_code)
+static void __kprobes noinline
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- unsigned long address;
struct mm_struct *mm;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -1028,9 +1038,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
tsk = current;
mm = tsk->mm;
- /* Get the faulting address: */
- address = read_cr2();
-
/*
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
@@ -1087,11 +1094,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
- if (static_cpu_has(X86_FEATURE_SMAP)) {
- if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
- return;
- }
+ if (unlikely(smap_violation(error_code, regs))) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
}
/*
@@ -1244,32 +1249,50 @@ good_area:
up_read(&mm->mmap_sem);
}
-dotraplinkage void __kprobes
+dotraplinkage void __kprobes notrace
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
+ unsigned long address = read_cr2(); /* Get the faulting address */
enum ctx_state prev_state;
+ /*
+ * We must have this function tagged with __kprobes, notrace and call
+ * read_cr2() before calling anything else. To avoid calling any kind
+ * of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contain all sorts of tracepoints.
+ */
+
prev_state = exception_enter();
- __do_page_fault(regs, error_code);
+ __do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
-static void trace_page_fault_entries(struct pt_regs *regs,
+#ifdef CONFIG_TRACING
+static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
{
if (user_mode(regs))
- trace_page_fault_user(read_cr2(), regs, error_code);
+ trace_page_fault_user(address, regs, error_code);
else
- trace_page_fault_kernel(read_cr2(), regs, error_code);
+ trace_page_fault_kernel(address, regs, error_code);
}
-dotraplinkage void __kprobes
+dotraplinkage void __kprobes notrace
trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
+ /*
+ * The exception_enter and tracepoint processing could
+ * trigger another page faults (user space callchain
+ * reading) and destroy the original cr2 value, so read
+ * the faulting address now.
+ */
+ unsigned long address = read_cr2();
enum ctx_state prev_state;
prev_state = exception_enter();
- trace_page_fault_entries(regs, error_code);
- __do_page_fault(regs, error_code);
+ trace_page_fault_entries(address, regs, error_code);
+ __do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
+#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 81b2750f3666..27aa0455fab3 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
struct numa_memblk *mb = &mi->blk[i];
memblock_set_node(mb->start, mb->end - mb->start,
&memblock.memory, mb->nid);
-
- /*
- * At this time, all memory regions reserved by memblock are
- * used by the kernel. Set the nid in memblock.reserved will
- * mark out all the nodes the kernel resides in.
- */
- memblock_set_node(mb->start, mb->end - mb->start,
- &memblock.reserved, mb->nid);
}
/*
@@ -565,10 +557,21 @@ static void __init numa_init_array(void)
static void __init numa_clear_kernel_node_hotplug(void)
{
int i, nid;
- nodemask_t numa_kernel_nodes;
+ nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
unsigned long start, end;
struct memblock_type *type = &memblock.reserved;
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = &numa_meminfo.blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ }
+
/* Mark all kernel nodes. */
for (i = 0; i < type->cnt; i++)
node_set(type->regions[i].nid, numa_kernel_nodes);
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 0342d27ca798..47b6436e41c2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
+ start = round_down(start, PAGES_PER_SECTION);
+ end = round_up(end, PAGES_PER_SECTION);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 1a25187e151e..1953e9c9391a 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -42,15 +42,25 @@ static __init inline int srat_disabled(void)
return acpi_numa < 0;
}
-/* Callback for SLIT parsing */
+/*
+ * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them. I/O localities are
+ * not supported at this point.
+ */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
int i, j;
- for (i = 0; i < slit->locality_count; i++)
- for (j = 0; j < slit->locality_count; j++)
+ for (i = 0; i < slit->locality_count; i++) {
+ if (pxm_to_node(i) == NUMA_NO_NODE)
+ continue;
+ for (j = 0; j < slit->locality_count; j++) {
+ if (pxm_to_node(j) == NUMA_NO_NODE)
+ continue;
numa_set_distance(pxm_to_node(i), pxm_to_node(j),
slit->entry[slit->locality_count * i + j]);
+ }
+ }
}
/* Callback for Proximity Domain -> x2APIC mapping */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac8..dd8dda167a24 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_start = start;
info.flush_end = end;
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
preempt_disable();
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-/*
- * It can find out the THP large page, or
- * HUGETLB page in tlb_flush when THP disabled
- */
-static inline unsigned long has_large_page(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr = ALIGN(start, HPAGE_SIZE);
- for (; addr < end; addr += HPAGE_SIZE) {
- pgd = pgd_offset(mm, addr);
- if (likely(!pgd_none(*pgd))) {
- pud = pud_offset(pgd, addr);
- if (likely(!pud_none(*pud))) {
- pmd = pmd_offset(pud, addr);
- if (likely(!pmd_none(*pmd)))
- if (pmd_large(*pmd))
- return addr;
- }
- }
- }
- return 0;
-}
-
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
unsigned act_entries, tlb_entries = 0;
+ unsigned long nr_base_pages;
preempt_disable();
if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
tlb_entries = tlb_lli_4k[ENTRIES];
else
tlb_entries = tlb_lld_4k[ENTRIES];
+
/* Assume all of TLB entries was occupied by this task */
- act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+ act_entries = tlb_entries >> tlb_flushall_shift;
+ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+ nr_base_pages = (end - start) >> PAGE_SHIFT;
/* tlb_flushall_shift is on balance point, details in commit log */
- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ if (nr_base_pages > act_entries) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
- if (has_large_page(mm, start, end)) {
- local_flush_tlb();
- goto flush_all;
- }
/* flush range by one by one 'invlpg' */
for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
static void do_flush_tlb_all(void *info)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}