diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries/lpar.c')
-rw-r--r-- | arch/powerpc/platforms/pseries/lpar.c | 618 |
1 files changed, 586 insertions, 32 deletions
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 1034ef1fe2b4..09bb878c21e0 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * pSeries_lpar.c * Copyright (C) 2001 Todd Inglett, IBM Corporation * * pSeries LPAR support. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Enables debugging of low-level hash table routines - careful! */ @@ -30,6 +17,10 @@ #include <linux/jump_label.h> #include <linux/delay.h> #include <linux/stop_machine.h> +#include <linux/spinlock.h> +#include <linux/cpuhotplug.h> +#include <linux/workqueue.h> +#include <linux/proc_fs.h> #include <asm/processor.h> #include <asm/mmu.h> #include <asm/page.h> @@ -65,13 +56,591 @@ EXPORT_SYMBOL(plpar_hcall); EXPORT_SYMBOL(plpar_hcall9); EXPORT_SYMBOL(plpar_hcall_norets); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +static u8 dtl_mask = DTL_LOG_PREEMPT; +#else +static u8 dtl_mask; +#endif + +void alloc_dtl_buffers(unsigned long *time_limit) +{ + int cpu; + struct paca_struct *pp; + struct dtl_entry *dtl; + + for_each_possible_cpu(cpu) { + pp = paca_ptrs[cpu]; + if (pp->dispatch_log) + continue; + dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL); + if (!dtl) { + pr_warn("Failed to allocate dispatch trace log for cpu %d\n", + cpu); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + pr_warn("Stolen time statistics will be unreliable\n"); +#endif + break; + } + + pp->dtl_ridx = 0; + pp->dispatch_log = dtl; + pp->dispatch_log_end = dtl + N_DISPATCH_LOG; + pp->dtl_curr = dtl; + + if (time_limit && time_after(jiffies, *time_limit)) { + cond_resched(); + *time_limit = jiffies + HZ; + } + } +} + +void register_dtl_buffer(int cpu) +{ + long ret; + struct paca_struct *pp; + struct dtl_entry *dtl; + int hwcpu = get_hard_smp_processor_id(cpu); + + pp = paca_ptrs[cpu]; + dtl = pp->dispatch_log; + if (dtl && dtl_mask) { + pp->dtl_ridx = 0; + pp->dtl_curr = dtl; + lppaca_of(cpu).dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES); + ret = register_dtl(hwcpu, __pa(dtl)); + if (ret) + pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n", + cpu, hwcpu, ret); + + lppaca_of(cpu).dtl_enable_mask = dtl_mask; + } +} + +#ifdef CONFIG_PPC_SPLPAR +struct dtl_worker { + struct delayed_work work; + int cpu; +}; + +struct vcpu_dispatch_data { + int last_disp_cpu; + + int total_disp; + + int same_cpu_disp; + int same_chip_disp; + int diff_chip_disp; + int far_chip_disp; + + int numa_home_disp; + int numa_remote_disp; + int numa_far_disp; +}; + +/* + * This represents the number of cpus in the hypervisor. Since there is no + * architected way to discover the number of processors in the host, we + * provision for dealing with NR_CPUS. This is currently 2048 by default, and + * is sufficient for our purposes. This will need to be tweaked if + * CONFIG_NR_CPUS is changed. + */ +#define NR_CPUS_H NR_CPUS + +DEFINE_RWLOCK(dtl_access_lock); +static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data); +static DEFINE_PER_CPU(u64, dtl_entry_ridx); +static DEFINE_PER_CPU(struct dtl_worker, dtl_workers); +static enum cpuhp_state dtl_worker_state; +static DEFINE_MUTEX(dtl_enable_mutex); +static int vcpudispatch_stats_on __read_mostly; +static int vcpudispatch_stats_freq = 50; +static __be32 *vcpu_associativity, *pcpu_associativity; + + +static void free_dtl_buffers(unsigned long *time_limit) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + int cpu; + struct paca_struct *pp; + + for_each_possible_cpu(cpu) { + pp = paca_ptrs[cpu]; + if (!pp->dispatch_log) + continue; + kmem_cache_free(dtl_cache, pp->dispatch_log); + pp->dtl_ridx = 0; + pp->dispatch_log = 0; + pp->dispatch_log_end = 0; + pp->dtl_curr = 0; + + if (time_limit && time_after(jiffies, *time_limit)) { + cond_resched(); + *time_limit = jiffies + HZ; + } + } +#endif +} + +static int init_cpu_associativity(void) +{ + vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core, + VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL); + pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core, + VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL); + + if (!vcpu_associativity || !pcpu_associativity) { + pr_err("error allocating memory for associativity information\n"); + return -ENOMEM; + } + + return 0; +} + +static void destroy_cpu_associativity(void) +{ + kfree(vcpu_associativity); + kfree(pcpu_associativity); + vcpu_associativity = pcpu_associativity = 0; +} + +static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag) +{ + __be32 *assoc; + int rc = 0; + + assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE]; + if (!assoc[0]) { + rc = hcall_vphn(cpu, flag, &assoc[0]); + if (rc) + return NULL; + } + + return assoc; +} + +static __be32 *get_pcpu_associativity(int cpu) +{ + return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU); +} + +static __be32 *get_vcpu_associativity(int cpu) +{ + return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU); +} + +static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) +{ + __be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc; + + if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H) + return -EINVAL; + + last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu); + cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu); + + if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) + return -EIO; + + return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); +} + +static int cpu_home_node_dispatch_distance(int disp_cpu) +{ + __be32 *disp_cpu_assoc, *vcpu_assoc; + int vcpu_id = smp_processor_id(); + + if (disp_cpu >= NR_CPUS_H) { + pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n", + disp_cpu, NR_CPUS_H); + return -EINVAL; + } + + disp_cpu_assoc = get_pcpu_associativity(disp_cpu); + vcpu_assoc = get_vcpu_associativity(vcpu_id); + + if (!disp_cpu_assoc || !vcpu_assoc) + return -EIO; + + return cpu_distance(disp_cpu_assoc, vcpu_assoc); +} + +static void update_vcpu_disp_stat(int disp_cpu) +{ + struct vcpu_dispatch_data *disp; + int distance; + + disp = this_cpu_ptr(&vcpu_disp_data); + if (disp->last_disp_cpu == -1) { + disp->last_disp_cpu = disp_cpu; + return; + } + + disp->total_disp++; + + if (disp->last_disp_cpu == disp_cpu || + (cpu_first_thread_sibling(disp->last_disp_cpu) == + cpu_first_thread_sibling(disp_cpu))) + disp->same_cpu_disp++; + else { + distance = cpu_relative_dispatch_distance(disp->last_disp_cpu, + disp_cpu); + if (distance < 0) + pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n", + smp_processor_id()); + else { + switch (distance) { + case 0: + disp->same_chip_disp++; + break; + case 1: + disp->diff_chip_disp++; + break; + case 2: + disp->far_chip_disp++; + break; + default: + pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n", + smp_processor_id(), + disp->last_disp_cpu, + disp_cpu, + distance); + } + } + } + + distance = cpu_home_node_dispatch_distance(disp_cpu); + if (distance < 0) + pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n", + smp_processor_id()); + else { + switch (distance) { + case 0: + disp->numa_home_disp++; + break; + case 1: + disp->numa_remote_disp++; + break; + case 2: + disp->numa_far_disp++; + break; + default: + pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n", + smp_processor_id(), + disp_cpu, + distance); + } + } + + disp->last_disp_cpu = disp_cpu; +} + +static void process_dtl_buffer(struct work_struct *work) +{ + struct dtl_entry dtle; + u64 i = __this_cpu_read(dtl_entry_ridx); + struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + struct dtl_worker *d = container_of(work, struct dtl_worker, work.work); + + if (!local_paca->dispatch_log) + return; + + /* if we have been migrated away, we cancel ourself */ + if (d->cpu != smp_processor_id()) { + pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n", + smp_processor_id()); + return; + } + + if (i == be64_to_cpu(vpa->dtl_idx)) + goto out; + + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtle = *dtl; + barrier(); + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { + /* buffer has overflowed */ + pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n", + d->cpu, + be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i); + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id)); + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + + __this_cpu_write(dtl_entry_ridx, i); + +out: + schedule_delayed_work_on(d->cpu, to_delayed_work(work), + HZ / vcpudispatch_stats_freq); +} + +static int dtl_worker_online(unsigned int cpu) +{ + struct dtl_worker *d = &per_cpu(dtl_workers, cpu); + + memset(d, 0, sizeof(*d)); + INIT_DELAYED_WORK(&d->work, process_dtl_buffer); + d->cpu = cpu; + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + per_cpu(dtl_entry_ridx, cpu) = 0; + register_dtl_buffer(cpu); +#else + per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx); +#endif + + schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq); + return 0; +} + +static int dtl_worker_offline(unsigned int cpu) +{ + struct dtl_worker *d = &per_cpu(dtl_workers, cpu); + + cancel_delayed_work_sync(&d->work); + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + unregister_dtl(get_hard_smp_processor_id(cpu)); +#endif + + return 0; +} + +static void set_global_dtl_mask(u8 mask) +{ + int cpu; + + dtl_mask = mask; + for_each_present_cpu(cpu) + lppaca_of(cpu).dtl_enable_mask = dtl_mask; +} + +static void reset_global_dtl_mask(void) +{ + int cpu; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + dtl_mask = DTL_LOG_PREEMPT; +#else + dtl_mask = 0; +#endif + for_each_present_cpu(cpu) + lppaca_of(cpu).dtl_enable_mask = dtl_mask; +} + +static int dtl_worker_enable(unsigned long *time_limit) +{ + int rc = 0, state; + + if (!write_trylock(&dtl_access_lock)) { + rc = -EBUSY; + goto out; + } + + set_global_dtl_mask(DTL_LOG_ALL); + + /* Setup dtl buffers and register those */ + alloc_dtl_buffers(time_limit); + + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online", + dtl_worker_online, dtl_worker_offline); + if (state < 0) { + pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n"); + free_dtl_buffers(time_limit); + reset_global_dtl_mask(); + write_unlock(&dtl_access_lock); + rc = -EINVAL; + goto out; + } + dtl_worker_state = state; + +out: + return rc; +} + +static void dtl_worker_disable(unsigned long *time_limit) +{ + cpuhp_remove_state(dtl_worker_state); + free_dtl_buffers(time_limit); + reset_global_dtl_mask(); + write_unlock(&dtl_access_lock); +} + +static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p, + size_t count, loff_t *ppos) +{ + unsigned long time_limit = jiffies + HZ; + struct vcpu_dispatch_data *disp; + int rc, cmd, cpu; + char buf[16]; + + if (count > 15) + return -EINVAL; + + if (copy_from_user(buf, p, count)) + return -EFAULT; + + buf[count] = 0; + rc = kstrtoint(buf, 0, &cmd); + if (rc || cmd < 0 || cmd > 1) { + pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n"); + return rc ? rc : -EINVAL; + } + + mutex_lock(&dtl_enable_mutex); + + if ((cmd == 0 && !vcpudispatch_stats_on) || + (cmd == 1 && vcpudispatch_stats_on)) + goto out; + + if (cmd) { + rc = init_cpu_associativity(); + if (rc) + goto out; + + for_each_possible_cpu(cpu) { + disp = per_cpu_ptr(&vcpu_disp_data, cpu); + memset(disp, 0, sizeof(*disp)); + disp->last_disp_cpu = -1; + } + + rc = dtl_worker_enable(&time_limit); + if (rc) { + destroy_cpu_associativity(); + goto out; + } + } else { + dtl_worker_disable(&time_limit); + destroy_cpu_associativity(); + } + + vcpudispatch_stats_on = cmd; + +out: + mutex_unlock(&dtl_enable_mutex); + if (rc) + return rc; + return count; +} + +static int vcpudispatch_stats_display(struct seq_file *p, void *v) +{ + int cpu; + struct vcpu_dispatch_data *disp; + + if (!vcpudispatch_stats_on) { + seq_puts(p, "off\n"); + return 0; + } + + for_each_online_cpu(cpu) { + disp = per_cpu_ptr(&vcpu_disp_data, cpu); + seq_printf(p, "cpu%d", cpu); + seq_put_decimal_ull(p, " ", disp->total_disp); + seq_put_decimal_ull(p, " ", disp->same_cpu_disp); + seq_put_decimal_ull(p, " ", disp->same_chip_disp); + seq_put_decimal_ull(p, " ", disp->diff_chip_disp); + seq_put_decimal_ull(p, " ", disp->far_chip_disp); + seq_put_decimal_ull(p, " ", disp->numa_home_disp); + seq_put_decimal_ull(p, " ", disp->numa_remote_disp); + seq_put_decimal_ull(p, " ", disp->numa_far_disp); + seq_puts(p, "\n"); + } + + return 0; +} + +static int vcpudispatch_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, vcpudispatch_stats_display, NULL); +} + +static const struct file_operations vcpudispatch_stats_proc_ops = { + .open = vcpudispatch_stats_open, + .read = seq_read, + .write = vcpudispatch_stats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static ssize_t vcpudispatch_stats_freq_write(struct file *file, + const char __user *p, size_t count, loff_t *ppos) +{ + int rc, freq; + char buf[16]; + + if (count > 15) + return -EINVAL; + + if (copy_from_user(buf, p, count)) + return -EFAULT; + + buf[count] = 0; + rc = kstrtoint(buf, 0, &freq); + if (rc || freq < 1 || freq > HZ) { + pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n", + HZ); + return rc ? rc : -EINVAL; + } + + vcpudispatch_stats_freq = freq; + + return count; +} + +static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v) +{ + seq_printf(p, "%d\n", vcpudispatch_stats_freq); + return 0; +} + +static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file) +{ + return single_open(file, vcpudispatch_stats_freq_display, NULL); +} + +static const struct file_operations vcpudispatch_stats_freq_proc_ops = { + .open = vcpudispatch_stats_freq_open, + .read = seq_read, + .write = vcpudispatch_stats_freq_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init vcpudispatch_stats_procfs_init(void) +{ + if (!lppaca_shared_proc(get_lppaca())) + return 0; + + if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL, + &vcpudispatch_stats_proc_ops)) + pr_err("vcpudispatch_stats: error creating procfs file\n"); + else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL, + &vcpudispatch_stats_freq_proc_ops)) + pr_err("vcpudispatch_stats_freq: error creating procfs file\n"); + + return 0; +} + +machine_device_initcall(pseries, vcpudispatch_stats_procfs_init); +#endif /* CONFIG_PPC_SPLPAR */ + void vpa_init(int cpu) { int hwcpu = get_hard_smp_processor_id(cpu); unsigned long addr; long ret; - struct paca_struct *pp; - struct dtl_entry *dtl; /* * The spec says it "may be problematic" if CPU x registers the VPA of @@ -112,22 +681,7 @@ void vpa_init(int cpu) /* * Register dispatch trace log, if one has been allocated. */ - pp = paca_ptrs[cpu]; - dtl = pp->dispatch_log; - if (dtl) { - pp->dtl_ridx = 0; - pp->dtl_curr = dtl; - lppaca_of(cpu).dtl_idx = 0; - - /* hypervisor reads buffer length from this field */ - dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES); - ret = register_dtl(hwcpu, __pa(dtl)); - if (ret) - pr_err("WARNING: DTL registration of cpu %d (hw %d) " - "failed with %ld\n", smp_processor_id(), - hwcpu, ret); - lppaca_of(cpu).dtl_enable_mask = 2; - } + register_dtl_buffer(cpu); } #ifdef CONFIG_PPC_BOOK3S_64 |