diff options
Diffstat (limited to 'arch/x86')
362 files changed, 15532 insertions, 20838 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 0e103236b754..0e9dec6cadd1 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -15,3 +15,4 @@ obj-y += vdso/ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ +obj-y += net/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aad..da349723d411 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -8,6 +8,7 @@ config 64BIT config X86_32 def_bool !64BIT + select CLKSRC_I8253 config X86_64 def_bool 64BIT @@ -16,8 +17,6 @@ config X86_64 config X86 def_bool y select HAVE_AOUT if X86_32 - select HAVE_READQ - select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE @@ -64,9 +63,13 @@ config X86 select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ + select GENERIC_FIND_FIRST_BIT select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP + select GENERIC_IRQ_SHOW + select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP + select HAVE_BPF_JIT if (X86_64 && NET) config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -107,7 +110,14 @@ config MMU def_bool y config ZONE_DMA - def_bool y + bool "DMA memory allocation support" if EXPERT + default y + help + DMA memory allocation support allows devices with less than 32-bit + addressing to allocate within the first 16MB of address space. + Disable if no such devices will be used. + + If unsure, say Y. config SBUS bool @@ -119,7 +129,7 @@ config NEED_SG_DMA_LENGTH def_bool y config GENERIC_ISA_DMA - def_bool y + def_bool ISA_DMA_API config GENERIC_IOMAP def_bool y @@ -139,7 +149,7 @@ config GENERIC_GPIO bool config ARCH_MAY_HAVE_PC_FDC - def_bool y + def_bool ISA_DMA_API config RWSEM_GENERIC_SPINLOCK def_bool !X86_XADD @@ -217,10 +227,6 @@ config X86_HT def_bool y depends on SMP -config X86_TRAMPOLINE - def_bool y - depends on SMP || (64BIT && ACPI_SLEEP) - config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -364,17 +370,6 @@ config X86_UV # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions -config X86_ELAN - bool "AMD Elan" - depends on X86_32 - depends on X86_EXTENDED_PLATFORM - ---help--- - Select this for an AMD Elan processor. - - Do not use this option for K6/Athlon/Opteron processors! - - If unsure, choose "PC-compatible" instead. - config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -382,6 +377,8 @@ config X86_INTEL_CE depends on X86_32 depends on X86_EXTENDED_PLATFORM select X86_REBOOTFIXUPS + select OF + select OF_EARLY_FLATTREE ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -687,6 +684,7 @@ config AMD_IOMMU bool "AMD IOMMU support" select SWIOTLB select PCI_MSI + select PCI_IOV depends on X86_64 && PCI && ACPI ---help--- With this option you can enable support for AMD IOMMU hardware in @@ -811,7 +809,7 @@ config X86_LOCAL_APIC config X86_IO_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC config X86_VISWS_APIC def_bool y @@ -916,6 +914,7 @@ config TOSHIBA config I8K tristate "Dell laptop support" + select HWMON ---help--- This adds a driver to safely access the System Management Mode of the CPU on the Dell Inspiron 8000. The System Management Mode @@ -1171,7 +1170,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" - depends on X86_64 && NUMA && PCI + depends on NUMA && PCI ---help--- Enable AMD NUMA node topology detection. You should say Y here if you have a multi processor AMD system. This uses an old method to @@ -1198,7 +1197,7 @@ config NODES_SPAN_OTHER_NODES config NUMA_EMU bool "NUMA emulation" - depends on X86_64 && NUMA + depends on NUMA ---help--- Enable NUMA emulation. A flat machine will be split into virtual nodes when booted with "numa=fake=N", where N is the @@ -1220,6 +1219,10 @@ config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA +config HAVE_ARCH_ALLOC_REMAP + def_bool y + depends on X86_32 && NUMA + config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM @@ -1228,13 +1231,9 @@ config NEED_NODE_MEMMAP_SIZE def_bool y depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_FLATMEM_ENABLE def_bool y - depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA + depends on X86_32 && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y @@ -1244,20 +1243,16 @@ config ARCH_DISCONTIGMEM_DEFAULT def_bool y depends on NUMA && X86_32 -config ARCH_PROC_KCORE_TEXT - def_bool y - depends on X86_64 && PROC_KCORE - -config ARCH_SPARSEMEM_DEFAULT - def_bool y - depends on X86_64 - config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_64 + config ARCH_SELECT_MEMORY_MODEL def_bool y depends on ARCH_SPARSEMEM_ENABLE @@ -1266,6 +1261,10 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ARCH_PROC_KCORE_TEXT + def_bool y + depends on X86_64 && PROC_KCORE + config ILLEGAL_POINTER_VALUE hex default 0 if X86_32 @@ -1700,12 +1699,8 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y depends on MEMORY_HOTPLUG -config HAVE_ARCH_EARLY_PFN_TO_NID - def_bool X86_64 - depends on NUMA - config USE_PERCPU_NUMA_NODE_ID - def_bool X86_64 + def_bool y depends on NUMA menu "Power management and ACPI options" @@ -1845,7 +1840,7 @@ config APM_ALLOW_INTS endif # APM -source "arch/x86/kernel/cpu/cpufreq/Kconfig" +source "drivers/cpufreq/Kconfig" source "drivers/cpuidle/Kconfig" @@ -2000,9 +1995,13 @@ source "drivers/pci/pcie/Kconfig" source "drivers/pci/Kconfig" -# x86_64 have no ISA slots, but do have ISA-style DMA. +# x86_64 have no ISA slots, but can have ISA-style DMA. config ISA_DMA_API - def_bool y + bool "ISA-style DMA support" if (X86_64 && EXPERT) + default y + help + Enables ISA-style DMA support for devices requiring such controllers. + If unsure, say Y. if X86_32 @@ -2066,9 +2065,10 @@ config SCx200HR_TIMER config OLPC bool "One Laptop Per Child support" + depends on !X86_PAE select GPIOLIB - select OLPC_OPENFIRMWARE - depends on !X86_64 && !X86_PAE + select OF + select OF_PROMTREE ---help--- Add support for detecting the unique features of the OLPC XO hardware. @@ -2079,21 +2079,6 @@ config OLPC_XO1 ---help--- Add support for non-essential features of the OLPC XO-1 laptop. -config OLPC_OPENFIRMWARE - bool "Support for OLPC's Open Firmware" - depends on !X86_64 && !X86_PAE - default n - select OF - help - This option adds support for the implementation of Open Firmware - that is used on the OLPC XO-1 Children's Machine. - If unsure, say N here. - -config OLPC_OPENFIRMWARE_DT - bool - default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE - select OF_PROMTREE - endif # X86_32 config AMD_NB @@ -2104,6 +2089,16 @@ source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" +config RAPIDIO + bool "RapidIO support" + depends on PCI + default n + help + If you say Y here, the kernel will include drivers and + infrastructure code to support RapidIO interconnect devices. + +source "drivers/rapidio/Kconfig" + endmenu @@ -2138,6 +2133,11 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config KEYS_COMPAT + bool + depends on COMPAT && KEYS + default y + endmenu diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 283c5a6a03a6..6a7cfdf8ff69 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,6 +1,4 @@ # Put here option for CPU selection and depending optimization -if !X86_ELAN - choice prompt "Processor family" default M686 if X86_32 @@ -203,6 +201,14 @@ config MWINCHIP3D stores for this CPU, which can increase performance of some operations. +config MELAN + bool "AMD Elan" + depends on X86_32 + ---help--- + Select this for an AMD Elan processor. + + Do not use this option for K6/Athlon/Opteron processors! + config MGEODEGX1 bool "GeodeGX1" depends on X86_32 @@ -292,13 +298,6 @@ config X86_GENERIC This is really intended for distributors who need more generic optimizations. -endif - -config X86_CPU - def_bool y - select GENERIC_FIND_FIRST_BIT - select GENERIC_FIND_NEXT_BIT - # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT @@ -317,7 +316,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 + default "4" if MELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_XADD @@ -331,7 +330,7 @@ config X86_PPRO_FENCE Old PentiumPro multiprocessor systems had errata that could cause memory operations to violate the x86 ordering standard in rare cases. Enabling this option will attempt to work around some (but not all) - occurances of this problem, at the cost of much heavier spinlock and + occurrences of this problem, at the cost of much heavier spinlock and memory barrier operations. If unsure, say n here. Even distro kernels should think twice before @@ -363,7 +362,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y @@ -371,7 +370,7 @@ config X86_INTEL_USERCOPY config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM config X86_USE_3DNOW def_bool y diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 615e18810f48..c0f8a5c88910 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -66,26 +66,6 @@ config DEBUG_STACKOVERFLOW This option will cause messages to be printed if free stack space drops below a certain limit. -config DEBUG_STACK_USAGE - bool "Stack utilization instrumentation" - depends on DEBUG_KERNEL - ---help--- - Enables the display of the minimum amount of free stack which each - task has ever had available in the sysrq-T and sysrq-P debug output. - - This option will slow down process creation somewhat. - -config DEBUG_PER_CPU_MAPS - bool "Debug access to per_cpu maps" - depends on DEBUG_KERNEL - depends on SMP - ---help--- - Say Y to verify that the per_cpu map being accessed has - been setup. Adds a fair amount of code to kernel memory - and decreases performance. - - Say N if unsure. - config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index f2ee1abb1df9..86cee7b749e1 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march= $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) # AMD Elan support -cflags-$(CONFIG_X86_ELAN) += -march=i486 +cflags-$(CONFIG_MELAN) += -march=i486 # Geode GX1 support cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index cae3feb1035e..db75d07c3645 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -91,7 +91,7 @@ static int detect_memory_e801(void) if (oreg.ax > 15*1024) { return -1; /* Bogus! */ } else if (oreg.ax == 15*1024) { - boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; + boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax; } else { /* * This ignores memory above 16MB if we have a memory diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 6f9872658dd2..2bf18059fbea 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ee01a9d5d4f0..22a0dc8e51dd 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 1a58ad89fdf7..c04f1b7a9139 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,8 +2,6 @@ # Arch-specific CryptoAPI modules. # -obj-$(CONFIG_CRYPTO_FPU) += fpu.o - obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o @@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o -aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o +aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 8fe2a4966b7a..be6d9e365a80 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1346,7 +1346,7 @@ _zero_cipher_left_decrypt: and $15, %r13 # %r13 = arg4 (mod 16) je _multiple_of_16_bytes_decrypt - # Handle the last <16 byte block seperately + # Handle the last <16 byte block separately paddd ONE(%rip), %xmm0 # increment CNT to get Yn movdqa SHUF_MASK(%rip), %xmm10 @@ -1355,7 +1355,7 @@ _zero_cipher_left_decrypt: ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) sub $16, %r11 add %r13, %r11 - movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block + movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes @@ -1607,11 +1607,12 @@ _zero_cipher_left_encrypt: and $15, %r13 # %r13 = arg4 (mod 16) je _multiple_of_16_bytes_encrypt - # Handle the last <16 Byte block seperately + # Handle the last <16 Byte block separately paddd ONE(%rip), %xmm0 # INCR CNT to get Yn movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) sub $16, %r11 add %r13, %r11 @@ -1634,7 +1635,9 @@ _zero_cipher_left_encrypt: # GHASH computation for the last <16 byte block sub %r13, %r11 add $16, %r11 - PSHUFB_XMM %xmm10, %xmm1 + + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 # shuffle xmm0 back to output as ciphertext diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index e1e60c7d5813..feee8ff1d05e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -94,6 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); + +int crypto_fpu_init(void); +void crypto_fpu_exit(void); + #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -828,9 +832,15 @@ static int rfc4106_init(struct crypto_tfm *tfm) struct cryptd_aead *cryptd_tfm; struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); + struct crypto_aead *cryptd_child; + struct aesni_rfc4106_gcm_ctx *child_ctx; cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); + + cryptd_child = cryptd_aead_child(cryptd_tfm); + child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); + memcpy(child_ctx, ctx, sizeof(*ctx)); ctx->cryptd_tfm = cryptd_tfm; tfm->crt_aead.reqsize = sizeof(struct aead_request) + crypto_aead_reqsize(&cryptd_tfm->base); @@ -873,22 +883,18 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) crypto_ablkcipher_clear_flags(ctr_tfm, ~0); ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); - if (ret) { - crypto_free_ablkcipher(ctr_tfm); - return ret; - } + if (ret) + goto out_free_ablkcipher; + ret = -ENOMEM; req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); - if (!req) { - crypto_free_ablkcipher(ctr_tfm); - return -EINVAL; - } + if (!req) + goto out_free_ablkcipher; req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); - if (!req_data) { - crypto_free_ablkcipher(ctr_tfm); - return -ENOMEM; - } + if (!req_data) + goto out_free_request; + memset(req_data->iv, 0, sizeof(req_data->iv)); /* Clear the data in the hash sub key container to zero.*/ @@ -913,8 +919,10 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) if (!ret) ret = req_data->result.err; } - ablkcipher_request_free(req); kfree(req_data); +out_free_request: + ablkcipher_request_free(req); +out_free_ablkcipher: crypto_free_ablkcipher(ctr_tfm); return ret; } @@ -925,6 +933,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, int ret = 0; struct crypto_tfm *tfm = crypto_aead_tfm(parent); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + struct aesni_rfc4106_gcm_ctx *child_ctx = + aesni_rfc4106_gcm_ctx_get(cryptd_child); u8 *new_key_mem = NULL; if (key_len < 4) { @@ -968,6 +979,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, goto exit; } ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); + memcpy(child_ctx, ctx, sizeof(*ctx)); exit: kfree(new_key_mem); return ret; @@ -999,7 +1011,6 @@ static int rfc4106_encrypt(struct aead_request *req) int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); if (!irq_fpu_usable()) { struct aead_request *cryptd_req = @@ -1008,6 +1019,7 @@ static int rfc4106_encrypt(struct aead_request *req) aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); return crypto_aead_encrypt(cryptd_req); } else { + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); kernel_fpu_begin(); ret = cryptd_child->base.crt_aead.encrypt(req); kernel_fpu_end(); @@ -1020,7 +1032,6 @@ static int rfc4106_decrypt(struct aead_request *req) int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); if (!irq_fpu_usable()) { struct aead_request *cryptd_req = @@ -1029,6 +1040,7 @@ static int rfc4106_decrypt(struct aead_request *req) aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); return crypto_aead_decrypt(cryptd_req); } else { + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); kernel_fpu_begin(); ret = cryptd_child->base.crt_aead.decrypt(req); kernel_fpu_end(); @@ -1249,6 +1261,8 @@ static int __init aesni_init(void) return -ENODEV; } + if ((err = crypto_fpu_init())) + goto fpu_err; if ((err = crypto_register_alg(&aesni_alg))) goto aes_err; if ((err = crypto_register_alg(&__aesni_alg))) @@ -1326,6 +1340,7 @@ blk_ecb_err: __aes_err: crypto_unregister_alg(&aesni_alg); aes_err: +fpu_err: return err; } @@ -1355,6 +1370,8 @@ static void __exit aesni_exit(void) crypto_unregister_alg(&blk_ecb_alg); crypto_unregister_alg(&__aesni_alg); crypto_unregister_alg(&aesni_alg); + + crypto_fpu_exit(); } module_init(aesni_init); diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index 1a8f8649c035..98d7a188f46b 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = { .module = THIS_MODULE, }; -static int __init crypto_fpu_module_init(void) +int __init crypto_fpu_init(void) { return crypto_register_template(&crypto_fpu_tmpl); } -static void __exit crypto_fpu_module_exit(void) +void __exit crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } - -module_init(crypto_fpu_module_init); -module_exit(crypto_fpu_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("FPU block cipher wrapper"); diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2d93bdbc9ac0..fd843877e841 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); + current->mm->context.ia32_compat = 1; setup_new_exec(bprm); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 518bb99c3394..c1870dddd322 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -25,6 +25,8 @@ #define sysretl_audit ia32_ret_from_sys_call #endif + .section .entry.text, "ax" + #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) .macro IA32_ARG_FIXUP noebp=0 @@ -126,26 +128,20 @@ ENTRY(ia32_sysenter_target) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %ebp,%ebp /* zero extension */ - pushq $__USER32_DS - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__USER32_DS /*CFI_REL_OFFSET ss,0*/ - pushq %rbp - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rbp CFI_REL_OFFSET rsp,0 - pushfq - CFI_ADJUST_CFA_OFFSET 8 + pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d CFI_REGISTER rip,r10 - pushq $__USER32_CS - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ movl %eax, %eax - pushq %r10 - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %r10 CFI_REL_OFFSET rip,0 - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax cld SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been @@ -182,11 +178,9 @@ sysexit_from_sys_call: xorq %r9,%r9 xorq %r10,%r10 xorq %r11,%r11 - popfq - CFI_ADJUST_CFA_OFFSET -8 + popfq_cfi /*CFI_RESTORE rflags*/ - popq %rcx /* User %esp */ - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT32 @@ -421,8 +415,7 @@ ENTRY(ia32_syscall) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%eax - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax cld /* note the registers are not zero extended to the sf. this could be a problem. */ @@ -851,4 +844,10 @@ ia32_sys_call_table: .quad sys_fanotify_init .quad sys32_fanotify_mark .quad sys_prlimit64 /* 340 */ + .quad sys_name_to_handle_at + .quad compat_sys_open_by_handle_at + .quad compat_sys_clock_adjtime + .quad sys_syncfs + .quad compat_sys_sendmmsg /* 345 */ + .quad sys_setns ia32_syscall_end: diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 52fd57f95c50..610001d385dd 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -29,6 +29,7 @@ #include <asm/processor.h> #include <asm/mmu.h> #include <asm/mpspec.h> +#include <asm/trampoline.h> #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -113,11 +114,11 @@ static inline void acpi_disable_pci(void) acpi_noirq_set(); } -/* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -extern void acpi_restore_state_mem(void); +/* Low-level suspend routine. */ +extern int acpi_suspend_lowlevel(void); -extern unsigned long acpi_wakeup_address; +extern const unsigned char acpi_wakeup_code[]; +#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code))) /* early initialization routine */ extern void acpi_reserve_wakeup_memory(void); @@ -182,19 +183,9 @@ static inline void disable_acpi(void) { } #define ARCH_HAS_POWER_INIT 1 -struct bootnode; - #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; -extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, - unsigned long end); -extern int acpi_scan_nodes(unsigned long start, unsigned long end); -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) - -#ifdef CONFIG_NUMA_EMU -extern void acpi_fake_nodes(const struct bootnode *fake_nodes, - int num_nodes); -#endif +extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ #define acpi_unlazy_tlb(x) leave_mm(x) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index a63a68be1cce..94d420b360d1 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -15,4 +15,13 @@ .endm #endif +.macro altinstruction_entry orig alt feature orig_len alt_len + .align 8 + .quad \orig + .quad \alt + .word \feature + .byte \orig_len + .byte \alt_len +.endm + #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13009d1af99a..bf535f947e8c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,7 +4,6 @@ #include <linux/types.h> #include <linux/stddef.h> #include <linux/stringify.h> -#include <linux/jump_label.h> #include <asm/asm.h> /* @@ -191,12 +190,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); extern void text_poke_smp_batch(struct text_poke_param *params, int n); -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) -#define IDEAL_NOP_SIZE_5 5 -extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; -extern void arch_init_ideal_nop5(void); -#else -static inline void arch_init_ideal_nop5(void) {} -#endif - #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 916bc8111a01..55d95eb789b3 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -19,13 +19,12 @@ #ifndef _ASM_X86_AMD_IOMMU_PROTO_H #define _ASM_X86_AMD_IOMMU_PROTO_H -struct amd_iommu; +#include <asm/amd_iommu_types.h> extern int amd_iommu_init_dma_ops(void); extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_thread(int irq, void *data); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_flush_all_domains(void); -extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_apply_erratum_63(u16 devid); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); extern int amd_iommu_init_devices(void); @@ -44,4 +43,12 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev) (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); } +static inline bool iommu_feature(struct amd_iommu *iommu, u64 f) +{ + if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) + return false; + + return !!(iommu->features & f); +} + #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index e3509fc303bf..4c9982995414 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -68,12 +68,25 @@ #define MMIO_CONTROL_OFFSET 0x0018 #define MMIO_EXCL_BASE_OFFSET 0x0020 #define MMIO_EXCL_LIMIT_OFFSET 0x0028 +#define MMIO_EXT_FEATURES 0x0030 #define MMIO_CMD_HEAD_OFFSET 0x2000 #define MMIO_CMD_TAIL_OFFSET 0x2008 #define MMIO_EVT_HEAD_OFFSET 0x2010 #define MMIO_EVT_TAIL_OFFSET 0x2018 #define MMIO_STATUS_OFFSET 0x2020 + +/* Extended Feature Bits */ +#define FEATURE_PREFETCH (1ULL<<0) +#define FEATURE_PPR (1ULL<<1) +#define FEATURE_X2APIC (1ULL<<2) +#define FEATURE_NX (1ULL<<3) +#define FEATURE_GT (1ULL<<4) +#define FEATURE_IA (1ULL<<6) +#define FEATURE_GA (1ULL<<7) +#define FEATURE_HE (1ULL<<8) +#define FEATURE_PC (1ULL<<9) + /* MMIO status bits */ #define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 @@ -113,7 +126,9 @@ /* command specific defines */ #define CMD_COMPL_WAIT 0x01 #define CMD_INV_DEV_ENTRY 0x02 -#define CMD_INV_IOMMU_PAGES 0x03 +#define CMD_INV_IOMMU_PAGES 0x03 +#define CMD_INV_IOTLB_PAGES 0x04 +#define CMD_INV_ALL 0x08 #define CMD_COMPL_WAIT_STORE_MASK 0x01 #define CMD_COMPL_WAIT_INT_MASK 0x02 @@ -215,6 +230,8 @@ #define IOMMU_PTE_IR (1ULL << 61) #define IOMMU_PTE_IW (1ULL << 62) +#define DTE_FLAG_IOTLB 0x01 + #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) @@ -227,6 +244,7 @@ /* IOMMU capabilities */ #define IOMMU_CAP_IOTLB 24 #define IOMMU_CAP_NPCACHE 26 +#define IOMMU_CAP_EFR 27 #define MAX_DOMAIN_ID 65536 @@ -249,6 +267,8 @@ extern bool amd_iommu_dump; /* global flag if IOMMUs cache non-present entries */ extern bool amd_iommu_np_cache; +/* Only true if all IOMMUs support device IOTLBs */ +extern bool amd_iommu_iotlb_sup; /* * Make iterating over all IOMMUs easier @@ -371,6 +391,9 @@ struct amd_iommu { /* flags read from acpi table */ u8 acpi_flags; + /* Extended features */ + u64 features; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability @@ -409,9 +432,6 @@ struct amd_iommu { /* if one, we need to send a completion wait command */ bool need_sync; - /* becomes true if a command buffer reset is running */ - bool reset_in_progress; - /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 64dc82ee19f0..67f87f257611 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -9,23 +9,19 @@ struct amd_nb_bus_dev_range { u8 dev_limit; }; -extern struct pci_device_id amd_nb_misc_ids[]; +extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; -struct bootnode; -extern int early_is_amd_nb(u32 value); +extern bool early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); -extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); -extern int amd_scan_nodes(void); - -#ifdef CONFIG_NUMA_EMU -extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); -extern void amd_get_nodes(struct bootnode *nodes); -#endif +extern int amd_numa_init(void); +extern int amd_get_subcaches(int); +extern int amd_set_subcaches(int, int); struct amd_northbridge { struct pci_dev *misc; + struct pci_dev *link; }; struct amd_northbridge_info { @@ -35,17 +31,18 @@ struct amd_northbridge_info { }; extern struct amd_northbridge_info amd_northbridges; -#define AMD_NB_GART 0x1 -#define AMD_NB_L3_INDEX_DISABLE 0x2 +#define AMD_NB_GART BIT(0) +#define AMD_NB_L3_INDEX_DISABLE BIT(1) +#define AMD_NB_L3_PARTITIONING BIT(2) #ifdef CONFIG_AMD_NB -static inline int amd_nb_num(void) +static inline u16 amd_nb_num(void) { return amd_northbridges.num; } -static inline int amd_nb_has_feature(int feature) +static inline bool amd_nb_has_feature(unsigned feature) { return ((amd_northbridges.flags & feature) == feature); } diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3c896946f4cc..4a0b7c7e2cce 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -2,7 +2,6 @@ #define _ASM_X86_APIC_H #include <linux/cpumask.h> -#include <linux/delay.h> #include <linux/pm.h> #include <asm/alternative.h> @@ -220,7 +219,6 @@ extern void enable_IR_x2apic(void); extern int get_physical_broadcast(void); -extern void apic_disable(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC(void); @@ -228,7 +226,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern int verify_local_APIC(void); -extern void cache_APIC_registers(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void setup_local_APIC(void); @@ -239,8 +236,7 @@ void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); -extern void enable_NMI_through_LVT0(void); -extern int apic_force_enable(void); +extern int apic_force_enable(unsigned long addr); /* * On 32bit this is mach-xxx local @@ -261,7 +257,6 @@ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } -static inline void apic_disable(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop #endif /* !CONFIG_X86_LOCAL_APIC */ @@ -307,8 +302,6 @@ struct apic { void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); - int (*apicid_to_node)(int logical_apicid); - int (*cpu_to_logical_apicid)(int cpu); int (*cpu_present_to_apicid)(int mps_cpu); void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); void (*setup_portio_remap)(void); @@ -356,6 +349,28 @@ struct apic { void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); u32 (*safe_wait_icr_idle)(void); + +#ifdef CONFIG_X86_32 + /* + * Called very early during boot from get_smp_config(). It should + * return the logical apicid. x86_[bios]_cpu_to_apicid is + * initialized before this function is called. + * + * If logical apicid can't be determined that early, the function + * may return BAD_APICID. Logical apicid will be configured after + * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity + * won't be applied properly during early boot in this case. + */ + int (*x86_32_early_logical_apicid)(int cpu); + + /* + * Optional method called from setup_local_APIC() after logical + * apicid is guaranteed to be known to initialize apicid -> node + * mapping if NUMA initialization hasn't done so already. Don't + * add new users. + */ + int (*x86_32_numa_cpu_node)(int cpu); +#endif }; /* @@ -366,6 +381,26 @@ struct apic { extern struct apic *apic; /* + * APIC drivers are probed based on how they are listed in the .apicdrivers + * section. So the order is important and enforced by the ordering + * of different apic driver files in the Makefile. + * + * For the files having two apic drivers, we use apic_drivers() + * to enforce the order with in them. + */ +#define apic_driver(sym) \ + static struct apic *__apicdrivers_##sym __used \ + __aligned(sizeof(struct apic *)) \ + __section(.apicdrivers) = { &sym } + +#define apic_drivers(sym1, sym2) \ + static struct apic *__apicdrivers_##sym1##sym2[2] __used \ + __aligned(sizeof(struct apic *)) \ + __section(.apicdrivers) = { &sym1, &sym2 } + +extern struct apic *__apicdrivers[], *__apicdrivers_end[]; + +/* * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP @@ -443,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x) #define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 #ifdef CONFIG_X86_64 -extern struct apic apic_flat; -extern struct apic apic_physflat; -extern struct apic apic_x2apic_cluster; -extern struct apic apic_x2apic_phys; extern int default_acpi_madt_oem_check(char *, char *); extern void apic_send_IPI_self(int vector); -extern struct apic apic_x2apic_uv_x; DECLARE_PER_CPU(int, x2apic_extra_bits); extern int default_cpu_present_to_apicid(int mps_cpu); @@ -465,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert) return; } -extern void generic_bigsmp_probe(void); +extern struct apic *generic_bigsmp_probe(void); #ifdef CONFIG_X86_LOCAL_APIC @@ -501,7 +531,10 @@ extern struct apic apic_noop; #ifdef CONFIG_X86_32 -extern struct apic apic_default; +static inline int noop_x86_32_early_logical_apicid(int cpu) +{ + return BAD_APICID; +} /* * Set up the logical destination ID. @@ -522,8 +555,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -extern int default_apicid_to_node(int logical_apicid); - #endif static inline unsigned int @@ -558,12 +589,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma *retmap = *phys_map; } -/* Mapping from cpu number to logical apicid */ -static inline int default_cpu_to_logical_apicid(int cpu) -{ - return 1 << cpu; -} - static inline int __default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) @@ -596,8 +621,4 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_X86_32 -extern u8 cpu_2_logical_apicid[NR_CPUS]; -#endif - #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 47a30ff8e517..34595d5e1038 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -78,6 +78,7 @@ #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 @@ -426,4 +427,16 @@ struct local_apic { #else #define BAD_APICID 0xFFFFu #endif + +enum ioapic_irq_destination_types { + dest_Fixed = 0, + dest_LowestPrio = 1, + dest_SMI = 2, + dest__reserved_1 = 3, + dest_NMI = 4, + dest_INIT = 5, + dest__reserved_2 = 6, + dest_ExtINT = 7 +}; + #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index 3c7521063d3f..aa6a3170ab5a 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -4,16 +4,40 @@ #include <asm/io.h> /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E. + * Returns physical address of EBDA. Returns 0 if there is no EBDA. */ static inline unsigned int get_bios_ebda(void) { + /* + * There is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E. + */ unsigned int address = *(unsigned short *)phys_to_virt(0x40E); address <<= 4; return address; /* 0 means none */ } +/* + * Return the sanitized length of the EBDA in bytes, if it exists. + */ +static inline unsigned int get_bios_ebda_length(void) +{ + unsigned int address; + unsigned int length; + + address = get_bios_ebda(); + if (!address) + return 0; + + /* EBDA length is byte 0 of the EBDA (stored in KiB) */ + length = *(unsigned char *)phys_to_virt(address); + length <<= 10; + + /* Trim the length if it extends beyond 640KiB */ + length = min_t(unsigned int, (640 * 1024) - address, length); + return length; +} + void reserve_ebda_region(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 903683b07e42..69d58131bc8e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -456,14 +456,12 @@ static inline int fls(int x) #ifdef __KERNEL__ -#include <asm-generic/bitops/ext2-non-atomic.h> +#include <asm-generic/bitops/le.h> #define ext2_set_bit_atomic(lock, nr, addr) \ test_and_set_bit((nr), (unsigned long *)(addr)) #define ext2_clear_bit_atomic(lock, nr, addr) \ test_and_clear_bit((nr), (unsigned long *)(addr)) -#include <asm-generic/bitops/minix.h> - #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index c8bfe63a06de..e020d88ec02d 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -12,6 +12,7 @@ /* setup data types */ #define SETUP_NONE 0 #define SETUP_E820_EXT 1 +#define SETUP_DTB 2 /* extensible setup data list node */ struct setup_data { diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 62f084478f7e..4e12668711e5 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -71,7 +71,7 @@ static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent * - * Within a catagory, the attributes are mutually exclusive. + * Within a category, the attributes are mutually exclusive. * * The implementation of this API will take care of various aspects that * are associated with changing such attributes, such as: diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 220e2ea08e80..71cc3800712c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,7 +125,7 @@ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */ #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ @@ -160,6 +160,7 @@ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -194,6 +195,8 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) @@ -206,8 +209,7 @@ extern const char * const x86_power_flags[32]; #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) -#define cpu_has(c, bit) \ - (__builtin_constant_p(bit) && \ +#define REQUIRED_MASK_BIT_SET(bit) \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ @@ -217,10 +219,16 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ - (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ - ? 1 : \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) + +#define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) +#define this_cpu_has(bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ + x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability)) + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) @@ -279,6 +287,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) +#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 617bd56b3070..7b439d9aea2a 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -4,30 +4,33 @@ #include <asm/desc_defs.h> #include <asm/ldt.h> #include <asm/mmu.h> + #include <linux/smp.h> -static inline void fill_ldt(struct desc_struct *desc, - const struct user_desc *info) -{ - desc->limit0 = info->limit & 0x0ffff; - desc->base0 = info->base_addr & 0x0000ffff; - - desc->base1 = (info->base_addr & 0x00ff0000) >> 16; - desc->type = (info->read_exec_only ^ 1) << 1; - desc->type |= info->contents << 2; - desc->s = 1; - desc->dpl = 0x3; - desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf0000) >> 16; - desc->avl = info->useable; - desc->d = info->seg_32bit; - desc->g = info->limit_in_pages; - desc->base2 = (info->base_addr & 0xff000000) >> 24; +static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) +{ + desc->limit0 = info->limit & 0x0ffff; + + desc->base0 = (info->base_addr & 0x0000ffff); + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; + + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; + + desc->s = 1; + desc->dpl = 0x3; + desc->p = info->seg_not_present ^ 1; + desc->limit = (info->limit & 0xf0000) >> 16; + desc->avl = info->useable; + desc->d = info->seg_32bit; + desc->g = info->limit_in_pages; + + desc->base2 = (info->base_addr & 0xff000000) >> 24; /* * Don't allow setting of the lm bit. It is useless anyway * because 64bit system calls require __USER_CS: */ - desc->l = 0; + desc->l = 0; } extern struct desc_ptr idt_descr; @@ -36,6 +39,7 @@ extern gate_desc idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); + DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) @@ -48,16 +52,16 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); - gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); + gate->offset_low = PTR_LOW(func); + gate->segment = __KERNEL_CS; + gate->ist = ist; + gate->p = 1; + gate->dpl = dpl; + gate->zero0 = 0; + gate->zero1 = 0; + gate->type = type; + gate->offset_middle = PTR_MIDDLE(func); + gate->offset_high = PTR_HIGH(func); } #else @@ -66,8 +70,7 @@ static inline void pack_gate(gate_desc *gate, unsigned char type, unsigned short seg) { gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | - (((0x80 | type | (dpl << 5)) & 0xff) << 8); + gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); } #endif @@ -75,31 +78,29 @@ static inline void pack_gate(gate_desc *gate, unsigned char type, static inline int desc_empty(const void *ptr) { const u32 *desc = ptr; + return !(desc[0] | desc[1]); } #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else -#define load_TR_desc() native_load_tr_desc() -#define load_gdt(dtr) native_load_gdt(dtr) -#define load_idt(dtr) native_load_idt(dtr) -#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) -#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) - -#define store_gdt(dtr) native_store_gdt(dtr) -#define store_idt(dtr) native_store_idt(dtr) -#define store_tr(tr) (tr = native_store_tr()) - -#define load_TLS(t, cpu) native_load_tls(t, cpu) -#define set_ldt native_set_ldt - -#define write_ldt_entry(dt, entry, desc) \ - native_write_ldt_entry(dt, entry, desc) -#define write_gdt_entry(dt, entry, desc, type) \ - native_write_gdt_entry(dt, entry, desc, type) -#define write_idt_entry(dt, entry, g) \ - native_write_idt_entry(dt, entry, g) +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) + +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) + +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt + +#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc) +#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type) +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g) static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { @@ -112,33 +113,27 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) #define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) -static inline void native_write_idt_entry(gate_desc *idt, int entry, - const gate_desc *gate) +static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate) { memcpy(&idt[entry], gate, sizeof(*gate)); } -static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, - const void *desc) +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc) { memcpy(&ldt[entry], desc, 8); } -static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, - const void *desc, int type) +static inline void +native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type) { unsigned int size; + switch (type) { - case DESC_TSS: - size = sizeof(tss_desc); - break; - case DESC_LDT: - size = sizeof(ldt_desc); - break; - default: - size = sizeof(struct desc_struct); - break; + case DESC_TSS: size = sizeof(tss_desc); break; + case DESC_LDT: size = sizeof(ldt_desc); break; + default: size = sizeof(*gdt); break; } + memcpy(&gdt[entry], desc, size); } @@ -154,20 +149,21 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, } -static inline void set_tssldt_descriptor(void *d, unsigned long addr, - unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) { #ifdef CONFIG_X86_64 struct ldttss_desc64 *desc = d; + memset(desc, 0, sizeof(*desc)); - desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; - desc->type = type; - desc->p = 1; - desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); + + desc->limit0 = size & 0xFFFF; + desc->base0 = PTR_LOW(addr); + desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->type = type; + desc->p = 1; + desc->limit1 = (size >> 16) & 0xF; + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; + desc->base3 = PTR_HIGH(addr); #else pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); #endif @@ -237,14 +233,16 @@ static inline void native_store_idt(struct desc_ptr *dtr) static inline unsigned long native_store_tr(void) { unsigned long tr; + asm volatile("str %0":"=r" (tr)); + return tr; } static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) { - unsigned int i; struct desc_struct *gdt = get_cpu_gdt_table(cpu); + unsigned int i; for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; @@ -313,6 +311,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { gate_desc s; + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); /* * does not need to be atomic because it is only done once at @@ -343,8 +342,9 @@ static inline void alloc_system_vector(int vector) set_bit(vector, used_vectors); if (first_system_vector > vector) first_system_vector = vector; - } else + } else { BUG(); + } } static inline void alloc_intr_gate(unsigned int n, void *addr) diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index ca1098a7e580..0bdb0c54d9a1 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -10,7 +10,6 @@ #include <linux/spinlock.h> /* And spinlocks */ #include <asm/io.h> /* need byte IO */ -#include <linux/delay.h> #ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER #define dma_outb outb_p @@ -70,22 +69,18 @@ #define MAX_DMA_CHANNELS 8 -#ifdef CONFIG_X86_32 - -/* The maximum address that we can perform a DMA transfer to on this platform */ -#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000) - -#else - /* 16MB ISA DMA zone */ #define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#ifdef CONFIG_X86_32 +/* The maximum address that we can perform a DMA transfer to on this platform */ +#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000) +#else /* Compat define for old dma zone */ #define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) - #endif /* 8237 DMA controllers */ @@ -151,6 +146,7 @@ #define DMA_AUTOINIT 0x10 +#ifdef CONFIG_ISA_DMA_API extern spinlock_t dma_spin_lock; static inline unsigned long claim_dma_lock(void) @@ -164,6 +160,7 @@ static inline void release_dma_lock(unsigned long flags) { spin_unlock_irqrestore(&dma_spin_lock, flags); } +#endif /* CONFIG_ISA_DMA_API */ /* enable/disable a specific DMA channel */ static inline void enable_dma(unsigned int dmanr) @@ -303,9 +300,11 @@ static inline int get_dma_residue(unsigned int dmanr) } -/* These are in kernel/dma.c: */ +/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */ +#ifdef CONFIG_ISA_DMA_API extern int request_dma(unsigned int dmanr, const char *device_id); extern void free_dma(unsigned int dmanr); +#endif /* From PCI */ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index e99d55d74df5..908b96957d88 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -96,7 +96,7 @@ extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr); struct setup_data; -extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); +extern void parse_e820_ext(struct setup_data *data); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8e4a16508d4e..7093e4a6a0bc 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ extern int add_efi_memmap; +extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern void efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 57650ab4a5f5..1cd6d26a0a8d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) -.irpc idx, "01234567" +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx BUILD_INTERRUPT3(invalidate_interrupt\idx, (INVALIDATE_TLB_VECTOR_START)+\idx, smp_invalidate_interrupt) +.endif .endr #endif diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 06850a7194e1..2c6fc9e62812 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -7,14 +7,12 @@ frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp,0 movl %esp,%ebp .endm .macro ENDFRAME - popl %ebp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebp CFI_RESTORE ebp .endm #else diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index db24c2278be0..268c783ab1c0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -38,11 +38,10 @@ extern void mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { /* - * call mcount is "e8 <4 byte offset>" - * The addr points to the 4 byte offset and the caller of this - * function wants the pointer to e8. Simply subtract one. + * addr is the address of the mcount call instruction. + * recordmcount does the necessary offset calculation. */ - return addr - 1; + return addr; } #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 1f11ce44e956..d09bb03653f0 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -37,7 +37,7 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) @@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, - int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { + int ret = 0; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) /* Real i386 machines have no cmpxchg instruction */ @@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, return -ENOSYS; #endif - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" + asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" "2:\t.section .fixup, \"ax\"\n" - "3:\tmov %2, %0\n" + "3:\tmov %3, %0\n" "\tjmp 2b\n" "\t.previous\n" _ASM_EXTABLE(1b, 3b) - : "=a" (oldval), "+m" (*uaddr) - : "i" (-EFAULT), "r" (newval), "0" (oldval) + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) + : "i" (-EFAULT), "r" (newval), "1" (oldval) : "memory" ); - return oldval; + *uval = oldval; + return ret; } #endif diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 43085bfc99c3..156cd5d18d2a 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) * Don't enable translation but enable GART IO and CPU accesses. * Also, set DISTLBWALKPRB since GART tables memory is UC. */ - ctl = DISTLBWALKPRB | order << 1; + ctl = order << 1; pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } @@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; - /* address of the mappings table */ - addr >>= 12; - tmp = (u32) addr<<4; - tmp &= ~0xf; - pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); - - /* Enable GART translation for this hammer. */ - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl |= GARTEN; - ctl &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + /* address of the mappings table */ + addr >>= 12; + tmp = (u32) addr<<4; + tmp &= ~0xf; + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); + + /* Enable GART translation for this hammer. */ + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + ctl |= GARTEN | DISTLBWALKPRB; + ctl &= ~(DISGARTCPU | DISGARTIO); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 0274ec5a7e62..bb9efe8706e2 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void); extern void invalidate_interrupt5(void); extern void invalidate_interrupt6(void); extern void invalidate_interrupt7(void); +extern void invalidate_interrupt8(void); +extern void invalidate_interrupt9(void); +extern void invalidate_interrupt10(void); +extern void invalidate_interrupt11(void); +extern void invalidate_interrupt12(void); +extern void invalidate_interrupt13(void); +extern void invalidate_interrupt14(void); +extern void invalidate_interrupt15(void); +extern void invalidate_interrupt16(void); +extern void invalidate_interrupt17(void); +extern void invalidate_interrupt18(void); +extern void invalidate_interrupt19(void); +extern void invalidate_interrupt20(void); +extern void invalidate_interrupt21(void); +extern void invalidate_interrupt22(void); +extern void invalidate_interrupt23(void); +extern void invalidate_interrupt24(void); +extern void invalidate_interrupt25(void); +extern void invalidate_interrupt26(void); +extern void invalidate_interrupt27(void); +extern void invalidate_interrupt28(void); +extern void invalidate_interrupt29(void); +extern void invalidate_interrupt30(void); +extern void invalidate_interrupt31(void); extern void irq_move_cleanup_interrupt(void); extern void reboot_interrupt(void); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ef328901c802..c9e09ea05644 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -237,7 +237,7 @@ static inline void fpu_save_init(struct fpu *fpu) } else if (use_fxsr()) { fpu_fxsave(fpu); } else { - asm volatile("fsave %[fx]; fwait" + asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); return; } diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h index fc1f579fb965..65aaa91d5850 100644 --- a/arch/x86/include/asm/i8253.h +++ b/arch/x86/include/asm/i8253.h @@ -6,6 +6,8 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 +#define PIT_LATCH LATCH + extern raw_spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 36fb1a6a5109..8dbe353e41e1 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask); -extern unsigned long __initdata e820_table_start; -extern unsigned long __meminitdata e820_table_end; -extern unsigned long __meminitdata e820_table_top; +extern unsigned long __initdata pgt_buf_start; +extern unsigned long __meminitdata pgt_buf_end; +extern unsigned long __meminitdata pgt_buf_top; #endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 072273082528..d02804d650c4 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -38,7 +38,6 @@ #include <linux/string.h> #include <linux/compiler.h> -#include <asm-generic/int-ll64.h> #include <asm/page.h> #include <xen/xen.h> @@ -87,27 +86,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) build_mmio_read(readq, "q", unsigned long, "=r", :"memory") build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -#else - -static inline __u64 readq(const volatile void __iomem *addr) -{ - const volatile u32 __iomem *p = addr; - u32 low, high; - - low = readl(p); - high = readl(p + 1); - - return low + ((u64)high << 32); -} - -static inline void writeq(__u64 val, volatile void __iomem *addr) -{ - writel(val, addr); - writel(val >> 32, addr+4); -} - -#endif - #define readq_relaxed(a) readq(a) #define __raw_readq(a) readq(a) @@ -117,6 +95,8 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #define readq readq #define writeq writeq +#endif + /** * virt_to_phys - map virtual addresses to physical * @address: address to remap diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index f327d386d6cc..690d1cc9a877 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -63,17 +63,6 @@ union IO_APIC_reg_03 { } __attribute__ ((packed)) bits; }; -enum ioapic_irq_destination_types { - dest_Fixed = 0, - dest_LowestPrio = 1, - dest_SMI = 2, - dest__reserved_1 = 3, - dest_NMI = 4, - dest_INIT = 5, - dest__reserved_2 = 6, - dest_ExtINT = 7 -}; - struct IO_APIC_route_entry { __u32 vector : 8, delivery_mode : 3, /* 000: FIXED @@ -106,18 +95,22 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + #ifdef CONFIG_X86_IO_APIC /* * # of IO-APICs and # of IRQ routing registers */ extern int nr_ioapics; -extern int nr_ioapic_registers[MAX_IO_APICS]; -#define MP_MAX_IOAPIC_PIN 127 +extern int mpc_ioapic_id(int ioapic); +extern unsigned int mpc_ioapic_addr(int ioapic); +extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); -/* I/O APIC entries */ -extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; +#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -150,11 +143,6 @@ extern int timer_through_8259; #define io_apic_assign_pci_irqs \ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) -extern u8 io_apic_unique_id(u8 id); -extern int io_apic_get_unique_id(int ioapic, int apic_id); -extern int io_apic_get_version(int ioapic); -extern int io_apic_get_redir_entries(int ioapic); - struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); @@ -162,11 +150,11 @@ void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); -extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); -extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); -extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); +int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); + +extern int save_ioapic_entries(void); +extern void mask_ioapic_entries(void); +extern int restore_ioapic_entries(void); extern int get_nr_irqs_gsi(void); @@ -186,6 +174,8 @@ extern void __init pre_init_apic_IRQ0(void); extern void mp_save_irq(struct mpc_intsrc *m); +extern void disable_ioapic_support(void); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 @@ -199,6 +189,20 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; } struct io_apic_irq_attr; static inline int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { return 0; } + +static inline int save_ioapic_entries(void) +{ + return -ENOMEM; +} + +static inline void mask_ioapic_entries(void) { } +static inline int restore_ioapic_entries(void) +{ + return -ENOMEM; +} + +static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void disable_ioapic_support(void) { } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 0b7228268a63..615fa9061b57 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); -extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, - int vector); -extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, - int vector); /* Avoid include hell */ #define NMI_VECTOR 0x02 @@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector) } #ifdef CONFIG_X86_32 +extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, + int vector); +extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, + int vector); extern void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); extern void default_send_IPI_allbutself(int vector); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index c704b38c57a2..ba870bb6dd8e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -10,9 +10,6 @@ #include <asm/apicdef.h> #include <asm/irq_vectors.h> -/* Even though we don't support this, supply it to appease OF */ -static inline void irq_dispose_mapping(unsigned int virq) { } - static inline int irq_canonicalize(int irq) { return ((irq == 2) ? 9 : irq); diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h new file mode 100644 index 000000000000..423bbbddf36d --- /dev/null +++ b/arch/x86/include/asm/irq_controller.h @@ -0,0 +1,12 @@ +#ifndef __IRQ_CONTROLLER__ +#define __IRQ_CONTROLLER__ + +struct irq_domain { + int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize, + u32 *out_hwirq, u32 *out_type); + void *priv; + struct device_node *controller; + struct list_head l; +}; + +#endif diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6af0894dafb4..6e976ee3b3ef 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_IRQ_VECTORS_H #define _ASM_X86_IRQ_VECTORS_H +#include <linux/threads.h> /* * Linux IRQ vector layout. * @@ -16,8 +17,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... 237 : device interrupts - * Vectors 238 ... 255 : special interrupts + * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts + * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * @@ -96,37 +97,43 @@ #define THRESHOLD_APIC_VECTOR 0xf9 #define REBOOT_VECTOR 0xf8 -/* f0-f7 used for spreading out TLB flushes: */ -#define INVALIDATE_TLB_VECTOR_END 0xf7 -#define INVALIDATE_TLB_VECTOR_START 0xf0 -#define NUM_INVALIDATE_TLB_VECTORS 8 - -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - /* * Generic system vector for platform specific use */ -#define X86_PLATFORM_IPI_VECTOR 0xed +#define X86_PLATFORM_IPI_VECTOR 0xf7 /* * IRQ work vector: */ -#define IRQ_WORK_VECTOR 0xec +#define IRQ_WORK_VECTOR 0xf6 -#define UV_BAU_MESSAGE 0xea +#define UV_BAU_MESSAGE 0xf5 /* * Self IPI vector for machine checks */ -#define MCE_SELF_VECTOR 0xeb +#define MCE_SELF_VECTOR 0xf4 /* Xen vector callback to receive events in a HVM domain */ -#define XEN_HVM_EVTCHN_CALLBACK 0xe9 +#define XEN_HVM_EVTCHN_CALLBACK 0xf3 + +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef + +/* up to 32 vectors used for spreading out TLB flushes: */ +#if NR_CPUS <= 32 +# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS) +#else +# define NUM_INVALIDATE_TLB_VECTORS (32) +#endif + +#define INVALIDATE_TLB_VECTOR_END (0xee) +#define INVALIDATE_TLB_VECTOR_START \ + (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1) #define NR_VECTORS 256 diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 574dbc22893a..a32b18ce6ead 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -5,20 +5,25 @@ #include <linux/types.h> #include <asm/nops.h> +#include <asm/asm.h> #define JUMP_LABEL_NOP_SIZE 5 -# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" - -# define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:" \ - JUMP_LABEL_INITIAL_NOP \ - ".pushsection __jump_table, \"aw\" \n\t"\ - _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ - ".popsection \n\t" \ - : : "i" (key) : : label); \ - } while (0) +#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" + +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:" + JUMP_LABEL_INITIAL_NOP + ".pushsection __jump_table, \"aw\" \n\t" + _ASM_ALIGN "\n\t" + _ASM_PTR "1b, %l[l_yes], %c0 \n\t" + ".popsection \n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index ca242d35e873..fe2cc6e105fa 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -13,7 +13,6 @@ enum die_val { DIE_PANIC, DIE_NMI, DIE_DIE, - DIE_NMIWATCHDOG, DIE_KERNELDEBUG, DIE_TRAP, DIE_GPF, @@ -27,7 +26,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp); + unsigned long *sp, unsigned long bp); extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 396f5b5fc4d7..77e95f54570a 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -77,6 +77,7 @@ static inline void arch_kgdb_breakpoint(void) } #define BREAK_INSTR_SIZE 1 #define CACHE_FLUSH_IS_SAFE 1 +#define GDB_ADJUSTS_BREAK_OFFSET extern int kgdb_ll_trap(int cmd, const char *str, struct pt_regs *regs, long err, int trap, int sig); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 8e37deb1eb38..0049211959c0 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -14,6 +14,8 @@ #include <asm/desc_defs.h> struct x86_emulate_ctxt; +enum x86_intercept; +enum x86_intercept_stage; struct x86_exception { u8 vector; @@ -24,6 +26,24 @@ struct x86_exception { }; /* + * This struct is used to carry enough information from the instruction + * decoder to main KVM so that a decision can be made whether the + * instruction needs to be intercepted or not. + */ +struct x86_instruction_info { + u8 intercept; /* which intercept */ + u8 rep_prefix; /* rep prefix? */ + u8 modrm_mod; /* mod part of modrm */ + u8 modrm_reg; /* index of register used */ + u8 modrm_rm; /* rm part of modrm */ + u64 src_val; /* value of source operand */ + u8 src_bytes; /* size of source operand */ + u8 dst_bytes; /* size of destination operand */ + u8 ad_bytes; /* size of src/dst address */ + u64 next_rip; /* rip following the instruction */ +}; + +/* * x86_emulate_ops: * * These operations represent the instruction emulator's interface to memory. @@ -62,6 +82,7 @@ struct x86_exception { #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ +#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { /* @@ -71,8 +92,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*read_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, + unsigned int bytes, struct x86_exception *fault); /* @@ -82,8 +104,8 @@ struct x86_emulate_ops { * @val: [OUT] Value write to memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*write_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * fetch: Read bytes of standard (non-emulated/special) memory. @@ -92,8 +114,8 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*fetch)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*fetch)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* @@ -102,11 +124,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_emulated)(unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + int (*read_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, + struct x86_exception *fault); /* * write_emulated: Write bytes to emulated/special memory area. @@ -115,11 +135,10 @@ struct x86_emulate_ops { * required). * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_emulated)(unsigned long addr, - const void *val, + int (*write_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + struct x86_exception *fault); /* * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an @@ -129,40 +148,54 @@ struct x86_emulate_ops { * @new: [IN ] Value to write to @addr. * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ - int (*cmpxchg_emulated)(unsigned long addr, + int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); - - int (*pio_in_emulated)(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - int (*pio_out_emulated)(int size, unsigned short port, const void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - bool (*get_cached_descriptor)(struct desc_struct *desc, - int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct desc_struct *desc, - int seg, struct kvm_vcpu *vcpu); - u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); - void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); - unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); - void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); - void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); - ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); - int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); - int (*cpl)(struct kvm_vcpu *vcpu); - int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); - int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + struct x86_exception *fault); + void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr); + + int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count); + + int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, const void *val, + unsigned int count); + + bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, int seg); + void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, int seg); + unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, + int seg); + void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); + int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + int (*cpl)(struct x86_emulate_ctxt *ctxt); + int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); + int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); + int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + void (*halt)(struct x86_emulate_ctxt *ctxt); + void (*wbinvd)(struct x86_emulate_ctxt *ctxt); + int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); + void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ + void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ + int (*intercept)(struct x86_emulate_ctxt *ctxt, + struct x86_instruction_info *info, + enum x86_intercept_stage stage); }; +typedef u32 __attribute__((vector_size(16))) sse128_t; + /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; unsigned int bytes; union { unsigned long orig_val; @@ -174,11 +207,13 @@ struct operand { ulong ea; unsigned seg; } mem; + unsigned xmm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(unsigned long) + 2]; + sse128_t vec_val; }; }; @@ -197,6 +232,7 @@ struct read_cache { struct decode_cache { u8 twobyte; u8 b; + u8 intercept; u8 lock_prefix; u8 rep_prefix; u8 op_bytes; @@ -209,6 +245,7 @@ struct decode_cache { u8 seg_override; unsigned int d; int (*execute)(struct x86_emulate_ctxt *ctxt); + int (*check_perm)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; unsigned long eip; /* modrm */ @@ -227,18 +264,17 @@ struct x86_emulate_ctxt { struct x86_emulate_ops *ops; /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; - u32 cs_base; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ + bool only_vendor_specific_insn; bool have_exception; struct x86_exception exception; @@ -248,8 +284,8 @@ struct x86_emulate_ctxt { }; /* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 +#define REPE_PREFIX 0xf3 +#define REPNE_PREFIX 0xf2 /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ @@ -258,6 +294,69 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ +/* any protected mode */ +#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ + X86EMUL_MODE_PROT64) + +enum x86_intercept_stage { + X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ + X86_ICPT_PRE_EXCEPT, + X86_ICPT_POST_EXCEPT, + X86_ICPT_POST_MEMACCESS, +}; + +enum x86_intercept { + x86_intercept_none, + x86_intercept_cr_read, + x86_intercept_cr_write, + x86_intercept_clts, + x86_intercept_lmsw, + x86_intercept_smsw, + x86_intercept_dr_read, + x86_intercept_dr_write, + x86_intercept_lidt, + x86_intercept_sidt, + x86_intercept_lgdt, + x86_intercept_sgdt, + x86_intercept_lldt, + x86_intercept_sldt, + x86_intercept_ltr, + x86_intercept_str, + x86_intercept_rdtsc, + x86_intercept_rdpmc, + x86_intercept_pushf, + x86_intercept_popf, + x86_intercept_cpuid, + x86_intercept_rsm, + x86_intercept_iret, + x86_intercept_intn, + x86_intercept_invd, + x86_intercept_pause, + x86_intercept_hlt, + x86_intercept_invlpg, + x86_intercept_invlpga, + x86_intercept_vmrun, + x86_intercept_vmload, + x86_intercept_vmsave, + x86_intercept_vmmcall, + x86_intercept_stgi, + x86_intercept_clgi, + x86_intercept_skinit, + x86_intercept_rdtscp, + x86_intercept_icebp, + x86_intercept_wbinvd, + x86_intercept_monitor, + x86_intercept_mwait, + x86_intercept_rdmsr, + x86_intercept_wrmsr, + x86_intercept_in, + x86_intercept_ins, + x86_intercept_out, + x86_intercept_outs, + + nr_x86_intercepts +}; + /* Host execution mode. */ #if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 @@ -269,6 +368,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 +#define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ffd7f8d29187..d2ac8e2ee897 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -30,14 +30,30 @@ #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MMIO_SIZE 16 #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) + #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + + #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) @@ -85,7 +101,7 @@ #define ASYNC_PF_PER_VCPU 64 -extern spinlock_t kvm_lock; +extern raw_spinlock_t kvm_lock; extern struct list_head vm_list; struct kvm_vcpu; @@ -118,6 +134,9 @@ enum kvm_reg { enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, + VCPU_EXREG_RFLAGS, + VCPU_EXREG_CPL, + VCPU_EXREG_SEGMENTS, }; enum { @@ -255,6 +274,8 @@ struct kvm_mmu { int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -335,16 +356,9 @@ struct kvm_vcpu_arch { u64 *last_pte_updated; gfn_t last_pte_gfn; - struct { - gfn_t gfn; /* presumed gfn during guest pte update */ - pfn_t pfn; /* pfn corresponding to that gfn */ - unsigned long mmu_seq; - } update_pte; - struct fpu guest_fpu; u64 xcr0; - gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; @@ -371,18 +385,22 @@ struct kvm_vcpu_arch { /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; + bool emulate_regs_need_sync_to_vcpu; + bool emulate_regs_need_sync_from_vcpu; gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; - u64 last_host_tsc; u64 last_guest_tsc; u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u32 virtual_tsc_khz; bool tsc_catchup; + u32 tsc_catchup_mult; + s8 tsc_catchup_shift; bool nmi_pending; bool nmi_injected; @@ -448,13 +466,10 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; - spinlock_t tsc_write_lock; + raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; - u32 virtual_tsc_khz; - u32 virtual_tsc_mult; - s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; @@ -506,6 +521,8 @@ struct kvm_vcpu_stat { u32 nmi_injections; }; +struct x86_instruction_info; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -590,9 +607,17 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + + int (*check_intercept)(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage); + const struct trace_print_flags *exit_reasons_str; }; @@ -631,6 +656,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +/* control of guest tsc rate supported? */ +extern bool kvm_has_tsc_control; +/* minimum supported tsc_khz for guests */ +extern u32 kvm_min_guest_tsc_khz; +/* maximum supported tsc_khz for guests */ +extern u32 kvm_max_guest_tsc_khz; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ @@ -649,9 +681,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); - void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); @@ -661,8 +690,6 @@ struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); -int emulate_clts(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -725,8 +752,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu); - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 12d55e773eb6..48142971b25d 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,11 +8,6 @@ #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) -/* - * For 32-bit UML - mark functions implemented in assembly that use - * regparm input parameters: - */ -#define asmregparm __attribute__((regparm(3))) /* * Make sure the compiler doesn't do anything stupid with the diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index eb16e94ae04f..021979a6e23f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -142,8 +142,6 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} static inline void enable_p5_mce(void) {} #endif -extern void (*x86_mce_decode_callback)(struct mce *m); - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 80a1dee5bea5..5f55e6962769 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -11,6 +11,12 @@ typedef struct { void *ldt; int size; + +#ifdef CONFIG_X86_64 + /* True if mm supports a task running in 32 bit compatibility mode. */ + unsigned short ia32_compat; +#endif + struct mutex lock; void *vdso; } mm_context_t; diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 91df7c51806c..5e83a416eca8 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -13,31 +13,11 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #include <asm/numaq.h> -/* summit or generic arch */ -#include <asm/srat.h> - -extern int get_memcfg_numa_flat(void); -/* - * This allows any one NUMA architecture to be compiled - * for, and still fall back to the flat function if it - * fails. - */ -static inline void get_memcfg_numa(void) -{ - - if (get_memcfg_numaq()) - return; - if (get_memcfg_from_srat()) - return; - get_memcfg_numa_flat(); -} extern void resume_map_numa_kva(pgd_t *pgd); #else /* !CONFIG_NUMA */ -#define get_memcfg_numa get_memcfg_numa_flat - static inline void resume_map_numa_kva(pgd_t *pgd) {} #endif /* CONFIG_NUMA */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index 288b96f815a6..b3f88d7867c7 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -4,36 +4,13 @@ #ifndef _ASM_X86_MMZONE_64_H #define _ASM_X86_MMZONE_64_H - #ifdef CONFIG_NUMA #include <linux/mmdebug.h> - #include <asm/smp.h> -/* Simple perfect hash to map physical addresses to node numbers */ -struct memnode { - int shift; - unsigned int mapsize; - s16 *map; - s16 embedded_map[64 - 8]; -} ____cacheline_aligned; /* total size = 128 bytes */ -extern struct memnode memnode; -#define memnode_shift memnode.shift -#define memnodemap memnode.map -#define memnodemapsize memnode.mapsize - extern struct pglist_data *node_data[]; -static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) -{ - unsigned nid; - VIRTUAL_BUG_ON(!memnodemap); - nid = memnodemap[addr >> memnode_shift]; - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); - return nid; -} - #define NODE_DATA(nid) (node_data[nid]) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 67763c5d8b4e..9eae7752ae9b 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -35,7 +35,7 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_X86_ELAN +#elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE #define MODULE_PROC_FAMILY "CRUSOE " diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 0c90dd9f0505..9c7d95f6174b 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -25,7 +25,6 @@ extern int pic_mode; #define MAX_IRQ_SOURCES 256 extern unsigned int def_to_bigsmp; -extern u8 apicid_2_node[]; #ifdef CONFIG_X86_NUMAQ extern int mp_bus_id_to_node[MAX_MP_BUSSES]; @@ -33,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES]; extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #endif -#define MAX_APICID 256 - #else /* CONFIG_X86_64: */ #define MAX_MP_BUSSES 256 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 43a18c77676d..485b4f1f079b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -43,6 +43,7 @@ #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 @@ -52,6 +53,9 @@ #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_OFFCORE_RSP_0 0x000001a6 +#define MSR_OFFCORE_RSP_1 0x000001a7 + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 @@ -92,11 +96,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_AMD64_MC0_MASK 0xc0010044 + #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) @@ -110,6 +118,7 @@ complete list. */ #define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c76f5b92b840..4886a68f267e 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -7,7 +7,6 @@ #ifdef CONFIG_X86_LOCAL_APIC -extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); @@ -30,8 +29,8 @@ void arch_trigger_all_cpu_backtrace(void); * external nmis, because the local ones are more frequent. * * Also setup some default high/normal/low settings for - * subsystems to registers with. Using 4 bits to seperate - * the priorities. This can go alot higher if needed be. + * subsystems to registers with. Using 4 bits to separate + * the priorities. This can go a lot higher if needed be. */ #define NMI_LOCAL_SHIFT 16 /* randomly picked */ diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index 6d8723a766cc..405b4032a60b 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -1,7 +1,13 @@ #ifndef _ASM_X86_NOPS_H #define _ASM_X86_NOPS_H -/* Define nops for use with alternative() */ +/* + * Define nops for use with alternative() and for tracing. + * + * *_NOP5_ATOMIC must be a single instruction. + */ + +#define NOP_DS_PREFIX 0x3e /* generic versions from gas 1: nop @@ -13,14 +19,15 @@ 6: leal 0x00000000(%esi),%esi 7: leal 0x00000000(,%esi,1),%esi */ -#define GENERIC_NOP1 ".byte 0x90\n" -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 +#define GENERIC_NOP1 0x90 +#define GENERIC_NOP2 0x89,0xf6 +#define GENERIC_NOP3 0x8d,0x76,0x00 +#define GENERIC_NOP4 0x8d,0x74,0x26,0x00 +#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4 +#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00 +#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 +#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7 +#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4 /* Opteron 64bit nops 1: nop @@ -29,16 +36,17 @@ 4: osp osp osp nop */ #define K8_NOP1 GENERIC_NOP1 -#define K8_NOP2 ".byte 0x66,0x90\n" -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" -#define K8_NOP5 K8_NOP3 K8_NOP2 -#define K8_NOP6 K8_NOP3 K8_NOP3 -#define K8_NOP7 K8_NOP4 K8_NOP3 -#define K8_NOP8 K8_NOP4 K8_NOP4 +#define K8_NOP2 0x66,K8_NOP1 +#define K8_NOP3 0x66,K8_NOP2 +#define K8_NOP4 0x66,K8_NOP3 +#define K8_NOP5 K8_NOP3,K8_NOP2 +#define K8_NOP6 K8_NOP3,K8_NOP3 +#define K8_NOP7 K8_NOP4,K8_NOP3 +#define K8_NOP8 K8_NOP4,K8_NOP4 +#define K8_NOP5_ATOMIC 0x66,K8_NOP4 /* K7 nops - uses eax dependencies (arbitary choice) + uses eax dependencies (arbitrary choice) 1: nop 2: movl %eax,%eax 3: leal (,%eax,1),%eax @@ -47,13 +55,14 @@ 7: leal 0x00000000(,%eax,1),%eax */ #define K7_NOP1 GENERIC_NOP1 -#define K7_NOP2 ".byte 0x8b,0xc0\n" -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" -#define K7_NOP5 K7_NOP4 ASM_NOP1 -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" -#define K7_NOP8 K7_NOP7 ASM_NOP1 +#define K7_NOP2 0x8b,0xc0 +#define K7_NOP3 0x8d,0x04,0x20 +#define K7_NOP4 0x8d,0x44,0x20,0x00 +#define K7_NOP5 K7_NOP4,K7_NOP1 +#define K7_NOP6 0x8d,0x80,0,0,0,0 +#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0 +#define K7_NOP8 K7_NOP7,K7_NOP1 +#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4 /* P6 nops uses eax dependencies (Intel-recommended choice) @@ -69,52 +78,65 @@ There is kernel code that depends on this. */ #define P6_NOP1 GENERIC_NOP1 -#define P6_NOP2 ".byte 0x66,0x90\n" -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" +#define P6_NOP2 0x66,0x90 +#define P6_NOP3 0x0f,0x1f,0x00 +#define P6_NOP4 0x0f,0x1f,0x40,0 +#define P6_NOP5 0x0f,0x1f,0x44,0x00,0 +#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0 +#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0 +#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0 +#define P6_NOP5_ATOMIC P6_NOP5 + +#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" #if defined(CONFIG_MK7) -#define ASM_NOP1 K7_NOP1 -#define ASM_NOP2 K7_NOP2 -#define ASM_NOP3 K7_NOP3 -#define ASM_NOP4 K7_NOP4 -#define ASM_NOP5 K7_NOP5 -#define ASM_NOP6 K7_NOP6 -#define ASM_NOP7 K7_NOP7 -#define ASM_NOP8 K7_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC) #elif defined(CONFIG_X86_P6_NOP) -#define ASM_NOP1 P6_NOP1 -#define ASM_NOP2 P6_NOP2 -#define ASM_NOP3 P6_NOP3 -#define ASM_NOP4 P6_NOP4 -#define ASM_NOP5 P6_NOP5 -#define ASM_NOP6 P6_NOP6 -#define ASM_NOP7 P6_NOP7 -#define ASM_NOP8 P6_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC) #elif defined(CONFIG_X86_64) -#define ASM_NOP1 K8_NOP1 -#define ASM_NOP2 K8_NOP2 -#define ASM_NOP3 K8_NOP3 -#define ASM_NOP4 K8_NOP4 -#define ASM_NOP5 K8_NOP5 -#define ASM_NOP6 K8_NOP6 -#define ASM_NOP7 K8_NOP7 -#define ASM_NOP8 K8_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC) #else -#define ASM_NOP1 GENERIC_NOP1 -#define ASM_NOP2 GENERIC_NOP2 -#define ASM_NOP3 GENERIC_NOP3 -#define ASM_NOP4 GENERIC_NOP4 -#define ASM_NOP5 GENERIC_NOP5 -#define ASM_NOP6 GENERIC_NOP6 -#define ASM_NOP7 GENERIC_NOP7 -#define ASM_NOP8 GENERIC_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC) #endif #define ASM_NOP_MAX 8 +#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */ + +#ifndef __ASSEMBLY__ +extern const unsigned char * const *ideal_nops; +extern void arch_init_ideal_nops(void); +#endif #endif /* _ASM_X86_NOPS_H */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 27da400d3138..bfacd2ccf651 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,5 +1,85 @@ +#ifndef _ASM_X86_NUMA_H +#define _ASM_X86_NUMA_H + +#include <linux/nodemask.h> + +#include <asm/topology.h> +#include <asm/apicdef.h> + +#ifdef CONFIG_NUMA + +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) +#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) + +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + +extern int numa_off; + +/* + * __apicid_to_node[] stores the raw mapping between physical apicid and + * node and is used to initialize cpu_to_node mapping. + * + * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus + * should be accessed by the accessors - set_apicid_to_node() and + * numa_cpu_node(). + */ +extern s16 __apicid_to_node[MAX_LOCAL_APIC]; +extern nodemask_t numa_nodes_parsed __initdata; + +extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern void __init numa_set_distance(int from, int to, int distance); + +static inline void set_apicid_to_node(int apicid, s16 node) +{ + __apicid_to_node[apicid] = node; +} + +extern int __cpuinit numa_cpu_node(int cpu); + +#else /* CONFIG_NUMA */ +static inline void set_apicid_to_node(int apicid, s16 node) +{ +} + +static inline int numa_cpu_node(int cpu) +{ + return NUMA_NO_NODE; +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_X86_32 # include "numa_32.h" #else # include "numa_64.h" #endif + +#ifdef CONFIG_NUMA +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +extern void __init init_cpu_to_node(void); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); +#else /* CONFIG_NUMA */ +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +static inline void init_cpu_to_node(void) { } +static inline void numa_add_cpu(int cpu) { } +static inline void numa_remove_cpu(int cpu) { } +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +void debug_cpumask_set_cpu(int cpu, int node, bool enable); +#endif + +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +void numa_emu_cmdline(char *); +#endif /* CONFIG_NUMA_EMU */ + +#endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index b0ef2b449a9d..e7d6b8254742 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -1,11 +1,6 @@ #ifndef _ASM_X86_NUMA_32_H #define _ASM_X86_NUMA_32_H -extern int numa_off; - -extern int pxm_to_nid(int pxm); -extern void numa_remove_cpu(int cpu); - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 0493be39607c..0c05f7ae46e8 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,53 +1,6 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -#include <linux/nodemask.h> -#include <asm/apicdef.h> - -struct bootnode { - u64 start; - u64 end; -}; - -extern int compute_hash_shift(struct bootnode *nodes, int numblks, - int *nodeids); - -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) - -extern void numa_init_array(void); -extern int numa_off; - -extern s16 apicid_to_node[MAX_LOCAL_APIC]; - extern unsigned long numa_free_all_bootmem(void); -extern void setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end); - -#ifdef CONFIG_NUMA -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - -extern void __init init_cpu_to_node(void); -extern void __cpuinit numa_set_node(int cpu, int node); -extern void __cpuinit numa_clear_node(int cpu); -extern void __cpuinit numa_add_cpu(int cpu); -extern void __cpuinit numa_remove_cpu(int cpu); - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -void numa_emu_cmdline(char *); -#endif /* CONFIG_NUMA_EMU */ -#else -static inline void init_cpu_to_node(void) { } -static inline void numa_set_node(int cpu, int node) { } -static inline void numa_clear_node(int cpu) { } -static inline void numa_add_cpu(int cpu, int node) { } -static inline void numa_remove_cpu(int cpu) { } -#endif #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 37c516545ec8..c3b3c322fd87 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -29,7 +29,7 @@ #ifdef CONFIG_X86_NUMAQ extern int found_numaq; -extern int get_memcfg_numaq(void); +extern int numaq_numa_init(void); extern int pci_numaq_init(void); extern void *xquad_portio; @@ -166,11 +166,6 @@ struct sys_cfg_data { void numaq_tsc_disable(void); -#else -static inline int get_memcfg_numaq(void) -{ - return 0; -} #endif /* CONFIG_X86_NUMAQ */ #endif /* _ASM_X86_NUMAQ_H */ diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index f482010350fb..5ca6801b75f3 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -20,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info; /* * OLPC board IDs contain the major build number within the mask 0x0ff0, - * and the minor build number withing 0x000f. Pre-builds have a minor + * and the minor build number within 0x000f. Pre-builds have a minor * number less than 8, and normal builds start at 8. For example, 0x0B10 * is a PreB1, and 0x0C18 is a C1. */ diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 641988efe063..24487712e0b1 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -6,7 +6,7 @@ #define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ -#ifdef CONFIG_OLPC_OPENFIRMWARE +#ifdef CONFIG_OLPC extern bool olpc_ofw_is_installed(void); @@ -26,19 +26,12 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); -#else /* !CONFIG_OLPC_OPENFIRMWARE */ +extern void olpc_dt_build_devicetree(void); -static inline bool olpc_ofw_is_installed(void) { return false; } +#else /* !CONFIG_OLPC */ static inline void olpc_ofw_detect(void) { } static inline void setup_olpc_ofw_pgd(void) { } -static inline bool olpc_ofw_present(void) { return false; } - -#endif /* !CONFIG_OLPC_OPENFIRMWARE */ - -#ifdef CONFIG_OLPC_OPENFIRMWARE_DT -extern void olpc_dt_build_devicetree(void); -#else static inline void olpc_dt_build_devicetree(void) { } -#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ +#endif /* !CONFIG_OLPC */ #endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 1df66211fd1b..bce688d54c12 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -2,6 +2,7 @@ #define _ASM_X86_PAGE_DEFS_H #include <linux/const.h> +#include <linux/types.h> /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +static inline phys_addr_t get_max_mapped(void) +{ + return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; +} + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8); +extern void initmem_init(void); extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 676129229630..d498943b906c 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -135,8 +135,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev); #include "pci_64.h" #endif -void dma32_reserve_bootmem(void); - /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include <asm-generic/pci-dma-compat.h> diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 7e172955ee57..a0a9779084d1 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -45,7 +45,7 @@ #include <linux/stringify.h> #ifdef CONFIG_SMP -#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x +#define __percpu_prefix "%%"__stringify(__percpu_seg)":" #define __my_cpu_offset percpu_read(this_cpu_off) /* @@ -62,9 +62,11 @@ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ }) #else -#define __percpu_arg(x) "%P" #x +#define __percpu_prefix "" #endif +#define __percpu_arg(x) __percpu_prefix "%P" #x + /* * Initialized pointers to per-cpu variables needed for the boot * processor need to use these macros to get the proper address @@ -451,6 +453,26 @@ do { \ #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #endif /* !CONFIG_M386 */ +#ifdef CONFIG_X86_CMPXCHG64 +#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy = n2; \ + asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ + : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ + : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#endif /* CONFIG_X86_CMPXCHG64 */ + /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. @@ -480,6 +502,39 @@ do { \ #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +/* + * Pretty complex macro to generate cmpxchg16 instruction. The instruction + * is not supported on early AMD64 processors so we must be able to emulate + * it in software. The address used in the cmpxchg16 instruction must be + * aligned to a 16 byte boundary. + */ +#ifdef CONFIG_SMP +#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3 +#else +#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2 +#endif +#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy; \ + alternative_io(CMPXCHG16B_EMU_CALL, \ + "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ + X86_FEATURE_CX16, \ + ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ + "S" (&pcp1), "b"(__n1), "c"(__n2), \ + "a"(__o1), "d"(__o2) : "memory"); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) + #endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ @@ -492,6 +547,33 @@ do { \ old__; \ }) +static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, + const unsigned long __percpu *addr) +{ + unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; + + return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; +} + +static inline int x86_this_cpu_variable_test_bit(int nr, + const unsigned long __percpu *addr) +{ + int oldbit; + + asm volatile("bt "__percpu_arg(2)",%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#define x86_this_cpu_test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? x86_this_cpu_constant_test_bit((nr), (addr)) \ + : x86_this_cpu_variable_test_bit((nr), (addr))) + + #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index cc29086e30cd..56fd9e3abbda 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -1,5 +1,5 @@ /* - * Netburst Perfomance Events (P4, old Xeon) + * Netburst Performance Events (P4, old Xeon) */ #ifndef PERF_EVENT_P4_H @@ -9,7 +9,7 @@ #include <linux/bitops.h> /* - * NetBurst has perfomance MSRs shared between + * NetBurst has performance MSRs shared between * threads if HT is turned on, ie for both logical * processors (mem: in turn in Atom with HT support * perf-MSRs are not shared and every thread has its diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 94b979d1b58d..effff47a3c82 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd) static inline void pud_clear(pud_t *pudp) { - unsigned long pgd; - set_pud(pudp, __pud(0)); /* @@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp) * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... * - * Make sure the pud entry we're updating is within the - * current pgd to avoid unnecessary TLB flushes. + * Currently all places where pud_clear() is called either have + * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or + * pud_clear_bad()), so we don't need TLB flush here. */ - pgd = read_cr3(); - if (__pa(pudp) >= pgd && __pa(pudp) < - (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) - write_cr3(pgd); } #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7db7723d1f32..d56187c6b838 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h new file mode 100644 index 000000000000..4950a0b1d09c --- /dev/null +++ b/arch/x86/include/asm/probe_roms.h @@ -0,0 +1,8 @@ +#ifndef _PROBE_ROMS_H_ +#define _PROBE_ROMS_H_ +struct pci_dev; + +extern void __iomem *pci_map_biosrom(struct pci_dev *pdev); +extern void pci_unmap_biosrom(void __iomem *rom); +extern size_t pci_biosrom_size(struct pci_dev *pdev); +#endif diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 7a3e836eb2a9..59ab4dffa377 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -7,7 +7,7 @@ */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ #define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ #define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ @@ -60,6 +60,7 @@ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ +#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ /* * x86-64 Task Priority Register, CR8 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b9c03fb3369a..219371546afd 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -94,10 +94,6 @@ struct cpuinfo_x86 { int x86_cache_alignment; /* In bytes */ int x86_power; unsigned long loops_per_jiffy; -#ifdef CONFIG_SMP - /* cpus sharing the last level cache: */ - cpumask_var_t llc_shared_map; -#endif /* cpuid returned max cores value: */ u16 x86_max_cores; u16 apicid; diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b4ec95f07518..971e0b46446e 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -1 +1,69 @@ -/* dummy prom.h; here to make linux/of.h's #includes happy */ +/* + * Definitions for Device tree / OpenFirmware handling on X86 + * + * based on arch/powerpc/include/asm/prom.h which is + * Copyright (C) 1996-2005 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ASM_X86_PROM_H +#define _ASM_X86_PROM_H +#ifndef __ASSEMBLY__ + +#include <linux/of.h> +#include <linux/types.h> +#include <linux/pci.h> + +#include <asm/irq.h> +#include <asm/atomic.h> +#include <asm/setup.h> +#include <asm/irq_controller.h> + +#ifdef CONFIG_OF +extern int of_ioapic; +extern u64 initial_dtb; +extern void add_dtb(u64 data); +extern void x86_add_irq_domains(void); +void __cpuinit x86_of_pci_init(void); +void x86_dtb_init(void); + +static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev) +{ + return pdev ? pdev->dev.of_node : NULL; +} + +static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) +{ + return pci_device_to_OF_node(bus->self); +} + +#else +static inline void add_dtb(u64 data) { } +static inline void x86_add_irq_domains(void) { } +static inline void x86_of_pci_init(void) { } +static inline void x86_dtb_init(void) { } +#define of_ioapic 0 +#endif + +extern char cmd_line[COMMAND_LINE_SIZE]; + +#define pci_address_to_pio pci_address_to_pio +unsigned long pci_address_to_pio(phys_addr_t addr); + +/** + * irq_dispose_mapping - Unmap an interrupt + * @virq: linux virq number of the interrupt to unmap + * + * FIXME: We really should implement proper virq handling like power, + * but that's going to be major surgery. + */ +static inline void irq_dispose_mapping(unsigned int virq) { } + +#define HAVE_ARCH_DEVTREE_FIXUPS + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h index 52b098a6eebb..7b0a55a88851 100644 --- a/arch/x86/include/asm/ptrace-abi.h +++ b/arch/x86/include/asm/ptrace-abi.h @@ -31,7 +31,7 @@ #define R12 24 #define RBP 32 #define RBX 40 -/* arguments: interrupts/non tracing syscalls only save upto here*/ +/* arguments: interrupts/non tracing syscalls only save up to here*/ #define R11 48 #define R10 56 #define R9 64 diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 78cd1ea94500..94e7618fcac8 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -73,7 +73,7 @@ struct pt_regs { unsigned long r12; unsigned long rbp; unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save upto here*/ +/* arguments: non interrupts/non tracing syscalls only save up to here*/ unsigned long r11; unsigned long r10; unsigned long r9; @@ -103,7 +103,7 @@ struct pt_regs { unsigned long r12; unsigned long bp; unsigned long bx; -/* arguments: non interrupts/non tracing syscalls only save upto here*/ +/* arguments: non interrupts/non tracing syscalls only save up to here*/ unsigned long r11; unsigned long r10; unsigned long r9; @@ -136,6 +136,7 @@ struct cpuinfo_x86; struct task_struct; extern unsigned long profile_pc(struct pt_regs *regs); +#define profile_pc profile_pc extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); @@ -202,20 +203,11 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) #endif } -static inline unsigned long instruction_pointer(struct pt_regs *regs) -{ - return regs->ip; -} - -static inline unsigned long frame_pointer(struct pt_regs *regs) -{ - return regs->bp; -} +#define GET_IP(regs) ((regs)->ip) +#define GET_FP(regs) ((regs)->bp) +#define GET_USP(regs) ((regs)->sp) -static inline unsigned long user_stack_pointer(struct pt_regs *regs) -{ - return regs->sp; -} +#include <asm-generic/ptrace.h> /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 562d4fd31ba8..3250e3d605d9 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -18,7 +18,10 @@ extern struct machine_ops machine_ops; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); -void machine_real_restart(const unsigned char *code, int length); +void machine_real_restart(unsigned int type); +/* These must match dispatch_table in reboot_32.S */ +#define MRR_BIOS 0 +#define MRR_APM 1 typedef void (*nmi_shootdown_cb)(int, struct die_args*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index d1e41b0f9b60..df4cd32b4cc6 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -37,26 +37,9 @@ #endif #ifdef __KERNEL__ - -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/lockdep.h> #include <asm/asm.h> -struct rwsem_waiter; - -extern asmregparm struct rw_semaphore * - rwsem_down_read_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_down_write_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_wake(struct rw_semaphore *); -extern asmregparm struct rw_semaphore * - rwsem_downgrade_wake(struct rw_semaphore *sem); - /* - * the semaphore definition - * * The bias values and the counter type limits the number of * potential readers/writers to 32767 for 32 bits and 2147483647 * for 64 bits. @@ -74,43 +57,6 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -typedef signed long rwsem_count_t; - -struct rw_semaphore { - rwsem_count_t count; - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - - -#define __RWSEM_INITIALIZER(name) \ -{ \ - RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \ -} - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - /* * lock for reading */ @@ -133,7 +79,7 @@ static inline void __down_read(struct rw_semaphore *sem) */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - rwsem_count_t result, tmp; + long result, tmp; asm volatile("# beginning __down_read_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -155,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning down_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* adds 0xffff0001, returns the old value */ @@ -180,9 +126,8 @@ static inline void __down_write(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - rwsem_count_t ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -193,7 +138,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ @@ -211,7 +156,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 0xffff0001, returns the old value */ @@ -247,8 +192,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(rwsem_count_t delta, - struct rw_semaphore *sem) +static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) { asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) @@ -258,10 +202,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta, /* * implement exchange and add functionality */ -static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, - struct rw_semaphore *sem) +static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) { - rwsem_count_t tmp = delta; + long tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) @@ -270,10 +213,5 @@ static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, return tmp + delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 231f1c1d6607..cd84f7208f76 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -1,14 +1,16 @@ #ifndef _ASM_X86_SEGMENT_H #define _ASM_X86_SEGMENT_H +#include <linux/const.h> + /* Constructor for a conventional segment GDT (or LDT) entry */ /* This is a macro so it can be used in initializers */ #define GDT_ENTRY(flags, base, limit) \ - ((((base) & 0xff000000ULL) << (56-24)) | \ - (((flags) & 0x0000f0ffULL) << 40) | \ - (((limit) & 0x000f0000ULL) << (48-16)) | \ - (((base) & 0x00ffffffULL) << 16) | \ - (((limit) & 0x0000ffffULL))) + ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ + (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ + (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \ + (((base) & _AC(0x00ffffff,ULL)) << 16) | \ + (((limit) & _AC(0x0000ffff,ULL)))) /* Simple and small GDT entries for booting only */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index db8aa19a08a2..9756551ec760 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align); * executable.) */ #define RESERVE_BRK(name,sz) \ - static void __section(.discard.text) __used \ + static void __section(.discard.text) __used notrace \ __brk_reservation_fn_##name##__(void) { \ asm volatile ( \ ".pushsection .brk_reservation,\"aw\",@nobits;" \ @@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align); type *name; \ RESERVE_BRK(name, sizeof(type) * entries) +extern void probe_roms(void); #ifdef __i386__ void __init i386_start_kernel(void); -extern void probe_roms(void); #else void __init x86_64_start_kernel(char *real_mode); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1f4695136776..73b11bc0ae6f 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -17,12 +17,24 @@ #endif #include <asm/thread_info.h> #include <asm/cpumask.h> +#include <asm/cpufeature.h> extern int smp_num_siblings; extern unsigned int num_processors; +static inline bool cpu_has_ht_siblings(void) +{ + bool has_siblings = false; +#ifdef CONFIG_SMP + has_siblings = cpu_has_ht && smp_num_siblings > 1; +#endif + return has_siblings; +} + DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +/* cpus sharing the last level cache: */ +DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU(u16, cpu_llc_id); DECLARE_PER_CPU(int, cpu_number); @@ -36,8 +48,16 @@ static inline struct cpumask *cpu_core_mask(int cpu) return per_cpu(cpu_core_map, cpu); } +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return per_cpu(cpu_llc_shared_map, cpu); +} + DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); +#endif /* Static state in head.S used to set up a CPU */ extern unsigned long stack_start; /* Initial stack pointer address */ diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h deleted file mode 100644 index b508d639d1a7..000000000000 --- a/arch/x86/include/asm/srat.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen <gone@us.ibm.com> - */ - -#ifndef _ASM_X86_SRAT_H -#define _ASM_X86_SRAT_H - -#ifdef CONFIG_ACPI_NUMA -extern int get_memcfg_from_srat(void); -#else -static inline int get_memcfg_from_srat(void) -{ - return 0; -} -#endif - -#endif /* _ASM_X86_SRAT_H */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 52b5c7ed3608..70bbe39043a9 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -37,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo, /* Generic stack tracer with callbacks */ struct stacktrace_ops { - void (*warning)(void *data, char *msg); - /* msg must contain %s for the symbol */ - void (*warning_symbol)(void *data, char *msg, unsigned long symbol); void (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); @@ -47,7 +44,7 @@ struct stacktrace_ops { }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data); #ifdef CONFIG_X86_32 @@ -86,11 +83,11 @@ stack_frame(struct task_struct *task, struct pt_regs *regs) extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl); + unsigned long *stack, unsigned long bp, char *log_lvl); extern void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl); + unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index fd921c3a6841..487055c8c1aa 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -9,8 +9,6 @@ #include <asm/desc.h> #include <asm/i387.h> -static inline int arch_prepare_suspend(void) { return 0; } - /* image of the saved processor state */ struct saved_context { u16 es, fs, gs, ss; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 8d942afae681..09b0bf104156 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -9,11 +9,6 @@ #include <asm/desc.h> #include <asm/i387.h> -static inline int arch_prepare_suspend(void) -{ - return 0; -} - /* * Image of the saved processor state, used by the low level ACPI suspend to * RAM code and by the low level hibernation code. diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 33ecc3ea8782..c2ff2a1d845e 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -98,8 +98,6 @@ do { \ */ #define HAVE_DISABLE_HLT #else -#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" -#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" /* frame pointer must be last for get_wchan */ #define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" @@ -305,24 +303,81 @@ static inline void native_wbinvd(void) #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else -#define read_cr0() (native_read_cr0()) -#define write_cr0(x) (native_write_cr0(x)) -#define read_cr2() (native_read_cr2()) -#define write_cr2(x) (native_write_cr2(x)) -#define read_cr3() (native_read_cr3()) -#define write_cr3(x) (native_write_cr3(x)) -#define read_cr4() (native_read_cr4()) -#define read_cr4_safe() (native_read_cr4_safe()) -#define write_cr4(x) (native_write_cr4(x)) -#define wbinvd() (native_wbinvd()) + +static inline unsigned long read_cr0(void) +{ + return native_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + native_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return native_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + native_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return native_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + native_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return native_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return native_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + native_write_cr4(x); +} + +static inline void wbinvd(void) +{ + native_wbinvd(); +} + #ifdef CONFIG_X86_64 -#define read_cr8() (native_read_cr8()) -#define write_cr8(x) (native_write_cr8(x)) -#define load_gs_index native_load_gs_index + +static inline unsigned long read_cr8(void) +{ + return native_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + native_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + native_load_gs_index(selector); +} + #endif /* Clear the 'TS' bit */ -#define clts() (native_clts()) +static inline void clts(void) +{ + native_clts(); +} #endif/* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f0b6e5dbc5a0..1f2e61e28981 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -161,8 +161,14 @@ struct thread_info { #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -#define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) +#define alloc_thread_info_node(tsk, node) \ +({ \ + struct page *page = alloc_pages_node(node, THREAD_FLAGS, \ + THREAD_ORDER); \ + struct thread_info *ret = page ? page_address(page) : NULL; \ + \ + ret; \ +}) #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21899cc31e52..c00692476e9f 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -47,21 +47,6 @@ #include <asm/mpspec.h> -#ifdef CONFIG_X86_32 - -/* Mappings between logical cpu number and node number */ -extern int cpu_to_node_map[]; - -/* Returns the number of the node containing CPU 'cpu' */ -static inline int __cpu_to_node(int cpu) -{ - return cpu_to_node_map[cpu]; -} -#define early_cpu_to_node __cpu_to_node -#define cpu_to_node __cpu_to_node - -#else /* CONFIG_X86_64 */ - /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); @@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu) #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#endif /* CONFIG_X86_64 */ - /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; @@ -110,19 +93,11 @@ extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) #ifdef CONFIG_X86_32 -extern unsigned long node_start_pfn[]; -extern unsigned long node_end_pfn[]; -extern unsigned long node_remap_size[]; -#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) - # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 - #else - # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 - #endif /* sched_domains SD_NODE_INIT for NUMA machines */ @@ -155,7 +130,7 @@ extern unsigned long node_remap_size[]; .balance_interval = 1, \ } -#ifdef CONFIG_X86_64_ACPI_NUMA +#ifdef CONFIG_X86_64 extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #endif diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index f4500fb3b485..feca3118a73b 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -3,25 +3,36 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_X86_TRAMPOLINE +#include <linux/types.h> +#include <asm/io.h> + /* - * Trampoline 80x86 program as an array. + * Trampoline 80x86 program as an array. These are in the init rodata + * segment, but that's okay, because we only care about the relative + * addresses of the symbols. */ -extern const unsigned char trampoline_data []; -extern const unsigned char trampoline_end []; -extern unsigned char *trampoline_base; +extern const unsigned char x86_trampoline_start []; +extern const unsigned char x86_trampoline_end []; +extern unsigned char *x86_trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; extern unsigned long initial_gs; -#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) +extern void __init setup_trampolines(void); + +extern const unsigned char trampoline_data[]; +extern const unsigned char trampoline_status[]; + +#define TRAMPOLINE_SYM(x) \ + ((void *)(x86_trampoline_base + \ + ((const unsigned char *)(x) - x86_trampoline_start))) -extern unsigned long setup_trampoline(void); -extern void __init reserve_trampoline_memory(void); -#else -static inline void reserve_trampoline_memory(void) {} -#endif /* CONFIG_X86_TRAMPOLINE */ +/* Address of the SMP trampoline */ +static inline unsigned long trampoline_address(void) +{ + return virt_to_phys(TRAMPOLINE_SYM(trampoline_data)); +} #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 1ca132fc0d03..9db5583b6d38 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -35,7 +35,7 @@ static inline cycles_t get_cycles(void) static __always_inline cycles_t vget_cycles(void) { /* - * We only do VDSOs on TSC capable CPUs, so this shouldnt + * We only do VDSOs on TSC capable CPUs, so this shouldn't * access boot_cpu_data (which is not VDSO-safe): */ #ifndef CONFIG_X86_TSC @@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); +#ifdef CONFIG_X86_64 +extern cycles_t vread_tsc(void); +#endif + /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index df1da20f4534..8e8c23fef08c 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,22 +1,6 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#define dma_addr_t dma_addr_t - #include <asm-generic/types.h> -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -typedef u64 dma64_addr_t; -#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) -/* DMA addresses come in 32-bit and 64-bit flavours. */ -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif - -#endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index abd3e0ea762a..99ddd148a760 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -6,7 +6,6 @@ #include <linux/errno.h> #include <linux/compiler.h> #include <linux/thread_info.h> -#include <linux/prefetch.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> @@ -42,7 +41,7 @@ * Returns 0 if the range is valid, nonzero otherwise. * * This is equivalent to the following test: - * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) + * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) * * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 088d09fb1615..566e803cc602 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -6,7 +6,6 @@ */ #include <linux/errno.h> #include <linux/thread_info.h> -#include <linux/prefetch.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 316708d5af92..1c66d30971ad 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -6,7 +6,6 @@ */ #include <linux/compiler.h> #include <linux/errno.h> -#include <linux/prefetch.h> #include <linux/lockdep.h> #include <asm/alternative.h> #include <asm/cpufeature.h> diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index b766a5e8ba0e..593485b38ab3 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -346,10 +346,16 @@ #define __NR_fanotify_init 338 #define __NR_fanotify_mark 339 #define __NR_prlimit64 340 +#define __NR_name_to_handle_at 341 +#define __NR_open_by_handle_at 342 +#define __NR_clock_adjtime 343 +#define __NR_syncfs 344 +#define __NR_sendmmsg 345 +#define __NR_setns 346 #ifdef __KERNEL__ -#define NR_syscalls 341 +#define NR_syscalls 347 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 363e9b8a715b..705bf139288c 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -669,6 +669,18 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init) __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_prlimit64 302 __SYSCALL(__NR_prlimit64, sys_prlimit64) +#define __NR_name_to_handle_at 303 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 304 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) +#define __NR_clock_adjtime 305 +__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) +#define __NR_syncfs 306 +__SYSCALL(__NR_syncfs, sys_syncfs) +#define __NR_sendmmsg 307 +__SYSCALL(__NR_sendmmsg, sys_sendmmsg) +#define __NR_setns 308 +__SYSCALL(__NR_setns, sys_setns) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 3e094af443c3..a291c40efd43 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -5,7 +5,7 @@ * * SGI UV Broadcast Assist Unit definitions * - * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_BAU_H @@ -35,17 +35,20 @@ #define MAX_CPUS_PER_UVHUB 64 #define MAX_CPUS_PER_SOCKET 32 -#define UV_ADP_SIZE 64 /* hardware-provided max. */ -#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */ -#define UV_ITEMS_PER_DESCRIPTOR 8 +#define ADP_SZ 64 /* hardware-provided max. */ +#define UV_CPUS_PER_AS 32 /* hardware-provided max. */ +#define ITEMS_PER_DESC 8 /* the 'throttle' to prevent the hardware stay-busy bug */ #define MAX_BAU_CONCURRENT 3 #define UV_ACT_STATUS_MASK 0x3 #define UV_ACT_STATUS_SIZE 2 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 -#define UV_NET_ENDPOINT_INTD 0x38 -#define UV_DESC_BASE_PNODE_SHIFT 49 +#define UV1_NET_ENDPOINT_INTD 0x38 +#define UV2_NET_ENDPOINT_INTD 0x28 +#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \ + UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD) +#define UV_DESC_PSHIFT 49 #define UV_PAYLOADQ_PNODE_SHIFT 49 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" #define UV_BAU_BASENAME "sgi_uv/bau_tunables" @@ -53,29 +56,64 @@ #define UV_BAU_TUNABLES_FILE "bau_tunables" #define WHITESPACE " \t\n" #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) -#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL +#define cpubit_isset(cpu, bau_local_cpumask) \ + test_bit((cpu), (bau_local_cpumask).bits) + /* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ -#define BAU_MISC_CONTROL_MULT_MASK 3 +/* + * UV2: Bit 19 selects between + * (0): 10 microsecond timebase and + * (1): 80 microseconds + * we're using 655us, similar to UV1: 65 units of 10us + */ +#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) +#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (65*10UL) + +#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \ + UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ + UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD) -#define UVH_AGING_PRESCALE_SEL 0x000000b000UL +#define BAU_MISC_CONTROL_MULT_MASK 3 + +#define UVH_AGING_PRESCALE_SEL 0x000000b000UL /* [30:28] URGENCY_7 an index into a table of times */ -#define BAU_URGENCY_7_SHIFT 28 -#define BAU_URGENCY_7_MASK 7 +#define BAU_URGENCY_7_SHIFT 28 +#define BAU_URGENCY_7_MASK 7 -#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL +#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL /* [45:40] BAU - BAU transaction timeout select - a multiplier */ -#define BAU_TRANS_SHIFT 40 -#define BAU_TRANS_MASK 0x3f +#define BAU_TRANS_SHIFT 40 +#define BAU_TRANS_MASK 0x3f + +/* + * shorten some awkward names + */ +#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT +#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT +#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT +#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD +#define write_gmmr uv_write_global_mmr64 +#define write_lmmr uv_write_local_mmr +#define read_lmmr uv_read_local_mmr +#define read_gmmr uv_read_global_mmr64 /* * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */ -#define DESC_STATUS_IDLE 0 -#define DESC_STATUS_ACTIVE 1 -#define DESC_STATUS_DESTINATION_TIMEOUT 2 -#define DESC_STATUS_SOURCE_TIMEOUT 3 +#define DS_IDLE 0 +#define DS_ACTIVE 1 +#define DS_DESTINATION_TIMEOUT 2 +#define DS_SOURCE_TIMEOUT 3 +/* + * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2 + * values 1 and 5 will not occur + */ +#define UV2H_DESC_IDLE 0 +#define UV2H_DESC_DEST_TIMEOUT 2 +#define UV2H_DESC_DEST_STRONG_NACK 3 +#define UV2H_DESC_BUSY 4 +#define UV2H_DESC_SOURCE_TIMEOUT 6 +#define UV2H_DESC_DEST_PUT_ERR 7 /* * delay for 'plugged' timeout retries, in microseconds @@ -86,13 +124,24 @@ * threshholds at which to use IPI to free resources */ /* after this # consecutive 'plugged' timeouts, use IPI to release resources */ -#define PLUGSB4RESET 100 +#define PLUGSB4RESET 100 /* after this many consecutive timeouts, use IPI to release resources */ -#define TIMEOUTSB4RESET 1 +#define TIMEOUTSB4RESET 1 /* at this number uses of IPI to release resources, giveup the request */ -#define IPI_RESET_LIMIT 1 +#define IPI_RESET_LIMIT 1 /* after this # consecutive successes, bump up the throttle if it was lowered */ -#define COMPLETE_THRESHOLD 5 +#define COMPLETE_THRESHOLD 5 + +#define UV_LB_SUBNODEID 0x10 + +/* these two are the same for UV1 and UV2: */ +#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT +#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK +/* 4 bits of software ack period */ +#define UV2_ACK_MASK 0x7UL +#define UV2_ACK_UNITS_SHFT 3 +#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT +#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT /* * number of entries in the destination side payload queue @@ -113,9 +162,16 @@ /* * tuning the action when the numalink network is extremely delayed */ -#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */ -#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ -#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */ +#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in + microseconds */ +#define CONGESTED_REPS 10 /* long delays averaged over + this many broadcasts */ +#define CONGESTED_PERIOD 30 /* time for the bau to be + disabled, in seconds */ +/* see msg_type: */ +#define MSG_NOOP 0 +#define MSG_REGULAR 1 +#define MSG_RETRY 2 /* * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) @@ -124,11 +180,11 @@ * The distribution specification (32 bytes) is interpreted as a 256-bit * distribution vector. Adjacent bits correspond to consecutive even numbered * nodeIDs. The result of adding the index of a given bit to the 15-bit - * 'base_dest_nodeid' field of the header corresponds to the + * 'base_dest_nasid' field of the header corresponds to the * destination nodeID associated with that specified bit. */ -struct bau_target_uvhubmask { - unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; +struct bau_targ_hubmask { + unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; }; /* @@ -137,7 +193,7 @@ struct bau_target_uvhubmask { * enough bits for max. cpu's per uvhub) */ struct bau_local_cpumask { - unsigned long bits; + unsigned long bits; }; /* @@ -158,14 +214,14 @@ struct bau_local_cpumask { * The payload is software-defined for INTD transactions */ struct bau_msg_payload { - unsigned long address; /* signifies a page or all TLB's - of the cpu */ + unsigned long address; /* signifies a page or all + TLB's of the cpu */ /* 64 bits */ - unsigned short sending_cpu; /* filled in by sender */ + unsigned short sending_cpu; /* filled in by sender */ /* 16 bits */ - unsigned short acknowledge_count;/* filled in by destination */ + unsigned short acknowledge_count; /* filled in by destination */ /* 16 bits */ - unsigned int reserved1:32; /* not usable */ + unsigned int reserved1:32; /* not usable */ }; @@ -174,93 +230,96 @@ struct bau_msg_payload { * see table 4.2.3.0.1 in broacast_assist spec. */ struct bau_msg_header { - unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ + unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ - unsigned int base_dest_nodeid:15; /* nasid of the */ - /* bits 20:6 */ /* first bit in uvhub map */ - unsigned int command:8; /* message type */ + unsigned int base_dest_nasid:15; /* nasid of the first bit */ + /* bits 20:6 */ /* in uvhub map */ + unsigned int command:8; /* message type */ /* bits 28:21 */ - /* 0x38: SN3net EndPoint Message */ - unsigned int rsvd_1:3; /* must be zero */ + /* 0x38: SN3net EndPoint Message */ + unsigned int rsvd_1:3; /* must be zero */ /* bits 31:29 */ - /* int will align on 32 bits */ - unsigned int rsvd_2:9; /* must be zero */ + /* int will align on 32 bits */ + unsigned int rsvd_2:9; /* must be zero */ /* bits 40:32 */ - /* Suppl_A is 56-41 */ - unsigned int sequence:16;/* message sequence number */ - /* bits 56:41 */ /* becomes bytes 16-17 of msg */ - /* Address field (96:57) is never used as an - address (these are address bits 42:3) */ - - unsigned int rsvd_3:1; /* must be zero */ + /* Suppl_A is 56-41 */ + unsigned int sequence:16; /* message sequence number */ + /* bits 56:41 */ /* becomes bytes 16-17 of msg */ + /* Address field (96:57) is + never used as an address + (these are address bits + 42:3) */ + + unsigned int rsvd_3:1; /* must be zero */ /* bit 57 */ - /* address bits 27:4 are payload */ + /* address bits 27:4 are payload */ /* these next 24 (58-81) bits become bytes 12-14 of msg */ - /* bits 65:58 land in byte 12 */ - unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ + unsigned int replied_to:1; /* sent as 0 by the source to + byte 12 */ /* bit 58 */ - unsigned int msg_type:3; /* software type of the message*/ + unsigned int msg_type:3; /* software type of the + message */ /* bits 61:59 */ - unsigned int canceled:1; /* message canceled, resource to be freed*/ + unsigned int canceled:1; /* message canceled, resource + is to be freed*/ /* bit 62 */ - unsigned int payload_1a:1;/* not currently used */ + unsigned int payload_1a:1; /* not currently used */ /* bit 63 */ - unsigned int payload_1b:2;/* not currently used */ + unsigned int payload_1b:2; /* not currently used */ /* bits 65:64 */ /* bits 73:66 land in byte 13 */ - unsigned int payload_1ca:6;/* not currently used */ + unsigned int payload_1ca:6; /* not currently used */ /* bits 71:66 */ - unsigned int payload_1c:2;/* not currently used */ + unsigned int payload_1c:2; /* not currently used */ /* bits 73:72 */ /* bits 81:74 land in byte 14 */ - unsigned int payload_1d:6;/* not currently used */ + unsigned int payload_1d:6; /* not currently used */ /* bits 79:74 */ - unsigned int payload_1e:2;/* not currently used */ + unsigned int payload_1e:2; /* not currently used */ /* bits 81:80 */ - unsigned int rsvd_4:7; /* must be zero */ + unsigned int rsvd_4:7; /* must be zero */ /* bits 88:82 */ - unsigned int sw_ack_flag:1;/* software acknowledge flag */ + unsigned int swack_flag:1; /* software acknowledge flag */ /* bit 89 */ - /* INTD trasactions at destination are to - wait for software acknowledge */ - unsigned int rsvd_5:6; /* must be zero */ + /* INTD trasactions at + destination are to wait for + software acknowledge */ + unsigned int rsvd_5:6; /* must be zero */ /* bits 95:90 */ - unsigned int rsvd_6:5; /* must be zero */ + unsigned int rsvd_6:5; /* must be zero */ /* bits 100:96 */ - unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */ + unsigned int int_both:1; /* if 1, interrupt both sockets + on the uvhub */ /* bit 101*/ - unsigned int fairness:3;/* usually zero */ + unsigned int fairness:3; /* usually zero */ /* bits 104:102 */ - unsigned int multilevel:1; /* multi-level multicast format */ + unsigned int multilevel:1; /* multi-level multicast + format */ /* bit 105 */ - /* 0 for TLB: endpoint multi-unicast messages */ - unsigned int chaining:1;/* next descriptor is part of this activation*/ + /* 0 for TLB: endpoint multi-unicast messages */ + unsigned int chaining:1; /* next descriptor is part of + this activation*/ /* bit 106 */ - unsigned int rsvd_7:21; /* must be zero */ + unsigned int rsvd_7:21; /* must be zero */ /* bits 127:107 */ }; -/* see msg_type: */ -#define MSG_NOOP 0 -#define MSG_REGULAR 1 -#define MSG_RETRY 2 - /* * The activation descriptor: * The format of the message to send, plus all accompanying control * Should be 64 bytes */ struct bau_desc { - struct bau_target_uvhubmask distribution; + struct bau_targ_hubmask distribution; /* * message template, consisting of header and payload: */ - struct bau_msg_header header; - struct bau_msg_payload payload; + struct bau_msg_header header; + struct bau_msg_payload payload; }; /* * -payload-- ---------header------ @@ -279,59 +338,51 @@ struct bau_desc { * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from - * sw_ack_vector and payload_2) + * swack_vec and payload_2) * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload * operation." */ -struct bau_payload_queue_entry { - unsigned long address; /* signifies a page or all TLB's - of the cpu */ +struct bau_pq_entry { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ /* 64 bits, bytes 0-7 */ - - unsigned short sending_cpu; /* cpu that sent the message */ + unsigned short sending_cpu; /* cpu that sent the message */ /* 16 bits, bytes 8-9 */ - - unsigned short acknowledge_count; /* filled in by destination */ + unsigned short acknowledge_count; /* filled in by destination */ /* 16 bits, bytes 10-11 */ - /* these next 3 bytes come from bits 58-81 of the message header */ - unsigned short replied_to:1; /* sent as 0 by the source */ - unsigned short msg_type:3; /* software message type */ - unsigned short canceled:1; /* sent as 0 by the source */ - unsigned short unused1:3; /* not currently using */ + unsigned short replied_to:1; /* sent as 0 by the source */ + unsigned short msg_type:3; /* software message type */ + unsigned short canceled:1; /* sent as 0 by the source */ + unsigned short unused1:3; /* not currently using */ /* byte 12 */ - - unsigned char unused2a; /* not currently using */ + unsigned char unused2a; /* not currently using */ /* byte 13 */ - unsigned char unused2; /* not currently using */ + unsigned char unused2; /* not currently using */ /* byte 14 */ - - unsigned char sw_ack_vector; /* filled in by the hardware */ + unsigned char swack_vec; /* filled in by the hardware */ /* byte 15 (bits 127:120) */ - - unsigned short sequence; /* message sequence number */ + unsigned short sequence; /* message sequence number */ /* bytes 16-17 */ - unsigned char unused4[2]; /* not currently using bytes 18-19 */ + unsigned char unused4[2]; /* not currently using bytes 18-19 */ /* bytes 18-19 */ - - int number_of_cpus; /* filled in at destination */ + int number_of_cpus; /* filled in at destination */ /* 32 bits, bytes 20-23 (aligned) */ - - unsigned char unused5[8]; /* not using */ + unsigned char unused5[8]; /* not using */ /* bytes 24-31 */ }; struct msg_desc { - struct bau_payload_queue_entry *msg; - int msg_slot; - int sw_ack_slot; - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; + struct bau_pq_entry *msg; + int msg_slot; + int swack_slot; + struct bau_pq_entry *queue_first; + struct bau_pq_entry *queue_last; }; struct reset_args { - int sender; + int sender; }; /* @@ -339,105 +390,226 @@ struct reset_args { */ struct ptc_stats { /* sender statistics */ - unsigned long s_giveup; /* number of fall backs to IPI-style flushes */ - unsigned long s_requestor; /* number of shootdown requests */ - unsigned long s_stimeout; /* source side timeouts */ - unsigned long s_dtimeout; /* destination side timeouts */ - unsigned long s_time; /* time spent in sending side */ - unsigned long s_retriesok; /* successful retries */ - unsigned long s_ntargcpu; /* total number of cpu's targeted */ - unsigned long s_ntargself; /* times the sending cpu was targeted */ - unsigned long s_ntarglocals; /* targets of cpus on the local blade */ - unsigned long s_ntargremotes; /* targets of cpus on remote blades */ - unsigned long s_ntarglocaluvhub; /* targets of the local hub */ - unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ - unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ - unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ - unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */ - unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */ - unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */ - unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */ - unsigned long s_resets_plug; /* ipi-style resets from plug state */ - unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ - unsigned long s_busy; /* status stayed busy past s/w timer */ - unsigned long s_throttles; /* waits in throttle */ - unsigned long s_retry_messages; /* retry broadcasts */ - unsigned long s_bau_reenabled; /* for bau enable/disable */ - unsigned long s_bau_disabled; /* for bau enable/disable */ + unsigned long s_giveup; /* number of fall backs to + IPI-style flushes */ + unsigned long s_requestor; /* number of shootdown + requests */ + unsigned long s_stimeout; /* source side timeouts */ + unsigned long s_dtimeout; /* destination side timeouts */ + unsigned long s_time; /* time spent in sending side */ + unsigned long s_retriesok; /* successful retries */ + unsigned long s_ntargcpu; /* total number of cpu's + targeted */ + unsigned long s_ntargself; /* times the sending cpu was + targeted */ + unsigned long s_ntarglocals; /* targets of cpus on the local + blade */ + unsigned long s_ntargremotes; /* targets of cpus on remote + blades */ + unsigned long s_ntarglocaluvhub; /* targets of the local hub */ + unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ + unsigned long s_ntarguvhub; /* total number of uvhubs + targeted */ + unsigned long s_ntarguvhub16; /* number of times target + hubs >= 16*/ + unsigned long s_ntarguvhub8; /* number of times target + hubs >= 8 */ + unsigned long s_ntarguvhub4; /* number of times target + hubs >= 4 */ + unsigned long s_ntarguvhub2; /* number of times target + hubs >= 2 */ + unsigned long s_ntarguvhub1; /* number of times target + hubs == 1 */ + unsigned long s_resets_plug; /* ipi-style resets from plug + state */ + unsigned long s_resets_timeout; /* ipi-style resets from + timeouts */ + unsigned long s_busy; /* status stayed busy past + s/w timer */ + unsigned long s_throttles; /* waits in throttle */ + unsigned long s_retry_messages; /* retry broadcasts */ + unsigned long s_bau_reenabled; /* for bau enable/disable */ + unsigned long s_bau_disabled; /* for bau enable/disable */ /* destination statistics */ - unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ - unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ - unsigned long d_multmsg; /* interrupts with multiple messages */ - unsigned long d_nomsg; /* interrupts with no message */ - unsigned long d_time; /* time spent on destination side */ - unsigned long d_requestee; /* number of messages processed */ - unsigned long d_retries; /* number of retry messages processed */ - unsigned long d_canceled; /* number of messages canceled by retries */ - unsigned long d_nocanceled; /* retries that found nothing to cancel */ - unsigned long d_resets; /* number of ipi-style requests processed */ - unsigned long d_rcanceled; /* number of messages canceled by resets */ + unsigned long d_alltlb; /* times all tlb's on this + cpu were flushed */ + unsigned long d_onetlb; /* times just one tlb on this + cpu was flushed */ + unsigned long d_multmsg; /* interrupts with multiple + messages */ + unsigned long d_nomsg; /* interrupts with no message */ + unsigned long d_time; /* time spent on destination + side */ + unsigned long d_requestee; /* number of messages + processed */ + unsigned long d_retries; /* number of retry messages + processed */ + unsigned long d_canceled; /* number of messages canceled + by retries */ + unsigned long d_nocanceled; /* retries that found nothing + to cancel */ + unsigned long d_resets; /* number of ipi-style requests + processed */ + unsigned long d_rcanceled; /* number of messages canceled + by resets */ +}; + +struct tunables { + int *tunp; + int deflt; +}; + +struct hub_and_pnode { + short uvhub; + short pnode; +}; + +struct socket_desc { + short num_cpus; + short cpu_number[MAX_CPUS_PER_SOCKET]; +}; + +struct uvhub_desc { + unsigned short socket_mask; + short num_cpus; + short uvhub; + short pnode; + struct socket_desc socket[2]; }; /* * one per-cpu; to locate the software tables */ struct bau_control { - struct bau_desc *descriptor_base; - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; - struct bau_payload_queue_entry *bau_msg_head; - struct bau_control *uvhub_master; - struct bau_control *socket_master; - struct ptc_stats *statp; - unsigned long timeout_interval; - unsigned long set_bau_on_time; - atomic_t active_descriptor_count; - int plugged_tries; - int timeout_tries; - int ipi_attempts; - int conseccompletes; - int baudisabled; - int set_bau_off; - short cpu; - short uvhub_cpu; - short uvhub; - short cpus_in_socket; - short cpus_in_uvhub; - unsigned short message_number; - unsigned short uvhub_quiesce; - short socket_acknowledge_count[DEST_Q_SIZE]; - cycles_t send_message; - spinlock_t uvhub_lock; - spinlock_t queue_lock; + struct bau_desc *descriptor_base; + struct bau_pq_entry *queue_first; + struct bau_pq_entry *queue_last; + struct bau_pq_entry *bau_msg_head; + struct bau_control *uvhub_master; + struct bau_control *socket_master; + struct ptc_stats *statp; + unsigned long timeout_interval; + unsigned long set_bau_on_time; + atomic_t active_descriptor_count; + int plugged_tries; + int timeout_tries; + int ipi_attempts; + int conseccompletes; + int baudisabled; + int set_bau_off; + short cpu; + short osnode; + short uvhub_cpu; + short uvhub; + short cpus_in_socket; + short cpus_in_uvhub; + short partition_base_pnode; + unsigned short message_number; + unsigned short uvhub_quiesce; + short socket_acknowledge_count[DEST_Q_SIZE]; + cycles_t send_message; + spinlock_t uvhub_lock; + spinlock_t queue_lock; /* tunables */ - int max_bau_concurrent; - int max_bau_concurrent_constant; - int plugged_delay; - int plugsb4reset; - int timeoutsb4reset; - int ipi_reset_limit; - int complete_threshold; - int congested_response_us; - int congested_reps; - int congested_period; - cycles_t period_time; - long period_requests; + int max_concurr; + int max_concurr_const; + int plugged_delay; + int plugsb4reset; + int timeoutsb4reset; + int ipi_reset_limit; + int complete_threshold; + int cong_response_us; + int cong_reps; + int cong_period; + cycles_t period_time; + long period_requests; + struct hub_and_pnode *thp; }; -static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) +static unsigned long read_mmr_uv2_status(void) +{ + return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); +} + +static void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); +} + +static void write_mmr_descriptor_base(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image); +} + +static void write_mmr_activation(unsigned long index) +{ + write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); +} + +static void write_gmmr_activation(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image); +} + +static void write_mmr_payload_first(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image); +} + +static void write_mmr_payload_tail(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image); +} + +static void write_mmr_payload_last(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image); +} + +static void write_mmr_misc_control(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); +} + +static unsigned long read_mmr_misc_control(int pnode) +{ + return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL); +} + +static void write_mmr_sw_ack(unsigned long mr) +{ + uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + +static unsigned long read_mmr_sw_ack(void) +{ + return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); +} + +static unsigned long read_gmmr_sw_ack(int pnode) +{ + return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); +} + +static void write_mmr_data_config(int pnode, unsigned long mr) +{ + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr); +} + +static inline int bau_uvhub_isset(int uvhub, struct bau_targ_hubmask *dstp) { return constant_test_bit(uvhub, &dstp->bits[0]); } -static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) +static inline void bau_uvhub_set(int pnode, struct bau_targ_hubmask *dstp) { - __set_bit(uvhub, &dstp->bits[0]); + __set_bit(pnode, &dstp->bits[0]); } -static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, +static inline void bau_uvhubs_clear(struct bau_targ_hubmask *dstp, int nbits) { bitmap_zero(&dstp->bits[0], nbits); } -static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp) +static inline int bau_uvhub_weight(struct bau_targ_hubmask *dstp) { return bitmap_weight((unsigned long *)&dstp->bits[0], UV_DISTRIBUTION_SIZE); @@ -448,9 +620,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) bitmap_zero(&dstp->bits, nbits); } -#define cpubit_isset(cpu, bau_local_cpumask) \ - test_bit((cpu), (bau_local_cpumask).bits) - extern void uv_bau_message_intr1(void); extern void uv_bau_timeout_intr1(void); @@ -458,7 +627,7 @@ struct atomic_short { short counter; }; -/** +/* * atomic_read_short - read a short atomic variable * @v: pointer of type atomic_short * @@ -469,14 +638,14 @@ static inline int atomic_read_short(const struct atomic_short *v) return v->counter; } -/** - * atomic_add_short_return - add and return a short int +/* + * atom_asr - add and return a short int * @i: short value to add * @v: pointer of type atomic_short * * Atomically adds @i to @v and returns @i + @v */ -static inline int atomic_add_short_return(short i, struct atomic_short *v) +static inline int atom_asr(short i, struct atomic_short *v) { short __i = i; asm volatile(LOCK_PREFIX "xaddw %0, %1" @@ -485,4 +654,26 @@ static inline int atomic_add_short_return(short i, struct atomic_short *v) return i + __i; } +/* + * conditionally add 1 to *v, unless *v is >= u + * return 0 if we cannot add 1 to *v because it is >= u + * return 1 if we can add 1 to *v because it is < u + * the add is atomic + * + * This is close to atomic_add_unless(), but this allows the 'u' value + * to be lowered below the current 'v'. atomic_add_unless can only stop + * on equal. + */ +static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) +{ + spin_lock(lock); + if (atomic_read(v) >= u) { + spin_unlock(lock); + return 0; + } + atomic_inc(v); + spin_unlock(lock); + return 1; +} + #endif /* _ASM_X86_UV_UV_BAU_H */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a501741c2335..f26544a15214 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -77,8 +77,9 @@ * * 1111110000000000 * 5432109876543210 - * pppppppppplc0cch Nehalem-EX - * ppppppppplcc0cch Westmere-EX + * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg) + * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg) + * pppppppppppcccch SandyBridge (15 bits in hdw reg) * sssssssssss * * p = pnode bits @@ -87,7 +88,7 @@ * h = hyperthread * s = bits that are in the SOCKET_ID CSR * - * Note: Processor only supports 12 bits in the APICID register. The ACPI + * Note: Processor may support fewer bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. * * Unless otherwise specified, all references to APICID refer to @@ -138,6 +139,8 @@ struct uv_hub_info_s { unsigned long global_mmr_base; unsigned long gpa_mask; unsigned int gnode_extra; + unsigned char hub_revision; + unsigned char apic_pnode_shift; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -149,13 +152,31 @@ struct uv_hub_info_s { unsigned char m_val; unsigned char n_val; struct uv_scir_s scir; - unsigned char apic_pnode_shift; }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) +/* + * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2 + * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE. + * This is a software convention - NOT the hardware revision numbers in + * the hub chip. + */ +#define UV1_HUB_REVISION_BASE 1 +#define UV2_HUB_REVISION_BASE 3 + +static inline int is_uv1_hub(void) +{ + return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE; +} + +static inline int is_uv2_hub(void) +{ + return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; +} + union uvh_apicid { unsigned long v; struct uvh_apicid_s { @@ -180,11 +201,25 @@ union uvh_apicid { #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) #define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) -#define UV_LOCAL_MMR_BASE 0xf4000000UL -#define UV_GLOBAL_MMR32_BASE 0xf8000000UL +#define UV1_LOCAL_MMR_BASE 0xf4000000UL +#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL +#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024) +#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) + +#define UV2_LOCAL_MMR_BASE 0xfa000000UL +#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL +#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024) +#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) + +#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE \ + : UV2_LOCAL_MMR_BASE) +#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE \ + : UV2_GLOBAL_MMR32_BASE) +#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ + UV2_LOCAL_MMR_SIZE) +#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\ + UV2_GLOBAL_MMR32_SIZE) #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) -#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) -#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) #define UV_GLOBAL_GRU_MMR_BASE 0x4000000 @@ -301,6 +336,17 @@ static inline int uv_apicid_to_pnode(int apicid) } /* + * Convert an apicid to the socket number on the blade + */ +static inline int uv_apicid_to_socket(int apicid) +{ + if (is_uv1_hub()) + return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1; + else + return 0; +} + +/* * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ @@ -398,6 +444,8 @@ struct uv_blade_info { unsigned short nr_online_cpus; unsigned short pnode; short memory_nid; + spinlock_t nmi_lock; + unsigned long nmi_count; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; @@ -517,14 +565,13 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) /* * Get the minimum revision number of the hub chips within the partition. - * 1 - initial rev 1.0 silicon - * 2 - rev 2.0 production silicon + * 1 - UV1 rev 1.0 initial silicon + * 2 - UV1 rev 2.0 production silicon + * 3 - UV2 rev 1.0 initial silicon */ static inline int uv_get_min_hub_revision_id(void) { - extern int uv_min_hub_revision_id; - - return uv_min_hub_revision_id; + return uv_hub_info->hub_revision; } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 20cafeac7455..4be52c863448 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,19 +5,70 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H #define _ASM_X86_UV_UV_MMRS_H +/* + * This file contains MMR definitions for both UV1 & UV2 hubs. + * + * In general, MMR addresses and structures are identical on both hubs. + * These MMRs are identified as: + * #define UVH_xxx <address> + * union uvh_xxx { + * unsigned long v; + * struct uvh_int_cmpd_s { + * } s; + * }; + * + * If the MMR exists on both hub type but has different addresses or + * contents, the MMR definition is similar to: + * #define UV1H_xxx <uv1 address> + * #define UV2H_xxx <uv2address> + * #define UVH_xxx (is_uv1_hub() ? UV1H_xxx : UV2H_xxx) + * union uvh_xxx { + * unsigned long v; + * struct uv1h_int_cmpd_s { (Common fields only) + * } s; + * struct uv1h_int_cmpd_s { (Full UV1 definition) + * } s1; + * struct uv2h_int_cmpd_s { (Full UV2 definition) + * } s2; + * }; + * + * Only essential difference are enumerated. For example, if the address is + * the same for both UV1 & UV2, only a single #define is generated. Likewise, + * if the contents is the same for both hubs, only the "s" structure is + * generated. + * + * If the MMR exists on ONLY 1 type of hub, no generic definition is + * generated: + * #define UVnH_xxx <uvn address> + * union uvnh_xxx { + * unsigned long v; + * struct uvh_int_cmpd_s { + * } sn; + * }; + */ + #define UV_MMR_ENABLE (1UL << 63) +#define UV1_HUB_PART_NUMBER 0x88a5 +#define UV2_HUB_PART_NUMBER 0x8eb8 + +/* Compat: if this #define is present, UV headers support UV2 */ +#define UV2_HUB_IS_SUPPORTED 1 + +/* KABI compat: if this #define is present, KABI hacks are present */ +#define UV2_HUB_KABI_HACKS 1 + /* ========================================================================= */ /* UVH_BAU_DATA_BROADCAST */ /* ========================================================================= */ #define UVH_BAU_DATA_BROADCAST 0x61688UL -#define UVH_BAU_DATA_BROADCAST_32 0x0440 +#define UVH_BAU_DATA_BROADCAST_32 0x440 #define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 #define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL @@ -34,7 +85,7 @@ union uvh_bau_data_broadcast_u { /* UVH_BAU_DATA_CONFIG */ /* ========================================================================= */ #define UVH_BAU_DATA_CONFIG 0x61680UL -#define UVH_BAU_DATA_CONFIG_32 0x0438 +#define UVH_BAU_DATA_CONFIG_32 0x438 #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 #define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL @@ -73,125 +124,245 @@ union uvh_bau_data_config_u { /* UVH_EVENT_OCCURRED0 */ /* ========================================================================= */ #define UVH_EVENT_OCCURRED0 0x70000UL -#define UVH_EVENT_OCCURRED0_32 0x005e8 - -#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 -#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL -#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 -#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL -#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 -#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL -#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3 -#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL -#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4 -#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL -#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5 -#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL -#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6 -#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL -#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 -#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL -#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 -#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL -#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 -#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL -#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 -#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL -#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 -#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL -#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 -#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL -#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 -#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL -#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 -#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL -#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 -#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL -#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 -#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL -#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 -#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL -#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 -#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL -#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 -#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL -#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 -#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL -#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 -#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL -#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 -#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL -#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 -#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL -#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 -#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL -#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 -#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL -#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 -#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL -#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43 -#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL -#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 -#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL -#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45 -#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 -#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 -#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 -#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 -#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL -#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 -#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL -#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51 -#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL -#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52 -#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL -#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53 -#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL -#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54 -#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL -#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55 -#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL -#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 -#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL +#define UVH_EVENT_OCCURRED0_32 0x5e8 + +#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 +#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL +#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 +#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL +#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3 +#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL +#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4 +#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL +#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5 +#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL +#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6 +#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL +#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 +#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL +#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 +#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL +#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 +#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL +#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 +#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL +#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 +#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL +#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 +#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL +#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 +#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL +#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 +#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL +#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 +#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL +#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 +#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL +#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 +#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL +#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 +#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL +#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 +#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL +#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 +#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL +#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 +#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL +#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 +#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL +#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 +#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 +#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43 +#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL +#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 +#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL +#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45 +#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL +#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 +#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51 +#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52 +#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53 +#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54 +#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL +#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55 +#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL +#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 +#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL + +#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2 +#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 +#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 +#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 +#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 +#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 +#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 +#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 +#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 +#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 +#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 +#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 +#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 +#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + union uvh_event_occurred0_u { unsigned long v; - struct uvh_event_occurred0_s { + struct uv1h_event_occurred0_s { unsigned long lb_hcerr : 1; /* RW, W1C */ unsigned long gr0_hcerr : 1; /* RW, W1C */ unsigned long gr1_hcerr : 1; /* RW, W1C */ @@ -250,14 +421,76 @@ union uvh_event_occurred0_u { unsigned long bau_data : 1; /* RW, W1C */ unsigned long power_management_req : 1; /* RW, W1C */ unsigned long rsvd_57_63 : 7; /* */ - } s; + } s1; + struct uv2h_event_occurred0_s { + unsigned long lb_hcerr : 1; /* RW */ + unsigned long qp_hcerr : 1; /* RW */ + unsigned long rh_hcerr : 1; /* RW */ + unsigned long lh0_hcerr : 1; /* RW */ + unsigned long lh1_hcerr : 1; /* RW */ + unsigned long gr0_hcerr : 1; /* RW */ + unsigned long gr1_hcerr : 1; /* RW */ + unsigned long ni0_hcerr : 1; /* RW */ + unsigned long ni1_hcerr : 1; /* RW */ + unsigned long lb_aoerr0 : 1; /* RW */ + unsigned long qp_aoerr0 : 1; /* RW */ + unsigned long rh_aoerr0 : 1; /* RW */ + unsigned long lh0_aoerr0 : 1; /* RW */ + unsigned long lh1_aoerr0 : 1; /* RW */ + unsigned long gr0_aoerr0 : 1; /* RW */ + unsigned long gr1_aoerr0 : 1; /* RW */ + unsigned long xb_aoerr0 : 1; /* RW */ + unsigned long rt_aoerr0 : 1; /* RW */ + unsigned long ni0_aoerr0 : 1; /* RW */ + unsigned long ni1_aoerr0 : 1; /* RW */ + unsigned long lb_aoerr1 : 1; /* RW */ + unsigned long qp_aoerr1 : 1; /* RW */ + unsigned long rh_aoerr1 : 1; /* RW */ + unsigned long lh0_aoerr1 : 1; /* RW */ + unsigned long lh1_aoerr1 : 1; /* RW */ + unsigned long gr0_aoerr1 : 1; /* RW */ + unsigned long gr1_aoerr1 : 1; /* RW */ + unsigned long xb_aoerr1 : 1; /* RW */ + unsigned long rt_aoerr1 : 1; /* RW */ + unsigned long ni0_aoerr1 : 1; /* RW */ + unsigned long ni1_aoerr1 : 1; /* RW */ + unsigned long system_shutdown_int : 1; /* RW */ + unsigned long lb_irq_int_0 : 1; /* RW */ + unsigned long lb_irq_int_1 : 1; /* RW */ + unsigned long lb_irq_int_2 : 1; /* RW */ + unsigned long lb_irq_int_3 : 1; /* RW */ + unsigned long lb_irq_int_4 : 1; /* RW */ + unsigned long lb_irq_int_5 : 1; /* RW */ + unsigned long lb_irq_int_6 : 1; /* RW */ + unsigned long lb_irq_int_7 : 1; /* RW */ + unsigned long lb_irq_int_8 : 1; /* RW */ + unsigned long lb_irq_int_9 : 1; /* RW */ + unsigned long lb_irq_int_10 : 1; /* RW */ + unsigned long lb_irq_int_11 : 1; /* RW */ + unsigned long lb_irq_int_12 : 1; /* RW */ + unsigned long lb_irq_int_13 : 1; /* RW */ + unsigned long lb_irq_int_14 : 1; /* RW */ + unsigned long lb_irq_int_15 : 1; /* RW */ + unsigned long l1_nmi_int : 1; /* RW */ + unsigned long stop_clock : 1; /* RW */ + unsigned long asic_to_l1 : 1; /* RW */ + unsigned long l1_to_asic : 1; /* RW */ + unsigned long la_seq_trigger : 1; /* RW */ + unsigned long ipi_int : 1; /* RW */ + unsigned long extio_int0 : 1; /* RW */ + unsigned long extio_int1 : 1; /* RW */ + unsigned long extio_int2 : 1; /* RW */ + unsigned long extio_int3 : 1; /* RW */ + unsigned long profile_int : 1; /* RW */ + unsigned long rsvd_59_63 : 5; /* */ + } s2; }; /* ========================================================================= */ /* UVH_EVENT_OCCURRED0_ALIAS */ /* ========================================================================= */ #define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL -#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0 +#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 /* ========================================================================= */ /* UVH_GR0_TLB_INT0_CONFIG */ @@ -432,8 +665,16 @@ union uvh_int_cmpb_u { /* ========================================================================= */ #define UVH_INT_CMPC 0x22100UL -#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 -#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL +#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 +#define UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 +#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT (is_uv1_hub() ? \ + UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT : \ + UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT) +#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL +#define UV2H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL +#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK (is_uv1_hub() ? \ + UV1H_INT_CMPC_REAL_TIME_CMPC_MASK : \ + UV2H_INT_CMPC_REAL_TIME_CMPC_MASK) union uvh_int_cmpc_u { unsigned long v; @@ -448,8 +689,16 @@ union uvh_int_cmpc_u { /* ========================================================================= */ #define UVH_INT_CMPD 0x22180UL -#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 -#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL +#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 +#define UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 +#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT (is_uv1_hub() ? \ + UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT : \ + UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT) +#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL +#define UV2H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL +#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK (is_uv1_hub() ? \ + UV1H_INT_CMPD_REAL_TIME_CMPD_MASK : \ + UV2H_INT_CMPD_REAL_TIME_CMPD_MASK) union uvh_int_cmpd_u { unsigned long v; @@ -463,7 +712,7 @@ union uvh_int_cmpd_u { /* UVH_IPI_INT */ /* ========================================================================= */ #define UVH_IPI_INT 0x60500UL -#define UVH_IPI_INT_32 0x0348 +#define UVH_IPI_INT_32 0x348 #define UVH_IPI_INT_VECTOR_SHFT 0 #define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL @@ -493,7 +742,7 @@ union uvh_ipi_int_u { /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL @@ -515,7 +764,7 @@ union uvh_lb_bau_intd_payload_queue_first_u { /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL @@ -533,7 +782,7 @@ union uvh_lb_bau_intd_payload_queue_last_u { /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL @@ -551,7 +800,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL @@ -585,6 +834,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u { #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + union uvh_lb_bau_intd_software_acknowledge_u { unsigned long v; struct uvh_lb_bau_intd_software_acknowledge_s { @@ -612,13 +862,13 @@ union uvh_lb_bau_intd_software_acknowledge_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 /* ========================================================================= */ /* UVH_LB_BAU_MISC_CONTROL */ /* ========================================================================= */ #define UVH_LB_BAU_MISC_CONTROL 0x320170UL -#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10 +#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL @@ -628,8 +878,8 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL -#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 @@ -650,8 +900,86 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL union uvh_lb_bau_misc_control_u { unsigned long v; @@ -660,7 +988,25 @@ union uvh_lb_bau_misc_control_u { unsigned long apic_mode : 1; /* RW */ unsigned long force_broadcast : 1; /* RW */ unsigned long force_lock_nop : 1; /* RW */ - unsigned long csi_agent_presence_vector : 3; /* RW */ + unsigned long qpi_agent_presence_vector : 3; /* RW */ + unsigned long descriptor_fetch_mode : 1; /* RW */ + unsigned long enable_intd_soft_ack_mode : 1; /* RW */ + unsigned long intd_soft_ack_timeout_period : 4; /* RW */ + unsigned long enable_dual_mapping_mode : 1; /* RW */ + unsigned long vga_io_port_decode_enable : 1; /* RW */ + unsigned long vga_io_port_16_bit_decode : 1; /* RW */ + unsigned long suppress_dest_registration : 1; /* RW */ + unsigned long programmed_initial_priority : 3; /* RW */ + unsigned long use_incoming_priority : 1; /* RW */ + unsigned long enable_programmed_initial_priority : 1; /* RW */ + unsigned long rsvd_29_63 : 35; + } s; + struct uv1h_lb_bau_misc_control_s { + unsigned long rejection_delay : 8; /* RW */ + unsigned long apic_mode : 1; /* RW */ + unsigned long force_broadcast : 1; /* RW */ + unsigned long force_lock_nop : 1; /* RW */ + unsigned long qpi_agent_presence_vector : 3; /* RW */ unsigned long descriptor_fetch_mode : 1; /* RW */ unsigned long enable_intd_soft_ack_mode : 1; /* RW */ unsigned long intd_soft_ack_timeout_period : 4; /* RW */ @@ -673,14 +1019,40 @@ union uvh_lb_bau_misc_control_u { unsigned long enable_programmed_initial_priority : 1; /* RW */ unsigned long rsvd_29_47 : 19; /* */ unsigned long fun : 16; /* RW */ - } s; + } s1; + struct uv2h_lb_bau_misc_control_s { + unsigned long rejection_delay : 8; /* RW */ + unsigned long apic_mode : 1; /* RW */ + unsigned long force_broadcast : 1; /* RW */ + unsigned long force_lock_nop : 1; /* RW */ + unsigned long qpi_agent_presence_vector : 3; /* RW */ + unsigned long descriptor_fetch_mode : 1; /* RW */ + unsigned long enable_intd_soft_ack_mode : 1; /* RW */ + unsigned long intd_soft_ack_timeout_period : 4; /* RW */ + unsigned long enable_dual_mapping_mode : 1; /* RW */ + unsigned long vga_io_port_decode_enable : 1; /* RW */ + unsigned long vga_io_port_16_bit_decode : 1; /* RW */ + unsigned long suppress_dest_registration : 1; /* RW */ + unsigned long programmed_initial_priority : 3; /* RW */ + unsigned long use_incoming_priority : 1; /* RW */ + unsigned long enable_programmed_initial_priority : 1; /* RW */ + unsigned long enable_automatic_apic_mode_selection : 1; /* RW */ + unsigned long apic_mode_status : 1; /* RO */ + unsigned long suppress_interrupts_to_self : 1; /* RW */ + unsigned long enable_lock_based_system_flush : 1; /* RW */ + unsigned long enable_extended_sb_status : 1; /* RW */ + unsigned long suppress_int_prio_udt_to_self : 1; /* RW */ + unsigned long use_legacy_descriptor_formats : 1; /* RW */ + unsigned long rsvd_36_47 : 12; /* */ + unsigned long fun : 16; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL @@ -703,7 +1075,7 @@ union uvh_lb_bau_sb_activation_control_u { /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL @@ -719,7 +1091,7 @@ union uvh_lb_bau_sb_activation_status_0_u { /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL @@ -735,7 +1107,7 @@ union uvh_lb_bau_sb_activation_status_1_u { /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ #define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL @@ -754,23 +1126,6 @@ union uvh_lb_bau_sb_descriptor_base_u { }; /* ========================================================================= */ -/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */ -/* ========================================================================= */ -#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL -#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0 - -#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 -#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL - -union uvh_lb_target_physical_apic_id_mask_u { - unsigned long v; - struct uvh_lb_target_physical_apic_id_mask_s { - unsigned long bit_enables : 32; /* RW */ - unsigned long rsvd_32_63 : 32; /* */ - } s; -}; - -/* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ #define UVH_NODE_ID 0x0UL @@ -785,10 +1140,36 @@ union uvh_lb_target_physical_apic_id_mask_u { #define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL #define UVH_NODE_ID_NODE_ID_SHFT 32 #define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48 -#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL -#define UVH_NODE_ID_NI_PORT_SHFT 56 -#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL + +#define UV1H_NODE_ID_FORCE1_SHFT 0 +#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV1H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV1H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV1H_NODE_ID_REVISION_SHFT 28 +#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV1H_NODE_ID_NODE_ID_SHFT 32 +#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48 +#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL +#define UV1H_NODE_ID_NI_PORT_SHFT 56 +#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL + +#define UV2H_NODE_ID_FORCE1_SHFT 0 +#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV2H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV2H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV2H_NODE_ID_REVISION_SHFT 28 +#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV2H_NODE_ID_NODE_ID_SHFT 32 +#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UV2H_NODE_ID_NI_PORT_SHFT 57 +#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL union uvh_node_id_u { unsigned long v; @@ -798,12 +1179,31 @@ union uvh_node_id_u { unsigned long part_number : 16; /* RO */ unsigned long revision : 4; /* RO */ unsigned long node_id : 15; /* RW */ + unsigned long rsvd_47_63 : 17; + } s; + struct uv1h_node_id_s { + unsigned long force1 : 1; /* RO */ + unsigned long manufacturer : 11; /* RO */ + unsigned long part_number : 16; /* RO */ + unsigned long revision : 4; /* RO */ + unsigned long node_id : 15; /* RW */ unsigned long rsvd_47 : 1; /* */ unsigned long nodes_per_bit : 7; /* RW */ unsigned long rsvd_55 : 1; /* */ unsigned long ni_port : 4; /* RO */ unsigned long rsvd_60_63 : 4; /* */ - } s; + } s1; + struct uv2h_node_id_s { + unsigned long force1 : 1; /* RO */ + unsigned long manufacturer : 11; /* RO */ + unsigned long part_number : 16; /* RO */ + unsigned long revision : 4; /* RO */ + unsigned long node_id : 15; /* RW */ + unsigned long rsvd_47_49 : 3; /* */ + unsigned long nodes_per_bit : 7; /* RO */ + unsigned long ni_port : 5; /* RO */ + unsigned long rsvd_62_63 : 2; /* */ + } s2; }; /* ========================================================================= */ @@ -954,18 +1354,38 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { #define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL -#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 -#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL + +#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 +#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL + +#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL union uvh_rh_gam_config_mmr_u { unsigned long v; struct uvh_rh_gam_config_mmr_s { unsigned long m_skt : 6; /* RW */ unsigned long n_skt : 4; /* RW */ + unsigned long rsvd_10_63 : 54; + } s; + struct uv1h_rh_gam_config_mmr_s { + unsigned long m_skt : 6; /* RW */ + unsigned long n_skt : 4; /* RW */ unsigned long rsvd_10_11: 2; /* */ unsigned long mmiol_cfg : 1; /* RW */ unsigned long rsvd_13_63: 51; /* */ - } s; + } s1; + struct uv2h_rh_gam_config_mmr_s { + unsigned long m_skt : 6; /* RW */ + unsigned long n_skt : 4; /* RW */ + unsigned long rsvd_10_63: 54; /* */ + } s2; }; /* ========================================================================= */ @@ -975,25 +1395,49 @@ union uvh_rh_gam_config_mmr_u { #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_gru_overlay_config_mmr_s { unsigned long rsvd_0_27: 28; /* */ unsigned long base : 18; /* RW */ + unsigned long rsvd_46_62 : 17; + unsigned long enable : 1; /* RW */ + } s; + struct uv1h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27: 28; /* */ + unsigned long base : 18; /* RW */ unsigned long rsvd_46_47: 2; /* */ unsigned long gr4 : 1; /* RW */ unsigned long rsvd_49_51: 3; /* */ unsigned long n_gru : 4; /* RW */ unsigned long rsvd_56_62: 7; /* */ unsigned long enable : 1; /* RW */ - } s; + } s1; + struct uv2h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27: 28; /* */ + unsigned long base : 18; /* RW */ + unsigned long rsvd_46_51: 6; /* */ + unsigned long n_gru : 4; /* RW */ + unsigned long rsvd_56_62: 7; /* */ + unsigned long enable : 1; /* RW */ + } s2; }; /* ========================================================================= */ @@ -1001,25 +1445,42 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { /* ========================================================================= */ #define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_mmioh_overlay_config_mmr_u { unsigned long v; - struct uvh_rh_gam_mmioh_overlay_config_mmr_s { + struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { unsigned long rsvd_0_29: 30; /* */ unsigned long base : 16; /* RW */ unsigned long m_io : 6; /* RW */ unsigned long n_io : 4; /* RW */ unsigned long rsvd_56_62: 7; /* */ unsigned long enable : 1; /* RW */ - } s; + } s1; + struct uv2h_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_26: 27; /* */ + unsigned long base : 19; /* RW */ + unsigned long m_io : 6; /* RW */ + unsigned long n_io : 4; /* RW */ + unsigned long rsvd_56_62: 7; /* */ + unsigned long enable : 1; /* RW */ + } s2; }; /* ========================================================================= */ @@ -1029,20 +1490,40 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_mmr_overlay_config_mmr_s { unsigned long rsvd_0_25: 26; /* */ unsigned long base : 20; /* RW */ + unsigned long rsvd_46_62 : 17; + unsigned long enable : 1; /* RW */ + } s; + struct uv1h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25: 26; /* */ + unsigned long base : 20; /* RW */ unsigned long dual_hub : 1; /* RW */ unsigned long rsvd_47_62: 16; /* */ unsigned long enable : 1; /* RW */ - } s; + } s1; + struct uv2h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25: 26; /* */ + unsigned long base : 20; /* RW */ + unsigned long rsvd_46_62: 17; /* */ + unsigned long enable : 1; /* RW */ + } s2; }; /* ========================================================================= */ @@ -1099,5 +1580,170 @@ union uvh_rtc1_int_config_u { } s; }; +/* ========================================================================= */ +/* UVH_SCRATCH5 */ +/* ========================================================================= */ +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x778 + +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL + +union uvh_scratch5_u { + unsigned long v; + struct uvh_scratch5_s { + unsigned long scratch5 : 64; /* RW, W1CS */ + } s; +}; + +/* ========================================================================= */ +/* UV2H_EVENT_OCCURRED2 */ +/* ========================================================================= */ +#define UV2H_EVENT_OCCURRED2 0x70100UL +#define UV2H_EVENT_OCCURRED2_32 0xb68 + +#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +union uv2h_event_occurred2_u { + unsigned long v; + struct uv2h_event_occurred2_s { + unsigned long rtc_0 : 1; /* RW */ + unsigned long rtc_1 : 1; /* RW */ + unsigned long rtc_2 : 1; /* RW */ + unsigned long rtc_3 : 1; /* RW */ + unsigned long rtc_4 : 1; /* RW */ + unsigned long rtc_5 : 1; /* RW */ + unsigned long rtc_6 : 1; /* RW */ + unsigned long rtc_7 : 1; /* RW */ + unsigned long rtc_8 : 1; /* RW */ + unsigned long rtc_9 : 1; /* RW */ + unsigned long rtc_10 : 1; /* RW */ + unsigned long rtc_11 : 1; /* RW */ + unsigned long rtc_12 : 1; /* RW */ + unsigned long rtc_13 : 1; /* RW */ + unsigned long rtc_14 : 1; /* RW */ + unsigned long rtc_15 : 1; /* RW */ + unsigned long rtc_16 : 1; /* RW */ + unsigned long rtc_17 : 1; /* RW */ + unsigned long rtc_18 : 1; /* RW */ + unsigned long rtc_19 : 1; /* RW */ + unsigned long rtc_20 : 1; /* RW */ + unsigned long rtc_21 : 1; /* RW */ + unsigned long rtc_22 : 1; /* RW */ + unsigned long rtc_23 : 1; /* RW */ + unsigned long rtc_24 : 1; /* RW */ + unsigned long rtc_25 : 1; /* RW */ + unsigned long rtc_26 : 1; /* RW */ + unsigned long rtc_27 : 1; /* RW */ + unsigned long rtc_28 : 1; /* RW */ + unsigned long rtc_29 : 1; /* RW */ + unsigned long rtc_30 : 1; /* RW */ + unsigned long rtc_31 : 1; /* RW */ + unsigned long rsvd_32_63: 32; /* */ + } s1; +}; + +/* ========================================================================= */ +/* UV2H_EVENT_OCCURRED2_ALIAS */ +/* ========================================================================= */ +#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL +#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 + +/* ========================================================================= */ +/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */ +/* ========================================================================= */ +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 + +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL + +union uv2h_lb_bau_sb_activation_status_2_u { + unsigned long v; + struct uv2h_lb_bau_sb_activation_status_2_s { + unsigned long aux_error : 64; /* RW */ + } s1; +}; + +/* ========================================================================= */ +/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */ +/* ========================================================================= */ +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0 + +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL + +union uv1h_lb_target_physical_apic_id_mask_u { + unsigned long v; + struct uv1h_lb_target_physical_apic_id_mask_s { + unsigned long bit_enables : 32; /* RW */ + unsigned long rsvd_32_63 : 32; /* */ + } s1; +}; + #endif /* __ASM_UV_MMRS_X86_H__ */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 9064052b73de..bb0522850b74 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -1,20 +1,6 @@ #ifndef _ASM_X86_VDSO_H #define _ASM_X86_VDSO_H -#ifdef CONFIG_X86_64 -extern const char VDSO64_PRELINK[]; - -/* - * Given a pointer to the vDSO image, find the pointer to VDSO64_name - * as that symbol is defined in the vDSO sources or linker script. - */ -#define VDSO64_SYMBOL(base, name) \ -({ \ - extern const char VDSO64_##name[]; \ - (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \ -}) -#endif - #if defined CONFIG_X86_32 || defined CONFIG_COMPAT extern const char VDSO32_PRELINK[]; diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3d61e204826f..646b4c1ca695 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -23,8 +23,6 @@ struct vsyscall_gtod_data { struct timespec wall_to_monotonic; struct timespec wall_time_coarse; }; -extern struct vsyscall_gtod_data __vsyscall_gtod_data -__section_vsyscall_gtod_data; extern struct vsyscall_gtod_data vsyscall_gtod_data; #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d0983d255fbd..d55597351f6a 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -16,27 +16,19 @@ enum vsyscall_num { #ifdef __KERNEL__ #include <linux/seqlock.h> -#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) -#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) - /* Definitions for CONFIG_GENERIC_TIME definitions */ -#define __section_vsyscall_gtod_data __attribute__ \ - ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) -#define __section_vsyscall_clock __attribute__ \ - ((unused, __section__ (".vsyscall_clock"),aligned(16))) #define __vsyscall_fn \ __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 -extern int __vgetcpu_mode; -extern volatile unsigned long __jiffies; - /* kernel space (writeable) */ extern int vgetcpu_mode; extern struct timezone sys_tz; +#include <asm/vvar.h> + extern void map_vsyscall(void); #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h new file mode 100644 index 000000000000..341b3559452b --- /dev/null +++ b/arch/x86/include/asm/vvar.h @@ -0,0 +1,52 @@ +/* + * vvar.h: Shared vDSO/kernel variable declarations + * Copyright (c) 2011 Andy Lutomirski + * Subject to the GNU General Public License, version 2 + * + * A handful of variables are accessible (read-only) from userspace + * code in the vsyscall page and the vdso. They are declared here. + * Some other file must define them with DEFINE_VVAR. + * + * In normal kernel code, they are used like any other variable. + * In user code, they are accessed through the VVAR macro. + * + * Each of these variables lives in the vsyscall page, and each + * one needs a unique offset within the little piece of the page + * reserved for vvars. Specify that offset in DECLARE_VVAR. + * (There are 896 bytes available. If you mess up, the linker will + * catch it.) + */ + +/* Offset of vars within vsyscall page */ +#define VSYSCALL_VARS_OFFSET (3072 + 128) + +#if defined(__VVAR_KERNEL_LDS) + +/* The kernel linker script defines its own magic to put vvars in the + * right place. + */ +#define DECLARE_VVAR(offset, type, name) \ + EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset) + +#else + +#define DECLARE_VVAR(offset, type, name) \ + static type const * const vvaraddr_ ## name = \ + (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset)); + +#define DEFINE_VVAR(type, name) \ + type __vvar_ ## name \ + __attribute__((section(".vsyscall_var_" #name), aligned(16))) + +#define VVAR(name) (*vvaraddr_ ## name) + +#endif + +/* DECLARE_VVAR(offset, type, name) */ + +DECLARE_VVAR(0, volatile unsigned long, jiffies) +DECLARE_VVAR(8, int, vgetcpu_mode) +DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) + +#undef DECLARE_VVAR +#undef VSYSCALL_VARS_OFFSET diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h new file mode 100644 index 000000000000..6bf5b8e478c0 --- /dev/null +++ b/arch/x86/include/asm/x2apic.h @@ -0,0 +1,62 @@ +/* + * Common bits for X2APIC cluster/physical modes. + */ + +#ifndef _ASM_X86_X2APIC_H +#define _ASM_X86_X2APIC_H + +#include <asm/apic.h> +#include <asm/ipi.h> +#include <linux/cpumask.h> + +/* + * Need to use more than cpu 0, because we need more vectors + * when MSI-X are used. + */ +static const struct cpumask *x2apic_target_cpus(void) +{ + return cpu_online_mask; +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +/* + * For now each logical cpu is in its own vector allocation domain. + */ +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +static void +__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +static unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +static unsigned long x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +static void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +#endif /* _ASM_X86_X2APIC_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 64642ad019fb..d3d859035af9 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,6 +68,17 @@ struct x86_init_oem { }; /** + * struct x86_init_mapping - platform specific initial kernel pagetable setup + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage + * + * For more details on the purpose of this hook, look in + * init_memory_mapping and the commit that added it. + */ +struct x86_init_mapping { + void (*pagetable_reserve)(u64 start, u64 end); +}; + +/** * struct x86_init_paging - platform specific paging functions * @pagetable_setup_start: platform specific pre paging_init() call * @pagetable_setup_done: platform specific post paging_init() call @@ -83,11 +94,13 @@ struct x86_init_paging { * boot cpu * @tsc_pre_init: platform function called before TSC init * @timer_init: initialize the platform timer (default PIT/HPET) + * @wallclock_init: init the wallclock device */ struct x86_init_timers { void (*setup_percpu_clockev)(void); void (*tsc_pre_init)(void); void (*timer_init)(void); + void (*wallclock_init)(void); }; /** @@ -121,6 +134,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a3c28ae4025b..d240ea950519 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set) static inline int HYPERVISOR_sched_op(int cmd, void *arg) { - return _hypercall2(int, sched_op_new, cmd, arg); + return _hypercall2(int, sched_op, cmd, arg); } static inline long @@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value) #endif static inline int -HYPERVISOR_suspend(unsigned long srec) +HYPERVISOR_suspend(unsigned long start_info_mfn) { - return _hypercall3(int, sched_op, SCHEDOP_shutdown, - SHUTDOWN_suspend, srec); + struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; + + /* + * For a PV guest the tools require that the start_info mfn be + * present in rdx/edx when the hypercall is made. Per the + * hypercall calling convention this is the third hypercall + * argument, which is start_info_mfn here. + */ + return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn); } static inline int @@ -440,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg) return _hypercall2(unsigned long, hvm_op, op, arg); } +static inline int +HYPERVISOR_tmem_op( + struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, op); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 1c10c88ee4e1..5d4922ad4b9b 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -86,7 +86,7 @@ DEFINE_GUEST_HANDLE(void); * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: - * Level == 0: Noone may enter + * Level == 0: No one may enter * Level == 1: Kernel may enter * Level == 2: Kernel may enter * Level == 3: Everyone may enter diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index f25bdf238a33..64a619d47d34 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -29,8 +29,10 @@ typedef struct xpaddr { /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ #define INVALID_P2M_ENTRY (~0UL) -#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1)) +#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2)) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) +#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) /* Maximum amount of memory we can handle in a domain in pages */ #define MAX_DOMAIN_PAGES \ @@ -41,12 +43,19 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern unsigned long set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); -extern int m2p_add_override(unsigned long mfn, struct page *page); -extern int m2p_remove_override(struct page *page); +extern int m2p_add_override(unsigned long mfn, struct page *page, + bool clear_pte); +extern int m2p_remove_override(struct page *page, bool clear_pte); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); +#ifdef CONFIG_XEN_DEBUG_FS +extern int p2m_dump_show(struct seq_file *m, void *v); +#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -57,7 +66,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) mfn = get_phys_to_machine(pfn); if (mfn != INVALID_P2M_ENTRY) - mfn &= ~FOREIGN_FRAME_BIT; + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); return mfn; } @@ -73,25 +82,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) static inline unsigned long mfn_to_pfn(unsigned long mfn) { unsigned long pfn; + int ret = 0; if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; + if (unlikely((mfn >> machine_to_phys_order) != 0)) { + pfn = ~0; + goto try_override; + } pfn = 0; /* * The array access can fail (e.g., device space beyond end of RAM). * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ - __get_user(pfn, &machine_to_phys_mapping[mfn]); - - /* - * If this appears to be a foreign mfn (because the pfn - * doesn't map back to the mfn), then check the local override - * table to see if there's a better pfn to use. + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); +try_override: + /* ret might be < 0 if there are no entries in the m2p for mfn */ + if (ret < 0) + pfn = ~0; + else if (get_phys_to_machine(pfn) != mfn) + /* + * If this appears to be a foreign mfn (because the pfn + * doesn't map back to the mfn), then check the local override + * table to see if there's a better pfn to use. + * + * m2p_find_override_pfn returns ~0 if it doesn't find anything. + */ + pfn = m2p_find_override_pfn(mfn, ~0); + + /* + * pfn is ~0 if there are no entries in the m2p for mfn or if the + * entry doesn't map back to the mfn and m2p_override doesn't have a + * valid entry for it. */ - if (get_phys_to_machine(pfn) != mfn) - pfn = m2p_find_override_pfn(mfn, pfn); + if (pfn == ~0 && + get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) + pfn = mfn; return pfn; } diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 2329b3eaf8d3..4fbda9a3f339 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -15,10 +15,26 @@ static inline int pci_xen_hvm_init(void) #endif #if defined(CONFIG_XEN_DOM0) void __init xen_setup_pirqs(void); +int xen_find_device_domain_owner(struct pci_dev *dev); +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); +int xen_unregister_device_domain_owner(struct pci_dev *dev); #else static inline void __init xen_setup_pirqs(void) { } +static inline int xen_find_device_domain_owner(struct pci_dev *dev) +{ + return -1; +} +static inline int xen_register_device_domain_owner(struct pci_dev *dev, + uint16_t domain) +{ + return -1; +} +static inline int xen_unregister_device_domain_owner(struct pci_dev *dev) +{ + return -1; +} #endif #if defined(CONFIG_PCI_MSI) @@ -27,16 +43,16 @@ static inline void __init xen_setup_pirqs(void) * its own functions. */ struct xen_pci_frontend_ops { - int (*enable_msi)(struct pci_dev *dev, int **vectors); + int (*enable_msi)(struct pci_dev *dev, int vectors[]); void (*disable_msi)(struct pci_dev *dev); - int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); + int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec); void (*disable_msix)(struct pci_dev *dev); }; extern struct xen_pci_frontend_ops *xen_pci_frontend; static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, - int **vectors) + int vectors[]) { if (xen_pci_frontend && xen_pci_frontend->enable_msi) return xen_pci_frontend->enable_msi(dev, vectors); @@ -48,7 +64,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) xen_pci_frontend->disable_msi(dev); } static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, - int **vectors, int nvec) + int vectors[], int nvec) { if (xen_pci_frontend && xen_pci_frontend->enable_msix) return xen_pci_frontend->enable_msix(dev, vectors, nvec); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 34244b2cd880..f5abe3a245b8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities -CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg @@ -24,30 +23,33 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_tsc.o := $(nostackp) +CFLAGS_vread_tsc_64.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n GCOV_PROFILE_paravirt.o := n +# vread_tsc_64 is hot and should be fully optimized: +CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls + obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_X86_32) += probe_roms_32.o +obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o obj-y += bootflag.o e820.o -obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o +obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-y += trampoline.o trampoline_$(BITS).o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o @@ -55,10 +57,12 @@ obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o +obj-$(CONFIG_ISA_DMA_API) += i8237.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o +obj-$(CONFIG_X86_32) += reboot_32.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o @@ -66,10 +70,9 @@ obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o +obj-$(CONFIG_SMP) += smpboot.o +obj-$(CONFIG_SMP) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_64_SMP) += tsc_sync.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o @@ -109,13 +112,14 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_OF) += devicetree.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_AUDIT) += audit_64.o - obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o + obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3e6e2d68f761..4558f0d0822d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -595,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) nid = acpi_get_node(handle); if (nid == -1 || !node_online(nid)) return; -#ifdef CONFIG_X86_64 - apicid_to_node[physid] = nid; + set_apicid_to_node(physid, nid); numa_set_node(cpu, nid); -#else /* CONFIG_X86_32 */ - apicid_2_node[physid] = nid; - cpu_to_node_map[cpu] = nid; -#endif - #endif } @@ -976,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) mp_irq.irqflag = (trigger << 2) | polarity; mp_irq.srcbus = MP_ISA_BUS; mp_irq.srcbusirq = bus_irq; /* IRQ */ - mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ + mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ mp_irq.dstirq = pin; /* INTIN# */ mp_save_irq(&mp_irq); @@ -1027,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void) if (ioapic < 0) continue; pin = mp_find_ioapic_pin(ioapic, gsi); - dstapic = mp_ioapics[ioapic].apicid; + dstapic = mpc_ioapic_id(ioapic); for (idx = 0; idx < mp_irq_entries; idx++) { struct mpc_intsrc *irq = mp_irqs + idx; @@ -1088,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapics[ioapic].apicid; + mp_irq.dstapic = mpc_ioapic_id(ioapic); mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); mp_save_irq(&mp_irq); @@ -1119,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapics[ioapic].apicid, + "%d-%d\n", mpc_ioapic_id(ioapic), ioapic_pin); return gsi; } diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 28595d6df47c..ead21b663117 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -6,11 +6,17 @@ #include <asm/page_types.h> #include <asm/pgtable_types.h> #include <asm/processor-flags.h> +#include "wakeup.h" .code16 - .section ".header", "a" + .section ".jump", "ax" + .globl _start +_start: + cli + jmp wakeup_code /* This should match the structure in wakeup.h */ + .section ".header", "a" .globl wakeup_header wakeup_header: video_mode: .short 0 /* Video mode number */ @@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */ wakeup_jmp_off: .word 3f wakeup_jmp_seg: .word 0 wakeup_gdt: .quad 0, 0, 0 -signature: .long 0x51ee1111 +signature: .long WAKEUP_HEADER_SIGNATURE .text - .globl _start .code16 wakeup_code: -_start: - cli cld /* Apparently some dimwit BIOS programmers don't know how to @@ -77,12 +80,12 @@ _start: /* Check header signature... */ movl signature, %eax - cmpl $0x51ee1111, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax jne bogus_real_magic /* Check we really have everything... */ movl end_signature, %eax - cmpl $0x65a22c82, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax jne bogus_real_magic /* Call the C code */ @@ -147,3 +150,7 @@ wakeup_heap: wakeup_stack: .space 2048 wakeup_stack_end: + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h index 69d38d0b2b64..e1828c07e79c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ b/arch/x86/kernel/acpi/realmode/wakeup.h @@ -35,7 +35,8 @@ struct wakeup_header { extern struct wakeup_header wakeup_header; #endif -#define HEADER_OFFSET 0x3f00 -#define WAKEUP_SIZE 0x4000 +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 060fff8f5c5b..d4f8010a5b1b 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -13,9 +13,19 @@ ENTRY(_start) SECTIONS { . = 0; + .jump : { + *(.jump) + } = 0x90909090 + + . = WAKEUP_HEADER_OFFSET; + .header : { + *(.header) + } + + . = ALIGN(16); .text : { *(.text*) - } + } = 0x90909090 . = ALIGN(16); .rodata : { @@ -33,11 +43,6 @@ SECTIONS *(.data*) } - .signature : { - end_signature = .; - LONG(0x65a22c82) - } - . = ALIGN(16); .bss : { __bss_start = .; @@ -45,20 +50,13 @@ SECTIONS __bss_end = .; } - . = HEADER_OFFSET; - .header : { - *(.header) + .signature : { + *(.signature) } - . = ALIGN(16); _end = .; /DISCARD/ : { *(.note*) } - - /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: - */ - . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 68d1537b8c81..18a857ba7a25 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -18,37 +18,28 @@ #include "realmode/wakeup.h" #include "sleep.h" -unsigned long acpi_wakeup_address; unsigned long acpi_realmode_flags; -/* address in low memory of the wakeup routine. */ -static unsigned long acpi_realmode; - #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) static char temp_stack[4096]; #endif /** - * acpi_save_state_mem - save kernel state + * acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. - * - * Note that this is too late to change acpi_wakeup_address. */ -int acpi_save_state_mem(void) +int acpi_suspend_lowlevel(void) { struct wakeup_header *header; + /* address in low memory of the wakeup routine. */ + char *acpi_realmode; - if (!acpi_realmode) { - printk(KERN_ERR "Could not allocate memory during boot, " - "S3 disabled\n"); - return -ENOMEM; - } - memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE); + acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code); - header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); - if (header->signature != 0x51ee1111) { + header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET); + if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); return -EINVAL; } @@ -68,9 +59,7 @@ int acpi_save_state_mem(void) /* GDT[0]: GDT self-pointer */ header->wakeup_gdt[0] = (u64)(sizeof(header->wakeup_gdt) - 1) + - ((u64)(acpi_wakeup_address + - ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) - << 16); + ((u64)__pa(&header->wakeup_gdt) << 16); /* GDT[1]: big real mode-like code segment */ header->wakeup_gdt[1] = GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); @@ -96,7 +85,7 @@ int acpi_save_state_mem(void) header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ - header->trampoline_segment = setup_trampoline() >> 4; + header->trampoline_segment = trampoline_address() >> 4; #ifdef CONFIG_SMP stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = @@ -107,56 +96,10 @@ int acpi_save_state_mem(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + do_suspend_lowlevel(); return 0; } -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - - -/** - * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_wakeup_memory(void) -{ - phys_addr_t mem; - - if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - - if (mem == MEMBLOCK_ERROR) { - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); - return; - } - acpi_realmode = (unsigned long) phys_to_virt(mem); - acpi_wakeup_address = mem; - memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); -} - -int __init acpi_configure_wakeup_memory(void) -{ - if (acpi_realmode) - set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT); - - return 0; -} -arch_initcall(acpi_configure_wakeup_memory); - - static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { @@ -169,11 +112,6 @@ static int __init acpi_sleep_setup(char *str) #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); - if (strncmp(str, "s4_nonvs", 8) == 0) { - pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, " - "please use acpi_sleep=nonvs instead"); - acpi_nvs_nosave(); - } #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index adbcbaa6f1df..416d4be13fef 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -4,13 +4,12 @@ #include <asm/trampoline.h> -extern char wakeup_code_start, wakeup_code_end; - extern unsigned long saved_video_mode; extern long saved_magic; extern int wakeup_pmode_return; -extern char swsusp_pg_dir[PAGE_SIZE]; extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S index 6ff3b5730575..63b8ab524f2c 100644 --- a/arch/x86/kernel/acpi/wakeup_rm.S +++ b/arch/x86/kernel/acpi/wakeup_rm.S @@ -2,9 +2,11 @@ * Wrapper script for the realmode binary as a transport object * before copying to low memory. */ - .section ".rodata","a" - .globl wakeup_code_start, wakeup_code_end -wakeup_code_start: +#include <asm/page_types.h> + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .globl acpi_wakeup_code +acpi_wakeup_code: .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" -wakeup_code_end: - .size wakeup_code_start, .-wakeup_code_start + .size acpi_wakeup_code, .-acpi_wakeup_code diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7038b95d363f..a81f2d52f869 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) +/* + * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes + * that correspond to that nop. Getting from one nop to the next, we + * add to the array the offset that is equal to the sum of all sizes of + * nops preceding the one we are after. + * + * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the + * nice symmetry of sizes of the previous nops. + */ #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " - GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8 - "\t.previous"); -extern const unsigned char intelnops[]; -static const unsigned char *const __initconst_or_module -intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char intelnops[] = +{ + GENERIC_NOP1, + GENERIC_NOP2, + GENERIC_NOP3, + GENERIC_NOP4, + GENERIC_NOP5, + GENERIC_NOP6, + GENERIC_NOP7, + GENERIC_NOP8, + GENERIC_NOP5_ATOMIC +}; +static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +{ NULL, intelnops, intelnops + 1, @@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = { intelnops + 1 + 2 + 3 + 4 + 5, intelnops + 1 + 2 + 3 + 4 + 5 + 6, intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef K8_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8 - "\t.previous"); -extern const unsigned char k8nops[]; -static const unsigned char *const __initconst_or_module -k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char k8nops[] = +{ + K8_NOP1, + K8_NOP2, + K8_NOP3, + K8_NOP4, + K8_NOP5, + K8_NOP6, + K8_NOP7, + K8_NOP8, + K8_NOP5_ATOMIC +}; +static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +{ NULL, k8nops, k8nops + 1, @@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = { k8nops + 1 + 2 + 3 + 4 + 5, k8nops + 1 + 2 + 3 + 4 + 5 + 6, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #if defined(K7_NOP1) && !defined(CONFIG_X86_64) -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8 - "\t.previous"); -extern const unsigned char k7nops[]; -static const unsigned char *const __initconst_or_module -k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char k7nops[] = +{ + K7_NOP1, + K7_NOP2, + K7_NOP3, + K7_NOP4, + K7_NOP5, + K7_NOP6, + K7_NOP7, + K7_NOP8, + K7_NOP5_ATOMIC +}; +static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = +{ NULL, k7nops, k7nops + 1, @@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = { k7nops + 1 + 2 + 3 + 4 + 5, k7nops + 1 + 2 + 3 + 4 + 5 + 6, k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef P6_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " - P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8 - "\t.previous"); -extern const unsigned char p6nops[]; -static const unsigned char *const __initconst_or_module -p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char __initconst_or_module p6nops[] = +{ + P6_NOP1, + P6_NOP2, + P6_NOP3, + P6_NOP4, + P6_NOP5, + P6_NOP6, + P6_NOP7, + P6_NOP8, + P6_NOP5_ATOMIC +}; +static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = +{ NULL, p6nops, p6nops + 1, @@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = { p6nops + 1 + 2 + 3 + 4 + 5, p6nops + 1 + 2 + 3 + 4 + 5 + 6, p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif +/* Initialize these to a safe default */ #ifdef CONFIG_X86_64 +const unsigned char * const *ideal_nops = p6_nops; +#else +const unsigned char * const *ideal_nops = intel_nops; +#endif -extern char __vsyscall_0; -static const unsigned char *const *__init_or_module find_nop_table(void) +void __init arch_init_ideal_nops(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return k8_nops; -} - -#else /* CONFIG_X86_64 */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + /* + * Due to a decoder implementation quirk, some + * specific Intel CPUs actually perform better with + * the "k8_nops" than with the SDM-recommended NOPs. + */ + if (boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 0x0f && + boot_cpu_data.x86_model != 0x1c && + boot_cpu_data.x86_model != 0x26 && + boot_cpu_data.x86_model != 0x27 && + boot_cpu_data.x86_model < 0x30) { + ideal_nops = k8_nops; + } else if (boot_cpu_has(X86_FEATURE_NOPL)) { + ideal_nops = p6_nops; + } else { +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + ideal_nops = intel_nops; +#endif + } -static const unsigned char *const *__init_or_module find_nop_table(void) -{ - if (boot_cpu_has(X86_FEATURE_K8)) - return k8_nops; - else if (boot_cpu_has(X86_FEATURE_K7)) - return k7_nops; - else if (boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return intel_nops; + default: +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + if (boot_cpu_has(X86_FEATURE_K8)) + ideal_nops = k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + ideal_nops = k7_nops; + else + ideal_nops = intel_nops; +#endif + } } -#endif /* CONFIG_X86_64 */ - /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void __init_or_module add_nops(void *insns, unsigned int len) { - const unsigned char *const *noptable = find_nop_table(); - while (len > 0) { unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, noptable[noplen], noplen); + memcpy(insns, ideal_nops[noplen], noplen); insns += noplen; len -= noplen; } @@ -195,11 +250,12 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; +extern char __vsyscall_0; void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where + self modifying code. This implies that asymmetric systems where APs have less capabilities than the boot processor are not handled. Tough. Make sure you disable such features by hand. */ @@ -210,6 +266,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite a previous scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ for (a = start; a < end; a++) { u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); @@ -620,7 +685,12 @@ static int __kprobes stop_machine_text_poke(void *data) flush_icache_range((unsigned long)p->addr, (unsigned long)p->addr + p->len); } - + /* + * Intel Archiecture Software Developer's Manual section 7.1.3 specifies + * that a core serializing instruction such as "cpuid" should be + * executed on _each_ core before the new instruction is made visible. + */ + sync_core(); return 0; } @@ -673,29 +743,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) wrote_text = 0; __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); } - -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) - -#ifdef CONFIG_X86_64 -unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 }; -#else -unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; -#endif - -void __init arch_init_ideal_nop5(void) -{ - /* - * There is no good nop for all x86 archs. This selection - * algorithm should be unified with the one in find_nop_table(), - * but this should be good enough for now. - * - * For cases other than the ones below, use the safe (as in - * always functional) defaults above. - */ -#ifdef CONFIG_X86_64 - /* Don't use these on 32 bits due to broken virtualizers */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - memcpy(ideal_nop5, p6_nops[5], 5); -#endif -} -#endif diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c index c01ffa5b9b87..b117efd24f71 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -27,7 +27,7 @@ #include <linux/kdebug.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/io.h> #include <linux/gfp.h> #include <asm/atomic.h> @@ -81,6 +81,9 @@ static u32 gart_unmapped_entry; #define AGPEXTERN #endif +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + /* backdoor interface to AGP driver */ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; @@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; @@ -589,7 +596,7 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static void gart_fixup_northbridges(struct sys_device *dev) +static void gart_fixup_northbridges(void) { int i; @@ -613,33 +620,20 @@ static void gart_fixup_northbridges(struct sys_device *dev) } } -static int gart_resume(struct sys_device *dev) +static void gart_resume(void) { pr_info("PCI-DMA: Resuming GART IOMMU\n"); - gart_fixup_northbridges(dev); + gart_fixup_northbridges(); enable_gart_translations(); - - return 0; } -static int gart_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, +static struct syscore_ops gart_syscore_ops = { .resume = gart_resume, }; -static struct sys_device device_gart = { - .cls = &gart_sysdev_class, -}; - /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -650,7 +644,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i, error; + int i; pr_info("PCI-DMA: Disabling AGP.\n"); @@ -685,12 +679,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - error = sysdev_class_register(&gart_sysdev_class); - if (!error) - error = sysdev_register(&device_gart); - if (error) - panic("Could not register gart_sysdev -- " - "would corrupt data on next suspend"); + register_syscore_ops(&gart_syscore_ops); flush_gart(); diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 57ca77787220..cd8cbeb5fa34 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -18,6 +18,7 @@ */ #include <linux/pci.h> +#include <linux/pci-ats.h> #include <linux/bitmap.h> #include <linux/slab.h> #include <linux/debugfs.h> @@ -25,6 +26,7 @@ #include <linux/dma-mapping.h> #include <linux/iommu-helper.h> #include <linux/iommu.h> +#include <linux/delay.h> #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -34,7 +36,7 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define EXIT_LOOP_COUNT 10000000 +#define LOOP_TIMEOUT 100000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); @@ -57,7 +59,6 @@ struct iommu_cmd { u32 data[4]; }; -static void reset_iommu_command_buffer(struct amd_iommu *iommu); static void update_domain(struct protection_domain *domain); /**************************************************************************** @@ -322,8 +323,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); - iommu->reset_in_progress = true; - reset_iommu_command_buffer(iommu); dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: @@ -367,7 +366,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) spin_unlock_irqrestore(&iommu->lock, flags); } -irqreturn_t amd_iommu_int_handler(int irq, void *data) +irqreturn_t amd_iommu_int_thread(int irq, void *data) { struct amd_iommu *iommu; @@ -377,192 +376,300 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) return IRQ_HANDLED; } +irqreturn_t amd_iommu_int_handler(int irq, void *data) +{ + return IRQ_WAKE_THREAD; +} + /**************************************************************************** * * IOMMU command queuing functions * ****************************************************************************/ -/* - * Writes the command to the IOMMUs command buffer and informs the - * hardware about the new command. Must be called with iommu->lock held. - */ -static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) +static int wait_on_sem(volatile u64 *sem) +{ + int i = 0; + + while (*sem == 0 && i < LOOP_TIMEOUT) { + udelay(1); + i += 1; + } + + if (i == LOOP_TIMEOUT) { + pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); + return -EIO; + } + + return 0; +} + +static void copy_cmd_to_buffer(struct amd_iommu *iommu, + struct iommu_cmd *cmd, + u32 tail) { - u32 tail, head; u8 *target; - WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); - tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; - memcpy_toio(target, cmd, sizeof(*cmd)); - tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; - head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - if (tail == head) - return -ENOMEM; + tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + + /* Copy command to buffer */ + memcpy(target, cmd, sizeof(*cmd)); + + /* Tell the IOMMU about it */ writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); +} - return 0; +static void build_completion_wait(struct iommu_cmd *cmd, u64 address) +{ + WARN_ON(address & 0x7ULL); + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; + cmd->data[1] = upper_32_bits(__pa(address)); + cmd->data[2] = 1; + CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); +} + +static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) +{ + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; + CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); +} + +static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, + size_t size, u16 domid, int pde) +{ + u64 pages; + int s; + + pages = iommu_num_pages(address, size, PAGE_SIZE); + s = 0; + + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; + } + + address &= PAGE_MASK; + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[1] |= domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); + if (s) /* size bit - we flush more than one 4kb page */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + +static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, + u64 address, size_t size) +{ + u64 pages; + int s; + + pages = iommu_num_pages(address, size, PAGE_SIZE); + s = 0; + + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; + } + + address &= PAGE_MASK; + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; + cmd->data[0] |= (qdep & 0xff) << 24; + cmd->data[1] = devid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); + if (s) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; +} + +static void build_inv_all(struct iommu_cmd *cmd) +{ + memset(cmd, 0, sizeof(*cmd)); + CMD_SET_TYPE(cmd, CMD_INV_ALL); } /* - * General queuing function for commands. Takes iommu->lock and calls - * __iommu_queue_command(). + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. */ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { + u32 left, tail, head, next_tail; unsigned long flags; - int ret; + WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); + +again: spin_lock_irqsave(&iommu->lock, flags); - ret = __iommu_queue_command(iommu, cmd); - if (!ret) - iommu->need_sync = true; - spin_unlock_irqrestore(&iommu->lock, flags); - return ret; -} + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + left = (head - next_tail) % iommu->cmd_buf_size; -/* - * This function waits until an IOMMU has completed a completion - * wait command - */ -static void __iommu_wait_for_completion(struct amd_iommu *iommu) -{ - int ready = 0; - unsigned status = 0; - unsigned long i = 0; + if (left <= 2) { + struct iommu_cmd sync_cmd; + volatile u64 sem = 0; + int ret; + + build_completion_wait(&sync_cmd, (u64)&sem); + copy_cmd_to_buffer(iommu, &sync_cmd, tail); - INC_STATS_COUNTER(compl_wait); + spin_unlock_irqrestore(&iommu->lock, flags); + + if ((ret = wait_on_sem(&sem)) != 0) + return ret; - while (!ready && (i < EXIT_LOOP_COUNT)) { - ++i; - /* wait for the bit to become one */ - status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + goto again; } - /* set bit back to zero */ - status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; - writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + copy_cmd_to_buffer(iommu, cmd, tail); + + /* We need to sync now to make sure all commands are processed */ + iommu->need_sync = true; + + spin_unlock_irqrestore(&iommu->lock, flags); - if (unlikely(i == EXIT_LOOP_COUNT)) - iommu->reset_in_progress = true; + return 0; } /* * This function queues a completion wait command into the command * buffer of an IOMMU */ -static int __iommu_completion_wait(struct amd_iommu *iommu) +static int iommu_completion_wait(struct amd_iommu *iommu) { struct iommu_cmd cmd; + volatile u64 sem = 0; + int ret; + + if (!iommu->need_sync) + return 0; - memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; - CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + build_completion_wait(&cmd, (u64)&sem); - return __iommu_queue_command(iommu, &cmd); + ret = iommu_queue_command(iommu, &cmd); + if (ret) + return ret; + + return wait_on_sem(&sem); } -/* - * This function is called whenever we need to ensure that the IOMMU has - * completed execution of all commands we sent. It sends a - * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs - * us about that by writing a value to a physical address we pass with - * the command. - */ -static int iommu_completion_wait(struct amd_iommu *iommu) +static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) { - int ret = 0; - unsigned long flags; - - spin_lock_irqsave(&iommu->lock, flags); + struct iommu_cmd cmd; - if (!iommu->need_sync) - goto out; + build_inv_dte(&cmd, devid); - ret = __iommu_completion_wait(iommu); + return iommu_queue_command(iommu, &cmd); +} - iommu->need_sync = false; +static void iommu_flush_dte_all(struct amd_iommu *iommu) +{ + u32 devid; - if (ret) - goto out; + for (devid = 0; devid <= 0xffff; ++devid) + iommu_flush_dte(iommu, devid); - __iommu_wait_for_completion(iommu); + iommu_completion_wait(iommu); +} -out: - spin_unlock_irqrestore(&iommu->lock, flags); +/* + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. + */ +static void iommu_flush_tlb_all(struct amd_iommu *iommu) +{ + u32 dom_id; - if (iommu->reset_in_progress) - reset_iommu_command_buffer(iommu); + for (dom_id = 0; dom_id <= 0xffff; ++dom_id) { + struct iommu_cmd cmd; + build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + dom_id, 1); + iommu_queue_command(iommu, &cmd); + } - return 0; + iommu_completion_wait(iommu); } -static void iommu_flush_complete(struct protection_domain *domain) +static void iommu_flush_all(struct amd_iommu *iommu) { - int i; + struct iommu_cmd cmd; - for (i = 0; i < amd_iommus_present; ++i) { - if (!domain->dev_iommu[i]) - continue; + build_inv_all(&cmd); - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); + iommu_queue_command(iommu, &cmd); + iommu_completion_wait(iommu); +} + +void iommu_flush_all_caches(struct amd_iommu *iommu) +{ + if (iommu_feature(iommu, FEATURE_IA)) { + iommu_flush_all(iommu); + } else { + iommu_flush_dte_all(iommu); + iommu_flush_tlb_all(iommu); } } /* - * Command send function for invalidating a device table entry + * Command send function for flushing on-device TLB */ -static int iommu_flush_device(struct device *dev) +static int device_flush_iotlb(struct device *dev, u64 address, size_t size) { + struct pci_dev *pdev = to_pci_dev(dev); struct amd_iommu *iommu; struct iommu_cmd cmd; u16 devid; + int qdep; + qdep = pci_ats_queue_depth(pdev); devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - /* Build command */ - memset(&cmd, 0, sizeof(cmd)); - CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); - cmd.data[0] = devid; + build_inv_iotlb_pages(&cmd, devid, qdep, address, size); return iommu_queue_command(iommu, &cmd); } -static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - u16 domid, int pde, int s) -{ - memset(cmd, 0, sizeof(*cmd)); - address &= PAGE_MASK; - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); - cmd->data[1] |= domid; - cmd->data[2] = lower_32_bits(address); - cmd->data[3] = upper_32_bits(address); - if (s) /* size bit - we flush more than one 4kb page */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; -} - /* - * Generic command send function for invalidaing TLB entries + * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, - u64 address, u16 domid, int pde, int s) +static int device_flush_dte(struct device *dev) { - struct iommu_cmd cmd; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; int ret; - __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); + pdev = to_pci_dev(dev); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; - ret = iommu_queue_command(iommu, &cmd); + ret = iommu_flush_dte(iommu, devid); + if (ret) + return ret; + + if (pci_ats_enabled(pdev)) + ret = device_flush_iotlb(dev, 0, ~0UL); return ret; } @@ -572,23 +679,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static void __iommu_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) +static void __domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0, i; - unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); - - address &= PAGE_MASK; - - if (pages > 1) { - /* - * If we have to flush more than one page, flush all - * TLB entries for this domain - */ - address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - s = 1; - } + struct iommu_dev_data *dev_data; + struct iommu_cmd cmd; + int ret = 0, i; + build_inv_iommu_pages(&cmd, address, size, domain->id, pde); for (i = 0; i < amd_iommus_present; ++i) { if (!domain->dev_iommu[i]) @@ -598,101 +696,70 @@ static void __iommu_flush_pages(struct protection_domain *domain, * Devices of this domain are behind this IOMMU * We need a TLB flush */ - iommu_queue_inv_iommu_pages(amd_iommus[i], address, - domain->id, pde, s); + ret |= iommu_queue_command(amd_iommus[i], &cmd); + } + + list_for_each_entry(dev_data, &domain->dev_list, list) { + struct pci_dev *pdev = to_pci_dev(dev_data->dev); + + if (!pci_ats_enabled(pdev)) + continue; + + ret |= device_flush_iotlb(dev_data->dev, address, size); } - return; + WARN_ON(ret); } -static void iommu_flush_pages(struct protection_domain *domain, - u64 address, size_t size) +static void domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { - __iommu_flush_pages(domain, address, size, 0); + __domain_flush_pages(domain, address, size, 0); } /* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct protection_domain *domain) +static void domain_flush_tlb(struct protection_domain *domain) { - __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); + __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct protection_domain *domain) +static void domain_flush_tlb_pde(struct protection_domain *domain) { - __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); -} - - -/* - * This function flushes the DTEs for all devices in domain - */ -static void iommu_flush_domain_devices(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - - list_for_each_entry(dev_data, &domain->dev_list, list) - iommu_flush_device(dev_data->dev); - - spin_unlock_irqrestore(&domain->lock, flags); + __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } -static void iommu_flush_all_domain_devices(void) +static void domain_flush_complete(struct protection_domain *domain) { - struct protection_domain *domain; - unsigned long flags; + int i; - spin_lock_irqsave(&amd_iommu_pd_lock, flags); + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; - list_for_each_entry(domain, &amd_iommu_pd_list, list) { - iommu_flush_domain_devices(domain); - iommu_flush_complete(domain); + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); } - - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -void amd_iommu_flush_all_devices(void) -{ - iommu_flush_all_domain_devices(); -} /* - * This function uses heavy locking and may disable irqs for some time. But - * this is no issue because it is only called during resume. + * This function flushes the DTEs for all devices in domain */ -void amd_iommu_flush_all_domains(void) +static void domain_flush_devices(struct protection_domain *domain) { - struct protection_domain *domain; + struct iommu_dev_data *dev_data; unsigned long flags; - spin_lock_irqsave(&amd_iommu_pd_lock, flags); - - list_for_each_entry(domain, &amd_iommu_pd_list, list) { - spin_lock(&domain->lock); - iommu_flush_tlb_pde(domain); - iommu_flush_complete(domain); - spin_unlock(&domain->lock); - } - - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); -} - -static void reset_iommu_command_buffer(struct amd_iommu *iommu) -{ - pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); - - if (iommu->reset_in_progress) - panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + spin_lock_irqsave(&domain->lock, flags); - amd_iommu_reset_cmd_buffer(iommu); - amd_iommu_flush_all_devices(); - amd_iommu_flush_all_domains(); + list_for_each_entry(dev_data, &domain->dev_list, list) + device_flush_dte(dev_data->dev); - iommu->reset_in_progress = false; + spin_unlock_irqrestore(&domain->lock, flags); } /**************************************************************************** @@ -1410,17 +1477,22 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -static void set_dte_entry(u16 devid, struct protection_domain *domain) +static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) { u64 pte_root = virt_to_phys(domain->pt_root); + u32 flags = 0; pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[2] = domain->id; - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + if (ats) + flags |= DTE_FLAG_IOTLB; + + amd_iommu_dev_table[devid].data[3] |= flags; + amd_iommu_dev_table[devid].data[2] = domain->id; + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); } static void clear_dte_entry(u16 devid) @@ -1437,23 +1509,29 @@ static void do_attach(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; + struct pci_dev *pdev; + bool ats = false; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); + pdev = to_pci_dev(dev); + + if (amd_iommu_iotlb_sup) + ats = pci_ats_enabled(pdev); /* Update data structures */ dev_data->domain = domain; list_add(&dev_data->list, &domain->dev_list); - set_dte_entry(devid, domain); + set_dte_entry(devid, domain, ats); /* Do reference counting */ domain->dev_iommu[iommu->index] += 1; domain->dev_cnt += 1; /* Flush the DTE entry */ - iommu_flush_device(dev); + device_flush_dte(dev); } static void do_detach(struct device *dev) @@ -1476,7 +1554,7 @@ static void do_detach(struct device *dev) clear_dte_entry(devid); /* Flush the DTE entry */ - iommu_flush_device(dev); + device_flush_dte(dev); } /* @@ -1539,9 +1617,13 @@ out_unlock: static int attach_device(struct device *dev, struct protection_domain *domain) { + struct pci_dev *pdev = to_pci_dev(dev); unsigned long flags; int ret; + if (amd_iommu_iotlb_sup) + pci_enable_ats(pdev, PAGE_SHIFT); + write_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); @@ -1551,7 +1633,7 @@ static int attach_device(struct device *dev, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_flush_tlb_pde(domain); + domain_flush_tlb_pde(domain); return ret; } @@ -1598,12 +1680,16 @@ static void __detach_device(struct device *dev) */ static void detach_device(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev)) + pci_disable_ats(pdev); } /* @@ -1615,10 +1701,9 @@ static struct protection_domain *domain_for_device(struct device *dev) struct protection_domain *dom; struct iommu_dev_data *dev_data, *alias_data; unsigned long flags; - u16 devid, alias; + u16 devid; devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); if (!alias_data) @@ -1692,7 +1777,7 @@ static int device_change_notifier(struct notifier_block *nb, goto out; } - iommu_flush_device(dev); + device_flush_dte(dev); iommu_completion_wait(iommu); out: @@ -1753,8 +1838,9 @@ static void update_device_table(struct protection_domain *domain) struct iommu_dev_data *dev_data; list_for_each_entry(dev_data, &domain->dev_list, list) { + struct pci_dev *pdev = to_pci_dev(dev_data->dev); u16 devid = get_device_id(dev_data->dev); - set_dte_entry(devid, domain); + set_dte_entry(devid, domain, pci_ats_enabled(pdev)); } } @@ -1764,8 +1850,9 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - iommu_flush_domain_devices(domain); - iommu_flush_tlb_pde(domain); + + domain_flush_devices(domain); + domain_flush_tlb_pde(domain); domain->updated = false; } @@ -1924,10 +2011,10 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(&dma_dom->domain); + domain_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; } else if (unlikely(amd_iommu_np_cache)) - iommu_flush_pages(&dma_dom->domain, address, size); + domain_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1976,7 +2063,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(&dma_dom->domain, flush_addr, size); + domain_flush_pages(&dma_dom->domain, flush_addr, size); dma_dom->need_flush = false; } } @@ -2012,7 +2099,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, if (addr == DMA_ERROR_CODE) goto out; - iommu_flush_complete(domain); + domain_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -2039,7 +2126,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, __unmap_single(domain->priv, dma_addr, size, dir); - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2104,7 +2191,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_flush_complete(domain); + domain_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -2150,7 +2237,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, s->dma_address = s->dma_length = 0; } - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2200,7 +2287,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out_free; } - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2232,7 +2319,7 @@ static void free_coherent(struct device *dev, size_t size, __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2476,7 +2563,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; - iommu_flush_device(dev); + device_flush_dte(dev); iommu_completion_wait(iommu); } @@ -2542,7 +2629,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, unmap_size = iommu_unmap_page(domain, iova, page_size); mutex_unlock(&domain->api_lock); - iommu_flush_tlb_pde(domain); + domain_flush_tlb_pde(domain); return get_order(unmap_size); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6e11c8134158..9179c21120a8 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -21,7 +21,7 @@ #include <linux/acpi.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/interrupt.h> #include <linux/msi.h> #include <asm/pci-direct.h> @@ -137,6 +137,7 @@ int amd_iommus_present; /* IOMMUs have a non-present cache? */ bool amd_iommu_np_cache __read_mostly; +bool amd_iommu_iotlb_sup __read_mostly = true; /* * The ACPI table parsing functions set this variable on an error @@ -180,6 +181,12 @@ static u32 dev_table_size; /* size of the device table */ static u32 alias_table_size; /* size of the alias table */ static u32 rlookup_table_size; /* size if the rlookup table */ +/* + * This function flushes all internal caches of + * the IOMMU used by this driver. + */ +extern void iommu_flush_all_caches(struct amd_iommu *iommu); + static inline void update_last_devid(u16 devid) { if (devid > amd_iommu_last_bdf) @@ -293,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", + static const char * const feat_str[] = { + "PreF", "PPR", "X2APIC", "NX", "GT", "[5]", + "IA", "GA", "HE", "PC", NULL + }; + int i; + + printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx", dev_name(&iommu->dev->dev), iommu->cap_ptr); + if (iommu->cap & (1 << IOMMU_CAP_EFR)) { + printk(KERN_CONT " extended features: "); + for (i = 0; feat_str[i]; ++i) + if (iommu_feature(iommu, (1ULL << i))) + printk(KERN_CONT " %s", feat_str[i]); + } + printk(KERN_CONT "\n"); + iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } @@ -651,7 +672,7 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; - u32 range, misc; + u32 range, misc, low, high; int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, @@ -667,6 +688,15 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB))) + amd_iommu_iotlb_sup = false; + + /* read extended feature bits */ + low = readl(iommu->mmio_base + MMIO_EXT_FEATURES); + high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4); + + iommu->features = ((u64)high << 32) | low; + if (!is_rd890_iommu(iommu->dev)) return; @@ -1004,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu) if (pci_enable_msi(iommu->dev)) return 1; - r = request_irq(iommu->dev->irq, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD-Vi", - NULL); + r = request_threaded_irq(iommu->dev->irq, + amd_iommu_int_handler, + amd_iommu_int_thread, + 0, "AMD-Vi", + iommu->dev); if (r) { pci_disable_msi(iommu->dev); @@ -1244,6 +1275,7 @@ static void enable_iommus(void) iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable(iommu); + iommu_flush_all_caches(iommu); } } @@ -1260,7 +1292,7 @@ static void disable_iommus(void) * disable suspend until real resume implemented */ -static int amd_iommu_resume(struct sys_device *dev) +static void amd_iommu_resume(void) { struct amd_iommu *iommu; @@ -1274,13 +1306,11 @@ static int amd_iommu_resume(struct sys_device *dev) * we have to flush after the IOMMUs are enabled because a * disabled IOMMU will never execute the commands we send */ - amd_iommu_flush_all_devices(); - amd_iommu_flush_all_domains(); - - return 0; + for_each_iommu(iommu) + iommu_flush_all_caches(iommu); } -static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) +static int amd_iommu_suspend(void) { /* disable IOMMUs to go out of the way for BIOS */ disable_iommus(); @@ -1288,17 +1318,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static struct sysdev_class amd_iommu_sysdev_class = { - .name = "amd_iommu", +static struct syscore_ops amd_iommu_syscore_ops = { .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, }; -static struct sys_device device_amd_iommu = { - .id = 0, - .cls = &amd_iommu_sysdev_class, -}; - /* * This is the core init function for AMD IOMMU hardware in the system. * This function is called from the generic x86 DMA layer initialization @@ -1415,14 +1439,6 @@ static int __init amd_iommu_init(void) goto free; } - ret = sysdev_class_register(&amd_iommu_sysdev_class); - if (ret) - goto free; - - ret = sysdev_register(&device_amd_iommu); - if (ret) - goto free; - ret = amd_iommu_init_devices(); if (ret) goto free; @@ -1441,6 +1457,8 @@ static int __init amd_iommu_init(void) amd_iommu_init_notifier(); + register_syscore_ops(&amd_iommu_syscore_ops); + if (iommu_pass_through) goto out; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 0a99f7198bc3..4c39baa8facc 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -12,14 +12,19 @@ static u32 *flush_words; -struct pci_device_id amd_nb_misc_ids[] = { +const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); +static struct pci_device_id amd_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + {} +}; + const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { 0x00, 0x18, 0x20 }, { 0xff, 0x00, 0x20 }, @@ -31,7 +36,7 @@ struct amd_northbridge_info amd_northbridges; EXPORT_SYMBOL(amd_northbridges); static struct pci_dev *next_northbridge(struct pci_dev *dev, - struct pci_device_id *ids) + const struct pci_device_id *ids) { do { dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); @@ -43,9 +48,9 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev, int amd_cache_northbridges(void) { - int i = 0; + u16 i = 0; struct amd_northbridge *nb; - struct pci_dev *misc; + struct pci_dev *misc, *link; if (amd_nb_num()) return 0; @@ -64,10 +69,12 @@ int amd_cache_northbridges(void) amd_northbridges.nb = nb; amd_northbridges.num = i; - misc = NULL; + link = misc = NULL; for (i = 0; i != amd_nb_num(); i++) { node_to_amd_nb(i)->misc = misc = next_northbridge(misc, amd_nb_misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, amd_nb_link_ids); } /* some CPU families (e.g. family 0x11) do not support GART */ @@ -85,26 +92,95 @@ int amd_cache_northbridges(void) boot_cpu_data.x86_mask >= 0x1)) amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + /* L3 cache partitioning is supported on family 0x15 */ + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; + return 0; } EXPORT_SYMBOL_GPL(amd_cache_northbridges); -/* Ignores subdevice/subvendor but as far as I can figure out - they're useless anyways */ -int __init early_is_amd_nb(u32 device) +/* + * Ignores subdevice/subvendor but as far as I can figure out + * they're useless anyways + */ +bool __init early_is_amd_nb(u32 device) { - struct pci_device_id *id; + const struct pci_device_id *id; u32 vendor = device & 0xffff; + device >>= 16; for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) - return 1; + return true; + return false; +} + +int amd_get_subcaches(int cpu) +{ + struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + unsigned int mask; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + return (mask >> (4 * cuid)) & 0xf; +} + +int amd_set_subcaches(int cpu, int mask) +{ + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + unsigned int reg; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; + + /* if necessary, collect reset state of L3 partitioning and BAN mode */ + if (reset == 0) { + pci_read_config_dword(nb->link, 0x1d4, &reset); + pci_read_config_dword(nb->misc, 0x1b8, &ban); + ban &= 0x180000; + } + + /* deactivate BAN mode if any subcaches are to be disabled */ + if (mask != 0xf) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + + pci_write_config_dword(nb->link, 0x1d4, mask); + + /* reset BAN mode if L3 partitioning returned to reset state */ + pci_read_config_dword(nb->link, 0x1d4, ®); + if (reg == reset) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + reg &= ~0x180000; + pci_write_config_dword(nb->misc, 0x1b8, reg | ban); + } + return 0; } -int amd_cache_gart(void) +static int amd_cache_gart(void) { - int i; + u16 i; if (!amd_nb_has_feature(AMD_NB_GART)) return 0; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 51d4e1663066..289e92862fd9 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = { .rating = APBT_CLOCKSOURCE_RATING, .read = apbt_read_clocksource, .mask = APBT_MASK, - .shift = APBT_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = apbt_restart_clocksource, }; @@ -316,7 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev) irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); /* APB timer irqs are set up as mp_irqs, timer is edge type */ - __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); + __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); if (system_state == SYSTEM_BOOTING) { if (request_irq(adev->irq, apbt_interrupt_handler, @@ -508,64 +507,12 @@ static int apbt_next_event(unsigned long delta, return 0; } -/* - * APB timer clock is not in sync with pclk on Langwell, which translates to - * unreliable read value caused by sampling error. the error does not add up - * overtime and only happens when sampling a 0 as a 1 by mistake. so the time - * would go backwards. the following code is trying to prevent time traveling - * backwards. little bit paranoid. - */ static cycle_t apbt_read_clocksource(struct clocksource *cs) { - unsigned long t0, t1, t2; - static unsigned long last_read; - -bad_count: - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if (unlikely(t1 < t2)) { - pr_debug("APBT: read current count error %lx:%lx:%lx\n", - t1, t2, t2 - t1); - goto bad_count; - } - /* - * check against cached last read, makes sure time does not go back. - * it could be a normal rollover but we will do tripple check anyway - */ - if (unlikely(t2 > last_read)) { - /* check if we have a normal rollover */ - unsigned long raw_intr_status = - apbt_readl_reg(APBTMRS_RAW_INT_STATUS); - /* - * cs timer interrupt is masked but raw intr bit is set if - * rollover occurs. then we read EOI reg to clear it. - */ - if (raw_intr_status & (1 << phy_cs_timer_id)) { - apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); - goto out; - } - pr_debug("APB CS going back %lx:%lx:%lx ", - t2, last_read, t2 - last_read); -bad_count_x3: - pr_debug("triple check enforced\n"); - t0 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if ((t2 > t1) || (t1 > t0)) { - printk(KERN_ERR "Error: APB CS tripple check failed\n"); - goto bad_count_x3; - } - } -out: - last_read = t2; - return (cycle_t)~t2; + unsigned long current_count; + + current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); + return (cycle_t)~current_count; } static int apbt_clocksource_register(void) @@ -595,14 +542,7 @@ static int apbt_clocksource_register(void) if (t1 == apbt_read_clocksource(&clocksource_apbt)) panic("APBT counter not counting. APBT disabled\n"); - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); + clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); return 0; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5955a7800a96..3d2661ca6542 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -13,7 +13,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/pci_ids.h> #include <linux/pci.h> @@ -30,6 +30,22 @@ #include <asm/amd_nb.h> #include <asm/x86_init.h> +/* + * Using 512M as goal, in case kexec will load kernel_big + * that will do the on-position decompress, and could overlap with + * with the gart aperture that is used. + * Sequence: + * kernel_small + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kernel_small (gart area become e820_reserved) + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * So don't use 512M below as gart iommu, leave the space for kernel + * code for safe. + */ +#define GART_MIN_ADDR (512ULL << 20) +#define GART_MAX_ADDR (1ULL << 32) + int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; @@ -57,7 +73,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + unsigned long addr; /* aper_size should <= 1G */ if (fallback_aper_order > 5) @@ -70,40 +86,27 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - /* - * using 512M as goal, in case kexec will load kernel_big - * that will do the on position decompress, and could overlap with - * that positon with gart that is used. - * sequende: - * kernel_small - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kernel_small(gart area become e820_reserved) - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kerne_big (uncompressed size will be big than 64M or 128M) - * so don't use 512M below as gart iommu, leave the space for kernel - * code for safe - */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, + aper_size, aper_size); + if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { + printk(KERN_ERR + "Cannot allocate aperture memory hole (%lx,%uK)\n", + addr, aper_size>>10); + return 0; + } + memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. */ - kmemleak_ignore(p); - if (!p || __pa(p)+aper_size > 0xffffffff) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); - if (p) - free_bootmem(__pa(p), aper_size); - return 0; - } + kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); - insert_aperture_resource((u32)__pa(p), aper_size); - register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, - (u32)__pa(p+aper_size) >> PAGE_SHIFT); + aper_size >> 10, addr); + insert_aperture_resource((u32)addr, aper_size); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); - return (u32)__pa(p); + return (u32)addr; } @@ -500,7 +503,7 @@ out: * Don't enable translation yet but enable GART IO and CPU * accesses and set DISTLBWALKPRB since GART table memory is UC. */ - u32 ctl = DISTLBWALKPRB | aper_order << 1; + u32 ctl = aper_order << 1; bus = amd_nb_bus_dev_ranges[i].bus; dev_base = amd_nb_bus_dev_ranges[i].dev_base; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3966b564ea47..767fd04f2843 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,20 +2,25 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) -obj-y += apic_flat_64.o -obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o -obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +# APIC probe will depend on the listing order here obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o +obj-y += apic_flat_64.o endif -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o +# APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_ES7000) += es7000_32.o obj-$(CONFIG_X86_SUMMIT) += summit_32.o +obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o +obj-$(CONFIG_X86_ES7000) += es7000_32.o + +# For 32bit, probe_32 need to be listed last +obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 76b96d74978a..b961af86bfea 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -24,7 +24,7 @@ #include <linux/ftrace.h> #include <linux/ioport.h> #include <linux/module.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/timex.h> #include <linux/dmar.h> @@ -43,6 +43,7 @@ #include <asm/i8259.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> #include <asm/idle.h> @@ -78,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #ifdef CONFIG_X86_32 + +/* + * On x86_32, the mapping between cpu and logical apicid may vary + * depending on apic in use. The following early percpu variable is + * used for the mapping. This is where the behaviors of x86_64 and 32 + * actually diverge. Let's keep it ugly for now. + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); + /* * Knob to control our willingness to enable the local APIC. * * +1=force-enable */ -static int force_enable_local_apic; +static int force_enable_local_apic __initdata; /* * APIC command line parameters */ @@ -153,7 +163,7 @@ early_param("nox2apic", setup_nox2apic); unsigned long mp_lapic_addr; int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; +static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -177,29 +187,8 @@ static struct resource lapic_resource = { static unsigned int calibration_result; -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(const struct cpumask *mask); static void apic_pm_activate(void); -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - static unsigned long apic_phys; /* @@ -238,7 +227,7 @@ static int modern_apic(void) * right after this call apic become NOOP driven * so apic->write/read doesn't do anything */ -void apic_disable(void) +static void __init apic_disable(void) { pr_info("APIC: switched to apic NOOP\n"); apic = &apic_noop; @@ -282,23 +271,6 @@ u64 native_apic_icr_read(void) return icr1 | ((u64)icr2 << 32); } -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - #ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs @@ -508,6 +480,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask) #endif } + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + /* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. @@ -516,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { + if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; /* Make LAPIC timer preferrable over percpu HPET */ lapic_clockevent.rating = 150; @@ -1209,7 +1198,7 @@ void __cpuinit setup_local_APIC(void) rdtscll(tsc); if (disable_apic) { - arch_disable_smp_support(); + disable_ioapic_support(); return; } @@ -1237,6 +1226,30 @@ void __cpuinit setup_local_APIC(void) */ apic->init_apic_ldr(); +#ifdef CONFIG_X86_32 + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches the + * actual value. + */ + i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); + /* always use the value from LDR */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + logical_smp_processor_id(); + + /* + * Some NUMA implementations (NUMAQ) don't initialize apicid to + * node mapping during NUMA init. Now that logical apicid is + * guaranteed to be known, give it another chance. This is already + * a bit too late - percpu allocation has already happened without + * proper NUMA affinity. + */ + if (apic->x86_32_numa_cpu_node) + set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), + apic->x86_32_numa_cpu_node(cpu)); +#endif + /* * Set Task Priority to 'accept all'. We never change this * later on. @@ -1448,7 +1461,6 @@ int __init enable_IR(void) void __init enable_IR_x2apic(void) { unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; int ret, x2apic_enabled = 0; int dmar_table_init_ret; @@ -1456,13 +1468,7 @@ void __init enable_IR_x2apic(void) if (dmar_table_init_ret && !x2apic_supported()) return; - ioapic_entries = alloc_ioapic_entries(); - if (!ioapic_entries) { - pr_err("Allocate ioapic_entries failed\n"); - goto out; - } - - ret = save_IO_APIC_setup(ioapic_entries); + ret = save_ioapic_entries(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); goto out; @@ -1470,7 +1476,7 @@ void __init enable_IR_x2apic(void) local_irq_save(flags); legacy_pic->mask_all(); - mask_IO_APIC_setup(ioapic_entries); + mask_ioapic_entries(); if (dmar_table_init_ret) ret = 0; @@ -1501,14 +1507,11 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ - restore_IO_APIC_setup(ioapic_entries); + restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); out: - if (ioapic_entries) - free_ioapic_entries(ioapic_entries); - if (x2apic_enabled) return; @@ -1537,7 +1540,7 @@ static int __init detect_init_APIC(void) } #else -static int apic_verify(void) +static int __init apic_verify(void) { u32 features, h, l; @@ -1562,7 +1565,7 @@ static int apic_verify(void) return 0; } -int apic_force_enable(void) +int __init apic_force_enable(unsigned long addr) { u32 h, l; @@ -1578,7 +1581,7 @@ int apic_force_enable(void) if (!(l & MSR_IA32_APICBASE_ENABLE)) { pr_info("Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; wrmsr(MSR_IA32_APICBASE, l, h); enabled_via_apicbase = 1; } @@ -1619,7 +1622,7 @@ static int __init detect_init_APIC(void) "you can enable it with \"lapic\"\n"); return -1; } - if (apic_force_enable()) + if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) return -1; } else { if (apic_verify()) @@ -1810,30 +1813,41 @@ void smp_spurious_interrupt(struct pt_regs *regs) */ void smp_error_interrupt(struct pt_regs *regs) { - u32 v, v1; + u32 v0, v1; + u32 i = 0; + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; exit_idle(); irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); + v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); v1 = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); - /* - * Here is what the APIC error bits mean: - * 0: Send CS error - * 1: Receive CS error - * 2: Send accept error - * 3: Receive accept error - * 4: Reserved - * 5: Send illegal vector - * 6: Received illegal vector - * 7: Illegal register address - */ - pr_debug("APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", + smp_processor_id(), v0 , v1); + + v1 = v1 & 0xff; + while (v1) { + if (v1 & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v1 >>= 1; + }; + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + irq_exit(); } @@ -1930,17 +1944,6 @@ void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - /* - * Validate version - */ - if (version == 0x0) { - pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - if (num_processors >= nr_cpu_ids) { int max = nr_cpu_ids; int thiscpu = max + disabled_cpus; @@ -1954,22 +1957,34 @@ void __cpuinit generic_processor_info(int apicid, int version) } num_processors++; - cpu = cpumask_next_zero(-1, cpu_present_mask); - - if (version != apic_version[boot_cpu_physical_apicid]) - WARN_ONCE(1, - "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); - - physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. + * boot_cpu_init() already hold bit 0 in cpu_present_mask + * for BSP. */ cpu = 0; + } else + cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* + * Validate version + */ + if (version == 0x0) { + pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); + version = 0x10; } + apic_version[apicid] = version; + + if (version != apic_version[boot_cpu_physical_apicid]) { + pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + apic_version[boot_cpu_physical_apicid], cpu, version); + } + + physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -1977,7 +1992,10 @@ void __cpuinit generic_processor_info(int apicid, int version) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; #endif - +#ifdef CONFIG_X86_32 + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + apic->x86_32_early_logical_apicid(cpu); +#endif set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } @@ -1997,17 +2015,6 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifdef CONFIG_X86_32 -int default_apicid_to_node(int logical_apicid) -{ -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; -#else - return 0; -#endif -} -#endif - /* * Power management */ @@ -2036,7 +2043,7 @@ static struct { unsigned int apic_thmr; } apic_pm_state; -static int lapic_suspend(struct sys_device *dev, pm_message_t state) +static int lapic_suspend(void) { unsigned long flags; int maxlvt; @@ -2074,34 +2081,24 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int lapic_resume(struct sys_device *dev) +static void lapic_resume(void) { unsigned int l, h; unsigned long flags; int maxlvt; - int ret = 0; - struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) - return 0; + return; local_irq_save(flags); if (intr_remapping_enabled) { - ioapic_entries = alloc_ioapic_entries(); - if (!ioapic_entries) { - WARN(1, "Alloc ioapic_entries in lapic resume failed."); - ret = -ENOMEM; - goto restore; - } - - ret = save_IO_APIC_setup(ioapic_entries); - if (ret) { - WARN(1, "Saving IO-APIC state failed: %d\n", ret); - free_ioapic_entries(ioapic_entries); - goto restore; - } - - mask_IO_APIC_setup(ioapic_entries); + /* + * IO-APIC and PIC have their own resume routines. + * We just mask them here to make sure the interrupt + * subsystem is completely quiet while we enable x2apic + * and interrupt-remapping. + */ + mask_ioapic_entries(); legacy_pic->mask_all(); } @@ -2144,16 +2141,10 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (intr_remapping_enabled) { + if (intr_remapping_enabled) reenable_intr_remapping(x2apic_mode); - legacy_pic->restore_mask(); - restore_IO_APIC_setup(ioapic_entries); - free_ioapic_entries(ioapic_entries); - } -restore: - local_irq_restore(flags); - return ret; + local_irq_restore(flags); } /* @@ -2161,17 +2152,11 @@ restore: * are needed on every CPU up until machine_halt/restart/poweroff. */ -static struct sysdev_class lapic_sysclass = { - .name = "lapic", +static struct syscore_ops lapic_syscore_ops = { .resume = lapic_resume, .suspend = lapic_suspend, }; -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; @@ -2179,16 +2164,11 @@ static void __cpuinit apic_pm_activate(void) static int __init init_lapic_sysfs(void) { - int error; - - if (!cpu_has_apic) - return 0; /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + if (cpu_has_apic) + register_syscore_ops(&lapic_syscore_ops); - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; + return 0; } /* local apic needs to resume before other devices access its registers. */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 09d3b17ce0c2..f7a41e4cae47 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -16,6 +16,7 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/hardirq.h> +#include <linux/module.h> #include <asm/smp.h> #include <asm/apic.h> #include <asm/ipi.h> @@ -24,6 +25,12 @@ #include <acpi/acpi_bus.h> #endif +static struct apic apic_physflat; +static struct apic apic_flat; + +struct apic __read_mostly *apic = &apic_flat; +EXPORT_SYMBOL_GPL(apic); + static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 1; @@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -struct apic apic_flat = { +static struct apic apic_flat = { .name = "flat", .probe = NULL, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -185,8 +192,6 @@ struct apic apic_flat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -314,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_apicid, cpu); } -struct apic apic_physflat = { +static int physflat_probe(void) +{ + if (apic == &apic_physflat || num_possible_cpus() > 8) + return 1; + + return 0; +} + +static struct apic apic_physflat = { .name = "physical flat", - .probe = NULL, + .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, @@ -337,8 +350,6 @@ struct apic apic_physflat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -373,3 +384,8 @@ struct apic apic_physflat = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, }; + +/* + * We need to check for physflat first, so this order is important. + */ +apic_drivers(apic_physflat, apic_flat); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e31b9ffe25f5..775b82bc655c 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void) return 0; } -static int noop_cpu_to_logical_apicid(int cpu) -{ - return 0; -} - static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; @@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -int noop_apicid_to_node(int logical_apicid) -{ - /* we're always on node 0 */ - return 0; -} - static u32 noop_apic_read(u32 reg) { WARN_ON_ONCE((cpu_has_apic && !disable_apic)); @@ -153,9 +142,7 @@ struct apic apic_noop = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = noop_apicid_to_node, - .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, @@ -197,4 +184,8 @@ struct apic apic_noop = { .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, + +#ifdef CONFIG_X86_32 + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, +#endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cb804c5091b9..efd737e827f4 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit) return 1; } +static int bigsmp_early_logical_apicid(int cpu) +{ + /* on bigsmp, logical apicid is the same as physical */ + return early_per_cpu(x86_cpu_to_apicid, cpu); +} + static inline unsigned long calculate_ldr(int cpu) { unsigned long val, id; @@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void) nr_ioapics); } -static int bigsmp_apicid_to_node(int logical_apicid) -{ - return apicid_2_node[hard_smp_processor_id()]; -} - static int bigsmp_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -/* Mapping from cpu number to logical apicid */ -static inline int bigsmp_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_physical_id(cpu); -} - static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) /* As we are using single CPU as destination, pick only one CPU here */ static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) { - return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); + int cpu = cpumask_first(cpumask); + + if (cpu < nr_cpu_ids) + return cpu_physical_id(cpu); + return BAD_APICID; } static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, @@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, */ for_each_cpu_and(cpu, cpumask, andmask) { if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + return cpu_physical_id(cpu); } - return bigsmp_cpu_to_logical_apicid(cpu); + return BAD_APICID; } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) @@ -196,7 +193,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -struct apic apic_bigsmp = { +static struct apic apic_bigsmp = { .name = "bigsmp", .probe = probe_bigsmp, @@ -219,8 +216,6 @@ struct apic apic_bigsmp = { .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = bigsmp_apicid_to_node, - .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -256,4 +251,16 @@ struct apic apic_bigsmp = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, }; + +struct apic * __init generic_bigsmp_probe(void) +{ + if (probe_bigsmp()) + return &apic_bigsmp; + + return NULL; +} + +apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8593582d8022..9536b3fe43f8 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } +static int es7000_early_logical_apicid(int cpu) +{ + /* on es7000, logical apicid is the same as physical */ + return early_per_cpu(x86_bios_cpu_apicid, cpu); +} + static unsigned long calculate_ldr(int cpu) { unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); @@ -504,12 +510,6 @@ static void es7000_setup_apic_routing(void) nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } -static int es7000_apicid_to_node(int logical_apicid) -{ - return 0; -} - - static int es7000_cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) @@ -528,18 +528,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) ++cpu_id; } -/* Mapping from cpu number to logical apicid */ -static int es7000_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -#else - return logical_smp_processor_id(); -#endif -} - static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -561,7 +549,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = es7000_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); @@ -578,7 +566,7 @@ static unsigned int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = es7000_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -632,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, } /* We've been warned by a false positive warning.Use __refdata to keep calm. */ -struct apic __refdata apic_es7000_cluster = { +static struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, @@ -655,8 +643,6 @@ struct apic __refdata apic_es7000_cluster = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -695,9 +681,11 @@ struct apic __refdata apic_es7000_cluster = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; -struct apic __refdata apic_es7000 = { +static struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, @@ -720,8 +708,6 @@ struct apic __refdata apic_es7000 = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -758,4 +744,12 @@ struct apic __refdata apic_es7000 = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; + +/* + * Need to check for es7000 followed by es7000_cluster, so this order + * in apic_drivers is important. + */ +apic_drivers(apic_es7000, apic_es7000_cluster); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 79fd43ca6f96..d5e57db0f7be 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -16,11 +16,12 @@ #include <linux/kprobes.h> #include <linux/nmi.h> #include <linux/module.h> +#include <linux/delay.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(void) +u64 hw_nmi_get_sample_period(int watchdog_thresh) { - return (u64)(cpu_khz) * 1000 * 60; + return (u64)(cpu_khz) * 1000 * watchdog_thresh; } #endif @@ -83,7 +84,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, arch_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - dump_stack(); arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NOTIFY_STOP; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ca9e2a3545a9..e5293394b548 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -30,7 +30,7 @@ #include <linux/compiler.h> #include <linux/acpi.h> #include <linux/module.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/msi.h> #include <linux/htirq.h> #include <linux/freezer.h> @@ -76,17 +76,40 @@ int sis_apic_bug = -1; static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_RAW_SPINLOCK(vector_lock); -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; +static struct ioapic { + /* + * # of IRQ routing registers + */ + int nr_registers; + /* + * Saved state during suspend/resume, or while enabling intr-remap. + */ + struct IO_APIC_route_entry *saved_registers; + /* I/O APIC config */ + struct mpc_ioapic mp_config; + /* IO APIC gsi routing info */ + struct mp_ioapic_gsi gsi_config; + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} ioapics[MAX_IO_APICS]; -/* I/O APIC entries */ -struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; +#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver -/* IO APIC gsi routing info */ -struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; +int mpc_ioapic_id(int id) +{ + return ioapics[id].mp_config.apicid; +} + +unsigned int mpc_ioapic_addr(int id) +{ + return ioapics[id].mp_config.apicaddr; +} + +struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id) +{ + return &ioapics[id].gsi_config; +} + +int nr_ioapics; /* The one past the highest gsi number used */ u32 gsi_top; @@ -108,7 +131,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; -void arch_disable_smp_support(void) +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +void disable_ioapic_support(void) { #ifdef CONFIG_PCI noioapicquirk = 1; @@ -120,11 +146,14 @@ void arch_disable_smp_support(void) static int __init parse_noapic(char *str) { /* disable IO-APIC */ - arch_disable_smp_support(); + disable_ioapic_support(); return 0; } early_param("noapic", parse_noapic); +static int io_apic_setup_irq_pin(unsigned int irq, int node, + struct io_apic_irq_attr *attr); + /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) { @@ -173,6 +202,14 @@ int __init arch_early_irq_init(void) io_apic_irqs = ~0UL; } + for (i = 0; i < nr_ioapics; i++) { + ioapics[i].saved_registers = + kzalloc(sizeof(struct IO_APIC_route_entry) * + ioapics[i].nr_registers, GFP_KERNEL); + if (!ioapics[i].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node = cpu_to_node(0); @@ -181,7 +218,7 @@ int __init arch_early_irq_init(void) irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); for (i = 0; i < count; i++) { - set_irq_chip_data(i, &cfg[i]); + irq_set_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* @@ -200,7 +237,7 @@ int __init arch_early_irq_init(void) #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return get_irq_chip_data(irq); + return irq_get_chip_data(irq); } static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) @@ -226,7 +263,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { if (!cfg) return; - set_irq_chip_data(at, NULL); + irq_set_chip_data(at, NULL); free_cpumask_var(cfg->domain); free_cpumask_var(cfg->old_domain); kfree(cfg); @@ -256,14 +293,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) if (res < 0) { if (res != -EEXIST) return NULL; - cfg = get_irq_chip_data(at); + cfg = irq_get_chip_data(at); if (cfg) return cfg; } cfg = alloc_irq_cfg(at, node); if (cfg) - set_irq_chip_data(at, cfg); + irq_set_chip_data(at, cfg); else irq_free_desc(at); return cfg; @@ -291,7 +328,7 @@ struct io_apic { static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); + + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } static inline void io_apic_eoi(unsigned int apic, unsigned int vector) @@ -567,7 +604,7 @@ static void clear_IO_APIC (void) int apic, pin; for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) clear_IO_APIC_pin(apic, pin); } @@ -609,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -struct IO_APIC_route_entry **alloc_ioapic_entries(void) -{ - int apic; - struct IO_APIC_route_entry **ioapic_entries; - - ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, - GFP_KERNEL); - if (!ioapic_entries) - return 0; - - for (apic = 0; apic < nr_ioapics; apic++) { - ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!ioapic_entries[apic]) - goto nomem; - } - - return ioapic_entries; - -nomem: - while (--apic >= 0) - kfree(ioapic_entries[apic]); - kfree(ioapic_entries); - - return 0; -} - /* * Saves all the IO-APIC RTE's */ -int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +int save_ioapic_entries(void) { int apic, pin; - - if (!ioapic_entries) - return -ENOMEM; + int err = 0; for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - return -ENOMEM; + if (!ioapics[apic].saved_registers) { + err = -ENOMEM; + continue; + } - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - ioapic_entries[apic][pin] = + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } - return 0; + return err; } /* * Mask all IO APIC entries. */ -void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +void mask_ioapic_entries(void) { int apic, pin; - if (!ioapic_entries) - return; - for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - break; + if (!ioapics[apic].saved_registers) + continue; - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { struct IO_APIC_route_entry entry; - entry = ioapic_entries[apic][pin]; + entry = ioapics[apic].saved_registers[pin]; if (!entry.mask) { entry.mask = 1; ioapic_write_entry(apic, pin, entry); @@ -686,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) } /* - * Restore IO APIC entries which was saved in ioapic_entries. + * Restore IO APIC entries which was saved in the ioapic structure. */ -int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +int restore_ioapic_entries(void) { int apic, pin; - if (!ioapic_entries) - return -ENOMEM; - for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - return -ENOMEM; + if (!ioapics[apic].saved_registers) + continue; - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) ioapic_write_entry(apic, pin, - ioapic_entries[apic][pin]); + ioapics[apic].saved_registers[pin]); } return 0; } -void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) -{ - int apic; - - for (apic = 0; apic < nr_ioapics; apic++) - kfree(ioapic_entries[apic]); - - kfree(ioapic_entries); -} - /* * Find the IRQ entry number of a certain pin. */ @@ -725,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].irqtype == type && - (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || + (mp_irqs[i].dstapic == mpc_ioapic_id(apic) || mp_irqs[i].dstapic == MP_APIC_ALL) && mp_irqs[i].dstirq == pin) return i; @@ -767,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) + if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic) return apic; } } @@ -818,7 +811,7 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) default_ISA_polarity(idx) -static int MPBIOS_polarity(int idx) +static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; int polarity; @@ -860,7 +853,7 @@ static int MPBIOS_polarity(int idx) return polarity; } -static int MPBIOS_trigger(int idx) +static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; int trigger; @@ -932,20 +925,11 @@ static int MPBIOS_trigger(int idx) return trigger; } -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - static int pin_2_irq(int idx, int apic, int pin) { int irq; int bus = mp_irqs[idx].srcbus; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic); /* * Debugging check, we are in big trouble if this message pops up! @@ -956,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin) if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; } else { - u32 gsi = mp_gsi_routing[apic].gsi_base + pin; + u32 gsi = gsi_cfg->gsi_base + pin; if (gsi >= NR_IRQS_LEGACY) irq = gsi; @@ -1007,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, int lbus = mp_irqs[i].srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic || mp_irqs[i].dstapic == MP_APIC_ALL) break; @@ -1189,7 +1173,7 @@ void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; /* @@ -1220,17 +1204,13 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { int apic, idx, pin; for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { idx = find_irq_entry(apic, pin, mp_INT); if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) return irq_trigger(idx); @@ -1248,35 +1228,31 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, unsigned long trigger) +static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, + unsigned long trigger) { + struct irq_chip *chip = &ioapic_chip; + irq_flow_handler_t hdl; + bool fasteoi; if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) + trigger == IOAPIC_LEVEL) { irq_set_status_flags(irq, IRQ_LEVEL); - else + fasteoi = true; + } else { irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(cfg)) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; + chip = &ir_ioapic_chip; + fasteoi = trigger != 0; } - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + irq_set_chip_and_handler_name(irq, chip, hdl, + fasteoi ? "fasteoi" : "edge"); } static int setup_ioapic_entry(int apic_id, int irq, @@ -1362,56 +1338,45 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", - apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, + apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector, irq, trigger, polarity); - if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, + if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry, dest, trigger, polarity, cfg->vector, pin)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic_id].apicid, pin); + mpc_ioapic_id(apic_id), pin); __clear_irq_vector(irq, cfg); return; } - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, cfg, trigger); if (irq < legacy_pic->nr_legacy_irqs) legacy_pic->mask(irq); ioapic_write_entry(apic_id, pin, entry); } -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - -static void __init setup_IO_APIC_irqs(void) +static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) { - int apic_id, pin, idx, irq, notcon = 0; - int node = cpu_to_node(0); - struct irq_cfg *cfg; + if (idx != -1) + return false; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", + mpc_ioapic_id(apic_id), pin); + return true; +} - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { +static void __init __io_apic_setup_irqs(unsigned int apic_id) +{ + int idx, node = cpu_to_node(0); + struct io_apic_irq_attr attr; + unsigned int pin, irq; + + for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); + if (io_apic_pin_not_connected(idx, apic_id, pin)) continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } irq = pin_2_irq(idx, apic_id, pin); @@ -1423,25 +1388,24 @@ static void __init setup_IO_APIC_irqs(void) * installed and if it returns 1: */ if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) + apic->multi_timer_check(apic_id, irq)) continue; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - continue; + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); - add_pin_to_irq_node(cfg, node, apic_id, pin); - /* - * don't mark it in pin_programmed, so later acpi could - * set it correctly when irq < 16 - */ - setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), - irq_polarity(idx)); + io_apic_setup_irq_pin(irq, node, &attr); } +} - if (notcon) - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int apic_id; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) + __io_apic_setup_irqs(apic_id); } /* @@ -1452,7 +1416,7 @@ static void __init setup_IO_APIC_irqs(void) void setup_IO_APIC_irq_extra(u32 gsi) { int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); - struct irq_cfg *cfg; + struct io_apic_irq_attr attr; /* * Convert 'gsi' to 'ioapic.pin'. @@ -1472,21 +1436,10 @@ void setup_IO_APIC_irq_extra(u32 gsi) if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return; - - add_pin_to_irq_node(cfg, node, apic_id, pin); + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); - if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[apic_id].apicid, pin); - return; - } - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); - - setup_ioapic_irq(apic_id, pin, irq, cfg, - irq_trigger(idx), irq_polarity(idx)); + io_apic_setup_irq_pin_once(irq, node, &attr); } /* @@ -1518,7 +1471,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); /* * Add it to the IO-APIC irq-routing table: @@ -1541,7 +1495,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].apicid, nr_ioapic_registers[i]); + mpc_ioapic_id(i), ioapics[i].nr_registers); /* * We are a bit conservative about what we expect. We have to @@ -1561,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void) raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -1625,7 +1579,7 @@ __apicdebuginit(void) print_IO_APIC(void) for_each_active_irq(irq) { struct irq_pin_list *entry; - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -1855,7 +1809,7 @@ void __init enable_IO_APIC(void) for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, pin); @@ -1916,7 +1870,7 @@ void disable_IO_APIC(void) * * With interrupt-remapping, for now we will use virtual wire A mode, * as virtual wire B is little complex (need to configure both - * IOAPIC RTE aswell as interrupt-remapping table entry). + * IOAPIC RTE as well as interrupt-remapping table entry). * As this gets called during crash dump, keep this simple for now. */ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { @@ -1979,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(apic_id, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - old_id = mp_ioapics[apic_id].apicid; + old_id = mpc_ioapic_id(apic_id); - if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { + if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic_id, mp_ioapics[apic_id].apicid); + apic_id, mpc_ioapic_id(apic_id)); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); - mp_ioapics[apic_id].apicid = reg_00.bits.ID; + ioapics[apic_id].mp_config.apicid = reg_00.bits.ID; } /* @@ -1995,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (apic->check_apicid_used(&phys_id_present_map, - mp_ioapics[apic_id].apicid)) { + mpc_ioapic_id(apic_id))) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic_id, mp_ioapics[apic_id].apicid); + apic_id, mpc_ioapic_id(apic_id)); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; @@ -2006,13 +1960,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); physid_set(i, phys_id_present_map); - mp_ioapics[apic_id].apicid = i; + ioapics[apic_id].mp_config.apicid = i; } else { physid_mask_t tmp; - apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); + apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id), + &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", - mp_ioapics[apic_id].apicid); + mpc_ioapic_id(apic_id)); physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -2020,24 +1975,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * We need to adjust the IRQ routing table * if the ID changed. */ - if (old_id != mp_ioapics[apic_id].apicid) + if (old_id != mpc_ioapic_id(apic_id)) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].dstapic == old_id) mp_irqs[i].dstapic - = mp_ioapics[apic_id].apicid; + = mpc_ioapic_id(apic_id); /* * Update the ID register according to the right value * from the MPC table if they are different. */ - if (mp_ioapics[apic_id].apicid == reg_00.bits.ID) + if (mpc_ioapic_id(apic_id) == reg_00.bits.ID) continue; apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic_id].apicid); + mpc_ioapic_id(apic_id)); - reg_00.bits.ID = mp_ioapics[apic_id].apicid; + reg_00.bits.ID = mpc_ioapic_id(apic_id); raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2048,7 +2003,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) + if (reg_00.bits.ID != mpc_ioapic_id(apic_id)) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2391,7 +2346,7 @@ static void irq_complete_move(struct irq_cfg *cfg) void irq_force_complete_move(int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); if (!cfg) return; @@ -2405,7 +2360,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { } static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); - move_native_irq(data->irq); + irq_move_irq(data); ack_APIC_irq(); } @@ -2434,7 +2389,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { - if (mp_ioapics[entry->apic].apicver >= 0x20) { + if (mpc_ioapic_ver(entry->apic) >= 0x20) { /* * Intr-remapping uses pin number as the virtual vector * in the RTE. Actual vector is programmed in @@ -2462,7 +2417,7 @@ static void ack_apic_level(struct irq_data *data) irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + if (unlikely(irqd_is_setaffinity_pending(data))) { do_unmask_irq = 1; mask_ioapic(cfg); } @@ -2551,7 +2506,7 @@ static void ack_apic_level(struct irq_data *data) * and you can go talk to the chipset vendor about it. */ if (!io_apic_level_ack_pending(cfg)) - move_masked_irq(irq); + irq_move_masked_irq(data); unmask_ioapic(cfg); } } @@ -2614,7 +2569,7 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, @@ -2625,7 +2580,7 @@ static inline void init_IO_APIC_traps(void) legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ - set_irq_chip(irq, &no_irq_chip); + irq_set_chip(irq, &no_irq_chip); } } } @@ -2665,7 +2620,7 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { irq_clear_status_flags(irq, IRQ_LEVEL); - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -2749,7 +2704,7 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = get_irq_chip_data(0); + struct irq_cfg *cfg = irq_get_chip_data(0); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2935,7 +2890,7 @@ void __init setup_IO_APIC(void) } /* - * Called after all the initialization is done. If we didnt find any + * Called after all the initialization is done. If we didn't find any * APIC bugs then we can allow the modify fast path */ @@ -2948,89 +2903,44 @@ static int __init io_apic_bug_finalize(void) late_initcall(io_apic_bug_finalize); -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) +static void resume_ioapic_id(int ioapic_id) { - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; unsigned long flags; union IO_APIC_reg_00 reg_00; - int i; - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].apicid; - io_apic_write(dev->id, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_id, 0); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_id); + io_apic_write(ioapic_id, 0, reg_00.raw); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); +} - return 0; +static void ioapic_resume(void) +{ + int ioapic_id; + + for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) + resume_ioapic_id(ioapic_id); + + restore_ioapic_entries(); } -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, +static struct syscore_ops ioapic_syscore_ops = { + .suspend = save_ioapic_entries, .resume = ioapic_resume, }; -static int __init ioapic_init_sysfs(void) +static int __init ioapic_init_ops(void) { - struct sys_device * dev; - int i, size, error; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } + register_syscore_ops(&ioapic_syscore_ops); return 0; } -device_initcall(ioapic_init_sysfs); +device_initcall(ioapic_init_ops); /* * Dynamic irq allocate and deallocation @@ -3060,7 +2970,7 @@ unsigned int create_irq_nr(unsigned int from, int node) raw_spin_unlock_irqrestore(&vector_lock, flags); if (ret) { - set_irq_chip_data(irq, cfg); + irq_set_chip_data(irq, cfg); irq_clear_status_flags(irq, IRQ_NOREQUEST); } else { free_irq_at(irq, cfg); @@ -3085,7 +2995,7 @@ int create_irq(void) void destroy_irq(unsigned int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long flags; irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); @@ -3119,7 +3029,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(cfg)) { struct irte irte; int ir_index; u16 sub_handle; @@ -3291,6 +3201,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { + struct irq_chip *chip = &msi_chip; struct msi_msg msg; int ret; @@ -3298,14 +3209,15 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) if (ret < 0) return ret; - set_irq_msi(irq, msidesc); + irq_set_msi_desc(irq, msidesc); write_msi_msg(irq, &msg); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); - } else - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + chip = &msi_ir_chip; + } + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); @@ -3423,8 +3335,8 @@ int arch_setup_dmar_msi(unsigned int irq) if (ret < 0) return ret; dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); return 0; } #endif @@ -3482,6 +3394,7 @@ static struct irq_chip hpet_msi_type = { int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { + struct irq_chip *chip = &hpet_msi_type; struct msi_msg msg; int ret; @@ -3501,15 +3414,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) if (ret < 0) return ret; - hpet_msi_write(get_irq_data(irq), &msg); + hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (irq_remapped(get_irq_chip_data(irq))) - set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, - handle_edge_irq, "edge"); - else - set_irq_chip_and_handler_name(irq, &hpet_msi_type, - handle_edge_irq, "edge"); + if (irq_remapped(irq_get_chip_data(irq))) + chip = &ir_hpet_msi_type; + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } #endif @@ -3596,7 +3506,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) write_ht_irq_msg(irq, &msg); - set_irq_chip_and_handler_name(irq, &ht_irq_chip, + irq_set_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); @@ -3605,7 +3515,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -int __init io_apic_get_redir_entries (int ioapic) +static int +io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) +{ + struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); + int ret; + + if (!cfg) + return -EINVAL; + ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); + if (!ret) + setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg, + attr->trigger, attr->polarity); + return ret; +} + +int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) +{ + unsigned int id = attr->ioapic, pin = attr->ioapic_pin; + int ret; + + /* Avoid redundant programming */ + if (test_bit(pin, ioapics[id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mpc_ioapic_id(id), pin); + return 0; + } + ret = io_apic_setup_irq_pin(irq, node, attr); + if (!ret) + set_bit(pin, ioapics[id].pin_programmed); + return ret; +} + +static int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3659,96 +3602,24 @@ int __init arch_probe_nr_irqs(void) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { - struct irq_cfg *cfg; int node; - int ioapic, pin; - int trigger, polarity; - ioapic = irq_attr->ioapic; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); + irq_attr->ioapic); return -EINVAL; } - if (dev) - node = dev_to_node(dev); - else - node = cpu_to_node(0); - - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return 0; - - pin = irq_attr->ioapic_pin; - trigger = irq_attr->trigger; - polarity = irq_attr->polarity; - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= legacy_pic->nr_legacy_irqs) { - if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) { - printk(KERN_INFO "can not add pin %d for irq %d\n", - pin, irq); - return 0; - } - } - - setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity); + node = dev ? dev_to_node(dev) : cpu_to_node(0); - return 0; -} - -int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) -{ - int ioapic, pin; - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - ioapic = irq_attr->ioapic; - pin = irq_attr->ioapic_pin; - if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[ioapic].apicid, pin); - return 0; - } - set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - - return __io_apic_set_pci_routing(dev, irq, irq_attr); -} - -u8 __init io_apic_unique_id(u8 id) -{ -#ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif + return io_apic_setup_irq_pin_once(irq, node, irq_attr); } #ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) +static int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; @@ -3821,9 +3692,32 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } + +static u8 __init io_apic_unique_id(u8 id) +{ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +} +#else +static u8 __init io_apic_unique_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + __set_bit(mpc_ioapic_id(i), used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} #endif -int __init io_apic_get_version(int ioapic) +static int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3868,14 +3762,14 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; - struct irq_desc *desc; const struct cpumask *mask; + struct irq_data *idata; if (skip_ioapic_setup == 1) return; for (ioapic = 0; ioapic < nr_ioapics; ioapic++) - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; @@ -3884,21 +3778,20 @@ void __init setup_ioapic_dest(void) if ((ioapic > 0) && (irq > 16)) continue; - desc = irq_to_desc(irq); + idata = irq_get_irq_data(irq); /* * Honour affinities which have been set in early boot */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->irq_data.affinity; + if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) + mask = idata->affinity; else mask = apic->target_cpus(); if (intr_remapping_enabled) - ir_ioapic_set_affinity(&desc->irq_data, mask, false); + ir_ioapic_set_affinity(idata, mask, false); else - ioapic_set_affinity(&desc->irq_data, mask, false); + ioapic_set_affinity(idata, mask, false); } } @@ -3947,7 +3840,7 @@ void __init ioapic_and_gsi_init(void) ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].apicaddr; + ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 if (!ioapic_phys) { printk(KERN_ERR @@ -4007,8 +3900,9 @@ int mp_find_ioapic(u32 gsi) /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_gsi_routing[i].gsi_base) - && (gsi <= mp_gsi_routing[i].gsi_end)) + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if ((gsi >= gsi_cfg->gsi_base) + && (gsi <= gsi_cfg->gsi_end)) return i; } @@ -4018,18 +3912,22 @@ int mp_find_ioapic(u32 gsi) int mp_find_ioapic_pin(int ioapic, u32 gsi) { + struct mp_ioapic_gsi *gsi_cfg; + if (WARN_ON(ioapic == -1)) return -1; - if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if (WARN_ON(gsi > gsi_cfg->gsi_end)) return -1; - return gsi - mp_gsi_routing[ioapic].gsi_base; + return gsi - gsi_cfg->gsi_base; } -static int bad_ioapic(unsigned long address) +static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); return 1; } @@ -4045,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; int entries; + struct mp_ioapic_gsi *gsi_cfg; if (bad_ioapic(address)) return; idx = nr_ioapics; - mp_ioapics[idx].type = MP_IOAPIC; - mp_ioapics[idx].flags = MPC_APIC_USABLE; - mp_ioapics[idx].apicaddr = address; + ioapics[idx].mp_config.type = MP_IOAPIC; + ioapics[idx].mp_config.flags = MPC_APIC_USABLE; + ioapics[idx].mp_config.apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].apicid = io_apic_unique_id(id); - mp_ioapics[idx].apicver = io_apic_get_version(idx); + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); + ioapics[idx].mp_config.apicver = io_apic_get_version(idx); /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ entries = io_apic_get_redir_entries(idx); - mp_gsi_routing[idx].gsi_base = gsi_base; - mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; + gsi_cfg = mp_ioapic_gsi_routing(idx); + gsi_cfg->gsi_base = gsi_base; + gsi_cfg->gsi_end = gsi_base + entries - 1; /* * The number of IO-APIC IRQ registers (== #pins): */ - nr_ioapic_registers[idx] = entries; + ioapics[idx].nr_registers = entries; - if (mp_gsi_routing[idx].gsi_end >= gsi_top) - gsi_top = mp_gsi_routing[idx].gsi_end + 1; + if (gsi_cfg->gsi_end >= gsi_top) + gsi_top = gsi_cfg->gsi_end + 1; printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, - mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, - mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + "GSI %d-%d\n", idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); nr_ioapics++; } @@ -4086,20 +3986,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) /* Enable IOAPIC early just for system timer */ void __init pre_init_apic_IRQ0(void) { - struct irq_cfg *cfg; + struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); #endif - /* Make sure the irq descriptor is set up */ - cfg = alloc_irq_and_cfg_at(0, 0); - setup_local_APIC(); - add_pin_to_irq_node(cfg, 0, 0, 0); - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - setup_ioapic_irq(0, 0, 0, cfg, 0, 0); + io_apic_setup_irq_pin(0, 0, &attr); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 08385e090a6f..cce91bf26676 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, local_irq_restore(flags); } +#ifdef CONFIG_X86_32 + void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) { @@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, local_irq_save(flags); for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); local_irq_restore(flags); } @@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, if (query_cpu == this_cpu) continue; __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); } local_irq_restore(flags); } -#ifdef CONFIG_X86_32 - /* * This is only used on smaller machines. */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 960f26ab5c9f..c4a61ca1349a 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -48,8 +48,6 @@ #include <asm/e820.h> #include <asm/ipi.h> -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) - int found_numaq; /* @@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4]; static inline void numaq_register_node(int node, struct sys_cfg_data *scd) { struct eachquadmem *eq = scd->eq + node; + u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; + u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; + int ret; - node_set_online(node); - - /* Convert to pages */ - node_start_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size); - - node_end_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - - memblock_x86_register_active_regions(node, node_start_pfn[node], - node_end_pfn[node]); - - memory_present(node, node_start_pfn[node], node_end_pfn[node]); - - node_remap_size[node] = node_memmap_size_bytes(node, - node_start_pfn[node], - node_end_pfn[node]); + node_set(node, numa_nodes_parsed); + ret = numa_add_memblk(node, start, end); + BUG_ON(ret < 0); } /* * Function: smp_dump_qct() * * Description: gets memory layout from the quad config table. This - * function also updates node_online_map with the nodes (quads) present. + * function also updates numa_nodes_parsed with the nodes (quads) present. */ static void __init smp_dump_qct(void) { @@ -112,7 +99,6 @@ static void __init smp_dump_qct(void) scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - nodes_clear(node_online_map); for_each_node(node) { if (scd->quads_present31_0 & (1 << node)) numaq_register_node(node, scd); @@ -282,14 +268,14 @@ static __init void early_check_numaq(void) } } -int __init get_memcfg_numaq(void) +int __init numaq_numa_init(void) { early_check_numaq(); if (!found_numaq) - return 0; + return -ENOENT; smp_dump_qct(); - return 1; + return 0; } #define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) @@ -373,13 +359,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask return physids_promote(0xFUL, retmap); } -static inline int numaq_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -} - /* * Supporting over 60 cpus on NUMA-Q requires a locality-dependent * cpu to APIC ID relation to properly interact with the intelligent @@ -398,6 +377,15 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } +static int numaq_numa_cpu_node(int cpu) +{ + int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + + if (logical_apicid != BAD_APICID) + return numaq_apicid_to_node(logical_apicid); + return NUMA_NO_NODE; +} + static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); @@ -484,8 +472,8 @@ static void numaq_setup_portio_remap(void) (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); } -/* Use __refdata to keep false positive warning calm. */ -struct apic __refdata apic_numaq = { +/* Use __refdata to keep false positive warning calm. */ +static struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, @@ -508,8 +496,6 @@ struct apic __refdata apic_numaq = { .ioapic_phys_id_map = numaq_ioapic_phys_id_map, .setup_apic_routing = numaq_setup_apic_routing, .multi_timer_check = numaq_multi_timer_check, - .apicid_to_node = numaq_apicid_to_node, - .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid, .cpu_present_to_apicid = numaq_cpu_present_to_apicid, .apicid_to_cpu_present = numaq_apicid_to_cpu_present, .setup_portio_remap = numaq_setup_portio_remap, @@ -547,4 +533,9 @@ struct apic __refdata apic_numaq = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = numaq_numa_cpu_node, }; + +apic_driver(apic_numaq); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 99d2fe016084..b5254ad044ab 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,29 +52,9 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void __init default_setup_apic_routing(void) +static int default_x86_32_early_logical_apicid(int cpu) { - int version = apic_version[boot_cpu_physical_apicid]; - - if (num_possible_cpus() > 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } - -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); + return 1 << cpu; } static void setup_apic_flat_routing(void) @@ -107,7 +87,7 @@ static int probe_default(void) return 1; } -struct apic apic_default = { +static struct apic apic_default = { .name = "default", .probe = probe_default, @@ -130,8 +110,6 @@ struct apic apic_default = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, - .apicid_to_node = default_apicid_to_node, - .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -167,46 +145,26 @@ struct apic apic_default = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, }; -extern struct apic apic_numaq; -extern struct apic apic_summit; -extern struct apic apic_bigsmp; -extern struct apic apic_es7000; -extern struct apic apic_es7000_cluster; +apic_driver(apic_default); struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); -static struct apic *apic_probe[] __initdata = { -#ifdef CONFIG_X86_NUMAQ - &apic_numaq, -#endif -#ifdef CONFIG_X86_SUMMIT - &apic_summit, -#endif -#ifdef CONFIG_X86_BIGSMP - &apic_bigsmp, -#endif -#ifdef CONFIG_X86_ES7000 - &apic_es7000, - &apic_es7000_cluster, -#endif - &apic_default, /* must be last */ - NULL, -}; - static int cmdline_apic __initdata; static int __init parse_apic(char *arg) { - int i; + struct apic **drv; if (!arg) return -EINVAL; - for (i = 0; apic_probe[i]; i++) { - if (!strcmp(apic_probe[i]->name, arg)) { - apic = apic_probe[i]; + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!strcmp((*drv)->name, arg)) { + apic = *drv; cmdline_apic = 1; return 0; } @@ -217,38 +175,58 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init generic_bigsmp_probe(void) +void __init default_setup_apic_routing(void) { + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + #ifdef CONFIG_X86_BIGSMP /* - * This routine is used to switch to bigsmp mode when + * This is used to switch to bigsmp mode when * - There is no apic= option specified by the user * - generic_apic_probe() has chosen apic_default as the sub_arch * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ if (!cmdline_apic && apic == &apic_default) { - if (apic_bigsmp.probe()) { - apic = &apic_bigsmp; + struct apic *bigsmp = generic_bigsmp_probe(); + if (bigsmp) { + apic = bigsmp; printk(KERN_INFO "Overriding APIC driver with %s\n", apic->name); } } #endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); } void __init generic_apic_probe(void) { if (!cmdline_apic) { - int i; - for (i = 0; apic_probe[i]; i++) { - if (apic_probe[i]->probe()) { - apic = apic_probe[i]; + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe()) { + apic = *drv; break; } } /* Not visible without early console */ - if (!apic_probe[i]) + if (drv == __apicdrivers_end) panic("Didn't find an APIC driver"); } printk(KERN_INFO "Using APIC driver %s\n", apic->name); @@ -259,16 +237,16 @@ void __init generic_apic_probe(void) int __init generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (!apic_probe[i]->mps_oem_check) + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!((*drv)->mps_oem_check)) continue; - if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) + if (!(*drv)->mps_oem_check(mpc, oem, productid)) continue; if (!cmdline_apic) { - apic = apic_probe[i]; + apic = *drv; printk(KERN_INFO "Switched to APIC driver `%s'.\n", apic->name); } @@ -279,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (!apic_probe[i]->acpi_madt_oem_check) + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!(*drv)->acpi_madt_oem_check) continue; - if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) + if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) continue; if (!cmdline_apic) { - apic = apic_probe[i]; + apic = *drv; printk(KERN_INFO "Switched to APIC driver `%s'.\n", apic->name); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index d8c4a6feb286..3fe986698929 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,27 +23,6 @@ #include <asm/ipi.h> #include <asm/setup.h> -extern struct apic apic_flat; -extern struct apic apic_physflat; -extern struct apic apic_x2xpic_uv_x; -extern struct apic apic_x2apic_phys; -extern struct apic apic_x2apic_cluster; - -struct apic __read_mostly *apic = &apic_flat; -EXPORT_SYMBOL_GPL(apic); - -static struct apic *apic_probe[] __initdata = { -#ifdef CONFIG_X86_UV - &apic_x2apic_uv_x, -#endif -#ifdef CONFIG_X86_X2APIC - &apic_x2apic_phys, - &apic_x2apic_cluster, -#endif - &apic_physflat, - NULL, -}; - static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) { return hard_smp_processor_id() >> index_msb; @@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) */ void __init default_setup_apic_routing(void) { + struct apic **drv; enable_IR_x2apic(); -#ifdef CONFIG_X86_X2APIC - if (x2apic_mode -#ifdef CONFIG_X86_UV - && apic != &apic_x2apic_uv_x -#endif - ) { - if (x2apic_phys) - apic = &apic_x2apic_phys; - else - apic = &apic_x2apic_cluster; + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe && (*drv)->probe()) { + if (apic != *drv) { + apic = *drv; + pr_info("Switched APIC routing to %s.\n", + apic->name); + } + break; + } } -#endif - - if (apic == &apic_flat && num_possible_cpus() > 8) - apic = &apic_physflat; - - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); if (is_vsmp_box()) { /* need to update phys_pkg_id */ @@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector) int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { - apic = apic_probe[i]; - printk(KERN_INFO "Setting APIC routing to %s.\n", - apic->name); + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { + if (apic != *drv) { + apic = *drv; + pr_info("Setting APIC routing to %s.\n", + apic->name); + } return 1; } } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9b419263d90d..19114423c58c 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit) return 1; } -static void summit_init_apic_ldr(void) +static int summit_early_logical_apicid(int cpu) { - unsigned long val, id; int count = 0; - u8 my_id = (u8)hard_smp_processor_id(); + u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); u8 my_cluster = APIC_CLUSTER(my_id); #ifdef CONFIG_SMP u8 lid; @@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void) /* Create logical APIC IDs by counting CPUs already in cluster. */ for (count = 0, i = nr_cpu_ids; --i >= 0; ) { - lid = cpu_2_logical_apicid[i]; + lid = early_per_cpu(x86_cpu_to_logical_apicid, i); if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) ++count; } @@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void) /* We only have a 4 wide bitmap in cluster mode. If a deranged * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); - id = my_cluster | (1UL << count); + return my_cluster | (1UL << count); +} + +static void summit_init_apic_ldr(void) +{ + int cpu = smp_processor_id(); + unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + unsigned long val; + apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); @@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void) nr_ioapics); } -static int summit_apicid_to_node(int logical_apicid) -{ -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; -#else - return 0; -#endif -} - -/* Mapping from cpu number to logical apicid */ -static inline int summit_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -#else - return logical_smp_processor_id(); -#endif -} - static int summit_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = summit_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { printk("%s: Not a valid mask!\n", __func__); @@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = summit_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -505,7 +491,7 @@ void setup_summit(void) } #endif -struct apic apic_summit = { +static struct apic apic_summit = { .name = "summit", .probe = probe_summit, @@ -528,8 +514,6 @@ struct apic apic_summit = { .ioapic_phys_id_map = summit_ioapic_phys_id_map, .setup_apic_routing = summit_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = summit_apicid_to_node, - .cpu_to_logical_apicid = summit_cpu_to_logical_apicid, .cpu_present_to_apicid = summit_cpu_present_to_apicid, .apicid_to_cpu_present = summit_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -565,4 +549,8 @@ struct apic apic_summit = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = summit_early_logical_apicid, }; + +apic_driver(apic_summit); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cf69c59f4910..500795875827 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -5,118 +5,95 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/dmar.h> +#include <linux/cpu.h> #include <asm/smp.h> -#include <asm/apic.h> -#include <asm/ipi.h> +#include <asm/x2apic.h> static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); +static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled(); } -/* - * need to use more than cpu 0, because we need more vectors when - * MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) +static inline u32 x2apic_cluster(int cpu) { - return cpu_online_mask; -} - -/* - * for now each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); + return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; } static void - __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { - unsigned long cfg; + struct cpumask *cpus_in_cluster_ptr; + struct cpumask *ipi_mask_ptr; + unsigned int cpu, this_cpu; + unsigned long flags; + u32 dest; + + x2apic_wrmsr_fence(); + + local_irq_save(flags); - cfg = __prepare_ICR(0, vector, dest); + this_cpu = smp_processor_id(); /* - * send the IPI. + * We are to modify mask, so we need an own copy + * and be sure it's manipulated with irq off. */ - native_x2apic_icr_write(cfg, apicid); -} + ipi_mask_ptr = __raw_get_cpu_var(ipi_mask); + cpumask_copy(ipi_mask_ptr, mask); -/* - * for now, we send the IPI's one by one in the cpumask. - * TBD: Based on the cpu mask, we can send the IPI's to the cluster group - * at once. We have 16 cpu's in a cluster. This will minimize IPI register - * writes. - */ -static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) -{ - unsigned long query_cpu; - unsigned long flags; + /* + * The idea is to send one IPI per cluster. + */ + for_each_cpu(cpu, ipi_mask_ptr) { + unsigned long i; - x2apic_wrmsr_fence(); + cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu); + dest = 0; - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + /* Collect cpus in cluster. */ + for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) { + if (apic_dest == APIC_DEST_ALLINC || i != this_cpu) + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + } + + if (!dest) + continue; + + __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + /* + * Cluster sibling cpus should be discared now so + * we would not send IPI them second time. + */ + cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); } + local_irq_restore(flags); } +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_allbutself(int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_online_cpu(query_cpu) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_mask, vector); -} - -static int x2apic_apic_id_registered(void) -{ - return 1; + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) @@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_logical_apicid, cpu); } -static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) +static void init_x2apic_ldr(void) { - unsigned int id; + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; - id = x; - return id; + per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); + + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } } -static unsigned long set_apic_id(unsigned int id) + /* + * At CPU state changes, update the x2apic cluster sibling info. + */ +static int __cpuinit +update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long x; + unsigned int this_cpu = (unsigned long)hcpu; + unsigned int cpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu), + GFP_KERNEL)) { + err = -ENOMEM; + } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu), + GFP_KERNEL)) { + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + err = -ENOMEM; + } + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + break; + } - x = id; - return x; + return notifier_from_errno(err); } -static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} +static struct notifier_block __refdata x2apic_cpu_notifier = { + .notifier_call = update_clusterinfo, +}; -static void x2apic_send_IPI_self(int vector) +static int x2apic_init_cpu_notifier(void) { - apic_write(APIC_SELF_IPI, vector); + int cpu = smp_processor_id(); + + zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL); + + BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); + + __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); + register_hotcpu_notifier(&x2apic_cpu_notifier); + return 1; } -static void init_x2apic_ldr(void) +static int x2apic_cluster_probe(void) { - int cpu = smp_processor_id(); - - per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); + if (x2apic_mode) + return x2apic_init_cpu_notifier(); + else + return 0; } -struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", - .probe = NULL, + .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -206,18 +230,16 @@ struct apic apic_x2apic_cluster = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, - .phys_pkg_id = x2apic_cluster_phys_pkg_id, + .phys_pkg_id = x2apic_phys_pkg_id, .mps_oem_check = NULL, - .get_apic_id = x2apic_cluster_phys_get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, @@ -242,3 +264,5 @@ struct apic apic_x2apic_cluster = { .wait_icr_idle = native_x2apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; + +apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 8972f38c5ced..f5373dfde21e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -7,11 +7,12 @@ #include <linux/dmar.h> #include <asm/smp.h> -#include <asm/apic.h> -#include <asm/ipi.h> +#include <asm/x2apic.h> int x2apic_phys; +static struct apic apic_x2apic_phys; + static int set_x2apic_phys_mode(char *arg) { x2apic_phys = 1; @@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* - * need to use more than cpu 0, because we need more vectors when - * MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) -{ - return cpu_online_mask; -} - -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - -static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, - unsigned int dest) -{ - unsigned long cfg; - - cfg = __prepare_ICR(0, vector, dest); - - /* - * send the IPI. - */ - native_x2apic_icr_write(cfg, apicid); -} - -static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { unsigned long query_cpu; + unsigned long this_cpu; unsigned long flags; x2apic_wrmsr_fence(); local_irq_save(flags); + + this_cpu = smp_processor_id(); for_each_cpu(query_cpu, mask) { + if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu) + continue; __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu != this_cpu) - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_allbutself(int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_online_cpu(query_cpu) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_mask, vector); -} - -static int x2apic_apic_id_registered(void) -{ - return 1; + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) @@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_apicid, cpu); } -static unsigned int x2apic_phys_get_apic_id(unsigned long x) -{ - return x; -} - -static unsigned long set_apic_id(unsigned int id) -{ - return id; -} - -static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +static void init_x2apic_ldr(void) { - return initial_apicid >> index_msb; } -static void x2apic_send_IPI_self(int vector) +static int x2apic_phys_probe(void) { - apic_write(APIC_SELF_IPI, vector); -} + if (x2apic_mode && x2apic_phys) + return 1; -static void init_x2apic_ldr(void) -{ + return apic == &apic_x2apic_phys; } -struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys = { .name = "physical x2apic", - .probe = NULL, + .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -195,8 +136,6 @@ struct apic apic_x2apic_phys = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -205,8 +144,8 @@ struct apic apic_x2apic_phys = { .phys_pkg_id = x2apic_phys_pkg_id, .mps_oem_check = NULL, - .get_apic_id = x2apic_phys_get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, @@ -231,3 +170,5 @@ struct apic apic_x2apic_phys = { .wait_icr_idle = native_x2apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; + +apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index bd16b58b8850..b511a011b7d0 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -23,6 +23,8 @@ #include <linux/io.h> #include <linux/pci.h> #include <linux/kdebug.h> +#include <linux/delay.h> +#include <linux/crash_dump.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -34,6 +36,14 @@ #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> +#include <asm/emergency-restart.h> +#include <asm/nmi.h> + +/* BMC sets a bit this MMR non-zero before sending an NMI */ +#define UVH_NMI_MMR UVH_SCRATCH5 +#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8) +#define UV_NMI_PENDING_MASK (1UL << 63) +DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count); DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -48,6 +58,8 @@ unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); +static struct apic apic_x2apic_uv_x; + static unsigned long __init uv_early_read_mmr(unsigned long addr) { unsigned long val, *mmr; @@ -79,6 +91,10 @@ static int __init early_get_pnodeid(void) m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); uv_min_hub_revision_id = node_id.s.revision; + if (node_id.s.part_number == UV2_HUB_PART_NUMBER) + uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + + uv_hub_info->hub_revision = uv_min_hub_revision_id; pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); return pnode; } @@ -100,17 +116,25 @@ static void __init early_get_apic_pnode_shift(void) */ static void __init uv_set_apicid_hibit(void) { - union uvh_lb_target_physical_apic_id_mask_u apicid_mask; + union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; - apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK); - uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; + if (is_uv1_hub()) { + apicid_mask.v = + uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); + uv_apicid_hibits = + apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; + } } static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int pnodeid; + int pnodeid, is_uv1, is_uv2; - if (!strcmp(oem_id, "SGI")) { + is_uv1 = !strcmp(oem_id, "SGI"); + is_uv2 = !strcmp(oem_id, "SGI2"); + if (is_uv1 || is_uv2) { + uv_hub_info->hub_revision = + is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE; pnodeid = early_get_pnodeid(); early_get_apic_pnode_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; @@ -316,10 +340,15 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -struct apic __refdata apic_x2apic_uv_x = { +static int uv_probe(void) +{ + return apic == &apic_x2apic_uv_x; +} + +static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", - .probe = NULL, + .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, .apic_id_registered = uv_apic_id_registered, @@ -338,8 +367,6 @@ struct apic __refdata apic_x2apic_uv_x = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -469,12 +496,19 @@ static __init void map_mmr_high(int max_pnode) static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; - int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + int shift; mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); - if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, + if (is_uv1_hub() && mmioh.s1.enable) { + shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io, max_pnode, map_uc); + } + if (is_uv2_hub() && mmioh.s2.enable) { + shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io, + max_pnode, map_uc); + } } static __init void map_low_mmrs(void) @@ -641,18 +675,46 @@ void __cpuinit uv_cpu_init(void) */ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { + unsigned long real_uv_nmi; + int bid; + if (reason != DIE_NMIUNKNOWN) return NOTIFY_OK; if (in_crash_kexec) /* do nothing if entering the crash kernel */ return NOTIFY_OK; + /* - * Use a lock so only one cpu prints at a time - * to prevent intermixed output. + * Each blade has an MMR that indicates when an NMI has been sent + * to cpus on the blade. If an NMI is detected, atomically + * clear the MMR and update a per-blade NMI count used to + * cause each cpu on the blade to notice a new NMI. + */ + bid = uv_numa_blade_id(); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + + if (unlikely(real_uv_nmi)) { + spin_lock(&uv_blade_info[bid].nmi_lock); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + if (real_uv_nmi) { + uv_blade_info[bid].nmi_count++; + uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); + } + spin_unlock(&uv_blade_info[bid].nmi_lock); + } + + if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) + return NOTIFY_DONE; + + __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; + + /* + * Use a lock so only one cpu prints at a time. + * This prevents intermixed output. */ spin_lock(&uv_nmi_lock); - pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); dump_stack(); spin_unlock(&uv_nmi_lock); @@ -660,7 +722,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi + .notifier_call = uv_handle_nmi, + .priority = NMI_LOCAL_LOW_PRIOR - 1, }; void uv_register_nmi_notifier(void) @@ -692,13 +755,14 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask, pnode_io_mask; + printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2"); map_low_mmrs(); m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); - n_io = mmioh.s.n_io; + n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; @@ -719,8 +783,9 @@ void __init uv_system_init(void) printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kmalloc(bytes, GFP_KERNEL); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) uv_blade_info[blade].memory_nid = -1; @@ -746,6 +811,7 @@ void __init uv_system_init(void) uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info[blade].nmi_lock); max_pnode = max(pnode, max_pnode); blade++; } @@ -765,6 +831,8 @@ void __init uv_system_init(void) */ uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; + uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; @@ -812,4 +880,13 @@ void __init uv_system_init(void) /* register Legacy VGA I/O redirection handler */ pci_register_set_vga_state(uv_set_vga_state); + + /* + * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as + * EFI is not enabled in the kdump kernel. + */ + if (is_kdump_kernel()) + reboot_type = BOOT_ACPI; } + +apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 4c4ac32cd126..965a7666c283 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -66,7 +66,7 @@ * 1.5: Fix segment register reloading (in case of bad segments saved * across BIOS call). * Stephen Rothwell - * 1.6: Cope with complier/assembler differences. + * 1.6: Cope with compiler/assembler differences. * Only try to turn off the first display device. * Fix OOPS at power off with no APM BIOS by Jan Echternach * <echter@informatik.uni-rostock.de> @@ -227,6 +227,8 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> +#include <linux/acpi.h> +#include <linux/syscore_ops.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -977,20 +979,10 @@ recalc: static void apm_power_off(void) { - unsigned char po_bios_call[] = { - 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ - 0x8e, 0xd0, /* movw ax,ss */ - 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ - 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ - 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ - 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ - 0xcd, 0x15 /* int $0x15 */ - }; - /* Some bioses don't like being called from CPU != 0 */ if (apm_info.realmode_power_off) { set_cpus_allowed_ptr(current, cpumask_of(0)); - machine_real_restart(po_bios_call, sizeof(po_bios_call)); + machine_real_restart(MRR_APM); } else { (void)set_system_power_state(APM_STATE_OFF); } @@ -1248,7 +1240,7 @@ static int suspend(int vetoable) dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); - sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); @@ -1266,7 +1258,7 @@ static int suspend(int vetoable) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; - sysdev_resume(); + syscore_resume(); local_irq_enable(); dpm_resume_noirq(PMSG_RESUME); @@ -1290,7 +1282,7 @@ static void standby(void) dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); - sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1298,7 +1290,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); - sysdev_resume(); + syscore_resume(); local_irq_enable(); dpm_resume_noirq(PMSG_RESUME); @@ -2333,12 +2325,11 @@ static int __init apm_init(void) apm_info.disabled = 1; return -ENODEV; } - if (pm_flags & PM_ACPI) { + if (!acpi_disabled) { printk(KERN_NOTICE "apm: overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } - pm_flags |= PM_APM; /* * Set up the long jump entry point to the APM BIOS, which is called @@ -2430,7 +2421,6 @@ static void __exit apm_exit(void) kthread_stop(kapmd_task); kapmd_task = NULL; } - pm_flags &= ~PM_APM; } module_init(apm_init); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cfa82c899f47..4f13fafc5264 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -1,5 +1,70 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include <linux/crypto.h> +#include <linux/sched.h> +#include <linux/stddef.h> +#include <linux/hardirq.h> +#include <linux/suspend.h> +#include <linux/kbuild.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/sigframe.h> +#include <asm/bootparam.h> +#include <asm/suspend.h> + +#ifdef CONFIG_XEN +#include <xen/interface/xen.h> +#endif + #ifdef CONFIG_X86_32 # include "asm-offsets_32.c" #else # include "asm-offsets_64.c" #endif + +void common(void) { + BLANK(); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_preempt_count, thread_info, preempt_count); + + BLANK(); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); +} diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a4088dda37a..c29d631af6fc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,26 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed - * to extract and format the required data. - */ - -#include <linux/crypto.h> -#include <linux/sched.h> -#include <linux/signal.h> -#include <linux/personality.h> -#include <linux/suspend.h> -#include <linux/kbuild.h> #include <asm/ucontext.h> -#include <asm/sigframe.h> -#include <asm/pgtable.h> -#include <asm/fixmap.h> -#include <asm/processor.h> -#include <asm/thread_info.h> -#include <asm/bootparam.h> -#include <asm/elf.h> -#include <asm/suspend.h> - -#include <xen/interface/xen.h> #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" @@ -51,21 +29,10 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_task, thread_info, task); - OFFSET(TI_exec_domain, thread_info, exec_domain); - OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); - OFFSET(TI_preempt_count, thread_info, preempt_count); - OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); - OFFSET(GDS_size, desc_ptr, size); - OFFSET(GDS_address, desc_ptr, address); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); @@ -85,42 +52,13 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); - OFFSET(pbe_address, pbe, address); - OFFSET(pbe_orig_address, pbe, orig_address); - OFFSET(pbe_next, pbe, next); - /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); - DEFINE(THREAD_SIZE_asm, THREAD_SIZE); - - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - -#ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); - OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); -#endif - -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#endif - #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); @@ -139,11 +77,4 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif - - BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4a6aeedcd965..e72a1194af22 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,27 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ -#define COMPILE_OFFSETS - -#include <linux/crypto.h> -#include <linux/sched.h> -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/hardirq.h> -#include <linux/suspend.h> -#include <linux/kbuild.h> -#include <asm/processor.h> -#include <asm/segment.h> -#include <asm/thread_info.h> #include <asm/ia32.h> -#include <asm/bootparam.h> -#include <asm/suspend.h> - -#include <xen/interface/xen.h> - -#include <asm/sigframe.h> #define __NO_STUBS 1 #undef __SYSCALL @@ -33,41 +10,19 @@ static char syscalls[] = { int main(void) { -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - ENTRY(status); -#ifdef CONFIG_IA32_EMULATION - ENTRY(sysenter_return); -#endif - BLANK(); -#undef ENTRY #ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); - OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); + BLANK(); #endif - #ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + BLANK(); + +#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) ENTRY(ax); ENTRY(bx); ENTRY(cx); @@ -79,15 +34,12 @@ int main(void) ENTRY(ip); BLANK(); #undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); + + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); BLANK(); #endif - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - DEFINE(pbe_next, offsetof(struct pbe, next)); - BLANK(); -#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) + +#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(bx); ENTRY(cx); @@ -107,7 +59,8 @@ int main(void) ENTRY(flags); BLANK(); #undef ENTRY -#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) + +#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry) ENTRY(cr0); ENTRY(cr2); ENTRY(cr3); @@ -115,26 +68,11 @@ int main(void) ENTRY(cr8); BLANK(); #undef ENTRY - DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); - BLANK(); - DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); - BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); - BLANK(); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#undef ENTRY -#endif + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + return 0; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3f0ebe429a01..6042981d0309 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7c7bedb83c5a..b13ed393dfce 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ static int __cpuinit nearby_node(int apicid) { int i, node; for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -261,7 +265,7 @@ static int __cpuinit nearby_node(int apicid) #ifdef CONFIG_X86_HT static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { - u32 nodes; + u32 nodes, cores_per_cu = 1; u8 node_id; int cpu = smp_processor_id(); @@ -276,6 +280,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* get compute unit information */ smp_num_siblings = ((ebx >> 8) & 3) + 1; c->compute_unit_id = ebx & 0xff; + cores_per_cu += ((ebx >> 8) & 3); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; @@ -288,15 +293,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* fixup multi-node processor information */ if (nodes > 1) { u32 cores_per_node; + u32 cus_per_node; set_cpu_cap(c, X86_FEATURE_AMD_DCM); cores_per_node = c->x86_max_cores / nodes; + cus_per_node = cores_per_node / cores_per_cu; /* store NodeID, use llc_shared_map to store sibling info */ per_cpu(cpu_llc_id, cpu) = node_id; - /* core id to be in range from 0 to (cores_per_node - 1) */ - c->cpu_core_id = c->cpu_core_id % cores_per_node; + /* core id has to be in the [0 .. cores_per_node - 1] range */ + c->cpu_core_id %= cores_per_node; + c->compute_unit_id %= cus_per_node; } } #endif @@ -334,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id); static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; unsigned apicid = c->apicid; - node = per_cpu(cpu_llc_id, cpu); + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ int ht_nodeid = c->initial_apicid; if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) node = nearby_node(apicid); @@ -594,6 +611,35 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } #endif + + /* + * Family 0x12 and above processors have APIC timer + * running in deep C states. + */ + if (c->x86 > 0x11) + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + */ + if (c->x86 == 0x10) { + /* + * BIOS should disable GartTlbWlk Errors themself. If + * it doesn't do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + u64 mask; + int err; + + err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); + if (err == 0) { + mask |= (1 << 10); + checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 745a602f204f..22a073d7fbff 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif +static int disable_smep __cpuinitdata; +static __init int setup_disable_smep(char *arg) +{ + disable_smep = 1; + return 1; +} +__setup("nosmep", setup_disable_smep); + +static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) { + if (unlikely(disable_smep)) { + setup_clear_cpu_cap(X86_FEATURE_SMEP); + clear_in_cr4(X86_CR4_SMEP); + } else + set_in_cr4(X86_CR4_SMEP); + } +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) if (smp_num_siblings <= 1) goto out; - if (smp_num_siblings > nr_cpu_ids) { - pr_warning("CPU: Unsupported number of siblings %d", - smp_num_siblings); - smp_num_siblings = 1; - return; - } - index_msb = get_count_order(smp_num_siblings); c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); @@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - if (eax > 0) - c->x86_capability[9] = ebx; + c->x86_capability[9] = ebx; } /* AMD-defined flags: level 0x80000001 */ @@ -668,6 +679,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; #endif filter_cpuid_features(c, false); + + setup_smep(c); } void __init early_cpu_init(void) @@ -675,7 +688,7 @@ void __init early_cpu_init(void) const struct cpu_dev *const *cdev; int count = 0; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); #endif @@ -687,7 +700,7 @@ void __init early_cpu_init(void) cpu_devs[count] = cpudev; count++; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT { unsigned int j; @@ -753,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) #endif } + setup_smep(c); + get_model_name(c); /* Default name */ detect_nopl(c); @@ -869,7 +884,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) select_idle_routine(c); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif } diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig deleted file mode 100644 index 870e6cc6ad28..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ /dev/null @@ -1,266 +0,0 @@ -# -# CPU Frequency scaling -# - -menu "CPU Frequency scaling" - -source "drivers/cpufreq/Kconfig" - -if CPU_FREQ - -comment "CPUFreq processor drivers" - -config X86_PCC_CPUFREQ - tristate "Processor Clocking Control interface driver" - depends on ACPI && ACPI_PROCESSOR - help - This driver adds support for the PCC interface. - - For details, take a look at: - <file:Documentation/cpu-freq/pcc-cpufreq.txt>. - - To compile this driver as a module, choose M here: the - module will be called pcc-cpufreq. - - If in doubt, say N. - -config X86_ACPI_CPUFREQ - tristate "ACPI Processor P-States driver" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This driver adds a CPUFreq driver which utilizes the ACPI - Processor Performance States. - This driver also supports Intel Enhanced Speedstep. - - To compile this driver as a module, choose M here: the - module will be called acpi-cpufreq. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config ELAN_CPUFREQ - tristate "AMD Elan SC400 and SC410" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC400 and SC410 - processors. - - You need to specify the processor maximum speed as boot - parameter: elanfreq=maxspeed (in kHz) or as module - parameter "max_freq". - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config SC520_CPUFREQ - tristate "AMD Elan SC520" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC520 processor. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - - -config X86_POWERNOW_K6 - tristate "AMD Mobile K6-2/K6-3 PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K6-2+ and mobile - AMD K6-3+ processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K7 - tristate "AMD Mobile Athlon/Duron PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K7 mobile processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K7_ACPI - bool - depends on X86_POWERNOW_K7 && ACPI_PROCESSOR - depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m) - depends on X86_32 - default y - -config X86_POWERNOW_K8 - tristate "AMD Opteron/Athlon64 PowerNow!" - select CPU_FREQ_TABLE - depends on ACPI && ACPI_PROCESSOR - help - This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors. - - To compile this driver as a module, choose M here: the - module will be called powernow-k8. - - For details, take a look at <file:Documentation/cpu-freq/>. - -config X86_GX_SUSPMOD - tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" - depends on X86_32 && PCI - help - This add the CPUFreq driver for NatSemi Geode processors which - support suspend modulation. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO - tristate "Intel Enhanced SpeedStep (deprecated)" - select CPU_FREQ_TABLE - select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32 - depends on X86_32 || (X86_64 && ACPI_PROCESSOR) - help - This is deprecated and this functionality is now merged into - acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of - speedstep_centrino. - This adds the CPUFreq driver for Enhanced SpeedStep enabled - mobile CPUs. This means Intel Pentium M (Centrino) CPUs - or 64bit enabled Intel Xeons. - - To compile this driver as a module, choose M here: the - module will be called speedstep-centrino. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO_TABLE - bool "Built-in tables for Banias CPUs" - depends on X86_32 && X86_SPEEDSTEP_CENTRINO - default y - help - Use built-in tables for Banias CPUs if ACPI encoding - is not available. - - If in doubt, say N. - -config X86_SPEEDSTEP_ICH - tristate "Intel Speedstep on ICH-M chipsets (ioport interface)" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all - mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2, - ICH3 or ICH4 southbridge. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_SMI - tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) - on systems which have an Intel 440BX/ZX/MX southbridge. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_P4_CLOCKMOD - tristate "Intel Pentium 4 clock modulation" - select CPU_FREQ_TABLE - help - This adds the CPUFreq driver for Intel Pentium 4 / XEON - processors. When enabled it will lower CPU temperature by skipping - clocks. - - This driver should be only used in exceptional - circumstances when very low power is needed because it causes severe - slowdowns and noticeable latencies. Normally Speedstep should be used - instead. - - To compile this driver as a module, choose M here: the - module will be called p4-clockmod. - - For details, take a look at <file:Documentation/cpu-freq/>. - - Unless you are absolutely sure say N. - -config X86_CPUFREQ_NFORCE2 - tristate "nVidia nForce2 FSB changing" - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for FSB changing on nVidia nForce2 - platforms. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_LONGRUN - tristate "Transmeta LongRun" - depends on X86_32 - help - This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors - which support LongRun. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_LONGHAUL - tristate "VIA Cyrix III Longhaul" - select CPU_FREQ_TABLE - depends on X86_32 && ACPI_PROCESSOR - help - This adds the CPUFreq driver for VIA Samuel/CyrixIII, - VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T - processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for VIA C7 processors. However, this driver - does not have any safeguards to prevent operating the CPU out of spec - and is thus considered dangerous. Please use the regular ACPI cpufreq - driver, enabled by CONFIG_X86_ACPI_CPUFREQ. - - If in doubt, say N. - -comment "shared options" - -config X86_SPEEDSTEP_LIB - tristate - default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) - -config X86_SPEEDSTEP_RELAXED_CAP_CHECK - bool "Relaxed speedstep capability checks" - depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH) - help - Don't perform all checks for a speedstep capable system which would - normally be done. Some ancient or strange systems, though speedstep - capable, don't always indicate that they are speedstep capable. This - option lets the probing code bypass some of those checks if the - parameter "relaxed_check=1" is passed to the module. - -endif # CPU_FREQ - -endmenu diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile deleted file mode 100644 index bd54bf67e6fb..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# Link order matters. K8 is preferred to ACPI because of firmware bugs in early -# K8 systems. ACPI is preferred to all other hardware-specific drivers. -# speedstep-* is preferred over p4-clockmod. - -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o -obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o -obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o -obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o -obj-$(CONFIG_X86_LONGHAUL) += longhaul.o -obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o -obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o -obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o -obj-$(CONFIG_X86_LONGRUN) += longrun.o -obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o -obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o -obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o -obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o -obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o -obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o -obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c deleted file mode 100644 index a2baafb2fe6d..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ /dev/null @@ -1,776 +0,0 @@ -/* - * acpi-cpufreq.c - ACPI Processor P-States Driver - * - * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de> - * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/cpufreq.h> -#include <linux/compiler.h> -#include <linux/dmi.h> -#include <linux/slab.h> - -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/delay.h> -#include <linux/uaccess.h> - -#include <acpi/processor.h> - -#include <asm/msr.h> -#include <asm/processor.h> -#include <asm/cpufeature.h> -#include "mperf.h" - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "acpi-cpufreq", msg) - -MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); -MODULE_DESCRIPTION("ACPI Processor P-States Driver"); -MODULE_LICENSE("GPL"); - -enum { - UNDEFINED_CAPABLE = 0, - SYSTEM_INTEL_MSR_CAPABLE, - SYSTEM_IO_CAPABLE, -}; - -#define INTEL_MSR_RANGE (0xffff) - -struct acpi_cpufreq_data { - struct acpi_processor_performance *acpi_data; - struct cpufreq_frequency_table *freq_table; - unsigned int resume; - unsigned int cpu_feature; -}; - -static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); - -/* acpi_perf_data is a pointer to percpu data. */ -static struct acpi_processor_performance __percpu *acpi_perf_data; - -static struct cpufreq_driver acpi_cpufreq_driver; - -static unsigned int acpi_pstate_strict; - -static int check_est_cpu(unsigned int cpuid) -{ - struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - - return cpu_has(cpu, X86_FEATURE_EST); -} - -static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) -{ - struct acpi_processor_performance *perf; - int i; - - perf = data->acpi_data; - - for (i = 0; i < perf->state_count; i++) { - if (value == perf->states[i].status) - return data->freq_table[i].frequency; - } - return 0; -} - -static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data) -{ - int i; - struct acpi_processor_performance *perf; - - msr &= INTEL_MSR_RANGE; - perf = data->acpi_data; - - for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == perf->states[data->freq_table[i].index].status) - return data->freq_table[i].frequency; - } - return data->freq_table[0].frequency; -} - -static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) -{ - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - return extract_msr(val, data); - case SYSTEM_IO_CAPABLE: - return extract_io(val, data); - default: - return 0; - } -} - -struct msr_addr { - u32 reg; -}; - -struct io_addr { - u16 port; - u8 bit_width; -}; - -struct drv_cmd { - unsigned int type; - const struct cpumask *mask; - union { - struct msr_addr msr; - struct io_addr io; - } addr; - u32 val; -}; - -/* Called via smp_call_function_single(), on the target CPU */ -static void do_drv_read(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 h; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, cmd->val, h); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_read_port((acpi_io_address)cmd->addr.io.port, - &cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -/* Called via smp_call_function_many(), on the target CPUs */ -static void do_drv_write(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 lo, hi; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, lo, hi); - lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); - wrmsr(cmd->addr.msr.reg, lo, hi); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_write_port((acpi_io_address)cmd->addr.io.port, - cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -static void drv_read(struct drv_cmd *cmd) -{ - int err; - cmd->val = 0; - - err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); - WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ -} - -static void drv_write(struct drv_cmd *cmd) -{ - int this_cpu; - - this_cpu = get_cpu(); - if (cpumask_test_cpu(this_cpu, cmd->mask)) - do_drv_write(cmd); - smp_call_function_many(cmd->mask, do_drv_write, cmd, 1); - put_cpu(); -} - -static u32 get_cur_val(const struct cpumask *mask) -{ - struct acpi_processor_performance *perf; - struct drv_cmd cmd; - - if (unlikely(cpumask_empty(mask))) - return 0; - - switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - break; - default: - return 0; - } - - cmd.mask = mask; - drv_read(&cmd); - - dprintk("get_cur_val = %u\n", cmd.val); - - return cmd.val; -} - -static unsigned int get_cur_freq_on_cpu(unsigned int cpu) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); - unsigned int freq; - unsigned int cached_freq; - - dprintk("get_cur_freq_on_cpu (%d)\n", cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return 0; - } - - cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(cpumask_of(cpu)), data); - if (freq != cached_freq) { - /* - * The dreaded BIOS frequency change behind our back. - * Force set the frequency on next target call. - */ - data->resume = 1; - } - - dprintk("cur freq = %u\n", freq); - - return freq; -} - -static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, - struct acpi_cpufreq_data *data) -{ - unsigned int cur_freq; - unsigned int i; - - for (i = 0; i < 100; i++) { - cur_freq = extract_freq(get_cur_val(mask), data); - if (cur_freq == freq) - return 1; - udelay(10); - } - return 0; -} - -static int acpi_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - struct acpi_processor_performance *perf; - struct cpufreq_freqs freqs; - struct drv_cmd cmd; - unsigned int next_state = 0; /* Index into freq_table */ - unsigned int next_perf_state = 0; /* Index into perf table */ - unsigned int i; - int result = 0; - - dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return -ENODEV; - } - - perf = data->acpi_data; - result = cpufreq_frequency_table_target(policy, - data->freq_table, - target_freq, - relation, &next_state); - if (unlikely(result)) { - result = -ENODEV; - goto out; - } - - next_perf_state = data->freq_table[next_state].index; - if (perf->state == next_perf_state) { - if (unlikely(data->resume)) { - dprintk("Called after resume, resetting to P%d\n", - next_perf_state); - data->resume = 0; - } else { - dprintk("Already at target state (P%d)\n", - next_perf_state); - goto out; - } - } - - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_CTL; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - default: - result = -ENODEV; - goto out; - } - - /* cpufreq holds the hotplug lock, so we are safe from here on */ - if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) - cmd.mask = policy->cpus; - else - cmd.mask = cpumask_of(policy->cpu); - - freqs.old = perf->states[perf->state].core_frequency * 1000; - freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - drv_write(&cmd); - - if (acpi_pstate_strict) { - if (!check_freqs(cmd.mask, freqs.new, data)) { - dprintk("acpi_cpufreq_target failed (%d)\n", - policy->cpu); - result = -EAGAIN; - goto out; - } - } - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - perf->state = next_perf_state; - -out: - return result; -} - -static int acpi_cpufreq_verify(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_verify\n"); - - return cpufreq_frequency_table_verify(policy, data->freq_table); -} - -static unsigned long -acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) -{ - struct acpi_processor_performance *perf = data->acpi_data; - - if (cpu_khz) { - /* search the closest match to cpu_khz */ - unsigned int i; - unsigned long freq; - unsigned long freqn = perf->states[0].core_frequency * 1000; - - for (i = 0; i < (perf->state_count-1); i++) { - freq = freqn; - freqn = perf->states[i+1].core_frequency * 1000; - if ((2 * cpu_khz) > (freqn + freq)) { - perf->state = i; - return freq; - } - } - perf->state = perf->state_count-1; - return freqn; - } else { - /* assume CPU is at P0... */ - perf->state = 0; - return perf->states[0].core_frequency * 1000; - } -} - -static void free_acpi_perf_data(void) -{ - unsigned int i; - - /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ - for_each_possible_cpu(i) - free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) - ->shared_cpu_map); - free_percpu(acpi_perf_data); -} - -/* - * acpi_cpufreq_early_init - initialize ACPI P-States library - * - * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c) - * in order to determine correct frequency and voltage pairings. We can - * do _PDC and _PSD and find out the processor dependency for the - * actual init that will happen later... - */ -static int __init acpi_cpufreq_early_init(void) -{ - unsigned int i; - dprintk("acpi_cpufreq_early_init\n"); - - acpi_perf_data = alloc_percpu(struct acpi_processor_performance); - if (!acpi_perf_data) { - dprintk("Memory allocation error for acpi_perf_data.\n"); - return -ENOMEM; - } - for_each_possible_cpu(i) { - if (!zalloc_cpumask_var_node( - &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, - GFP_KERNEL, cpu_to_node(i))) { - - /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ - free_acpi_perf_data(); - return -ENOMEM; - } - } - - /* Do initialization in ACPI core */ - acpi_processor_preregister_performance(acpi_perf_data); - return 0; -} - -#ifdef CONFIG_SMP -/* - * Some BIOSes do SW_ANY coordination internally, either set it up in hw - * or do it in BIOS firmware and won't inform about it to OS. If not - * detected, this has a side effect of making CPU run at a different speed - * than OS intended it to run at. Detect it and handle it cleanly. - */ -static int bios_with_sw_any_bug; - -static int sw_any_bug_found(const struct dmi_system_id *d) -{ - bios_with_sw_any_bug = 1; - return 0; -} - -static const struct dmi_system_id sw_any_bug_dmi_table[] = { - { - .callback = sw_any_bug_found, - .ident = "Supermicro Server X6DLP", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), - DMI_MATCH(DMI_BIOS_VERSION, "080010"), - DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), - }, - }, - { } -}; - -static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) -{ - /* Intel Xeon Processor 7100 Series Specification Update - * http://www.intel.com/Assets/PDF/specupdate/314554.pdf - * AL30: A Machine Check Exception (MCE) Occurring during an - * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up. */ - if (c->x86_vendor == X86_VENDOR_INTEL) { - if ((c->x86 == 15) && - (c->x86_model == 6) && - (c->x86_mask == 8)) { - printk(KERN_INFO "acpi-cpufreq: Intel(R) " - "Xeon(R) 7100 Errata AL30, processors may " - "lock up on frequency changes: disabling " - "acpi-cpufreq.\n"); - return -ENODEV; - } - } - return 0; -} -#endif - -static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - unsigned int valid_states = 0; - unsigned int cpu = policy->cpu; - struct acpi_cpufreq_data *data; - unsigned int result = 0; - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - struct acpi_processor_performance *perf; -#ifdef CONFIG_SMP - static int blacklisted; -#endif - - dprintk("acpi_cpufreq_cpu_init\n"); - -#ifdef CONFIG_SMP - if (blacklisted) - return blacklisted; - blacklisted = acpi_cpufreq_blacklist(c); - if (blacklisted) - return blacklisted; -#endif - - data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); - per_cpu(acfreq_data, cpu) = data; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) - acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; - - result = acpi_processor_register_performance(data->acpi_data, cpu); - if (result) - goto err_free; - - perf = data->acpi_data; - policy->shared_type = perf->shared_type; - - /* - * Will let policy->cpus know about dependency only when software - * coordination is required. - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || - policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - cpumask_copy(policy->cpus, perf->shared_cpu_map); - } - cpumask_copy(policy->related_cpus, perf->shared_cpu_map); - -#ifdef CONFIG_SMP - dmi_check_system(sw_any_bug_dmi_table); - if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) { - policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; - cpumask_copy(policy->cpus, cpu_core_mask(cpu)); - } -#endif - - /* capability check */ - if (perf->state_count <= 1) { - dprintk("No P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if (perf->control_register.space_id != perf->status_register.space_id) { - result = -ENODEV; - goto err_unreg; - } - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - dprintk("SYSTEM IO addr space\n"); - data->cpu_feature = SYSTEM_IO_CAPABLE; - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - dprintk("HARDWARE addr space\n"); - if (!check_est_cpu(cpu)) { - result = -ENODEV; - goto err_unreg; - } - data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE; - break; - default: - dprintk("Unknown addr space %d\n", - (u32) (perf->control_register.space_id)); - result = -ENODEV; - goto err_unreg; - } - - data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * - (perf->state_count+1), GFP_KERNEL); - if (!data->freq_table) { - result = -ENOMEM; - goto err_unreg; - } - - /* detect transition latency */ - policy->cpuinfo.transition_latency = 0; - for (i = 0; i < perf->state_count; i++) { - if ((perf->states[i].transition_latency * 1000) > - policy->cpuinfo.transition_latency) - policy->cpuinfo.transition_latency = - perf->states[i].transition_latency * 1000; - } - - /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */ - if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && - policy->cpuinfo.transition_latency > 20 * 1000) { - policy->cpuinfo.transition_latency = 20 * 1000; - printk_once(KERN_INFO - "P-state transition latency capped at 20 uS\n"); - } - - /* table init */ - for (i = 0; i < perf->state_count; i++) { - if (i > 0 && perf->states[i].core_frequency >= - data->freq_table[valid_states-1].frequency / 1000) - continue; - - data->freq_table[valid_states].index = i; - data->freq_table[valid_states].frequency = - perf->states[i].core_frequency * 1000; - valid_states++; - } - data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; - perf->state = 0; - - result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); - if (result) - goto err_freqfree; - - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) - printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n"); - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - /* Current speed is unknown and not detectable by IO port */ - policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu); - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - acpi_cpufreq_driver.get = get_cur_freq_on_cpu; - policy->cur = get_cur_freq_on_cpu(cpu); - break; - default: - break; - } - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; - - dprintk("CPU%u - ACPI performance management activated.\n", cpu); - for (i = 0; i < perf->state_count; i++) - dprintk(" %cP%d: %d MHz, %d mW, %d uS\n", - (i == perf->state ? '*' : ' '), i, - (u32) perf->states[i].core_frequency, - (u32) perf->states[i].power, - (u32) perf->states[i].transition_latency); - - cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); - - /* - * the first call to ->target() should result in us actually - * writing something to the appropriate registers. - */ - data->resume = 1; - - return result; - -err_freqfree: - kfree(data->freq_table); -err_unreg: - acpi_processor_unregister_performance(perf, cpu); -err_free: - kfree(data); - per_cpu(acfreq_data, cpu) = NULL; - - return result; -} - -static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_cpu_exit\n"); - - if (data) { - cpufreq_frequency_table_put_attr(policy->cpu); - per_cpu(acfreq_data, policy->cpu) = NULL; - acpi_processor_unregister_performance(data->acpi_data, - policy->cpu); - kfree(data->freq_table); - kfree(data); - } - - return 0; -} - -static int acpi_cpufreq_resume(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_resume\n"); - - data->resume = 1; - - return 0; -} - -static struct freq_attr *acpi_cpufreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, -}; - -static int __init acpi_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - dprintk("acpi_cpufreq_init\n"); - - ret = acpi_cpufreq_early_init(); - if (ret) - return ret; - - ret = cpufreq_register_driver(&acpi_cpufreq_driver); - if (ret) - free_acpi_perf_data(); - - return ret; -} - -static void __exit acpi_cpufreq_exit(void) -{ - dprintk("acpi_cpufreq_exit\n"); - - cpufreq_unregister_driver(&acpi_cpufreq_driver); - - free_percpu(acpi_perf_data); -} - -module_param(acpi_pstate_strict, uint, 0644); -MODULE_PARM_DESC(acpi_pstate_strict, - "value 0 or non-zero. non-zero -> strict ACPI checks are " - "performed during frequency changes."); - -late_initcall(acpi_cpufreq_init); -module_exit(acpi_cpufreq_exit); - -MODULE_ALIAS("acpi"); diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c deleted file mode 100644 index 141abebc4516..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ /dev/null @@ -1,446 +0,0 @@ -/* - * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/delay.h> - -#define NFORCE2_XTAL 25 -#define NFORCE2_BOOTFSB 0x48 -#define NFORCE2_PLLENABLE 0xa8 -#define NFORCE2_PLLREG 0xa4 -#define NFORCE2_PLLADR 0xa0 -#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div) - -#define NFORCE2_MIN_FSB 50 -#define NFORCE2_SAFE_DISTANCE 50 - -/* Delay in ms between FSB changes */ -/* #define NFORCE2_DELAY 10 */ - -/* - * nforce2_chipset: - * FSB is changed using the chipset - */ -static struct pci_dev *nforce2_dev; - -/* fid: - * multiplier * 10 - */ -static int fid; - -/* min_fsb, max_fsb: - * minimum and maximum FSB (= FSB at boot time) - */ -static int min_fsb; -static int max_fsb; - -MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); -MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); -MODULE_LICENSE("GPL"); - -module_param(fid, int, 0444); -module_param(min_fsb, int, 0444); - -MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); -MODULE_PARM_DESC(min_fsb, - "Minimum FSB to use, if not defined: current FSB - 50"); - -#define PFX "cpufreq-nforce2: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "cpufreq-nforce2", msg) - -/** - * nforce2_calc_fsb - calculate FSB - * @pll: PLL value - * - * Calculates FSB from PLL value - */ -static int nforce2_calc_fsb(int pll) -{ - unsigned char mul, div; - - mul = (pll >> 8) & 0xff; - div = pll & 0xff; - - if (div > 0) - return NFORCE2_XTAL * mul / div; - - return 0; -} - -/** - * nforce2_calc_pll - calculate PLL value - * @fsb: FSB - * - * Calculate PLL value for given FSB - */ -static int nforce2_calc_pll(unsigned int fsb) -{ - unsigned char xmul, xdiv; - unsigned char mul = 0, div = 0; - int tried = 0; - - /* Try to calculate multiplier and divider up to 4 times */ - while (((mul == 0) || (div == 0)) && (tried <= 3)) { - for (xdiv = 2; xdiv <= 0x80; xdiv++) - for (xmul = 1; xmul <= 0xfe; xmul++) - if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) == - fsb + tried) { - mul = xmul; - div = xdiv; - } - tried++; - } - - if ((mul == 0) || (div == 0)) - return -1; - - return NFORCE2_PLL(mul, div); -} - -/** - * nforce2_write_pll - write PLL value to chipset - * @pll: PLL value - * - * Writes new FSB PLL value to chipset - */ -static void nforce2_write_pll(int pll) -{ - int temp; - - /* Set the pll addr. to 0x00 */ - pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0); - - /* Now write the value in all 64 registers */ - for (temp = 0; temp <= 0x3f; temp++) - pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll); - - return; -} - -/** - * nforce2_fsb_read - Read FSB - * - * Read FSB from chipset - * If bootfsb != 0, return FSB at boot-time - */ -static unsigned int nforce2_fsb_read(int bootfsb) -{ - struct pci_dev *nforce2_sub5; - u32 fsb, temp = 0; - - /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ - nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF, - PCI_ANY_ID, PCI_ANY_ID, NULL); - if (!nforce2_sub5) - return 0; - - pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb); - fsb /= 1000000; - - /* Check if PLL register is already set */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - - if (bootfsb || !temp) - return fsb; - - /* Use PLL register FSB value */ - pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp); - fsb = nforce2_calc_fsb(temp); - - return fsb; -} - -/** - * nforce2_set_fsb - set new FSB - * @fsb: New FSB - * - * Sets new FSB - */ -static int nforce2_set_fsb(unsigned int fsb) -{ - u32 temp = 0; - unsigned int tfsb; - int diff; - int pll = 0; - - if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { - printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb); - return -EINVAL; - } - - tfsb = nforce2_fsb_read(0); - if (!tfsb) { - printk(KERN_ERR PFX "Error while reading the FSB\n"); - return -EINVAL; - } - - /* First write? Then set actual value */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - if (!temp) { - pll = nforce2_calc_pll(tfsb); - - if (pll < 0) - return -EINVAL; - - nforce2_write_pll(pll); - } - - /* Enable write access */ - temp = 0x01; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp); - - diff = tfsb - fsb; - - if (!diff) - return 0; - - while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) { - if (diff < 0) - tfsb++; - else - tfsb--; - - /* Calculate the PLL reg. value */ - pll = nforce2_calc_pll(tfsb); - if (pll == -1) - return -EINVAL; - - nforce2_write_pll(pll); -#ifdef NFORCE2_DELAY - mdelay(NFORCE2_DELAY); -#endif - } - - temp = 0x40; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp); - - return 0; -} - -/** - * nforce2_get - get the CPU frequency - * @cpu: CPU number - * - * Returns the CPU frequency - */ -static unsigned int nforce2_get(unsigned int cpu) -{ - if (cpu) - return 0; - return nforce2_fsb_read(0) * fid * 100; -} - -/** - * nforce2_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int nforce2_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ -/* unsigned long flags; */ - struct cpufreq_freqs freqs; - unsigned int target_fsb; - - if ((target_freq > policy->max) || (target_freq < policy->min)) - return -EINVAL; - - target_fsb = target_freq / (fid * 100); - - freqs.old = nforce2_get(policy->cpu); - freqs.new = target_fsb * fid * 100; - freqs.cpu = 0; /* Only one CPU on nForce2 platforms */ - - if (freqs.old == freqs.new) - return 0; - - dprintk("Old CPU frequency %d kHz, new %d kHz\n", - freqs.old, freqs.new); - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Disable IRQs */ - /* local_irq_save(flags); */ - - if (nforce2_set_fsb(target_fsb) < 0) - printk(KERN_ERR PFX "Changing FSB to %d failed\n", - target_fsb); - else - dprintk("Changed FSB successfully to %d\n", - target_fsb); - - /* Enable IRQs */ - /* local_irq_restore(flags); */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - -/** - * nforce2_verify - verifies a new CPUFreq policy - * @policy: new policy - */ -static int nforce2_verify(struct cpufreq_policy *policy) -{ - unsigned int fsb_pol_max; - - fsb_pol_max = policy->max / (fid * 100); - - if (policy->min < (fsb_pol_max * fid * 100)) - policy->max = (fsb_pol_max + 1) * fid * 100; - - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static int nforce2_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int fsb; - unsigned int rfid; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* Get current FSB */ - fsb = nforce2_fsb_read(0); - - if (!fsb) - return -EIO; - - /* FIX: Get FID from CPU */ - if (!fid) { - if (!cpu_khz) { - printk(KERN_WARNING PFX - "cpu_khz not set, can't calculate multiplier!\n"); - return -ENODEV; - } - - fid = cpu_khz / (fsb * 100); - rfid = fid % 5; - - if (rfid) { - if (rfid > 2) - fid += 5 - rfid; - else - fid -= rfid; - } - } - - printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb, - fid / 10, fid % 10); - - /* Set maximum FSB to FSB at boot time */ - max_fsb = nforce2_fsb_read(1); - - if (!max_fsb) - return -EIO; - - if (!min_fsb) - min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE; - - if (min_fsb < NFORCE2_MIN_FSB) - min_fsb = NFORCE2_MIN_FSB; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = min_fsb * fid * 100; - policy->cpuinfo.max_freq = max_fsb * fid * 100; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = nforce2_get(policy->cpu); - policy->min = policy->cpuinfo.min_freq; - policy->max = policy->cpuinfo.max_freq; - - return 0; -} - -static int nforce2_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver nforce2_driver = { - .name = "nforce2", - .verify = nforce2_verify, - .target = nforce2_target, - .get = nforce2_get, - .init = nforce2_cpu_init, - .exit = nforce2_cpu_exit, - .owner = THIS_MODULE, -}; - -/** - * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic - * - * Detects nForce2 A2 and C1 stepping - * - */ -static int nforce2_detect_chipset(void) -{ - nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, - PCI_DEVICE_ID_NVIDIA_NFORCE2, - PCI_ANY_ID, PCI_ANY_ID, NULL); - - if (nforce2_dev == NULL) - return -ENODEV; - - printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n", - nforce2_dev->revision); - printk(KERN_INFO PFX - "FSB changing is maybe unstable and can lead to " - "crashes and data loss.\n"); - - return 0; -} - -/** - * nforce2_init - initializes the nForce2 CPUFreq driver - * - * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init nforce2_init(void) -{ - /* TODO: do we need to detect the processor? */ - - /* detect chipset */ - if (nforce2_detect_chipset()) { - printk(KERN_INFO PFX "No nForce2 chipset.\n"); - return -ENODEV; - } - - return cpufreq_register_driver(&nforce2_driver); -} - -/** - * nforce2_exit - unregisters cpufreq module - * - * Unregisters nForce2 FSB change support. - */ -static void __exit nforce2_exit(void) -{ - cpufreq_unregister_driver(&nforce2_driver); -} - -module_init(nforce2_init); -module_exit(nforce2_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c deleted file mode 100644 index 35a257dd4bb7..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Based on documentation provided by Dave Jones. Thanks! - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/ioport.h> -#include <linux/slab.h> -#include <linux/timex.h> -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/msr.h> -#include <asm/tsc.h> - -#define EPS_BRAND_C7M 0 -#define EPS_BRAND_C7 1 -#define EPS_BRAND_EDEN 2 -#define EPS_BRAND_C3 3 -#define EPS_BRAND_C7D 4 - -struct eps_cpu_data { - u32 fsb; - struct cpufreq_frequency_table freq_table[]; -}; - -static struct eps_cpu_data *eps_cpu[NR_CPUS]; - - -static unsigned int eps_get(unsigned int cpu) -{ - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (cpu) - return 0; - centaur = eps_cpu[cpu]; - if (centaur == NULL) - return 0; - - /* Return current frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - return centaur->fsb * ((lo >> 8) & 0xff); -} - -static int eps_set_state(struct eps_cpu_data *centaur, - unsigned int cpu, - u32 dest_state) -{ - struct cpufreq_freqs freqs; - u32 lo, hi; - int err = 0; - int i; - - freqs.old = eps_get(cpu); - freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff); - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Wait while CPU is busy */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i = 0; - while (lo & ((1 << 16) | (1 << 17))) { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } - /* Set new multiplier and voltage */ - wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0); - /* Wait until transition end */ - i = 0; - do { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } while (lo & ((1 << 16) | (1 << 17))); - - /* Return current frequency */ -postchange: - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - freqs.new = centaur->fsb * ((lo >> 8) & 0xff); - -#ifdef DEBUG - { - u8 current_multiplier, current_voltage; - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", - current_multiplier); - } -#endif - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - return err; -} - -static int eps_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct eps_cpu_data *centaur; - unsigned int newstate = 0; - unsigned int cpu = policy->cpu; - unsigned int dest_state; - int ret; - - if (unlikely(eps_cpu[cpu] == NULL)) - return -ENODEV; - centaur = eps_cpu[cpu]; - - if (unlikely(cpufreq_frequency_table_target(policy, - &eps_cpu[cpu]->freq_table[0], - target_freq, - relation, - &newstate))) { - return -EINVAL; - } - - /* Make frequency transition */ - dest_state = centaur->freq_table[newstate].index & 0xffff; - ret = eps_set_state(centaur, cpu, dest_state); - if (ret) - printk(KERN_ERR "eps: Timeout!\n"); - return ret; -} - -static int eps_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - &eps_cpu[policy->cpu]->freq_table[0]); -} - -static int eps_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - u32 lo, hi; - u64 val; - u8 current_multiplier, current_voltage; - u8 max_multiplier, max_voltage; - u8 min_multiplier, min_voltage; - u8 brand = 0; - u32 fsb; - struct eps_cpu_data *centaur; - struct cpuinfo_x86 *c = &cpu_data(0); - struct cpufreq_frequency_table *f_table; - int k, step, voltage; - int ret; - int states; - - if (policy->cpu != 0) - return -ENODEV; - - /* Check brand */ - printk(KERN_INFO "eps: Detected VIA "); - - switch (c->x86_model) { - case 10: - rdmsr(0x1153, lo, hi); - brand = (((lo >> 2) ^ lo) >> 18) & 3; - printk(KERN_CONT "Model A "); - break; - case 13: - rdmsr(0x1154, lo, hi); - brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff; - printk(KERN_CONT "Model D "); - break; - } - - switch (brand) { - case EPS_BRAND_C7M: - printk(KERN_CONT "C7-M\n"); - break; - case EPS_BRAND_C7: - printk(KERN_CONT "C7\n"); - break; - case EPS_BRAND_EDEN: - printk(KERN_CONT "Eden\n"); - break; - case EPS_BRAND_C7D: - printk(KERN_CONT "C7-D\n"); - break; - case EPS_BRAND_C3: - printk(KERN_CONT "C3\n"); - return -ENODEV; - break; - } - /* Enable Enhanced PowerSaver */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - wrmsrl(MSR_IA32_MISC_ENABLE, val); - /* Can be locked at 0 */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); - return -ENODEV; - } - } - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); - - /* Print limits */ - max_voltage = hi & 0xff; - printk(KERN_INFO "eps: Highest voltage = %dmV\n", - max_voltage * 16 + 700); - max_multiplier = (hi >> 8) & 0xff; - printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); - min_voltage = (hi >> 16) & 0xff; - printk(KERN_INFO "eps: Lowest voltage = %dmV\n", - min_voltage * 16 + 700); - min_multiplier = (hi >> 24) & 0xff; - printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); - - /* Sanity checks */ - if (current_multiplier == 0 || max_multiplier == 0 - || min_multiplier == 0) - return -EINVAL; - if (current_multiplier > max_multiplier - || max_multiplier <= min_multiplier) - return -EINVAL; - if (current_voltage > 0x1f || max_voltage > 0x1f) - return -EINVAL; - if (max_voltage < min_voltage) - return -EINVAL; - - /* Calc FSB speed */ - fsb = cpu_khz / current_multiplier; - /* Calc number of p-states supported */ - if (brand == EPS_BRAND_C7M) - states = max_multiplier - min_multiplier + 1; - else - states = 2; - - /* Allocate private data and frequency table for current cpu */ - centaur = kzalloc(sizeof(struct eps_cpu_data) - + (states + 1) * sizeof(struct cpufreq_frequency_table), - GFP_KERNEL); - if (!centaur) - return -ENOMEM; - eps_cpu[0] = centaur; - - /* Copy basic values */ - centaur->fsb = fsb; - - /* Fill frequency and MSR value table */ - f_table = ¢aur->freq_table[0]; - if (brand != EPS_BRAND_C7M) { - f_table[0].frequency = fsb * min_multiplier; - f_table[0].index = (min_multiplier << 8) | min_voltage; - f_table[1].frequency = fsb * max_multiplier; - f_table[1].index = (max_multiplier << 8) | max_voltage; - f_table[2].frequency = CPUFREQ_TABLE_END; - } else { - k = 0; - step = ((max_voltage - min_voltage) * 256) - / (max_multiplier - min_multiplier); - for (i = min_multiplier; i <= max_multiplier; i++) { - voltage = (k * step) / 256 + min_voltage; - f_table[k].frequency = fsb * i; - f_table[k].index = (i << 8) | voltage; - k++; - } - f_table[k].frequency = CPUFREQ_TABLE_END; - } - - policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */ - policy->cur = fsb * current_multiplier; - - ret = cpufreq_frequency_table_cpuinfo(policy, ¢aur->freq_table[0]); - if (ret) { - kfree(centaur); - return ret; - } - - cpufreq_frequency_table_get_attr(¢aur->freq_table[0], policy->cpu); - return 0; -} - -static int eps_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (eps_cpu[cpu] == NULL) - return -ENODEV; - centaur = eps_cpu[cpu]; - - /* Get max frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - /* Set max frequency */ - eps_set_state(centaur, cpu, hi & 0xffff); - /* Bye */ - cpufreq_frequency_table_put_attr(policy->cpu); - kfree(eps_cpu[cpu]); - eps_cpu[cpu] = NULL; - return 0; -} - -static struct freq_attr *eps_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver eps_driver = { - .verify = eps_verify, - .target = eps_target, - .init = eps_cpu_init, - .exit = eps_cpu_exit, - .get = eps_get, - .name = "e_powersaver", - .owner = THIS_MODULE, - .attr = eps_attr, -}; - -static int __init eps_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* This driver will work only on Centaur C7 processors with - * Enhanced SpeedStep/PowerSaver registers */ - if (c->x86_vendor != X86_VENDOR_CENTAUR - || c->x86 != 6 || c->x86_model < 10) - return -ENODEV; - if (!cpu_has(c, X86_FEATURE_EST)) - return -ENODEV; - - if (cpufreq_register_driver(&eps_driver)) - return -EINVAL; - return 0; -} - -static void __exit eps_exit(void) -{ - cpufreq_unregister_driver(&eps_driver); -} - -MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>"); -MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); -MODULE_LICENSE("GPL"); - -module_init(eps_init); -module_exit(eps_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c deleted file mode 100644 index c587db472a75..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ /dev/null @@ -1,309 +0,0 @@ -/* - * elanfreq: cpufreq driver for the AMD ELAN family - * - * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de> - * - * Parts of this code are (c) Sven Geggus <sven@geggus.net> - * - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> - -#include <linux/delay.h> -#include <linux/cpufreq.h> - -#include <asm/msr.h> -#include <linux/timex.h> -#include <linux/io.h> - -#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ -#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ - -/* Module parameter */ -static int max_freq; - -struct s_elan_multiplier { - int clock; /* frequency in kHz */ - int val40h; /* PMU Force Mode register */ - int val80h; /* CPU Clock Speed Register */ -}; - -/* - * It is important that the frequencies - * are listed in ascending order here! - */ -static struct s_elan_multiplier elan_multiplier[] = { - {1000, 0x02, 0x18}, - {2000, 0x02, 0x10}, - {4000, 0x02, 0x08}, - {8000, 0x00, 0x00}, - {16000, 0x00, 0x02}, - {33000, 0x00, 0x04}, - {66000, 0x01, 0x04}, - {99000, 0x01, 0x05} -}; - -static struct cpufreq_frequency_table elanfreq_table[] = { - {0, 1000}, - {1, 2000}, - {2, 4000}, - {3, 8000}, - {4, 16000}, - {5, 33000}, - {6, 66000}, - {7, 99000}, - {0, CPUFREQ_TABLE_END}, -}; - - -/** - * elanfreq_get_cpu_frequency: determine current cpu speed - * - * Finds out at which frequency the CPU of the Elan SOC runs - * at the moment. Frequencies from 1 to 33 MHz are generated - * the normal way, 66 and 99 MHz are called "Hyperspeed Mode" - * and have the rest of the chip running with 33 MHz. - */ - -static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg; /* Clock Speed Register */ - - local_irq_disable(); - outb_p(0x80, REG_CSCIR); - clockspeed_reg = inb_p(REG_CSCDR); - local_irq_enable(); - - if ((clockspeed_reg & 0xE0) == 0xE0) - return 0; - - /* Are we in CPU clock multiplied mode (66/99 MHz)? */ - if ((clockspeed_reg & 0xE0) == 0xC0) { - if ((clockspeed_reg & 0x01) == 0) - return 66000; - else - return 99000; - } - - /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0) == 0xA0) - return 33000; - - return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; -} - - -/** - * elanfreq_set_cpu_frequency: Change the CPU core frequency - * @cpu: cpu number - * @freq: frequency in kHz - * - * This function takes a frequency value and changes the CPU frequency - * according to this. Note that the frequency has to be checked by - * elanfreq_validatespeed() for correctness! - * - * There is no return value. - */ - -static void elanfreq_set_cpu_state(unsigned int state) -{ - struct cpufreq_freqs freqs; - - freqs.old = elanfreq_get_cpu_frequency(0); - freqs.new = elan_multiplier[state].clock; - freqs.cpu = 0; /* elanfreq.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n", - elan_multiplier[state].clock); - - - /* - * Access to the Elan's internal registers is indexed via - * 0x22: Chip Setup & Control Register Index Register (CSCI) - * 0x23: Chip Setup & Control Register Data Register (CSCD) - * - */ - - /* - * 0x40 is the Power Management Unit's Force Mode Register. - * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency) - */ - - local_irq_disable(); - outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00, REG_CSCDR); - local_irq_enable(); /* wait till internal pipelines and */ - udelay(1000); /* buffers have cleaned up */ - - local_irq_disable(); - - /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80, REG_CSCIR); - outb_p(elan_multiplier[state].val80h, REG_CSCDR); - - /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40, REG_CSCIR); - outb_p(elan_multiplier[state].val40h, REG_CSCDR); - udelay(10000); - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - - -/** - * elanfreq_validatespeed: test if frequency range is valid - * @policy: the policy to validate - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int elanfreq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); -} - -static int elanfreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - elanfreq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int elanfreq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int i; - int result; - - /* capability check */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) - return -ENODEV; - - /* max freq */ - if (!max_freq) - max_freq = elanfreq_get_cpu_frequency(0); - - /* table init */ - for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if (elanfreq_table[i].frequency > max_freq) - elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = elanfreq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); - return 0; -} - - -static int elanfreq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -#ifndef MODULE -/** - * elanfreq_setup - elanfreq command line parameter parsing - * - * elanfreq command line parameter. Use: - * elanfreq=66000 - * to set the maximum CPU frequency to 66 MHz. Note that in - * case you do not give this boot parameter, the maximum - * frequency will fall back to _current_ CPU frequency which - * might be lower. If you build this as a module, use the - * max_freq module parameter instead. - */ -static int __init elanfreq_setup(char *str) -{ - max_freq = simple_strtoul(str, &str, 0); - printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n"); - return 1; -} -__setup("elanfreq=", elanfreq_setup); -#endif - - -static struct freq_attr *elanfreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver elanfreq_driver = { - .get = elanfreq_get_cpu_frequency, - .verify = elanfreq_verify, - .target = elanfreq_target, - .init = elanfreq_cpu_init, - .exit = elanfreq_cpu_exit, - .name = "elanfreq", - .owner = THIS_MODULE, - .attr = elanfreq_attr, -}; - - -static int __init elanfreq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* Test if we have the right hardware */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) { - printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; - } - return cpufreq_register_driver(&elanfreq_driver); -} - - -static void __exit elanfreq_exit(void) -{ - cpufreq_unregister_driver(&elanfreq_driver); -} - - -module_param(max_freq, int, 0444); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, " - "Sven Geggus <sven@geggus.net>"); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); - -module_init(elanfreq_init); -module_exit(elanfreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c deleted file mode 100644 index 32974cf84232..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ /dev/null @@ -1,517 +0,0 @@ -/* - * Cyrix MediaGX and NatSemi Geode Suspend Modulation - * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * (C) 2002 Hiroshi Miura <miura@da-cha.org> - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Theoretical note: - * - * (see Geode(tm) CS5530 manual (rev.4.1) page.56) - * - * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0 - * are based on Suspend Modulation. - * - * Suspend Modulation works by asserting and de-asserting the SUSP# pin - * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP# - * the CPU enters an idle state. GX1 stops its core clock when SUSP# is - * asserted then power consumption is reduced. - * - * Suspend Modulation's OFF/ON duration are configurable - * with 'Suspend Modulation OFF Count Register' - * and 'Suspend Modulation ON Count Register'. - * These registers are 8bit counters that represent the number of - * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF) - * to the processor. - * - * These counters define a ratio which is the effective frequency - * of operation of the system. - * - * OFF Count - * F_eff = Fgx * ---------------------- - * OFF Count + ON Count - * - * 0 <= On Count, Off Count <= 255 - * - * From these limits, we can get register values - * - * off_duration + on_duration <= MAX_DURATION - * on_duration = off_duration * (stock_freq - freq) / freq - * - * off_duration = (freq * DURATION) / stock_freq - * on_duration = DURATION - off_duration - * - * - *--------------------------------------------------------------------------- - * - * ChangeLog: - * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org> - * - fix on/off register mistake - * - fix cpu_khz calc when it stops cpu modulation. - * - * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org> - * - rewrite for Cyrix MediaGX Cx5510/5520 and - * NatSemi Geode Cs5530(A). - * - * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * - cs5530_mod patch for 2.4.19-rc1. - * - *--------------------------------------------------------------------------- - * - * Todo - * Test on machines with 5510, 5530, 5530A - */ - -/************************************************************************ - * Suspend Modulation - Definitions * - ************************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/errno.h> -#include <linux/slab.h> - -#include <asm/processor-cyrix.h> - -/* PCI config registers, all at F0 */ -#define PCI_PMER1 0x80 /* power management enable register 1 */ -#define PCI_PMER2 0x81 /* power management enable register 2 */ -#define PCI_PMER3 0x82 /* power management enable register 3 */ -#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */ -#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */ -#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */ -#define PCI_MODON 0x95 /* suspend modulation ON counter register */ -#define PCI_SUSCFG 0x96 /* suspend configuration register */ - -/* PMER1 bits */ -#define GPM (1<<0) /* global power management */ -#define GIT (1<<1) /* globally enable PM device idle timers */ -#define GTR (1<<2) /* globally enable IO traps */ -#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */ -#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */ - -/* SUSCFG bits */ -#define SUSMOD (1<<0) /* enable/disable suspend modulation */ -/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */ -#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */ - /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */ -#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */ -/* the below is supported only with cs5530A */ -#define PWRSVE_ISA (1<<3) /* stop ISA clock */ -#define PWRSVE (1<<4) /* active idle */ - -struct gxfreq_params { - u8 on_duration; - u8 off_duration; - u8 pci_suscfg; - u8 pci_pmer1; - u8 pci_pmer2; - struct pci_dev *cs55x0; -}; - -static struct gxfreq_params *gx_params; -static int stock_freq; - -/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ -static int pci_busclk; -module_param(pci_busclk, int, 0444); - -/* maximum duration for which the cpu may be suspended - * (32us * MAX_DURATION). If no parameter is given, this defaults - * to 255. - * Note that this leads to a maximum of 8 ms(!) where the CPU clock - * is suspended -- processing power is just 0.39% of what it used to be, - * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ -static int max_duration = 255; -module_param(max_duration, int, 0444); - -/* For the default policy, we want at least some processing power - * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) - */ -#define POLICY_MIN_DIV 20 - - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "gx-suspmod", msg) - -/** - * we can detect a core multipiler from dir0_lsb - * from GX1 datasheet p.56, - * MULT[3:0]: - * 0000 = SYSCLK multiplied by 4 (test only) - * 0001 = SYSCLK multiplied by 10 - * 0010 = SYSCLK multiplied by 4 - * 0011 = SYSCLK multiplied by 6 - * 0100 = SYSCLK multiplied by 9 - * 0101 = SYSCLK multiplied by 5 - * 0110 = SYSCLK multiplied by 7 - * 0111 = SYSCLK multiplied by 8 - * of 33.3MHz - **/ -static int gx_freq_mult[16] = { - 4, 10, 4, 6, 9, 5, 7, 8, - 0, 0, 0, 0, 0, 0, 0, 0 -}; - - -/**************************************************************** - * Low Level chipset interface * - ****************************************************************/ -static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, - { 0, }, -}; - -static void gx_write_byte(int reg, int value) -{ - pci_write_config_byte(gx_params->cs55x0, reg, value); -} - -/** - * gx_detect_chipset: - * - **/ -static __init struct pci_dev *gx_detect_chipset(void) -{ - struct pci_dev *gx_pci = NULL; - - /* check if CPU is a MediaGX or a Geode. */ - if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && - (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { - dprintk("error: no MediaGX/Geode processor found!\n"); - return NULL; - } - - /* detect which companion chip is used */ - for_each_pci_dev(gx_pci) { - if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) - return gx_pci; - } - - dprintk("error: no supported chipset found!\n"); - return NULL; -} - -/** - * gx_get_cpuspeed: - * - * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi - * Geode CPU runs. - */ -static unsigned int gx_get_cpuspeed(unsigned int cpu) -{ - if ((gx_params->pci_suscfg & SUSMOD) == 0) - return stock_freq; - - return (stock_freq * gx_params->off_duration) - / (gx_params->on_duration + gx_params->off_duration); -} - -/** - * gx_validate_speed: - * determine current cpu speed - * - **/ - -static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, - u8 *off_duration) -{ - unsigned int i; - u8 tmp_on, tmp_off; - int old_tmp_freq = stock_freq; - int tmp_freq; - - *off_duration = 1; - *on_duration = 0; - - for (i = max_duration; i > 0; i--) { - tmp_off = ((khz * i) / stock_freq) & 0xff; - tmp_on = i - tmp_off; - tmp_freq = (stock_freq * tmp_off) / i; - /* if this relation is closer to khz, use this. If it's equal, - * prefer it, too - lower latency */ - if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) { - *on_duration = tmp_on; - *off_duration = tmp_off; - old_tmp_freq = tmp_freq; - } - } - - return old_tmp_freq; -} - - -/** - * gx_set_cpuspeed: - * set cpu speed in khz. - **/ - -static void gx_set_cpuspeed(unsigned int khz) -{ - u8 suscfg, pmer1; - unsigned int new_khz; - unsigned long flags; - struct cpufreq_freqs freqs; - - freqs.cpu = 0; - freqs.old = gx_get_cpuspeed(0); - - new_khz = gx_validate_speed(khz, &gx_params->on_duration, - &gx_params->off_duration); - - freqs.new = new_khz; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - local_irq_save(flags); - - - - if (new_khz != stock_freq) { - /* if new khz == 100% of CPU speed, it is special case */ - switch (gx_params->cs55x0->device) { - case PCI_DEVICE_ID_CYRIX_5530_LEGACY: - pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; - /* FIXME: need to test other values -- Zwane,Miura */ - /* typical 2 to 4ms */ - gx_write_byte(PCI_IRQTC, 4); - /* typical 50 to 100ms */ - gx_write_byte(PCI_VIDTC, 100); - gx_write_byte(PCI_PMER1, pmer1); - - if (gx_params->cs55x0->revision < 0x10) { - /* CS5530(rev 1.2, 1.3) */ - suscfg = gx_params->pci_suscfg|SUSMOD; - } else { - /* CS5530A,B.. */ - suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE; - } - break; - case PCI_DEVICE_ID_CYRIX_5520: - case PCI_DEVICE_ID_CYRIX_5510: - suscfg = gx_params->pci_suscfg | SUSMOD; - break; - default: - local_irq_restore(flags); - dprintk("fatal: try to set unknown chipset.\n"); - return; - } - } else { - suscfg = gx_params->pci_suscfg & ~(SUSMOD); - gx_params->off_duration = 0; - gx_params->on_duration = 0; - dprintk("suspend modulation disabled: cpu runs 100%% speed.\n"); - } - - gx_write_byte(PCI_MODOFF, gx_params->off_duration); - gx_write_byte(PCI_MODON, gx_params->on_duration); - - gx_write_byte(PCI_SUSCFG, suscfg); - pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); - - local_irq_restore(flags); - - gx_params->pci_suscfg = suscfg; - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", - gx_params->on_duration * 32, gx_params->off_duration * 32); - dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); -} - -/**************************************************************** - * High level functions * - ****************************************************************/ - -/* - * cpufreq_gx_verify: test if frequency range is valid - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int cpufreq_gx_verify(struct cpufreq_policy *policy) -{ - unsigned int tmp_freq = 0; - u8 tmp1, tmp2; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - /* it needs to be assured that at least one supported frequency is - * within policy->min and policy->max. If it is not, policy->max - * needs to be increased until one freuqency is supported. - * policy->min may not be decreased, though. This way we guarantee a - * specific processing capacity. - */ - tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2); - if (tmp_freq < policy->min) - tmp_freq += stock_freq / max_duration; - policy->min = tmp_freq; - if (policy->min > policy->max) - policy->max = tmp_freq; - tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2); - if (tmp_freq > policy->max) - tmp_freq -= stock_freq / max_duration; - policy->max = tmp_freq; - if (policy->max < policy->min) - policy->max = policy->min; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - return 0; -} - -/* - * cpufreq_gx_target: - * - */ -static int cpufreq_gx_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - u8 tmp1, tmp2; - unsigned int tmp_freq; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - - tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2); - while (tmp_freq < policy->min) { - tmp_freq += stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - while (tmp_freq > policy->max) { - tmp_freq -= stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - - gx_set_cpuspeed(tmp_freq); - - return 0; -} - -static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int maxfreq, curfreq; - - if (!policy || policy->cpu != 0) - return -ENODEV; - - /* determine maximum frequency */ - if (pci_busclk) - maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - else if (cpu_khz) - maxfreq = cpu_khz; - else - maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - - stock_freq = maxfreq; - curfreq = gx_get_cpuspeed(0); - - dprintk("cpu max frequency is %d.\n", maxfreq); - dprintk("cpu current frequency is %dkHz.\n", curfreq); - - /* setup basic struct for cpufreq API */ - policy->cpu = 0; - - if (max_duration < POLICY_MIN_DIV) - policy->min = maxfreq / max_duration; - else - policy->min = maxfreq / POLICY_MIN_DIV; - policy->max = maxfreq; - policy->cur = curfreq; - policy->cpuinfo.min_freq = maxfreq / max_duration; - policy->cpuinfo.max_freq = maxfreq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - - return 0; -} - -/* - * cpufreq_gx_init: - * MediaGX/Geode GX initialize cpufreq driver - */ -static struct cpufreq_driver gx_suspmod_driver = { - .get = gx_get_cpuspeed, - .verify = cpufreq_gx_verify, - .target = cpufreq_gx_target, - .init = cpufreq_gx_cpu_init, - .name = "gx-suspmod", - .owner = THIS_MODULE, -}; - -static int __init cpufreq_gx_init(void) -{ - int ret; - struct gxfreq_params *params; - struct pci_dev *gx_pci; - - /* Test if we have the right hardware */ - gx_pci = gx_detect_chipset(); - if (gx_pci == NULL) - return -ENODEV; - - /* check whether module parameters are sane */ - if (max_duration > 0xff) - max_duration = 0xff; - - dprintk("geode suspend modulation available.\n"); - - params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL); - if (params == NULL) - return -ENOMEM; - - params->cs55x0 = gx_pci; - gx_params = params; - - /* keep cs55x0 configurations */ - pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg)); - pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); - pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); - pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); - pci_read_config_byte(params->cs55x0, PCI_MODOFF, - &(params->off_duration)); - - ret = cpufreq_register_driver(&gx_suspmod_driver); - if (ret) { - kfree(params); - return ret; /* register error! */ - } - - return 0; -} - -static void __exit cpufreq_gx_exit(void) -{ - cpufreq_unregister_driver(&gx_suspmod_driver); - pci_dev_put(gx_params->cs55x0); - kfree(gx_params); -} - -MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>"); -MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); -MODULE_LICENSE("GPL"); - -module_init(cpufreq_gx_init); -module_exit(cpufreq_gx_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c deleted file mode 100644 index 03162dac6271..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ /dev/null @@ -1,1029 +0,0 @@ -/* - * (C) 2001-2004 Dave Jones. <davej@redhat.com> - * (C) 2002 Padraig Brady. <padraig@antefacto.com> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by VIA. - * - * VIA have currently 3 different versions of Longhaul. - * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. - * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. - * Version 2 of longhaul is backward compatible with v1, but adds - * LONGHAUL MSR for purpose of both frequency and voltage scaling. - * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C). - * Version 3 of longhaul got renamed to Powersaver and redesigned - * to use only the POWERSAVER MSR at 0x110a. - * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. - * It's pretty much the same feature wise to longhaul v2, though - * there is provision for scaling FSB too, but this doesn't work - * too well in practice so we don't even try to use this. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/timex.h> -#include <linux/io.h> -#include <linux/acpi.h> - -#include <asm/msr.h> -#include <acpi/processor.h> - -#include "longhaul.h" - -#define PFX "longhaul: " - -#define TYPE_LONGHAUL_V1 1 -#define TYPE_LONGHAUL_V2 2 -#define TYPE_POWERSAVER 3 - -#define CPU_SAMUEL 1 -#define CPU_SAMUEL2 2 -#define CPU_EZRA 3 -#define CPU_EZRA_T 4 -#define CPU_NEHEMIAH 5 -#define CPU_NEHEMIAH_C 6 - -/* Flags */ -#define USE_ACPI_C3 (1 << 1) -#define USE_NORTHBRIDGE (1 << 2) - -static int cpu_model; -static unsigned int numscales = 16; -static unsigned int fsb; - -static const struct mV_pos *vrm_mV_table; -static const unsigned char *mV_vrm_table; - -static unsigned int highest_speed, lowest_speed; /* kHz */ -static unsigned int minmult, maxmult; -static int can_scale_voltage; -static struct acpi_processor *pr; -static struct acpi_processor_cx *cx; -static u32 acpi_regs_addr; -static u8 longhaul_flags; -static unsigned int longhaul_index; - -/* Module parameters */ -static int scale_voltage; -static int disable_acpi_c3; -static int revid_errata; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longhaul", msg) - - -/* Clock ratios multiplied by 10 */ -static int mults[32]; -static int eblcr[32]; -static int longhaul_version; -static struct cpufreq_frequency_table *longhaul_table; - -#ifdef CONFIG_CPU_FREQ_DEBUG -static char speedbuffer[8]; - -static char *print_speed(int speed) -{ - if (speed < 1000) { - snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed); - return speedbuffer; - } - - if (speed%1000 == 0) - snprintf(speedbuffer, sizeof(speedbuffer), - "%dGHz", speed/1000); - else - snprintf(speedbuffer, sizeof(speedbuffer), - "%d.%dGHz", speed/1000, (speed%1000)/100); - - return speedbuffer; -} -#endif - - -static unsigned int calc_speed(int mult) -{ - int khz; - khz = (mult/10)*fsb; - if (mult%10) - khz += fsb/2; - khz *= 1000; - return khz; -} - - -static int longhaul_get_cpu_mult(void) -{ - unsigned long invalue = 0, lo, hi; - - rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi); - invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22; - if (longhaul_version == TYPE_LONGHAUL_V2 || - longhaul_version == TYPE_POWERSAVER) { - if (lo & (1<<27)) - invalue += 16; - } - return eblcr[invalue]; -} - -/* For processor with BCR2 MSR */ - -static void do_longhaul1(unsigned int mults_index) -{ - union msr_bcr2 bcr2; - - rdmsrl(MSR_VIA_BCR2, bcr2.val); - /* Enable software clock multiplier */ - bcr2.bits.ESOFTBF = 1; - bcr2.bits.CLOCKMUL = mults_index & 0xff; - - /* Sync to timer tick */ - safe_halt(); - /* Change frequency on next halt or sleep */ - wrmsrl(MSR_VIA_BCR2, bcr2.val); - /* Invoke transition */ - ACPI_FLUSH_CPU_CACHE(); - halt(); - - /* Disable software clock multiplier */ - local_irq_disable(); - rdmsrl(MSR_VIA_BCR2, bcr2.val); - bcr2.bits.ESOFTBF = 0; - wrmsrl(MSR_VIA_BCR2, bcr2.val); -} - -/* For processor with Longhaul MSR */ - -static void do_powersaver(int cx_address, unsigned int mults_index, - unsigned int dir) -{ - union msr_longhaul longhaul; - u32 t; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Setup new frequency */ - if (!revid_errata) - longhaul.bits.RevisionKey = longhaul.bits.RevisionID; - else - longhaul.bits.RevisionKey = 0; - longhaul.bits.SoftBusRatio = mults_index & 0xf; - longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4; - /* Setup new voltage */ - if (can_scale_voltage) - longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f; - /* Sync to timer tick */ - safe_halt(); - /* Raise voltage if necessary */ - if (can_scale_voltage && dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } - - /* Change frequency on next halt or sleep */ - longhaul.bits.EnableSoftBusRatio = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - /* Disable bus ratio bit */ - longhaul.bits.EnableSoftBusRatio = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - - /* Reduce voltage if necessary */ - if (can_scale_voltage && !dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } -} - -/** - * longhaul_set_cpu_frequency() - * @mults_index : bitpattern of the new multiplier. - * - * Sets a new clock ratio. - */ - -static void longhaul_setstate(unsigned int table_index) -{ - unsigned int mults_index; - int speed, mult; - struct cpufreq_freqs freqs; - unsigned long flags; - unsigned int pic1_mask, pic2_mask; - u16 bm_status = 0; - u32 bm_timeout = 1000; - unsigned int dir = 0; - - mults_index = longhaul_table[table_index].index; - /* Safety precautions */ - mult = mults[mults_index & 0x1f]; - if (mult == -1) - return; - speed = calc_speed(mult); - if ((speed > highest_speed) || (speed < lowest_speed)) - return; - /* Voltage transition before frequency transition? */ - if (can_scale_voltage && longhaul_index < table_index) - dir = 1; - - freqs.old = calc_speed(longhaul_get_cpu_mult()); - freqs.new = speed; - freqs.cpu = 0; /* longhaul.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", - fsb, mult/10, mult%10, print_speed(speed/1000)); -retry_loop: - preempt_disable(); - local_irq_save(flags); - - pic2_mask = inb(0xA1); - pic1_mask = inb(0x21); /* works on C3. save mask. */ - outb(0xFF, 0xA1); /* Overkill */ - outb(0xFE, 0x21); /* TMR0 only */ - - /* Wait while PCI bus is busy. */ - if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE - || ((pr != NULL) && pr->flags.bm_control))) { - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - while (bm_status && bm_timeout) { - outw(1 << 4, acpi_regs_addr); - bm_timeout--; - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - } - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Disable AGP and PCI arbiters */ - outb(3, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Disable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); - } - switch (longhaul_version) { - - /* - * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) - * Software controlled multipliers only. - */ - case TYPE_LONGHAUL_V1: - do_longhaul1(mults_index); - break; - - /* - * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C] - * - * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) - * Nehemiah can do FSB scaling too, but this has never been proven - * to work in practice. - */ - case TYPE_LONGHAUL_V2: - case TYPE_POWERSAVER: - if (longhaul_flags & USE_ACPI_C3) { - /* Don't allow wakeup */ - acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - do_powersaver(cx->address, mults_index, dir); - } else { - do_powersaver(0, mults_index, dir); - } - break; - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Enable arbiters */ - outb(0, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Enable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); - } - outb(pic2_mask, 0xA1); /* restore mask */ - outb(pic1_mask, 0x21); - - local_irq_restore(flags); - preempt_enable(); - - freqs.new = calc_speed(longhaul_get_cpu_mult()); - /* Check if requested frequency is set. */ - if (unlikely(freqs.new != speed)) { - printk(KERN_INFO PFX "Failed to set requested frequency!\n"); - /* Revision ID = 1 but processor is expecting revision key - * equal to 0. Jumpers at the bottom of processor will change - * multiplier and FSB, but will not change bits in Longhaul - * MSR nor enable voltage scaling. */ - if (!revid_errata) { - printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" " - "option.\n"); - revid_errata = 1; - msleep(200); - goto retry_loop; - } - /* Why ACPI C3 sometimes doesn't work is a mystery for me. - * But it does happen. Processor is entering ACPI C3 state, - * but it doesn't change frequency. I tried poking various - * bits in northbridge registers, but without success. */ - if (longhaul_flags & USE_ACPI_C3) { - printk(KERN_INFO PFX "Disabling ACPI C3 support.\n"); - longhaul_flags &= ~USE_ACPI_C3; - if (revid_errata) { - printk(KERN_INFO PFX "Disabling \"Ignore " - "Revision ID\" option.\n"); - revid_errata = 0; - } - msleep(200); - goto retry_loop; - } - /* This shouldn't happen. Longhaul ver. 2 was reported not - * working on processors without voltage scaling, but with - * RevID = 1. RevID errata will make things right. Just - * to be 100% sure. */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n"); - longhaul_version = TYPE_LONGHAUL_V1; - msleep(200); - goto retry_loop; - } - } - /* Report true CPU frequency */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - if (!bm_timeout) - printk(KERN_INFO PFX "Warning: Timeout while waiting for " - "idle PCI bus.\n"); -} - -/* - * Centaur decided to make life a little more tricky. - * Only longhaul v1 is allowed to read EBLCR BSEL[0:1]. - * Samuel2 and above have to try and guess what the FSB is. - * We do this by assuming we booted at maximum multiplier, and interpolate - * between that value multiplied by possible FSBs and cpu_mhz which - * was calculated at boot time. Really ugly, but no other way to do this. - */ - -#define ROUNDING 0xf - -static int guess_fsb(int mult) -{ - int speed = cpu_khz / 1000; - int i; - int speeds[] = { 666, 1000, 1333, 2000 }; - int f_max, f_min; - - for (i = 0; i < 4; i++) { - f_max = ((speeds[i] * mult) + 50) / 100; - f_max += (ROUNDING / 2); - f_min = f_max - ROUNDING; - if ((speed <= f_max) && (speed >= f_min)) - return speeds[i] / 10; - } - return 0; -} - - -static int __cpuinit longhaul_get_ranges(void) -{ - unsigned int i, j, k = 0; - unsigned int ratio; - int mult; - - /* Get current frequency */ - mult = longhaul_get_cpu_mult(); - if (mult == -1) { - printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n"); - return -EINVAL; - } - fsb = guess_fsb(mult); - if (fsb == 0) { - printk(KERN_INFO PFX "Invalid (reserved) FSB!\n"); - return -EINVAL; - } - /* Get max multiplier - as we always did. - * Longhaul MSR is usefull only when voltage scaling is enabled. - * C3 is booting at max anyway. */ - maxmult = mult; - /* Get min multiplier */ - switch (cpu_model) { - case CPU_NEHEMIAH: - minmult = 50; - break; - case CPU_NEHEMIAH_C: - minmult = 40; - break; - default: - minmult = 30; - break; - } - - dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n", - minmult/10, minmult%10, maxmult/10, maxmult%10); - - highest_speed = calc_speed(maxmult); - lowest_speed = calc_speed(minmult); - dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, - print_speed(lowest_speed/1000), - print_speed(highest_speed/1000)); - - if (lowest_speed == highest_speed) { - printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n"); - return -EINVAL; - } - if (lowest_speed > highest_speed) { - printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", - lowest_speed, highest_speed); - return -EINVAL; - } - - longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table), - GFP_KERNEL); - if (!longhaul_table) - return -ENOMEM; - - for (j = 0; j < numscales; j++) { - ratio = mults[j]; - if (ratio == -1) - continue; - if (ratio > maxmult || ratio < minmult) - continue; - longhaul_table[k].frequency = calc_speed(ratio); - longhaul_table[k].index = j; - k++; - } - if (k <= 1) { - kfree(longhaul_table); - return -ENODEV; - } - /* Sort */ - for (j = 0; j < k - 1; j++) { - unsigned int min_f, min_i; - min_f = longhaul_table[j].frequency; - min_i = j; - for (i = j + 1; i < k; i++) { - if (longhaul_table[i].frequency < min_f) { - min_f = longhaul_table[i].frequency; - min_i = i; - } - } - if (min_i != j) { - swap(longhaul_table[j].frequency, - longhaul_table[min_i].frequency); - swap(longhaul_table[j].index, - longhaul_table[min_i].index); - } - } - - longhaul_table[k].frequency = CPUFREQ_TABLE_END; - - /* Find index we are running on */ - for (j = 0; j < k; j++) { - if (mults[longhaul_table[j].index & 0x1f] == mult) { - longhaul_index = j; - break; - } - } - return 0; -} - - -static void __cpuinit longhaul_setup_voltagescaling(void) -{ - union msr_longhaul longhaul; - struct mV_pos minvid, maxvid, vid; - unsigned int j, speed, pos, kHz_step, numvscales; - int min_vid_speed; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!(longhaul.bits.RevisionID & 1)) { - printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n"); - return; - } - - if (!longhaul.bits.VRMRev) { - printk(KERN_INFO PFX "VRM 8.5\n"); - vrm_mV_table = &vrm85_mV[0]; - mV_vrm_table = &mV_vrm85[0]; - } else { - printk(KERN_INFO PFX "Mobile VRM\n"); - if (cpu_model < CPU_NEHEMIAH) - return; - vrm_mV_table = &mobilevrm_mV[0]; - mV_vrm_table = &mV_mobilevrm[0]; - } - - minvid = vrm_mV_table[longhaul.bits.MinimumVID]; - maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; - - if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { - printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " - "Voltage scaling disabled.\n", - minvid.mV/1000, minvid.mV%1000, - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - if (minvid.mV == maxvid.mV) { - printk(KERN_INFO PFX "Claims to support voltage scaling but " - "min & max are both %d.%03d. " - "Voltage scaling disabled\n", - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - /* How many voltage steps*/ - numvscales = maxvid.pos - minvid.pos + 1; - printk(KERN_INFO PFX - "Max VID=%d.%03d " - "Min VID=%d.%03d, " - "%d possible voltage scales\n", - maxvid.mV/1000, maxvid.mV%1000, - minvid.mV/1000, minvid.mV%1000, - numvscales); - - /* Calculate max frequency at min voltage */ - j = longhaul.bits.MinMHzBR; - if (longhaul.bits.MinMHzBR4) - j += 16; - min_vid_speed = eblcr[j]; - if (min_vid_speed == -1) - return; - switch (longhaul.bits.MinMHzFSB) { - case 0: - min_vid_speed *= 13333; - break; - case 1: - min_vid_speed *= 10000; - break; - case 3: - min_vid_speed *= 6666; - break; - default: - return; - break; - } - if (min_vid_speed >= highest_speed) - return; - /* Calculate kHz for one voltage step */ - kHz_step = (highest_speed - min_vid_speed) / numvscales; - - j = 0; - while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { - speed = longhaul_table[j].frequency; - if (speed > min_vid_speed) - pos = (speed - min_vid_speed) / kHz_step + minvid.pos; - else - pos = minvid.pos; - longhaul_table[j].index |= mV_vrm_table[pos] << 8; - vid = vrm_mV_table[mV_vrm_table[pos]]; - printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", - speed, j, vid.mV); - j++; - } - - can_scale_voltage = 1; - printk(KERN_INFO PFX "Voltage scaling enabled.\n"); -} - - -static int longhaul_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, longhaul_table); -} - - -static int longhaul_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int table_index = 0; - unsigned int i; - unsigned int dir = 0; - u8 vid, current_vid; - - if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, - relation, &table_index)) - return -EINVAL; - - /* Don't set same frequency again */ - if (longhaul_index == table_index) - return 0; - - if (!can_scale_voltage) - longhaul_setstate(table_index); - else { - /* On test system voltage transitions exceeding single - * step up or down were turning motherboard off. Both - * "ondemand" and "userspace" are unsafe. C7 is doing - * this in hardware, C3 is old and we need to do this - * in software. */ - i = longhaul_index; - current_vid = (longhaul_table[longhaul_index].index >> 8); - current_vid &= 0x1f; - if (table_index > longhaul_index) - dir = 1; - while (i != table_index) { - vid = (longhaul_table[i].index >> 8) & 0x1f; - if (vid != current_vid) { - longhaul_setstate(i); - current_vid = vid; - msleep(200); - } - if (dir) - i++; - else - i--; - } - longhaul_setstate(table_index); - } - longhaul_index = table_index; - return 0; -} - - -static unsigned int longhaul_get(unsigned int cpu) -{ - if (cpu) - return 0; - return calc_speed(longhaul_get_cpu_mult()); -} - -static acpi_status longhaul_walk_callback(acpi_handle obj_handle, - u32 nesting_level, - void *context, void **return_value) -{ - struct acpi_device *d; - - if (acpi_bus_get_device(obj_handle, &d)) - return 0; - - *return_value = acpi_driver_data(d); - return 1; -} - -/* VIA don't support PM2 reg, but have something similar */ -static int enable_arbiter_disable(void) -{ - struct pci_dev *dev; - int status = 1; - int reg; - u8 pci_cmd; - - /* Find PLE133 host bridge */ - reg = 0x78; - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, - NULL); - /* Find PM133/VT8605 host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8605_0, NULL); - /* Find CLE266 host bridge */ - if (dev == NULL) { - reg = 0x76; - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_862X_0, NULL); - /* Find CN400 V-Link host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL); - } - if (dev != NULL) { - /* Enable access to port 0x22 */ - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - pci_cmd |= 1<<7; - pci_write_config_byte(dev, reg, pci_cmd); - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - printk(KERN_ERR PFX - "Can't enable access to port 0x22.\n"); - status = 0; - } - } - pci_dev_put(dev); - return status; - } - return 0; -} - -static int longhaul_setup_southbridge(void) -{ - struct pci_dev *dev; - u8 pci_cmd; - - /* Find VT8235 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); - if (dev == NULL) - /* Find VT8237 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8237, NULL); - if (dev != NULL) { - /* Set transition time to max */ - pci_read_config_byte(dev, 0xec, &pci_cmd); - pci_cmd &= ~(1 << 2); - pci_write_config_byte(dev, 0xec, pci_cmd); - pci_read_config_byte(dev, 0xe4, &pci_cmd); - pci_cmd &= ~(1 << 7); - pci_write_config_byte(dev, 0xe4, pci_cmd); - pci_read_config_byte(dev, 0xe5, &pci_cmd); - pci_cmd |= 1 << 7; - pci_write_config_byte(dev, 0xe5, pci_cmd); - /* Get address of ACPI registers block*/ - pci_read_config_byte(dev, 0x81, &pci_cmd); - if (pci_cmd & 1 << 7) { - pci_read_config_dword(dev, 0x88, &acpi_regs_addr); - acpi_regs_addr &= 0xff00; - printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", - acpi_regs_addr); - } - - pci_dev_put(dev); - return 1; - } - return 0; -} - -static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - char *cpuname = NULL; - int ret; - u32 lo, hi; - - /* Check what we have on this motherboard */ - switch (c->x86_model) { - case 6: - cpu_model = CPU_SAMUEL; - cpuname = "C3 'Samuel' [C5A]"; - longhaul_version = TYPE_LONGHAUL_V1; - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr)); - break; - - case 7: - switch (c->x86_mask) { - case 0: - longhaul_version = TYPE_LONGHAUL_V1; - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - /* Note, this is not a typo, early Samuel2's had - * Samuel1 ratios. */ - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); - break; - case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V2; - if (c->x86_mask < 8) { - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - } else { - cpu_model = CPU_EZRA; - cpuname = "C3 'Ezra' [C5C]"; - } - memcpy(mults, ezra_mults, sizeof(ezra_mults)); - memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr)); - break; - } - break; - - case 8: - cpu_model = CPU_EZRA_T; - cpuname = "C3 'Ezra-T' [C5M]"; - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, ezrat_mults, sizeof(ezrat_mults)); - memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr)); - break; - - case 9: - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults)); - memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr)); - switch (c->x86_mask) { - case 0 ... 1: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah A' [C5XLOE]"; - break; - case 2 ... 4: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah B' [C5XLOH]"; - break; - case 5 ... 15: - cpu_model = CPU_NEHEMIAH_C; - cpuname = "C3 'Nehemiah C' [C5P]"; - break; - } - break; - - default: - cpuname = "Unknown"; - break; - } - /* Check Longhaul ver. 2 */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - rdmsr(MSR_VIA_LONGHAUL, lo, hi); - if (lo == 0 && hi == 0) - /* Looks like MSR isn't present */ - longhaul_version = TYPE_LONGHAUL_V1; - } - - printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname); - switch (longhaul_version) { - case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: - printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version); - break; - case TYPE_POWERSAVER: - printk(KERN_CONT "Powersaver supported.\n"); - break; - }; - - /* Doesn't hurt */ - longhaul_setup_southbridge(); - - /* Find ACPI data for processor */ - acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, - NULL, (void *)&pr); - - /* Check ACPI support for C3 state */ - if (pr != NULL && longhaul_version == TYPE_POWERSAVER) { - cx = &pr->power.states[ACPI_STATE_C3]; - if (cx->address > 0 && cx->latency <= 1000) - longhaul_flags |= USE_ACPI_C3; - } - /* Disable if it isn't working */ - if (disable_acpi_c3) - longhaul_flags &= ~USE_ACPI_C3; - /* Check if northbridge is friendly */ - if (enable_arbiter_disable()) - longhaul_flags |= USE_NORTHBRIDGE; - - /* Check ACPI support for bus master arbiter disable */ - if (!(longhaul_flags & USE_ACPI_C3 - || longhaul_flags & USE_NORTHBRIDGE) - && ((pr == NULL) || !(pr->flags.bm_control))) { - printk(KERN_ERR PFX - "No ACPI support. Unsupported northbridge.\n"); - return -ENODEV; - } - - if (longhaul_flags & USE_NORTHBRIDGE) - printk(KERN_INFO PFX "Using northbridge support.\n"); - if (longhaul_flags & USE_ACPI_C3) - printk(KERN_INFO PFX "Using ACPI support.\n"); - - ret = longhaul_get_ranges(); - if (ret != 0) - return ret; - - if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0)) - longhaul_setup_voltagescaling(); - - policy->cpuinfo.transition_latency = 200000; /* nsec */ - policy->cur = calc_speed(longhaul_get_cpu_mult()); - - ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table); - if (ret) - return ret; - - cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); - - return 0; -} - -static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *longhaul_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver longhaul_driver = { - .verify = longhaul_verify, - .target = longhaul_target, - .get = longhaul_get, - .init = longhaul_cpu_init, - .exit = __devexit_p(longhaul_cpu_exit), - .name = "longhaul", - .owner = THIS_MODULE, - .attr = longhaul_attr, -}; - - -static int __init longhaul_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) - return -ENODEV; - -#ifdef CONFIG_SMP - if (num_online_cpus() > 1) { - printk(KERN_ERR PFX "More than 1 CPU detected, " - "longhaul disabled.\n"); - return -ENODEV; - } -#endif -#ifdef CONFIG_X86_IO_APIC - if (cpu_has_apic) { - printk(KERN_ERR PFX "APIC detected. Longhaul is currently " - "broken in this configuration.\n"); - return -ENODEV; - } -#endif - switch (c->x86_model) { - case 6 ... 9: - return cpufreq_register_driver(&longhaul_driver); - case 10: - printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n"); - default: - ; - } - - return -ENODEV; -} - - -static void __exit longhaul_exit(void) -{ - int i; - - for (i = 0; i < numscales; i++) { - if (mults[i] == maxmult) { - longhaul_setstate(i); - break; - } - } - - cpufreq_unregister_driver(&longhaul_driver); - kfree(longhaul_table); -} - -/* Even if BIOS is exporting ACPI C3 state, and it is used - * with success when CPU is idle, this state doesn't - * trigger frequency transition in some cases. */ -module_param(disable_acpi_c3, int, 0644); -MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); -/* Change CPU voltage with frequency. Very usefull to save - * power, but most VIA C3 processors aren't supporting it. */ -module_param(scale_voltage, int, 0644); -MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); -/* Force revision key to 0 for processors which doesn't - * support voltage scaling, but are introducing itself as - * such. */ -module_param(revid_errata, int, 0644); -MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); -MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors."); -MODULE_LICENSE("GPL"); - -late_initcall(longhaul_init); -module_exit(longhaul_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h deleted file mode 100644 index cbf48fbca881..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ /dev/null @@ -1,353 +0,0 @@ -/* - * longhaul.h - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * VIA-specific information - */ - -union msr_bcr2 { - struct { - unsigned Reseved:19, // 18:0 - ESOFTBF:1, // 19 - Reserved2:3, // 22:20 - CLOCKMUL:4, // 26:23 - Reserved3:5; // 31:27 - } bits; - unsigned long val; -}; - -union msr_longhaul { - struct { - unsigned RevisionID:4, // 3:0 - RevisionKey:4, // 7:4 - EnableSoftBusRatio:1, // 8 - EnableSoftVID:1, // 9 - EnableSoftBSEL:1, // 10 - Reserved:3, // 11:13 - SoftBusRatio4:1, // 14 - VRMRev:1, // 15 - SoftBusRatio:4, // 19:16 - SoftVID:5, // 24:20 - Reserved2:3, // 27:25 - SoftBSEL:2, // 29:28 - Reserved3:2, // 31:30 - MaxMHzBR:4, // 35:32 - MaximumVID:5, // 40:36 - MaxMHzFSB:2, // 42:41 - MaxMHzBR4:1, // 43 - Reserved4:4, // 47:44 - MinMHzBR:4, // 51:48 - MinimumVID:5, // 56:52 - MinMHzFSB:2, // 58:57 - MinMHzBR4:1, // 59 - Reserved5:4; // 63:60 - } bits; - unsigned long long val; -}; - -/* - * Clock ratio tables. Div/Mod by 10 to get ratio. - * The eblcr values specify the ratio read from the CPU. - * The mults values specify what to write to the CPU. - */ - -/* - * VIA C3 Samuel 1 & Samuel 2 (stepping 0) - */ -static const int __cpuinitdata samuel1_mults[16] = { - -1, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - -1, /* 0100 -> RESERVED */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - -1, /* 1111 -> RESERVED */ -}; - -static const int __cpuinitdata samuel1_eblcr[16] = { - 50, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - -1, /* 0111 -> RESERVED */ - -1, /* 1000 -> RESERVED */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - -1, /* 1100 -> RESERVED */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Samuel2 Stepping 1->15 - */ -static const int __cpuinitdata samuel2_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 110, /* 0111 -> 11.0x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 130, /* 1110 -> 13.0x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Ezra - */ -static const int __cpuinitdata ezra_mults[16] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata ezra_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 (Ezra-T) [C5M]. - */ -static const int __cpuinitdata ezrat_mults[32] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - - -1, /* 0000 -> RESERVED (10.0x) */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (9.0x)*/ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> RESERVED (12.0x) */ -}; - -static const int __cpuinitdata ezrat_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - - -1, /* 0000 -> RESERVED (9.0x) */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (10.0x)*/ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - -1, /* 1100 -> RESERVED (12.0x) */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145, /* 1111 -> 14.5x */ -}; - -/* - * VIA C3 Nehemiah */ - -static const int __cpuinitdata nehemiah_mults[32] = { - 100, /* 0000 -> 10.0x */ - -1, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - -1, /* 0000 -> 10.0x */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata nehemiah_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ -}; - -/* - * Voltage scales. Div/Mod by 1000 to get actual voltage. - * Which scale to use depends on the VRM type in use. - */ - -struct mV_pos { - unsigned short mV; - unsigned short pos; -}; - -static const struct mV_pos __cpuinitdata vrm85_mV[32] = { - {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, - {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, - {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, - {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10}, - {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3}, - {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27}, - {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19}, - {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} -}; - -static const unsigned char __cpuinitdata mV_vrm85[32] = { - 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, - 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, - 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, - 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 -}; - -static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { - {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, - {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, - {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, - {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16}, - {975, 15}, {950, 14}, {925, 13}, {900, 12}, - {875, 11}, {850, 10}, {825, 9}, {800, 8}, - {775, 7}, {750, 6}, {725, 5}, {700, 4}, - {675, 3}, {650, 2}, {625, 1}, {600, 0} -}; - -static const unsigned char __cpuinitdata mV_mobilevrm[32] = { - 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, - 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, - 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, - 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 -}; - diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c deleted file mode 100644 index d9f51367666b..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/timex.h> - -#include <asm/msr.h> -#include <asm/processor.h> - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longrun", msg) - -static struct cpufreq_driver longrun_driver; - -/** - * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz - * values into per cent values. In TMTA microcode, the following is valid: - * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - */ -static unsigned int longrun_low_freq, longrun_high_freq; - - -/** - * longrun_get_policy - get the current LongRun policy - * @policy: struct cpufreq_policy where current policy is written into - * - * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS - * and MSR_TMTA_LONGRUN_CTRL - */ -static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi); - if (msr_lo & 0x01) - policy->policy = CPUFREQ_POLICY_PERFORMANCE; - else - policy->policy = CPUFREQ_POLICY_POWERSAVE; - - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi); - msr_lo &= 0x0000007F; - msr_hi &= 0x0000007F; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - policy->min = policy->max = longrun_high_freq; - } else { - policy->min = longrun_low_freq + msr_lo * - ((longrun_high_freq - longrun_low_freq) / 100); - policy->max = longrun_low_freq + msr_hi * - ((longrun_high_freq - longrun_low_freq) / 100); - } - policy->cpu = 0; -} - - -/** - * longrun_set_policy - sets a new CPUFreq policy - * @policy: new policy - * - * Sets a new CPUFreq policy on LongRun-capable processors. This function - * has to be called with cpufreq_driver locked. - */ -static int longrun_set_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - u32 pctg_lo, pctg_hi; - - if (!policy) - return -EINVAL; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - pctg_lo = pctg_hi = 100; - } else { - pctg_lo = (policy->min - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - pctg_hi = (policy->max - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - } - - if (pctg_hi > 100) - pctg_hi = 100; - if (pctg_lo > pctg_hi) - pctg_lo = pctg_hi; - - /* performance or economy mode */ - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - msr_lo &= 0xFFFFFFFE; - switch (policy->policy) { - case CPUFREQ_POLICY_PERFORMANCE: - msr_lo |= 0x00000001; - break; - case CPUFREQ_POLICY_POWERSAVE: - break; - } - wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - - /* lower and upper boundary */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_lo |= pctg_lo; - msr_hi |= pctg_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - return 0; -} - - -/** - * longrun_verify_poliy - verifies a new CPUFreq policy - * @policy: the policy to verify - * - * Validates a new CPUFreq policy. This function has to be called with - * cpufreq_driver locked. - */ -static int longrun_verify_policy(struct cpufreq_policy *policy) -{ - if (!policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - - if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) && - (policy->policy != CPUFREQ_POLICY_PERFORMANCE)) - return -EINVAL; - - return 0; -} - -static unsigned int longrun_get(unsigned int cpu) -{ - u32 eax, ebx, ecx, edx; - - if (cpu) - return 0; - - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - dprintk("cpuid eax is %u\n", eax); - - return eax * 1000; -} - -/** - * longrun_determine_freqs - determines the lowest and highest possible core frequency - * @low_freq: an int to put the lowest frequency into - * @high_freq: an int to put the highest frequency into - * - * Determines the lowest and highest possible core frequencies on this CPU. - * This is necessary to calculate the performance percentage according to - * TMTA rules: - * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) - */ -static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) -{ - u32 msr_lo, msr_hi; - u32 save_lo, save_hi; - u32 eax, ebx, ecx, edx; - u32 try_hi; - struct cpuinfo_x86 *c = &cpu_data(0); - - if (!low_freq || !high_freq) - return -EINVAL; - - if (cpu_has(c, X86_FEATURE_LRTI)) { - /* if the LongRun Table Interface is present, the - * detection is a bit easier: - * For minimum frequency, read out the maximum - * level (msr_hi), write that into "currently - * selected level", and read out the frequency. - * For maximum frequency, read out level zero. - */ - /* minimum */ - rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi); - wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *low_freq = msr_lo * 1000; /* to kHz */ - - /* maximum */ - wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *high_freq = msr_lo * 1000; /* to kHz */ - - dprintk("longrun table interface told %u - %u kHz\n", - *low_freq, *high_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - return 0; - } - - /* set the upper border to the value determined during TSC init */ - *high_freq = (cpu_khz / 1000); - *high_freq = *high_freq * 1000; - dprintk("high frequency is %u kHz\n", *high_freq); - - /* get current borders */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - save_lo = msr_lo & 0x0000007F; - save_hi = msr_hi & 0x0000007F; - - /* if current perf_pctg is larger than 90%, we need to decrease the - * upper limit to make the calculation more accurate. - */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - /* try decreasing in 10% steps, some processors react only - * on some barrier values */ - for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) { - /* set to 0 to try_hi perf_pctg */ - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_hi |= try_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - /* read out current core MHz and current perf_pctg */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - - /* restore values */ - wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi); - } - dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax); - - /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - * eqals - * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) - * - * high_freq * perf_pctg is stored tempoarily into "ebx". - */ - ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */ - - if ((ecx > 95) || (ecx == 0) || (eax < ebx)) - return -EIO; - - edx = ((eax - ebx) * 100) / (100 - ecx); - *low_freq = edx * 1000; /* back to kHz */ - - dprintk("low frequency is %u kHz\n", *low_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - - return 0; -} - - -static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) -{ - int result = 0; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* detect low and high frequency */ - result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq); - if (result) - return result; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = longrun_low_freq; - policy->cpuinfo.max_freq = longrun_high_freq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - longrun_get_policy(policy); - - return 0; -} - - -static struct cpufreq_driver longrun_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .verify = longrun_verify_policy, - .setpolicy = longrun_set_policy, - .get = longrun_get, - .init = longrun_cpu_init, - .name = "longrun", - .owner = THIS_MODULE, -}; - - -/** - * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver - * - * Initializes the LongRun support. - */ -static int __init longrun_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_TRANSMETA || - !cpu_has(c, X86_FEATURE_LONGRUN)) - return -ENODEV; - - return cpufreq_register_driver(&longrun_driver); -} - - -/** - * longrun_exit - unregisters LongRun support - */ -static void __exit longrun_exit(void) -{ - cpufreq_unregister_driver(&longrun_driver); -} - - -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and " - "Efficeon processors."); -MODULE_LICENSE("GPL"); - -module_init(longrun_init); -module_exit(longrun_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c deleted file mode 100644 index 911e193018ae..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.c +++ /dev/null @@ -1,51 +0,0 @@ -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> - -#include "mperf.h" - -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); - -/* Called via smp_call_function_single(), on the target CPU */ -static void read_measured_perf_ctrs(void *_cur) -{ - struct aperfmperf *am = _cur; - - get_aperfmperf(am); -} - -/* - * Return the measured active (C0) frequency on this CPU since last call - * to this function. - * Input: cpu number - * Return: Average CPU frequency in terms of max frequency (zero on error) - * - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance - * over a period of time, while CPU is in C0 state. - * IA32_MPERF counts at the rate of max advertised frequency - * IA32_APERF counts at the rate of actual CPU frequency - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and - * no meaning should be associated with absolute values of these MSRs. - */ -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) -{ - struct aperfmperf perf; - unsigned long ratio; - unsigned int retval; - - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) - return 0; - - ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); - per_cpu(acfreq_old_perf, cpu) = perf; - - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; - - return retval; -} -EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h deleted file mode 100644 index 5dbf2950dc22..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * (c) 2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c deleted file mode 100644 index 52c93648e492..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Pentium 4/Xeon CPU on demand clock modulation/speed scaling - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * (C) 2002 Arjan van de Ven <arjanv@redhat.com> - * (C) 2002 Tora T. Engstad - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Date Errata Description - * 20020525 N44, O17 12.5% or 25% DC causes lockup - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpufreq.h> -#include <linux/cpumask.h> -#include <linux/timex.h> - -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/timer.h> - -#include "speedstep-lib.h" - -#define PFX "p4-clockmod: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "p4-clockmod", msg) - -/* - * Duty Cycle (3bits), note DC_DISABLE is not specified in - * intel docs i just use it to mean disable - */ -enum { - DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT, - DC_64PT, DC_75PT, DC_88PT, DC_DISABLE -}; - -#define DC_ENTRIES 8 - - -static int has_N44_O17_errata[NR_CPUS]; -static unsigned int stock_freq; -static struct cpufreq_driver p4clockmod_driver; -static unsigned int cpufreq_p4_get(unsigned int cpu); - -static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) -{ - u32 l, h; - - if (!cpu_online(cpu) || - (newstate > DC_DISABLE) || (newstate == DC_RESV)) - return -EINVAL; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); - - if (l & 0x01) - dprintk("CPU#%d currently thermal throttled\n", cpu); - - if (has_N44_O17_errata[cpu] && - (newstate == DC_25PT || newstate == DC_DFLT)) - newstate = DC_38PT; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - if (newstate == DC_DISABLE) { - dprintk("CPU#%d disabling modulation\n", cpu); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); - } else { - dprintk("CPU#%d setting duty cycle to %d%%\n", - cpu, ((125 * newstate) / 10)); - /* bits 63 - 5 : reserved - * bit 4 : enable/disable - * bits 3-1 : duty cycle - * bit 0 : reserved - */ - l = (l & ~14); - l = l | (1<<4) | ((newstate & 0x7)<<1); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h); - } - - return 0; -} - - -static struct cpufreq_frequency_table p4clockmod_table[] = { - {DC_RESV, CPUFREQ_ENTRY_INVALID}, - {DC_DFLT, 0}, - {DC_25PT, 0}, - {DC_38PT, 0}, - {DC_50PT, 0}, - {DC_64PT, 0}, - {DC_75PT, 0}, - {DC_88PT, 0}, - {DC_DISABLE, 0}, - {DC_RESV, CPUFREQ_TABLE_END}, -}; - - -static int cpufreq_p4_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = DC_RESV; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = cpufreq_p4_get(policy->cpu); - freqs.new = stock_freq * p4clockmod_table[newstate].index / 8; - - if (freqs.new == freqs.old) - return 0; - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - /* run on each logical CPU, - * see section 13.15.3 of IA32 Intel Architecture Software - * Developer's Manual, Volume 3 - */ - for_each_cpu(i, policy->cpus) - cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -static int cpufreq_p4_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]); -} - - -static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x06) { - if (cpu_has(c, X86_FEATURE_EST)) - printk_once(KERN_WARNING PFX "Warning: EST-capable " - "CPU detected. The acpi-cpufreq module offers " - "voltage scaling in addition to frequency " - "scaling. You should use that instead of " - "p4-clockmod, if possible.\n"); - switch (c->x86_model) { - case 0x0E: /* Core */ - case 0x0F: /* Core Duo */ - case 0x16: /* Celeron Core */ - case 0x1C: /* Atom */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); - case 0x0D: /* Pentium M (Dothan) */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - /* fall through */ - case 0x09: /* Pentium M (Banias) */ - return speedstep_get_frequency(SPEEDSTEP_CPU_PM); - } - } - - if (c->x86 != 0xF) - return 0; - - /* on P-4s, the TSC runs with constant frequency independent whether - * throttling is active or not. */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) { - printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " - "The speedstep-ich or acpi cpufreq modules offer " - "voltage scaling in addition of frequency scaling. " - "You should use either one instead of p4-clockmod, " - "if possible.\n"); - return speedstep_get_frequency(SPEEDSTEP_CPU_P4M); - } - - return speedstep_get_frequency(SPEEDSTEP_CPU_P4D); -} - - - -static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - int cpuid = 0; - unsigned int i; - -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - - /* Errata workaround */ - cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask; - switch (cpuid) { - case 0x0f07: - case 0x0f0a: - case 0x0f11: - case 0x0f12: - has_N44_O17_errata[policy->cpu] = 1; - dprintk("has errata -- disabling low frequencies\n"); - } - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D && - c->x86_model < 2) { - /* switch to maximum frequency and measure result */ - cpufreq_p4_setdc(policy->cpu, DC_DISABLE); - recalibrate_cpu_khz(); - } - /* get max frequency */ - stock_freq = cpufreq_p4_get_frequency(c); - if (!stock_freq) - return -EINVAL; - - /* table init */ - for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if ((i < 2) && (has_N44_O17_errata[policy->cpu])) - p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; - else - p4clockmod_table[i].frequency = (stock_freq * i)/8; - } - cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); - - /* cpuinfo and default policy values */ - - /* the transition latency is set to be 1 higher than the maximum - * transition latency of the ondemand governor */ - policy->cpuinfo.transition_latency = 10000001; - policy->cur = stock_freq; - - return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); -} - - -static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int cpufreq_p4_get(unsigned int cpu) -{ - u32 l, h; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - - if (l & 0x10) { - l = l >> 1; - l &= 0x7; - } else - l = DC_DISABLE; - - if (l != DC_DISABLE) - return stock_freq * l / 8; - - return stock_freq; -} - -static struct freq_attr *p4clockmod_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver p4clockmod_driver = { - .verify = cpufreq_p4_verify, - .target = cpufreq_p4_target, - .init = cpufreq_p4_cpu_init, - .exit = cpufreq_p4_cpu_exit, - .get = cpufreq_p4_get, - .name = "p4-clockmod", - .owner = THIS_MODULE, - .attr = p4clockmod_attr, -}; - - -static int __init cpufreq_p4_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int ret; - - /* - * THERM_CONTROL is architectural for IA32 now, so - * we can rely on the capability checks - */ - if (c->x86_vendor != X86_VENDOR_INTEL) - return -ENODEV; - - if (!test_cpu_cap(c, X86_FEATURE_ACPI) || - !test_cpu_cap(c, X86_FEATURE_ACC)) - return -ENODEV; - - ret = cpufreq_register_driver(&p4clockmod_driver); - if (!ret) - printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock " - "Modulation available\n"); - - return ret; -} - - -static void __exit cpufreq_p4_exit(void) -{ - cpufreq_unregister_driver(&p4clockmod_driver); -} - - -MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>"); -MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); -MODULE_LICENSE("GPL"); - -late_initcall(cpufreq_p4_init); -module_exit(cpufreq_p4_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c deleted file mode 100644 index 4a5a42b842ad..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ /dev/null @@ -1,626 +0,0 @@ -/* - * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface - * - * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> - * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. - * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON - * INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 675 Mass Ave, Cambridge, MA 02139, USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/cpufreq.h> -#include <linux/compiler.h> -#include <linux/slab.h> - -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/spinlock.h> -#include <linux/uaccess.h> - -#include <acpi/processor.h> - -#define PCC_VERSION "1.00.00" -#define POLL_LOOPS 300 - -#define CMD_COMPLETE 0x1 -#define CMD_GET_FREQ 0x0 -#define CMD_SET_FREQ 0x1 - -#define BUF_SZ 4 - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "pcc-cpufreq", msg) - -struct pcc_register_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_size; - u64 address; -} __attribute__ ((packed)); - -struct pcc_memory_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 resource_usage; - u8 type_specific; - u64 granularity; - u64 minimum; - u64 maximum; - u64 translation_offset; - u64 address_length; -} __attribute__ ((packed)); - -static struct cpufreq_driver pcc_cpufreq_driver; - -struct pcc_header { - u32 signature; - u16 length; - u8 major; - u8 minor; - u32 features; - u16 command; - u16 status; - u32 latency; - u32 minimum_time; - u32 maximum_time; - u32 nominal; - u32 throttled_frequency; - u32 minimum_frequency; -}; - -static void __iomem *pcch_virt_addr; -static struct pcc_header __iomem *pcch_hdr; - -static DEFINE_SPINLOCK(pcc_lock); - -static struct acpi_generic_address doorbell; - -static u64 doorbell_preserve; -static u64 doorbell_write; - -static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, - 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; - -struct pcc_cpu { - u32 input_offset; - u32 output_offset; -}; - -static struct pcc_cpu __percpu *pcc_cpu_info; - -static int pcc_cpufreq_verify(struct cpufreq_policy *policy) -{ - cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static inline void pcc_cmd(void) -{ - u64 doorbell_value; - int i; - - acpi_read(&doorbell_value, &doorbell); - acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, - &doorbell); - - for (i = 0; i < POLL_LOOPS; i++) { - if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) - break; - } -} - -static inline void pcc_clear_mapping(void) -{ - if (pcch_virt_addr) - iounmap(pcch_virt_addr); - pcch_virt_addr = NULL; -} - -static unsigned int pcc_get_freq(unsigned int cpu) -{ - struct pcc_cpu *pcc_cpu_data; - unsigned int curr_freq; - unsigned int freq_limit; - u16 status; - u32 input_buffer; - u32 output_buffer; - - spin_lock(&pcc_lock); - - dprintk("get: get_freq for CPU %d\n", cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - input_buffer = 0x1; - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_GET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - output_buffer = - ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - dprintk("get: FAILED: for CPU %d, status is %d\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) - / 100) * 1000); - - dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " - "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", - cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), - output_buffer, curr_freq); - - freq_limit = (output_buffer >> 8) & 0xff; - if (freq_limit != 0xff) { - dprintk("get: frequency for cpu %d is being temporarily" - " capped at %d\n", cpu, curr_freq); - } - - spin_unlock(&pcc_lock); - return curr_freq; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return 0; -} - -static int pcc_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct pcc_cpu *pcc_cpu_data; - struct cpufreq_freqs freqs; - u16 status; - u32 input_buffer; - int cpu; - - spin_lock(&pcc_lock); - cpu = policy->cpu; - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - dprintk("target: CPU %d should go to target freq: %d " - "(virtual) input_offset is 0x%x\n", - cpu, target_freq, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - - freqs.new = target_freq; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - input_buffer = 0x1 | (((target_freq * 100) - / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_SET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - dprintk("target: FAILED for cpu %d, with status: 0x%x\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); - spin_unlock(&pcc_lock); - - return 0; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return -EINVAL; -} - -static int pcc_get_offset(int cpu) -{ - acpi_status status; - struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object *pccp, *offset; - struct pcc_cpu *pcc_cpu_data; - struct acpi_processor *pr; - int ret = 0; - - pr = per_cpu(processors, cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); - if (ACPI_FAILURE(status)) - return -ENODEV; - - pccp = buffer.pointer; - if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - }; - - offset = &(pccp->package.elements[0]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->input_offset = offset->integer.value; - - offset = &(pccp->package.elements[1]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->output_offset = offset->integer.value; - - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); - - dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " - "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", - cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); -out_free: - kfree(buffer.pointer); - return ret; -} - -static int __init pcc_cpufreq_do_osc(acpi_handle *handle) -{ - acpi_status status; - struct acpi_object_list input; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object in_params[4]; - union acpi_object *out_obj; - u32 capabilities[2]; - u32 errors; - u32 supported; - int ret = 0; - - input.count = 4; - input.pointer = in_params; - input.count = 4; - input.pointer = in_params; - in_params[0].type = ACPI_TYPE_BUFFER; - in_params[0].buffer.length = 16; - in_params[0].buffer.pointer = OSC_UUID; - in_params[1].type = ACPI_TYPE_INTEGER; - in_params[1].integer.value = 1; - in_params[2].type = ACPI_TYPE_INTEGER; - in_params[2].integer.value = 2; - in_params[3].type = ACPI_TYPE_BUFFER; - in_params[3].buffer.length = 8; - in_params[3].buffer.pointer = (u8 *)&capabilities; - - capabilities[0] = OSC_QUERY_ENABLE; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - - kfree(output.pointer); - capabilities[0] = 0x0; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - -out_free: - kfree(output.pointer); - return ret; -} - -static int __init pcc_cpufreq_probe(void) -{ - acpi_status status; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - struct pcc_memory_resource *mem_resource; - struct pcc_register_resource *reg_resource; - union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle, pcch_handle; - int ret = 0; - - status = acpi_get_handle(NULL, "\\_SB", &handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "PCCH", &pcch_handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "_OSC", &osc_handle); - if (ACPI_SUCCESS(status)) { - ret = pcc_cpufreq_do_osc(&osc_handle); - if (ret) - dprintk("probe: _OSC evaluation did not succeed\n"); - /* Firmware's use of _OSC is optional */ - ret = 0; - } - - status = acpi_evaluate_object(handle, "PCCH", NULL, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - } - - member = &out_obj->package.elements[0]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; - - dprintk("probe: mem_resource descriptor: 0x%x," - " length: %d, space_id: %d, resource_usage: %d," - " type_specific: %d, granularity: 0x%llx," - " minimum: 0x%llx, maximum: 0x%llx," - " translation_offset: 0x%llx, address_length: 0x%llx\n", - mem_resource->descriptor, mem_resource->length, - mem_resource->space_id, mem_resource->resource_usage, - mem_resource->type_specific, mem_resource->granularity, - mem_resource->minimum, mem_resource->maximum, - mem_resource->translation_offset, - mem_resource->address_length); - - if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { - ret = -ENODEV; - goto out_free; - } - - pcch_virt_addr = ioremap_nocache(mem_resource->minimum, - mem_resource->address_length); - if (pcch_virt_addr == NULL) { - dprintk("probe: could not map shared mem region\n"); - goto out_free; - } - pcch_hdr = pcch_virt_addr; - - dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); - dprintk("probe: PCCH header is at physical address: 0x%llx," - " signature: 0x%x, length: %d bytes, major: %d, minor: %d," - " supported features: 0x%x, command field: 0x%x," - " status field: 0x%x, nominal latency: %d us\n", - mem_resource->minimum, ioread32(&pcch_hdr->signature), - ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), - ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), - ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), - ioread32(&pcch_hdr->latency)); - - dprintk("probe: min time between commands: %d us," - " max time between commands: %d us," - " nominal CPU frequency: %d MHz," - " minimum CPU frequency: %d MHz," - " minimum CPU frequency without throttling: %d MHz\n", - ioread32(&pcch_hdr->minimum_time), - ioread32(&pcch_hdr->maximum_time), - ioread32(&pcch_hdr->nominal), - ioread32(&pcch_hdr->throttled_frequency), - ioread32(&pcch_hdr->minimum_frequency)); - - member = &out_obj->package.elements[1]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto pcch_free; - } - - reg_resource = (struct pcc_register_resource *)member->buffer.pointer; - - doorbell.space_id = reg_resource->space_id; - doorbell.bit_width = reg_resource->bit_width; - doorbell.bit_offset = reg_resource->bit_offset; - doorbell.access_width = 64; - doorbell.address = reg_resource->address; - - dprintk("probe: doorbell: space_id is %d, bit_width is %d, " - "bit_offset is %d, access_width is %d, address is 0x%llx\n", - doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, - doorbell.access_width, reg_resource->address); - - member = &out_obj->package.elements[2]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_preserve = member->integer.value; - - member = &out_obj->package.elements[3]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_write = member->integer.value; - - dprintk("probe: doorbell_preserve: 0x%llx," - " doorbell_write: 0x%llx\n", - doorbell_preserve, doorbell_write); - - pcc_cpu_info = alloc_percpu(struct pcc_cpu); - if (!pcc_cpu_info) { - ret = -ENOMEM; - goto pcch_free; - } - - printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" - " limits: %d MHz, %d MHz\n", PCC_VERSION, - ioread32(&pcch_hdr->minimum_frequency), - ioread32(&pcch_hdr->nominal)); - kfree(output.pointer); - return ret; -pcch_free: - pcc_clear_mapping(); -out_free: - kfree(output.pointer); - return ret; -} - -static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - unsigned int result = 0; - - if (!pcch_virt_addr) { - result = -1; - goto out; - } - - result = pcc_get_offset(cpu); - if (result) { - dprintk("init: PCCP evaluation failed\n"); - goto out; - } - - policy->max = policy->cpuinfo.max_freq = - ioread32(&pcch_hdr->nominal) * 1000; - policy->min = policy->cpuinfo.min_freq = - ioread32(&pcch_hdr->minimum_frequency) * 1000; - policy->cur = pcc_get_freq(cpu); - - if (!policy->cur) { - dprintk("init: Unable to get current CPU frequency\n"); - result = -EINVAL; - goto out; - } - - dprintk("init: policy->max is %d, policy->min is %d\n", - policy->max, policy->min); -out: - return result; -} - -static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver pcc_cpufreq_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .get = pcc_get_freq, - .verify = pcc_cpufreq_verify, - .target = pcc_cpufreq_target, - .init = pcc_cpufreq_cpu_init, - .exit = pcc_cpufreq_cpu_exit, - .name = "pcc-cpufreq", - .owner = THIS_MODULE, -}; - -static int __init pcc_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - ret = pcc_cpufreq_probe(); - if (ret) { - dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); - return ret; - } - - ret = cpufreq_register_driver(&pcc_cpufreq_driver); - - return ret; -} - -static void __exit pcc_cpufreq_exit(void) -{ - cpufreq_unregister_driver(&pcc_cpufreq_driver); - - pcc_clear_mapping(); - - free_percpu(pcc_cpu_info); -} - -MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); -MODULE_VERSION(PCC_VERSION); -MODULE_DESCRIPTION("Processor Clocking Control interface driver"); -MODULE_LICENSE("GPL"); - -late_initcall(pcc_cpufreq_init); -module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c deleted file mode 100644 index b3379d6a5c57..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) - * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, - * Dominik Brodowski. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/ioport.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/msr.h> - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ - -#define PFX "powernow-k6: " -static unsigned int busfreq; /* FSB, in 10 kHz */ -static unsigned int max_multiplier; - - -/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */ -static struct cpufreq_frequency_table clock_ratio[] = { - {45, /* 000 -> 4.5x */ 0}, - {50, /* 001 -> 5.0x */ 0}, - {40, /* 010 -> 4.0x */ 0}, - {55, /* 011 -> 5.5x */ 0}, - {20, /* 100 -> 2.0x */ 0}, - {30, /* 101 -> 3.0x */ 0}, - {60, /* 110 -> 6.0x */ 0}, - {35, /* 111 -> 3.5x */ 0}, - {0, CPUFREQ_TABLE_END} -}; - - -/** - * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier - * - * Returns the current setting of the frequency multiplier. Core clock - * speed is frequency of the Front-Side Bus multiplied with this value. - */ -static int powernow_k6_get_cpu_multiplier(void) -{ - u64 invalue = 0; - u32 msrval; - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - return clock_ratio[(invalue >> 5)&7].index; -} - - -/** - * powernow_k6_set_state - set the PowerNow! multiplier - * @best_i: clock_ratio[best_i] is the target multiplier - * - * Tries to change the PowerNow! multiplier - */ -static void powernow_k6_set_state(unsigned int best_i) -{ - unsigned long outvalue = 0, invalue = 0; - unsigned long msrval; - struct cpufreq_freqs freqs; - - if (clock_ratio[best_i].index > max_multiplier) { - printk(KERN_ERR PFX "invalid target frequency\n"); - return; - } - - freqs.old = busfreq * powernow_k6_get_cpu_multiplier(); - freqs.new = busfreq * clock_ratio[best_i].index; - freqs.cpu = 0; /* powernow-k6.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* we now need to transform best_i to the BVC format, see AMD#23446 */ - - outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5); - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - invalue = invalue & 0xf; - outvalue = outvalue | invalue; - outl(outvalue , (POWERNOW_IOPORT + 0x8)); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return; -} - - -/** - * powernow_k6_verify - verifies a new CPUfreq policy - * @policy: new policy - * - * Policy must be within lowest and highest possible CPU Frequency, - * and at least one possible state must be within min and max. - */ -static int powernow_k6_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &clock_ratio[0]); -} - - -/** - * powernow_k6_setpolicy - sets a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * sets a new CPUFreq policy - */ -static int powernow_k6_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &clock_ratio[0], - target_freq, relation, &newstate)) - return -EINVAL; - - powernow_k6_set_state(newstate); - - return 0; -} - - -static int powernow_k6_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i, f; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - /* get frequencies */ - max_multiplier = powernow_k6_get_cpu_multiplier(); - busfreq = cpu_khz / max_multiplier; - - /* table init */ - for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { - f = clock_ratio[i].index; - if (f > max_multiplier) - clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; - else - clock_ratio[i].frequency = busfreq * f; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 200000; - policy->cur = busfreq * max_multiplier; - - result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); - if (result) - return result; - - cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); - - return 0; -} - - -static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int i; - for (i = 0; i < 8; i++) { - if (i == max_multiplier) - powernow_k6_set_state(i); - } - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int powernow_k6_get(unsigned int cpu) -{ - unsigned int ret; - ret = (busfreq * powernow_k6_get_cpu_multiplier()); - return ret; -} - -static struct freq_attr *powernow_k6_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_k6_driver = { - .verify = powernow_k6_verify, - .target = powernow_k6_target, - .init = powernow_k6_cpu_init, - .exit = powernow_k6_cpu_exit, - .get = powernow_k6_get, - .name = "powernow-k6", - .owner = THIS_MODULE, - .attr = powernow_k6_attr, -}; - - -/** - * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver - * - * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported - * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero - * on success. - */ -static int __init powernow_k6_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) || - ((c->x86_model != 12) && (c->x86_model != 13))) - return -ENODEV; - - if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { - printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n"); - return -EIO; - } - - if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region(POWERNOW_IOPORT, 16); - return -EINVAL; - } - - return 0; -} - - -/** - * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support - * - * Unregisters AMD K6-2+ / K6-3+ PowerNow! support. - */ -static void __exit powernow_k6_exit(void) -{ - cpufreq_unregister_driver(&powernow_k6_driver); - release_region(POWERNOW_IOPORT, 16); -} - - -MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, " - "Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE("GPL"); - -module_init(powernow_k6_init); -module_exit(powernow_k6_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c deleted file mode 100644 index 4a45fd6e41ba..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ /dev/null @@ -1,752 +0,0 @@ -/* - * AMD K7 Powernow driver. - * (C) 2003 Dave Jones on behalf of SuSE Labs. - * (C) 2003-2004 Dave Jones <davej@redhat.com> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Errata 5: - * CPU may fail to execute a FID/VID change in presence of interrupt. - * - We cli/sti on stepping A0 CPUs around the FID/VID transition. - * Errata 15: - * CPU with half frequency multipliers may hang upon wakeup from disconnect. - * - We disable half multipliers if ACPI is used on A0 stepping CPUs. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/dmi.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */ -#include <asm/msr.h> -#include <asm/system.h> - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -#include <linux/acpi.h> -#include <acpi/processor.h> -#endif - -#include "powernow-k7.h" - -#define PFX "powernow: " - - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags; - u16 settlingtime; - u8 reserved1; - u8 numpst; -}; - -struct pst_s { - u32 cpuid; - u8 fsbspeed; - u8 maxfid; - u8 startvid; - u8 numpstates; -}; - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -union powernow_acpi_control_t { - struct { - unsigned long fid:5, - vid:5, - sgtc:20, - res1:2; - } bits; - unsigned long val; -}; -#endif - -#ifdef CONFIG_CPU_FREQ_DEBUG -/* divide by 1000 to get VCore voltage in V. */ -static const int mobile_vid_table[32] = { - 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, - 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0, - 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, - 1075, 1050, 1025, 1000, 975, 950, 925, 0, -}; -#endif - -/* divide by 10 to get FID. */ -static const int fid_codes[32] = { - 110, 115, 120, 125, 50, 55, 60, 65, - 70, 75, 80, 85, 90, 95, 100, 105, - 30, 190, 40, 200, 130, 135, 140, 210, - 150, 225, 160, 165, 170, 180, -1, -1, -}; - -/* This parameter is used in order to force ACPI instead of legacy method for - * configuration purpose. - */ - -static int acpi_force; - -static struct cpufreq_frequency_table *powernow_table; - -static unsigned int can_scale_bus; -static unsigned int can_scale_vid; -static unsigned int minimum_speed = -1; -static unsigned int maximum_speed; -static unsigned int number_scales; -static unsigned int fsb; -static unsigned int latency; -static char have_a0; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "powernow-k7", msg) - -static int check_fsb(unsigned int fsbspeed) -{ - int delta; - unsigned int f = fsb / 1000; - - delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; - return delta < 5; -} - -static int check_powernow(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int maxei, eax, ebx, ecx, edx; - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) { -#ifdef MODULE - printk(KERN_INFO PFX "This module only works with " - "AMD K7 CPUs\n"); -#endif - return 0; - } - - /* Get maximum capabilities */ - maxei = cpuid_eax(0x80000000); - if (maxei < 0x80000007) { /* Any powernow info ? */ -#ifdef MODULE - printk(KERN_INFO PFX "No powernow capabilities detected\n"); -#endif - return 0; - } - - if ((c->x86_model == 6) && (c->x86_mask == 0)) { - printk(KERN_INFO PFX "K7 660[A0] core detected, " - "enabling errata workarounds\n"); - have_a0 = 1; - } - - cpuid(0x80000007, &eax, &ebx, &ecx, &edx); - - /* Check we can actually do something before we say anything.*/ - if (!(edx & (1 << 1 | 1 << 2))) - return 0; - - printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); - - if (edx & 1 << 1) { - printk("frequency"); - can_scale_bus = 1; - } - - if ((edx & (1 << 1 | 1 << 2)) == 0x6) - printk(" and "); - - if (edx & 1 << 2) { - printk("voltage"); - can_scale_vid = 1; - } - - printk(".\n"); - return 1; -} - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -static void invalidate_entry(unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} -#endif - -static int get_ranges(unsigned char *pst) -{ - unsigned int j; - unsigned int speed; - u8 fid, vid; - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) - return -ENOMEM; - - for (j = 0 ; j < number_scales; j++) { - fid = *pst++; - - powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; - powernow_table[j].index = fid; /* lower 8 bits */ - - speed = powernow_table[j].frequency; - - if ((fid_codes[fid] % 10) == 5) { -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (have_a0 == 1) - invalidate_entry(j); -#endif - } - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - - vid = *pst++; - powernow_table[j].index |= (vid << 8); /* upper 8 bits */ - - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed/1000, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - } - powernow_table[number_scales].frequency = CPUFREQ_TABLE_END; - powernow_table[number_scales].index = 0; - - return 0; -} - - -static void change_FID(int fid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.FID != fid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.FID = fid; - fidvidctl.bits.VIDC = 0; - fidvidctl.bits.FIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_VID(int vid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.VID != vid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.VID = vid; - fidvidctl.bits.FIDC = 0; - fidvidctl.bits.VIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_speed(unsigned int index) -{ - u8 fid, vid; - struct cpufreq_freqs freqs; - union msr_fidvidstatus fidvidstatus; - int cfid; - - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in powernow_decode_bios, - * vid are the upper 8 bits. - */ - - fid = powernow_table[index].index & 0xFF; - vid = (powernow_table[index].index & 0xFF00) >> 8; - - freqs.cpu = 0; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - freqs.old = fsb * fid_codes[cfid] / 10; - - freqs.new = powernow_table[index].frequency; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Now do the magic poking into the MSRs. */ - - if (have_a0 == 1) /* A0 errata 5 */ - local_irq_disable(); - - if (freqs.old > freqs.new) { - /* Going down, so change FID first */ - change_FID(fid); - change_VID(vid); - } else { - /* Going up, so change VID first */ - change_VID(vid); - change_FID(fid); - } - - - if (have_a0 == 1) - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -} - - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - -static struct acpi_processor_performance *acpi_processor_perf; - -static int powernow_acpi_init(void) -{ - int i; - int retval = 0; - union powernow_acpi_control_t pc; - - if (acpi_processor_perf != NULL && powernow_table != NULL) { - retval = -EINVAL; - goto err0; - } - - acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance), - GFP_KERNEL); - if (!acpi_processor_perf) { - retval = -ENOMEM; - goto err0; - } - - if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, - GFP_KERNEL)) { - retval = -ENOMEM; - goto err05; - } - - if (acpi_processor_register_performance(acpi_processor_perf, 0)) { - retval = -EIO; - goto err1; - } - - if (acpi_processor_perf->control_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - if (acpi_processor_perf->status_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - number_scales = acpi_processor_perf->state_count; - - if (number_scales < 2) { - retval = -ENODEV; - goto err2; - } - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) { - retval = -ENOMEM; - goto err2; - } - - pc.val = (unsigned long) acpi_processor_perf->states[0].control; - for (i = 0; i < number_scales; i++) { - u8 fid, vid; - struct acpi_processor_px *state = - &acpi_processor_perf->states[i]; - unsigned int speed, speed_mhz; - - pc.val = (unsigned long) state->control; - dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", - i, - (u32) state->core_frequency, - (u32) state->power, - (u32) state->transition_latency, - (u32) state->control, - pc.bits.sgtc); - - vid = pc.bits.vid; - fid = pc.bits.fid; - - powernow_table[i].frequency = fsb * fid_codes[fid] / 10; - powernow_table[i].index = fid; /* lower 8 bits */ - powernow_table[i].index |= (vid << 8); /* upper 8 bits */ - - speed = powernow_table[i].frequency; - speed_mhz = speed / 1000; - - /* processor_perflib will multiply the MHz value by 1000 to - * get a KHz value (e.g. 1266000). However, powernow-k7 works - * with true KHz values (e.g. 1266768). To ensure that all - * powernow frequencies are available, we must ensure that - * ACPI doesn't restrict them, so we round up the MHz value - * to ensure that perflib's computed KHz value is greater than - * or equal to powernow's KHz value. - */ - if (speed % 1000 > 0) - speed_mhz++; - - if ((fid_codes[fid] % 10) == 5) { - if (have_a0 == 1) - invalidate_entry(i); - } - - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed_mhz, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - - if (state->core_frequency != speed_mhz) { - state->core_frequency = speed_mhz; - dprintk(" Corrected ACPI frequency to %d\n", - speed_mhz); - } - - if (latency < pc.bits.sgtc) - latency = pc.bits.sgtc; - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - } - - powernow_table[i].frequency = CPUFREQ_TABLE_END; - powernow_table[i].index = 0; - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - return 0; - -err2: - acpi_processor_unregister_performance(acpi_processor_perf, 0); -err1: - free_cpumask_var(acpi_processor_perf->shared_cpu_map); -err05: - kfree(acpi_processor_perf); -err0: - printk(KERN_WARNING PFX "ACPI perflib can not be used on " - "this platform\n"); - acpi_processor_perf = NULL; - return retval; -} -#else -static int powernow_acpi_init(void) -{ - printk(KERN_INFO PFX "no support for ACPI processor found." - " Please recompile your kernel with ACPI processor\n"); - return -EINVAL; -} -#endif - -static void print_pst_entry(struct pst_s *pst, unsigned int j) -{ - dprintk("PST:%d (@%p)\n", j, pst); - dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", - pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); -} - -static int powernow_decode_bios(int maxfid, int startvid) -{ - struct psb_s *psb; - struct pst_s *pst; - unsigned int i, j; - unsigned char *p; - unsigned int etuple; - unsigned int ret; - - etuple = cpuid_eax(0x80000001); - - for (i = 0xC0000; i < 0xffff0 ; i += 16) { - - p = phys_to_virt(i); - - if (memcmp(p, "AMDK7PNOW!", 10) == 0) { - dprintk("Found PSB header at %p\n", p); - psb = (struct psb_s *) p; - dprintk("Table version: 0x%x\n", psb->tableversion); - if (psb->tableversion != 0x12) { - printk(KERN_INFO PFX "Sorry, only v1.2 tables" - " supported right now\n"); - return -ENODEV; - } - - dprintk("Flags: 0x%x\n", psb->flags); - if ((psb->flags & 1) == 0) - dprintk("Mobile voltage regulator\n"); - else - dprintk("Desktop voltage regulator\n"); - - latency = psb->settlingtime; - if (latency < 100) { - printk(KERN_INFO PFX "BIOS set settling time " - "to %d microseconds. " - "Should be at least 100. " - "Correcting.\n", latency); - latency = 100; - } - dprintk("Settling Time: %d microseconds.\n", - psb->settlingtime); - dprintk("Has %d PST tables. (Only dumping ones " - "relevant to this CPU).\n", - psb->numpst); - - p += sizeof(struct psb_s); - - pst = (struct pst_s *) p; - - for (j = 0; j < psb->numpst; j++) { - pst = (struct pst_s *) p; - number_scales = pst->numpstates; - - if ((etuple == pst->cpuid) && - check_fsb(pst->fsbspeed) && - (maxfid == pst->maxfid) && - (startvid == pst->startvid)) { - print_pst_entry(pst, j); - p = (char *)pst + sizeof(struct pst_s); - ret = get_ranges(p); - return ret; - } else { - unsigned int k; - p = (char *)pst + sizeof(struct pst_s); - for (k = 0; k < number_scales; k++) - p += 2; - } - } - printk(KERN_INFO PFX "No PST tables match this cpuid " - "(0x%x)\n", etuple); - printk(KERN_INFO PFX "This is indicative of a broken " - "BIOS.\n"); - - return -EINVAL; - } - p++; - } - - return -ENODEV; -} - - -static int powernow_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate; - - if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, - relation, &newstate)) - return -EINVAL; - - change_speed(newstate); - - return 0; -} - - -static int powernow_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, powernow_table); -} - -/* - * We use the fact that the bus frequency is somehow - * a multiple of 100000/3 khz, then we compute sgtc according - * to this multiple. - * That way, we match more how AMD thinks all of that work. - * We will then get the same kind of behaviour already tested under - * the "well-known" other OS. - */ -static int __cpuinit fixup_sgtc(void) -{ - unsigned int sgtc; - unsigned int m; - - m = fsb / 3333; - if ((m % 10) >= 5) - m += 5; - - m /= 10; - - sgtc = 100 * m * latency; - sgtc = sgtc / 3; - if (sgtc > 0xfffff) { - printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc); - sgtc = 0xfffff; - } - return sgtc; -} - -static unsigned int powernow_get(unsigned int cpu) -{ - union msr_fidvidstatus fidvidstatus; - unsigned int cfid; - - if (cpu) - return 0; - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - - return fsb * fid_codes[cfid] / 10; -} - - -static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) -{ - printk(KERN_WARNING PFX - "%s laptop with broken PST tables in BIOS detected.\n", - d->ident); - printk(KERN_WARNING PFX - "You need to downgrade to 3A21 (09/09/2002), or try a newer " - "BIOS than 3A71 (01/20/2003)\n"); - printk(KERN_WARNING PFX - "cpufreq scaling has been disabled as a result of this.\n"); - return 0; -} - -/* - * Some Athlon laptops have really fucked PST tables. - * A BIOS update is all that can save them. - * Mention this, and disable cpufreq. - */ -static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { - { - .callback = acer_cpufreq_pst, - .ident = "Acer Aspire", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"), - DMI_MATCH(DMI_BIOS_VERSION, "3A71"), - }, - }, - { } -}; - -static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) -{ - union msr_fidvidstatus fidvidstatus; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - - recalibrate_cpu_khz(); - - fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID]; - if (!fsb) { - printk(KERN_WARNING PFX "can not determine bus frequency\n"); - return -EINVAL; - } - dprintk("FSB: %3dMHz\n", fsb/1000); - - if (dmi_check_system(powernow_dmi_table) || acpi_force) { - printk(KERN_INFO PFX "PSB/PST known to be broken. " - "Trying ACPI instead\n"); - result = powernow_acpi_init(); - } else { - result = powernow_decode_bios(fidvidstatus.bits.MFID, - fidvidstatus.bits.SVID); - if (result) { - printk(KERN_INFO PFX "Trying ACPI perflib\n"); - maximum_speed = 0; - minimum_speed = -1; - latency = 0; - result = powernow_acpi_init(); - if (result) { - printk(KERN_INFO PFX - "ACPI and legacy methods failed\n"); - } - } else { - /* SGTC use the bus clock as timer */ - latency = fixup_sgtc(); - printk(KERN_INFO PFX "SGTC: %d\n", latency); - } - } - - if (result) - return result; - - printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", - minimum_speed/1000, maximum_speed/1000); - - policy->cpuinfo.transition_latency = - cpufreq_scale(2000000UL, fsb, latency); - - policy->cur = powernow_get(0); - - cpufreq_frequency_table_get_attr(powernow_table, policy->cpu); - - return cpufreq_frequency_table_cpuinfo(policy, powernow_table); -} - -static int powernow_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (acpi_processor_perf) { - acpi_processor_unregister_performance(acpi_processor_perf, 0); - free_cpumask_var(acpi_processor_perf->shared_cpu_map); - kfree(acpi_processor_perf); - } -#endif - - kfree(powernow_table); - return 0; -} - -static struct freq_attr *powernow_table_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - .bios_limit = acpi_processor_get_bios_limit, -#endif - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, -}; - -static int __init powernow_init(void) -{ - if (check_powernow() == 0) - return -ENODEV; - return cpufreq_register_driver(&powernow_driver); -} - - -static void __exit powernow_exit(void) -{ - cpufreq_unregister_driver(&powernow_driver); -} - -module_param(acpi_force, int, 0444); -MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); -MODULE_DESCRIPTION("Powernow driver for AMD K7 processors."); -MODULE_LICENSE("GPL"); - -late_initcall(powernow_init); -module_exit(powernow_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h deleted file mode 100644 index 35fb4eaf6e1c..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * AMD-specific information - * - */ - -union msr_fidvidctl { - struct { - unsigned FID:5, // 4:0 - reserved1:3, // 7:5 - VID:5, // 12:8 - reserved2:3, // 15:13 - FIDC:1, // 16 - VIDC:1, // 17 - reserved3:2, // 19:18 - FIDCHGRATIO:1, // 20 - reserved4:11, // 31-21 - SGTC:20, // 32:51 - reserved5:12; // 63:52 - } bits; - unsigned long long val; -}; - -union msr_fidvidstatus { - struct { - unsigned CFID:5, // 4:0 - reserved1:3, // 7:5 - SFID:5, // 12:8 - reserved2:3, // 15:13 - MFID:5, // 20:16 - reserved3:11, // 31:21 - CVID:5, // 36:32 - reserved4:3, // 39:37 - SVID:5, // 44:40 - reserved5:3, // 47:45 - MVID:5, // 52:48 - reserved6:11; // 63:53 - } bits; - unsigned long long val; -}; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c deleted file mode 100644 index c567dec854f6..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ /dev/null @@ -1,1608 +0,0 @@ -/* - * (c) 2003-2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Support : mark.langsdorf@amd.com - * - * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones on behalf of SuSE Labs - * (C) 2004 Dominik Brodowski <linux@brodo.de> - * (C) 2004 Pavel Machek <pavel@ucw.cz> - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Valuable input gratefully received from Dave Jones, Pavel Machek, - * Dominik Brodowski, Jacob Shin, and others. - * Originally developed by Paul Devriendt. - * Processor information obtained from Chapter 9 (Power and Thermal Management) - * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD - * Opteron Processors" available for download from www.amd.com - * - * Tables for specific CPUs can be inferred from - * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/cpumask.h> -#include <linux/sched.h> /* for current / set_cpus_allowed() */ -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/msr.h> - -#include <linux/acpi.h> -#include <linux/mutex.h> -#include <acpi/processor.h> - -#define PFX "powernow-k8: " -#define VERSION "version 2.20.00" -#include "powernow-k8.h" -#include "mperf.h" - -/* serialize freq changes */ -static DEFINE_MUTEX(fidvid_mutex); - -static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); - -static int cpu_family = CPU_OPTERON; - -/* core performance boost */ -static bool cpb_capable, cpb_enabled; -static struct msr __percpu *msrs; - -static struct cpufreq_driver cpufreq_amd64_driver; - -#ifndef CONFIG_SMP -static inline const struct cpumask *cpu_core_mask(int cpu) -{ - return cpumask_of(0); -} -#endif - -/* Return a frequency in MHz, given an input fid */ -static u32 find_freq_from_fid(u32 fid) -{ - return 800 + (fid * 100); -} - -/* Return a frequency in KHz, given an input fid */ -static u32 find_khz_freq_from_fid(u32 fid) -{ - return 1000 * find_freq_from_fid(fid); -} - -static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, - u32 pstate) -{ - return data[pstate].frequency; -} - -/* Return the vco fid for an input fid - * - * Each "low" fid has corresponding "high" fid, and you can get to "low" fids - * only from corresponding high fids. This returns "high" fid corresponding to - * "low" one. - */ -static u32 convert_fid_to_vco_fid(u32 fid) -{ - if (fid < HI_FID_TABLE_BOTTOM) - return 8 + (2 * fid); - else - return fid; -} - -/* - * Return 1 if the pending bit is set. Unless we just instructed the processor - * to transition to a new state, seeing this bit set is really bad news. - */ -static int pending_bit_stuck(void) -{ - u32 lo, hi; - - if (cpu_family == CPU_HW_PSTATE) - return 0; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0; -} - -/* - * Update the global current fid / vid values from the status msr. - * Returns 1 on error. - */ -static int query_current_values_with_pending_wait(struct powernow_k8_data *data) -{ - u32 lo, hi; - u32 i = 0; - - if (cpu_family == CPU_HW_PSTATE) { - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; - data->currpstate = i; - - /* - * a workaround for family 11h erratum 311 might cause - * an "out-of-range Pstate if the core is in Pstate-0 - */ - if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) - data->currpstate = HW_PSTATE_0; - - return 0; - } - do { - if (i++ > 10000) { - dprintk("detected change pending stuck\n"); - return 1; - } - rdmsr(MSR_FIDVID_STATUS, lo, hi); - } while (lo & MSR_S_LO_CHANGE_PENDING); - - data->currvid = hi & MSR_S_HI_CURRENT_VID; - data->currfid = lo & MSR_S_LO_CURRENT_FID; - - return 0; -} - -/* the isochronous relief time */ -static void count_off_irt(struct powernow_k8_data *data) -{ - udelay((1 << data->irt) * 10); - return; -} - -/* the voltage stabilization time */ -static void count_off_vst(struct powernow_k8_data *data) -{ - udelay(data->vstable * VST_UNITS_20US); - return; -} - -/* need to init the control msr to a safe value (for each cpu) */ -static void fidvid_msr_init(void) -{ - u32 lo, hi; - u8 fid, vid; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - vid = hi & MSR_S_HI_CURRENT_VID; - fid = lo & MSR_S_LO_CURRENT_FID; - lo = fid | (vid << MSR_C_LO_VID_SHIFT); - hi = MSR_C_HI_STP_GNT_BENIGN; - dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); - wrmsr(MSR_FIDVID_CTL, lo, hi); -} - -/* write the new fid value along with the other control fields to the msr */ -static int write_new_fid(struct powernow_k8_data *data, u32 fid) -{ - u32 lo; - u32 savevid = data->currvid; - u32 i = 0; - - if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on fid write\n"); - return 1; - } - - lo = fid; - lo |= (data->currvid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n", - fid, lo, data->plllock * PLL_LOCK_CONVERSION); - - do { - wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); - if (i++ > 100) { - printk(KERN_ERR PFX - "Hardware error - pending bit very stuck - " - "no further pstate changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - count_off_irt(data); - - if (savevid != data->currvid) { - printk(KERN_ERR PFX - "vid change on fid trans, old 0x%x, new 0x%x\n", - savevid, data->currvid); - return 1; - } - - if (fid != data->currfid) { - printk(KERN_ERR PFX - "fid trans failed, fid 0x%x, curr 0x%x\n", fid, - data->currfid); - return 1; - } - - return 0; -} - -/* Write a new vid to the hardware */ -static int write_new_vid(struct powernow_k8_data *data, u32 vid) -{ - u32 lo; - u32 savefid = data->currfid; - int i = 0; - - if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on vid write\n"); - return 1; - } - - lo = data->currfid; - lo |= (vid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n", - vid, lo, STOP_GRANT_5NS); - - do { - wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); - if (i++ > 100) { - printk(KERN_ERR PFX "internal error - pending bit " - "very stuck - no further pstate " - "changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "fid changed on vid trans, old " - "0x%x new 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (vid != data->currvid) { - printk(KERN_ERR PFX "vid trans failed, vid 0x%x, " - "curr 0x%x\n", - vid, data->currvid); - return 1; - } - - return 0; -} - -/* - * Reduce the vid by the max of step or reqvid. - * Decreasing vid codes represent increasing voltages: - * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. - */ -static int decrease_vid_code_by_step(struct powernow_k8_data *data, - u32 reqvid, u32 step) -{ - if ((data->currvid - reqvid) > step) - reqvid = data->currvid - step; - - if (write_new_vid(data, reqvid)) - return 1; - - count_off_vst(data); - - return 0; -} - -/* Change hardware pstate by single MSR write */ -static int transition_pstate(struct powernow_k8_data *data, u32 pstate) -{ - wrmsr(MSR_PSTATE_CTRL, pstate, 0); - data->currpstate = pstate; - return 0; -} - -/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */ -static int transition_fid_vid(struct powernow_k8_data *data, - u32 reqfid, u32 reqvid) -{ - if (core_voltage_pre_transition(data, reqvid, reqfid)) - return 1; - - if (core_frequency_transition(data, reqfid)) - return 1; - - if (core_voltage_post_transition(data, reqvid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((reqfid != data->currfid) || (reqvid != data->currvid)) { - printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, " - "curr 0x%x 0x%x\n", - smp_processor_id(), - reqfid, reqvid, data->currfid, data->currvid); - return 1; - } - - dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", - smp_processor_id(), data->currfid, data->currvid); - - return 0; -} - -/* Phase 1 - core voltage transition ... setup voltage */ -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 reqfid) -{ - u32 rvosteps = data->rvo; - u32 savefid = data->currfid; - u32 maxvid, lo, rvomult = 1; - - dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " - "reqvid 0x%x, rvo 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqvid, data->rvo); - - if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) - rvomult = 2; - rvosteps *= rvomult; - rdmsr(MSR_FIDVID_STATUS, lo, maxvid); - maxvid = 0x1f & (maxvid >> 16); - dprintk("ph1 maxvid=0x%x\n", maxvid); - if (reqvid < maxvid) /* lower numbers are higher voltages */ - reqvid = maxvid; - - while (data->currvid > reqvid) { - dprintk("ph1: curr 0x%x, req vid 0x%x\n", - data->currvid, reqvid); - if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) - return 1; - } - - while ((rvosteps > 0) && - ((rvomult * data->rvo + data->currvid) > reqvid)) { - if (data->currvid == maxvid) { - rvosteps = 0; - } else { - dprintk("ph1: changing vid for rvo, req 0x%x\n", - data->currvid - 1); - if (decrease_vid_code_by_step(data, data->currvid-1, 1)) - return 1; - rvosteps--; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 2 - core frequency transition */ -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) -{ - u32 vcoreqfid, vcocurrfid, vcofiddiff; - u32 fid_interval, savevid = data->currvid; - - if (data->currfid == reqfid) { - printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", - data->currfid); - return 0; - } - - dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, " - "reqfid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqfid); - - vcoreqfid = convert_fid_to_vco_fid(reqfid); - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - - if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) - vcofiddiff = 0; - - while (vcofiddiff > 2) { - (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); - - if (reqfid > data->currfid) { - if (data->currfid > LO_FID_TABLE_TOP) { - if (write_new_fid(data, - data->currfid + fid_interval)) - return 1; - } else { - if (write_new_fid - (data, - 2 + convert_fid_to_vco_fid(data->currfid))) - return 1; - } - } else { - if (write_new_fid(data, data->currfid - fid_interval)) - return 1; - } - - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - } - - if (write_new_fid(data, reqfid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (data->currfid != reqfid) { - printk(KERN_ERR PFX - "ph2: mismatch, failed fid transition, " - "curr 0x%x, req 0x%x\n", - data->currfid, reqfid); - return 1; - } - - if (savevid != data->currvid) { - printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n", - savevid, data->currvid); - return 1; - } - - dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 3 - core voltage transition flow ... jump to the final vid. */ -static int core_voltage_post_transition(struct powernow_k8_data *data, - u32 reqvid) -{ - u32 savefid = data->currfid; - u32 savereqvid = reqvid; - - dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid); - - if (reqvid != data->currvid) { - if (write_new_vid(data, reqvid)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX - "ph3: bad fid change, save 0x%x, curr 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (data->currvid != reqvid) { - printk(KERN_ERR PFX - "ph3: failed vid transition\n, " - "req 0x%x, curr 0x%x", - reqvid, data->currvid); - return 1; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savereqvid != data->currvid) { - dprintk("ph3 failed, currvid 0x%x\n", data->currvid); - return 1; - } - - if (savefid != data->currfid) { - dprintk("ph3 failed, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -static void check_supported_cpu(void *_rc) -{ - u32 eax, ebx, ecx, edx; - int *rc = _rc; - - *rc = -ENODEV; - - if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD) - return; - - eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && - ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) - return; - - if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { - if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || - ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { - printk(KERN_INFO PFX - "Processor cpuid %x not supported\n", eax); - return; - } - - eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); - if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { - printk(KERN_INFO PFX - "No frequency change capabilities detected\n"); - return; - } - - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & P_STATE_TRANSITION_CAPABLE) - != P_STATE_TRANSITION_CAPABLE) { - printk(KERN_INFO PFX - "Power state transitions not supported\n"); - return; - } - } else { /* must be a HW Pstate capable processor */ - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) - cpu_family = CPU_HW_PSTATE; - else - return; - } - - *rc = 0; -} - -static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, - u8 maxvid) -{ - unsigned int j; - u8 lastfid = 0xff; - - for (j = 0; j < data->numps; j++) { - if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", - j, pst[j].vid); - return -EINVAL; - } - if (pst[j].vid < data->rvo) { - /* vid + rvo >= 0 */ - printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].vid < maxvid + data->rvo) { - /* vid + rvo >= maxvid */ - printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].fid > MAX_FID) { - printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { - /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR FW_BUG PFX "two low fids - %d : " - "0x%x\n", j, pst[j].fid); - return -EINVAL; - } - if (pst[j].fid < lastfid) - lastfid = pst[j].fid; - } - if (lastfid & 1) { - printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); - return -EINVAL; - } - if (lastfid > LO_FID_TABLE_TOP) - printk(KERN_INFO FW_BUG PFX - "first fid not from lo freq table\n"); - - return 0; -} - -static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, - unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} - -static void print_basics(struct powernow_k8_data *data) -{ - int j; - for (j = 0; j < data->numps; j++) { - if (data->powernow_table[j].frequency != - CPUFREQ_ENTRY_INVALID) { - if (cpu_family == CPU_HW_PSTATE) { - printk(KERN_INFO PFX - " %d : pstate %d (%d MHz)\n", j, - data->powernow_table[j].index, - data->powernow_table[j].frequency/1000); - } else { - printk(KERN_INFO PFX - " %d : fid 0x%x (%d MHz), vid 0x%x\n", - j, - data->powernow_table[j].index & 0xff, - data->powernow_table[j].frequency/1000, - data->powernow_table[j].index >> 8); - } - } - } - if (data->batps) - printk(KERN_INFO PFX "Only %d pstates on battery\n", - data->batps); -} - -static u32 freq_from_fid_did(u32 fid, u32 did) -{ - u32 mhz = 0; - - if (boot_cpu_data.x86 == 0x10) - mhz = (100 * (fid + 0x10)) >> did; - else if (boot_cpu_data.x86 == 0x11) - mhz = (100 * (fid + 8)) >> did; - else - BUG(); - - return mhz * 1000; -} - -static int fill_powernow_table(struct powernow_k8_data *data, - struct pst_s *pst, u8 maxvid) -{ - struct cpufreq_frequency_table *powernow_table; - unsigned int j; - - if (data->batps) { - /* use ACPI support to get full speed on mains power */ - printk(KERN_WARNING PFX - "Only %d pstates usable (use ACPI driver for full " - "range\n", data->batps); - data->numps = data->batps; - } - - for (j = 1; j < data->numps; j++) { - if (pst[j-1].fid >= pst[j].fid) { - printk(KERN_ERR PFX "PST out of sequence\n"); - return -EINVAL; - } - } - - if (data->numps < 2) { - printk(KERN_ERR PFX "no p states to transition\n"); - return -ENODEV; - } - - if (check_pst_table(data, pst, maxvid)) - return -EINVAL; - - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->numps + 1)), GFP_KERNEL); - if (!powernow_table) { - printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); - return -ENOMEM; - } - - for (j = 0; j < data->numps; j++) { - int freq; - powernow_table[j].index = pst[j].fid; /* lower 8 bits */ - powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ - freq = find_khz_freq_from_fid(pst[j].fid); - powernow_table[j].frequency = freq; - } - powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; - powernow_table[data->numps].index = 0; - - if (query_current_values_with_pending_wait(data)) { - kfree(powernow_table); - return -EIO; - } - - dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); - data->powernow_table = powernow_table; - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - for (j = 0; j < data->numps; j++) - if ((pst[j].fid == data->currfid) && - (pst[j].vid == data->currvid)) - return 0; - - dprintk("currfid/vid do not match PST, ignoring\n"); - return 0; -} - -/* Find and validate the PSB/PST table in BIOS. */ -static int find_psb_table(struct powernow_k8_data *data) -{ - struct psb_s *psb; - unsigned int i; - u32 mvs; - u8 maxvid; - u32 cpst = 0; - u32 thiscpuid; - - for (i = 0xc0000; i < 0xffff0; i += 0x10) { - /* Scan BIOS looking for the signature. */ - /* It can not be at ffff0 - it is too big. */ - - psb = phys_to_virt(i); - if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) - continue; - - dprintk("found PSB header at 0x%p\n", psb); - - dprintk("table vers: 0x%x\n", psb->tableversion); - if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); - return -ENODEV; - } - - dprintk("flags: 0x%x\n", psb->flags1); - if (psb->flags1) { - printk(KERN_ERR FW_BUG PFX "unknown flags\n"); - return -ENODEV; - } - - data->vstable = psb->vstable; - dprintk("voltage stabilization time: %d(*20us)\n", - data->vstable); - - dprintk("flags2: 0x%x\n", psb->flags2); - data->rvo = psb->flags2 & 3; - data->irt = ((psb->flags2) >> 2) & 3; - mvs = ((psb->flags2) >> 4) & 3; - data->vidmvs = 1 << mvs; - data->batps = ((psb->flags2) >> 6) & 3; - - dprintk("ramp voltage offset: %d\n", data->rvo); - dprintk("isochronous relief time: %d\n", data->irt); - dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); - - dprintk("numpst: 0x%x\n", psb->num_tables); - cpst = psb->num_tables; - if ((psb->cpuid == 0x00000fc0) || - (psb->cpuid == 0x00000fe0)) { - thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if ((thiscpuid == 0x00000fc0) || - (thiscpuid == 0x00000fe0)) - cpst = 1; - } - if (cpst != 1) { - printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); - return -ENODEV; - } - - data->plllock = psb->plllocktime; - dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); - dprintk("maxfid: 0x%x\n", psb->maxfid); - dprintk("maxvid: 0x%x\n", psb->maxvid); - maxvid = psb->maxvid; - - data->numps = psb->numps; - dprintk("numpstates: 0x%x\n", data->numps); - return fill_powernow_table(data, - (struct pst_s *)(psb+1), maxvid); - } - /* - * If you see this message, complain to BIOS manufacturer. If - * he tells you "we do not support Linux" or some similar - * nonsense, remember that Windows 2000 uses the same legacy - * mechanism that the old Linux PSB driver uses. Tell them it - * is broken with Windows 2000. - * - * The reference to the AMD documentation is chapter 9 in the - * BIOS and Kernel Developer's Guide, which is available on - * www.amd.com - */ - printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); - printk(KERN_ERR PFX "Make sure that your BIOS is up to date" - " and Cool'N'Quiet support is enabled in BIOS setup\n"); - return -ENODEV; -} - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, - unsigned int index) -{ - u64 control; - - if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) - return; - - control = data->acpi_data.states[index].control; - data->irt = (control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (control >> VST_SHIFT) & VST_MASK; -} - -static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) -{ - struct cpufreq_frequency_table *powernow_table; - int ret_val = -ENODEV; - u64 control, status; - - if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { - dprintk("register performance failed: bad ACPI data\n"); - return -EIO; - } - - /* verify the data contained in the ACPI structures */ - if (data->acpi_data.state_count <= 1) { - dprintk("No ACPI P-States\n"); - goto err_out; - } - - control = data->acpi_data.control_register.space_id; - status = data->acpi_data.status_register.space_id; - - if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Invalid control/status registers (%x - %x)\n", - control, status); - goto err_out; - } - - /* fill in data->powernow_table */ - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data.state_count + 1)), GFP_KERNEL); - if (!powernow_table) { - dprintk("powernow_table memory alloc failure\n"); - goto err_out; - } - - /* fill in data */ - data->numps = data->acpi_data.state_count; - powernow_k8_acpi_pst_values(data, 0); - - if (cpu_family == CPU_HW_PSTATE) - ret_val = fill_powernow_table_pstate(data, powernow_table); - else - ret_val = fill_powernow_table_fidvid(data, powernow_table); - if (ret_val) - goto err_out_mem; - - powernow_table[data->acpi_data.state_count].frequency = - CPUFREQ_TABLE_END; - powernow_table[data->acpi_data.state_count].index = 0; - data->powernow_table = powernow_table; - - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { - printk(KERN_ERR PFX - "unable to alloc powernow_k8_data cpumask\n"); - ret_val = -ENOMEM; - goto err_out_mem; - } - - return 0; - -err_out_mem: - kfree(powernow_table); - -err_out: - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); - - /* data->acpi_data.state_count informs us at ->exit() - * whether ACPI was used */ - data->acpi_data.state_count = 0; - - return ret_val; -} - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); - data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 index; - - index = data->acpi_data.states[i].control & HW_PSTATE_MASK; - if (index > data->max_hw_pstate) { - printk(KERN_ERR PFX "invalid pstate %d - " - "bad value %d.\n", i, index); - printk(KERN_ERR PFX "Please report to BIOS " - "manufacturer\n"); - invalidate_entry(powernow_table, i); - continue; - } - rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); - if (!(hi & HW_PSTATE_VALID_MASK)) { - dprintk("invalid pstate %d, ignoring\n", index); - invalidate_entry(powernow_table, i); - continue; - } - - powernow_table[i].index = index; - - /* Frequency may be rounded for these */ - if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10) - || boot_cpu_data.x86 == 0x11) { - powernow_table[i].frequency = - freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); - } else - powernow_table[i].frequency = - data->acpi_data.states[i].core_frequency * 1000; - } - return 0; -} - -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 fid; - u32 vid; - u32 freq, index; - u64 status, control; - - if (data->exttype) { - status = data->acpi_data.states[i].status; - fid = status & EXT_FID_MASK; - vid = (status >> VID_SHIFT) & EXT_VID_MASK; - } else { - control = data->acpi_data.states[i].control; - fid = control & FID_MASK; - vid = (control >> VID_SHIFT) & VID_MASK; - } - - dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); - - index = fid | (vid<<8); - powernow_table[i].index = index; - - freq = find_khz_freq_from_fid(fid); - powernow_table[i].frequency = freq; - - /* verify frequency is OK */ - if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { - dprintk("invalid freq %u kHz, ignoring\n", freq); - invalidate_entry(powernow_table, i); - continue; - } - - /* verify voltage is OK - - * BIOSs are using "off" to indicate invalid */ - if (vid == VID_OFF) { - dprintk("invalid vid %u, ignoring\n", vid); - invalidate_entry(powernow_table, i); - continue; - } - - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { - printk(KERN_INFO PFX "invalid freq entries " - "%u kHz vs. %u kHz\n", freq, - (unsigned int) - (data->acpi_data.states[i].core_frequency - * 1000)); - invalidate_entry(powernow_table, i); - continue; - } - } - return 0; -} - -static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) -{ - if (data->acpi_data.state_count) - acpi_processor_unregister_performance(&data->acpi_data, - data->cpu); - free_cpumask_var(data->acpi_data.shared_cpu_map); -} - -static int get_transition_latency(struct powernow_k8_data *data) -{ - int max_latency = 0; - int i; - for (i = 0; i < data->acpi_data.state_count; i++) { - int cur_latency = data->acpi_data.states[i].transition_latency - + data->acpi_data.states[i].bus_master_latency; - if (cur_latency > max_latency) - max_latency = cur_latency; - } - if (max_latency == 0) { - /* - * Fam 11h and later may return 0 as transition latency. This - * is intended and means "very fast". While cpufreq core and - * governors currently can handle that gracefully, better set it - * to 1 to avoid problems in the future. - */ - if (boot_cpu_data.x86 < 0x11) - printk(KERN_ERR FW_WARN PFX "Invalid zero transition " - "latency\n"); - max_latency = 1; - } - /* value in usecs, needs to be in nanoseconds */ - return 1000 * max_latency; -} - -/* Take a frequency, and issue the fid/vid transition command */ -static int transition_frequency_fidvid(struct powernow_k8_data *data, - unsigned int index) -{ - u32 fid = 0; - u32 vid = 0; - int res, i; - struct cpufreq_freqs freqs; - - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* fid/vid correctness check for k8 */ - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in find_psb_table, vid - * are the upper 8 bits. - */ - fid = data->powernow_table[index].index & 0xFF; - vid = (data->powernow_table[index].index & 0xFF00) >> 8; - - dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((data->currvid == vid) && (data->currfid == fid)) { - dprintk("target matches current values (fid 0x%x, vid 0x%x)\n", - fid, vid); - return 0; - } - - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", - smp_processor_id(), fid, vid); - freqs.old = find_khz_freq_from_fid(data->currfid); - freqs.new = find_khz_freq_from_fid(fid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_fid_vid(data, fid, vid); - freqs.new = find_khz_freq_from_fid(data->currfid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Take a frequency, and issue the hardware pstate transition command */ -static int transition_frequency_pstate(struct powernow_k8_data *data, - unsigned int index) -{ - u32 pstate = 0; - int res, i; - struct cpufreq_freqs freqs; - - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* get MSR index for hardware pstate transition */ - pstate = index & HW_PSTATE_MASK; - if (pstate > data->max_hw_pstate) - return 0; - freqs.old = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_pstate(data, pstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Driver entry point to switch to the target frequency */ -static int powernowk8_target(struct cpufreq_policy *pol, - unsigned targfreq, unsigned relation) -{ - cpumask_var_t oldmask; - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - u32 checkfid; - u32 checkvid; - unsigned int newstate; - int ret = -EIO; - - if (!data) - return -EINVAL; - - checkfid = data->currfid; - checkvid = data->currvid; - - /* only run on specific CPU from here on. */ - /* This is poor form: use a workqueue or smp_call_function_single */ - if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(oldmask, tsk_cpus_allowed(current)); - set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out; - } - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing targ, change pending bit set\n"); - goto err_out; - } - - dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", - pol->cpu, targfreq, pol->min, pol->max, relation); - - if (query_current_values_with_pending_wait(data)) - goto err_out; - - if (cpu_family != CPU_HW_PSTATE) { - dprintk("targ: curr fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - if ((checkvid != data->currvid) || - (checkfid != data->currfid)) { - printk(KERN_INFO PFX - "error - out of sync, fix 0x%x 0x%x, " - "vid 0x%x 0x%x\n", - checkfid, data->currfid, - checkvid, data->currvid); - } - } - - if (cpufreq_frequency_table_target(pol, data->powernow_table, - targfreq, relation, &newstate)) - goto err_out; - - mutex_lock(&fidvid_mutex); - - powernow_k8_acpi_pst_values(data, newstate); - - if (cpu_family == CPU_HW_PSTATE) - ret = transition_frequency_pstate(data, newstate); - else - ret = transition_frequency_fidvid(data, newstate); - if (ret) { - printk(KERN_ERR PFX "transition frequency failed\n"); - ret = 1; - mutex_unlock(&fidvid_mutex); - goto err_out; - } - mutex_unlock(&fidvid_mutex); - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - newstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - ret = 0; - -err_out: - set_cpus_allowed_ptr(current, oldmask); - free_cpumask_var(oldmask); - return ret; -} - -/* Driver entry point to verify the policy and range of frequencies */ -static int powernowk8_verify(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - return cpufreq_frequency_table_verify(pol, data->powernow_table); -} - -struct init_on_cpu { - struct powernow_k8_data *data; - int rc; -}; - -static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) -{ - struct init_on_cpu *init_on_cpu = _init_on_cpu; - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing init, change pending bit set\n"); - init_on_cpu->rc = -ENODEV; - return; - } - - if (query_current_values_with_pending_wait(init_on_cpu->data)) { - init_on_cpu->rc = -ENODEV; - return; - } - - if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - init_on_cpu->rc = 0; -} - -/* per CPU init entry point to the driver */ -static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) -{ - static const char ACPI_PSS_BIOS_BUG_MSG[] = - KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - FW_BUG PFX "Try again with latest BIOS.\n"; - struct powernow_k8_data *data; - struct init_on_cpu init_on_cpu; - int rc; - struct cpuinfo_x86 *c = &cpu_data(pol->cpu); - - if (!cpu_online(pol->cpu)) - return -ENODEV; - - smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); - if (rc) - return -ENODEV; - - data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); - if (!data) { - printk(KERN_ERR PFX "unable to alloc powernow_k8_data"); - return -ENOMEM; - } - - data->cpu = pol->cpu; - data->currpstate = HW_PSTATE_INVALID; - - if (powernow_k8_cpu_init_acpi(data)) { - /* - * Use the PSB BIOS structure. This is only availabe on - * an UP version, and is deprecated by AMD. - */ - if (num_online_cpus() != 1) { - printk_once(ACPI_PSS_BIOS_BUG_MSG); - goto err_out; - } - if (pol->cpu != 0) { - printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " - "CPU other than CPU0. Complain to your BIOS " - "vendor.\n"); - goto err_out; - } - rc = find_psb_table(data); - if (rc) - goto err_out; - - /* Take a crude guess here. - * That guess was in microseconds, so multiply with 1000 */ - pol->cpuinfo.transition_latency = ( - ((data->rvo + 8) * data->vstable * VST_UNITS_20US) + - ((1 << data->irt) * 30)) * 1000; - } else /* ACPI _PSS objects available */ - pol->cpuinfo.transition_latency = get_transition_latency(data); - - /* only run on specific CPU from here on */ - init_on_cpu.data = data; - smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, - &init_on_cpu, 1); - rc = init_on_cpu.rc; - if (rc != 0) - goto err_out_exit_acpi; - - if (cpu_family == CPU_HW_PSTATE) - cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); - else - cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); - data->available_cores = pol->cpus; - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - dprintk("policy current frequency %d kHz\n", pol->cur); - - /* min/max the cpu is capable of */ - if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { - printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); - powernow_k8_cpu_exit_acpi(data); - kfree(data->powernow_table); - kfree(data); - return -EINVAL; - } - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; - - cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); - - if (cpu_family == CPU_HW_PSTATE) - dprintk("cpu_init done, current pstate 0x%x\n", - data->currpstate); - else - dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - per_cpu(powernow_data, pol->cpu) = data; - - return 0; - -err_out_exit_acpi: - powernow_k8_cpu_exit_acpi(data); - -err_out: - kfree(data); - return -ENODEV; -} - -static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - powernow_k8_cpu_exit_acpi(data); - - cpufreq_frequency_table_put_attr(pol->cpu); - - kfree(data->powernow_table); - kfree(data); - per_cpu(powernow_data, pol->cpu) = NULL; - - return 0; -} - -static void query_values_on_cpu(void *_err) -{ - int *err = _err; - struct powernow_k8_data *data = __this_cpu_read(powernow_data); - - *err = query_current_values_with_pending_wait(data); -} - -static unsigned int powernowk8_get(unsigned int cpu) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, cpu); - unsigned int khz = 0; - int err; - - if (!data) - return 0; - - smp_call_function_single(cpu, query_values_on_cpu, &err, true); - if (err) - goto out; - - if (cpu_family == CPU_HW_PSTATE) - khz = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - khz = find_khz_freq_from_fid(data->currfid); - - -out: - return khz; -} - -static void _cpb_toggle_msrs(bool t) -{ - int cpu; - - get_online_cpus(); - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - if (t) - reg->l &= ~BIT(25); - else - reg->l |= BIT(25); - } - wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - put_online_cpus(); -} - -/* - * Switch on/off core performance boosting. - * - * 0=disable - * 1=enable. - */ -static void cpb_toggle(bool t) -{ - if (!cpb_capable) - return; - - if (t && !cpb_enabled) { - cpb_enabled = true; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting enabled.\n"); - } else if (!t && cpb_enabled) { - cpb_enabled = false; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting disabled.\n"); - } -} - -static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, - size_t count) -{ - int ret = -EINVAL; - unsigned long val = 0; - - ret = strict_strtoul(buf, 10, &val); - if (!ret && (val == 0 || val == 1) && cpb_capable) - cpb_toggle(val); - else - return -EINVAL; - - return count; -} - -static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) -{ - return sprintf(buf, "%u\n", cpb_enabled); -} - -#define define_one_rw(_name) \ -static struct freq_attr _name = \ -__ATTR(_name, 0644, show_##_name, store_##_name) - -define_one_rw(cpb); - -static struct freq_attr *powernow_k8_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - &cpb, - NULL, -}; - -static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, -}; - -/* - * Clear the boost-disable flag on the CPU_DOWN path so that this cpu - * cannot block the remaining ones from boosting. On the CPU_UP path we - * simply keep the boost-disable flag in sync with the current global - * state. - */ -static int cpb_notify(struct notifier_block *nb, unsigned long action, - void *hcpu) -{ - unsigned cpu = (long)hcpu; - u32 lo, hi; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - - if (!cpb_enabled) { - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo |= BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - } - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo &= ~BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block cpb_nb = { - .notifier_call = cpb_notify, -}; - -/* driver entry point for init */ -static int __cpuinit powernowk8_init(void) -{ - unsigned int i, supported_cpus = 0, cpu; - int rv; - - for_each_online_cpu(i) { - int rc; - smp_call_function_single(i, check_supported_cpu, &rc, 1); - if (rc == 0) - supported_cpus++; - } - - if (supported_cpus != num_online_cpus()) - return -ENODEV; - - printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", - num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - - cpb_capable = true; - - msrs = msrs_alloc(); - if (!msrs) { - printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); - return -ENOMEM; - } - - register_cpu_notifier(&cpb_nb); - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - cpb_enabled |= !(!!(reg->l & BIT(25))); - } - - printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", - (cpb_enabled ? "on" : "off")); - } - - rv = cpufreq_register_driver(&cpufreq_amd64_driver); - if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) { - unregister_cpu_notifier(&cpb_nb); - msrs_free(msrs); - msrs = NULL; - } - return rv; -} - -/* driver entry point for term */ -static void __exit powernowk8_exit(void) -{ - dprintk("exit\n"); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - msrs_free(msrs); - msrs = NULL; - - unregister_cpu_notifier(&cpb_nb); - } - - cpufreq_unregister_driver(&cpufreq_amd64_driver); -} - -MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and " - "Mark Langsdorf <mark.langsdorf@amd.com>"); -MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); -MODULE_LICENSE("GPL"); - -late_initcall(powernowk8_init); -module_exit(powernowk8_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h deleted file mode 100644 index df3529b1c02d..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * (c) 2003-2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -enum pstate { - HW_PSTATE_INVALID = 0xff, - HW_PSTATE_0 = 0, - HW_PSTATE_1 = 1, - HW_PSTATE_2 = 2, - HW_PSTATE_3 = 3, - HW_PSTATE_4 = 4, - HW_PSTATE_5 = 5, - HW_PSTATE_6 = 6, - HW_PSTATE_7 = 7, -}; - -struct powernow_k8_data { - unsigned int cpu; - - u32 numps; /* number of p-states */ - u32 batps; /* number of p-states supported on battery */ - u32 max_hw_pstate; /* maximum legal hardware pstate */ - - /* these values are constant when the PSB is used to determine - * vid/fid pairings, but are modified during the ->target() call - * when ACPI is used */ - u32 rvo; /* ramp voltage offset */ - u32 irt; /* isochronous relief time */ - u32 vidmvs; /* usable value calculated from mvs */ - u32 vstable; /* voltage stabilization time, units 20 us */ - u32 plllock; /* pll lock time, units 1 us */ - u32 exttype; /* extended interface = 1 */ - - /* keep track of the current fid / vid or pstate */ - u32 currvid; - u32 currfid; - enum pstate currpstate; - - /* the powernow_table includes all frequency and vid/fid pairings: - * fid are the lower 8 bits of the index, vid are the upper 8 bits. - * frequency is in kHz */ - struct cpufreq_frequency_table *powernow_table; - - /* the acpi table needs to be kept. it's only available if ACPI was - * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance acpi_data; - - /* we need to keep track of associated cores, but let cpufreq - * handle hotplug events - so just point at cpufreq pol->cpus - * structure */ - struct cpumask *available_cores; -}; - -/* processor's cpuid instruction support */ -#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ -#define CPUID_XFAM 0x0ff00000 /* extended family */ -#define CPUID_XFAM_K8 0 -#define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_MASK 0x000c0000 -#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ -#define CPUID_USE_XFAM_XMOD 0x00000f00 -#define CPUID_GET_MAX_CAPABILITIES 0x80000000 -#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 -#define P_STATE_TRANSITION_CAPABLE 6 - -/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */ -/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */ -/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */ -/* the register number is placed in ecx, and the data is returned in edx:eax. */ - -#define MSR_FIDVID_CTL 0xc0010041 -#define MSR_FIDVID_STATUS 0xc0010042 - -/* Field definitions within the FID VID Low Control MSR : */ -#define MSR_C_LO_INIT_FID_VID 0x00010000 -#define MSR_C_LO_NEW_VID 0x00003f00 -#define MSR_C_LO_NEW_FID 0x0000003f -#define MSR_C_LO_VID_SHIFT 8 - -/* Field definitions within the FID VID High Control MSR : */ -#define MSR_C_HI_STP_GNT_TO 0x000fffff - -/* Field definitions within the FID VID Low Status MSR : */ -#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ -#define MSR_S_LO_MAX_RAMP_VID 0x3f000000 -#define MSR_S_LO_MAX_FID 0x003f0000 -#define MSR_S_LO_START_FID 0x00003f00 -#define MSR_S_LO_CURRENT_FID 0x0000003f - -/* Field definitions within the FID VID High Status MSR : */ -#define MSR_S_HI_MIN_WORKING_VID 0x3f000000 -#define MSR_S_HI_MAX_WORKING_VID 0x003f0000 -#define MSR_S_HI_START_VID 0x00003f00 -#define MSR_S_HI_CURRENT_VID 0x0000003f -#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 - - -/* Hardware Pstate _PSS and MSR definitions */ -#define USE_HW_PSTATE 0x00000080 -#define HW_PSTATE_MASK 0x00000007 -#define HW_PSTATE_VALID_MASK 0x80000000 -#define HW_PSTATE_MAX_MASK 0x000000f0 -#define HW_PSTATE_MAX_SHIFT 4 -#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ -#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ -#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ -#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ - -/* define the two driver architectures */ -#define CPU_OPTERON 0 -#define CPU_HW_PSTATE 1 - - -/* - * There are restrictions frequencies have to follow: - * - only 1 entry in the low fid table ( <=1.4GHz ) - * - lowest entry in the high fid table must be >= 2 * the entry in the - * low fid table - * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry - * in the low fid table - * - the parts can only step at <= 200 MHz intervals, odd fid values are - * supported in revision G and later revisions. - * - lowest frequency must be >= interprocessor hypertransport link speed - * (only applies to MP systems obviously) - */ - -/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */ -#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */ -#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */ - -#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */ -#define HI_VCOFREQ_TABLE_BOTTOM 1600 - -#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ - -#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ -#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */ - -#define MIN_FREQ 800 /* Min and max freqs, per spec */ -#define MAX_FREQ 5000 - -#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */ -#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */ - -#define VID_OFF 0x3f - -#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ - -#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ - -#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ -#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */ - -/* - * Most values of interest are encoded in a single field of the _PSS - * entries: the "control" value. - */ - -#define IRT_SHIFT 30 -#define RVO_SHIFT 28 -#define EXT_TYPE_SHIFT 27 -#define PLL_L_SHIFT 20 -#define MVS_SHIFT 18 -#define VST_SHIFT 11 -#define VID_SHIFT 6 -#define IRT_MASK 3 -#define RVO_MASK 3 -#define EXT_TYPE_MASK 1 -#define PLL_L_MASK 0x7f -#define MVS_MASK 3 -#define VST_MASK 0x7f -#define VID_MASK 0x1f -#define FID_MASK 0x1f -#define EXT_VID_MASK 0x3f -#define EXT_FID_MASK 0x3f - - -/* - * Version 1.4 of the PSB table. This table is constructed by BIOS and is - * to tell the OS's power management driver which VIDs and FIDs are - * supported by this particular processor. - * If the data in the PSB / PST is wrong, then this driver will program the - * wrong values into hardware, which is very likely to lead to a crash. - */ - -#define PSB_ID_STRING "AMDK7PNOW!" -#define PSB_ID_STRING_LEN 10 - -#define PSB_VERSION_1_4 0x14 - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags1; - u16 vstable; - u8 flags2; - u8 num_tables; - u32 cpuid; - u8 plllocktime; - u8 maxfid; - u8 maxvid; - u8 numps; -}; - -/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */ -struct pst_s { - u8 fid; - u8 vid; -}; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) - -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 regfid); -static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c deleted file mode 100644 index 435a996a613a..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * sc520_freq.c: cpufreq driver for the AMD Elan sc520 - * - * Copyright (C) 2005 Sean Young <sean@mess.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Based on elanfreq.c - * - * 2005-03-30: - initial revision - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> - -#include <linux/delay.h> -#include <linux/cpufreq.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/msr.h> - -#define MMCR_BASE 0xfffef000 /* The default base address */ -#define OFFS_CPUCTL 0x2 /* CPU Control Register */ - -static __u8 __iomem *cpuctl; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "sc520_freq", msg) -#define PFX "sc520_freq: " - -static struct cpufreq_frequency_table sc520_freq_table[] = { - {0x01, 100000}, - {0x02, 133000}, - {0, CPUFREQ_TABLE_END}, -}; - -static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg = *cpuctl; - - switch (clockspeed_reg & 0x03) { - default: - printk(KERN_ERR PFX "error: cpuctl register has unexpected " - "value %02x\n", clockspeed_reg); - case 0x01: - return 100000; - case 0x02: - return 133000; - } -} - -static void sc520_freq_set_cpu_state(unsigned int state) -{ - - struct cpufreq_freqs freqs; - u8 clockspeed_reg; - - freqs.old = sc520_freq_get_cpu_frequency(0); - freqs.new = sc520_freq_table[state].frequency; - freqs.cpu = 0; /* AMD Elan is UP */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - dprintk("attempting to set frequency to %i kHz\n", - sc520_freq_table[state].frequency); - - local_irq_disable(); - - clockspeed_reg = *cpuctl & ~0x03; - *cpuctl = clockspeed_reg | sc520_freq_table[state].index; - - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - -static int sc520_freq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]); -} - -static int sc520_freq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, sc520_freq_table, - target_freq, relation, &newstate)) - return -EINVAL; - - sc520_freq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int sc520_freq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int result; - - /* capability check */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) - return -ENODEV; - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 1000000; /* 1ms */ - policy->cur = sc520_freq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu); - - return 0; -} - - -static int sc520_freq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -static struct freq_attr *sc520_freq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver sc520_freq_driver = { - .get = sc520_freq_get_cpu_frequency, - .verify = sc520_freq_verify, - .target = sc520_freq_target, - .init = sc520_freq_cpu_init, - .exit = sc520_freq_cpu_exit, - .name = "sc520_freq", - .owner = THIS_MODULE, - .attr = sc520_freq_attr, -}; - - -static int __init sc520_freq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int err; - - /* Test if we have the right hardware */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) { - dprintk("no Elan SC520 processor found!\n"); - return -ENODEV; - } - cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); - if (!cpuctl) { - printk(KERN_ERR "sc520_freq: error: failed to remap memory\n"); - return -ENOMEM; - } - - err = cpufreq_register_driver(&sc520_freq_driver); - if (err) - iounmap(cpuctl); - - return err; -} - - -static void __exit sc520_freq_exit(void) -{ - cpufreq_unregister_driver(&sc520_freq_driver); - iounmap(cpuctl); -} - - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Sean Young <sean@mess.org>"); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU"); - -module_init(sc520_freq_init); -module_exit(sc520_freq_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c deleted file mode 100644 index 9b1ff37de46a..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ /dev/null @@ -1,636 +0,0 @@ -/* - * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium - * M (part of the Centrino chipset). - * - * Since the original Pentium M, most new Intel CPUs support Enhanced - * SpeedStep. - * - * Despite the "SpeedStep" in the name, this is almost entirely unlike - * traditional SpeedStep. - * - * Modelled on speedstep.c - * - * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org> - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/sched.h> /* current */ -#include <linux/delay.h> -#include <linux/compiler.h> -#include <linux/gfp.h> - -#include <asm/msr.h> -#include <asm/processor.h> -#include <asm/cpufeature.h> - -#define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@vger.kernel.org" - -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) - -#define INTEL_MSR_RANGE (0xffff) - -struct cpu_id -{ - __u8 x86; /* CPU family */ - __u8 x86_model; /* model */ - __u8 x86_mask; /* stepping */ -}; - -enum { - CPU_BANIAS, - CPU_DOTHAN_A1, - CPU_DOTHAN_A2, - CPU_DOTHAN_B0, - CPU_MP4HT_D0, - CPU_MP4HT_E0, -}; - -static const struct cpu_id cpu_ids[] = { - [CPU_BANIAS] = { 6, 9, 5 }, - [CPU_DOTHAN_A1] = { 6, 13, 1 }, - [CPU_DOTHAN_A2] = { 6, 13, 2 }, - [CPU_DOTHAN_B0] = { 6, 13, 6 }, - [CPU_MP4HT_D0] = {15, 3, 4 }, - [CPU_MP4HT_E0] = {15, 4, 1 }, -}; -#define N_IDS ARRAY_SIZE(cpu_ids) - -struct cpu_model -{ - const struct cpu_id *cpu_id; - const char *model_name; - unsigned max_freq; /* max clock in kHz */ - - struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ -}; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x); - -/* Operating points for current CPU */ -static DEFINE_PER_CPU(struct cpu_model *, centrino_model); -static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); - -static struct cpufreq_driver centrino_driver; - -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE - -/* Computes the correct form for IA32_PERF_CTL MSR for a particular - frequency/voltage operating point; frequency in MHz, volts in mV. - This is stored as "index" in the structure. */ -#define OP(mhz, mv) \ - { \ - .frequency = (mhz) * 1000, \ - .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \ - } - -/* - * These voltage tables were derived from the Intel Pentium M - * datasheet, document 25261202.pdf, Table 5. I have verified they - * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium - * M. - */ - -/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */ -static struct cpufreq_frequency_table banias_900[] = -{ - OP(600, 844), - OP(800, 988), - OP(900, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */ -static struct cpufreq_frequency_table banias_1000[] = -{ - OP(600, 844), - OP(800, 972), - OP(900, 988), - OP(1000, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */ -static struct cpufreq_frequency_table banias_1100[] = -{ - OP( 600, 956), - OP( 800, 1020), - OP( 900, 1100), - OP(1000, 1164), - OP(1100, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - - -/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */ -static struct cpufreq_frequency_table banias_1200[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP( 900, 1020), - OP(1000, 1100), - OP(1100, 1164), - OP(1200, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.30GHz (Banias) */ -static struct cpufreq_frequency_table banias_1300[] = -{ - OP( 600, 956), - OP( 800, 1260), - OP(1000, 1292), - OP(1200, 1356), - OP(1300, 1388), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.40GHz (Banias) */ -static struct cpufreq_frequency_table banias_1400[] = -{ - OP( 600, 956), - OP( 800, 1180), - OP(1000, 1308), - OP(1200, 1436), - OP(1400, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.50GHz (Banias) */ -static struct cpufreq_frequency_table banias_1500[] = -{ - OP( 600, 956), - OP( 800, 1116), - OP(1000, 1228), - OP(1200, 1356), - OP(1400, 1452), - OP(1500, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.60GHz (Banias) */ -static struct cpufreq_frequency_table banias_1600[] = -{ - OP( 600, 956), - OP( 800, 1036), - OP(1000, 1164), - OP(1200, 1276), - OP(1400, 1420), - OP(1600, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.70GHz (Banias) */ -static struct cpufreq_frequency_table banias_1700[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP(1000, 1116), - OP(1200, 1228), - OP(1400, 1308), - OP(1700, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; -#undef OP - -#define _BANIAS(cpuid, max, name) \ -{ .cpu_id = cpuid, \ - .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \ - .max_freq = (max)*1000, \ - .op_points = banias_##max, \ -} -#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max) - -/* CPU models, their operating frequency range, and freq/voltage - operating points */ -static struct cpu_model models[] = -{ - _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"), - BANIAS(1000), - BANIAS(1100), - BANIAS(1200), - BANIAS(1300), - BANIAS(1400), - BANIAS(1500), - BANIAS(1600), - BANIAS(1700), - - /* NULL model_name is a wildcard */ - { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL }, - - { NULL, } -}; -#undef _BANIAS -#undef BANIAS - -static int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - struct cpu_model *model; - - for(model = models; model->cpu_id != NULL; model++) - if (centrino_verify_cpu_id(cpu, model->cpu_id) && - (model->model_name == NULL || - strcmp(cpu->x86_model_id, model->model_name) == 0)) - break; - - if (model->cpu_id == NULL) { - /* No match at all */ - dprintk("no support for CPU model \"%s\": " - "send /proc/cpuinfo to " MAINTAINER "\n", - cpu->x86_model_id); - return -ENOENT; - } - - if (model->op_points == NULL) { - /* Matched a non-match */ - dprintk("no table support for CPU model \"%s\"\n", - cpu->x86_model_id); - dprintk("try using the acpi-cpufreq driver\n"); - return -ENOENT; - } - - per_cpu(centrino_model, policy->cpu) = model; - - dprintk("found \"%s\": max frequency: %dkHz\n", - model->model_name, model->max_freq); - - return 0; -} - -#else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - return -ENODEV; -} -#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ - -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x) -{ - if ((c->x86 == x->x86) && - (c->x86_model == x->x86_model) && - (c->x86_mask == x->x86_mask)) - return 1; - return 0; -} - -/* To be called only after centrino_model is initialized */ -static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) -{ - int i; - - /* - * Extract clock in kHz from PERF_CTL value - * for centrino, as some DSDTs are buggy. - * Ideally, this can be done using the acpi_data structure. - */ - if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { - msr = (msr >> 8) & 0xff; - return msr * 100000; - } - - if ((!per_cpu(centrino_model, cpu)) || - (!per_cpu(centrino_model, cpu)->op_points)) - return 0; - - msr &= 0xffff; - for (i = 0; - per_cpu(centrino_model, cpu)->op_points[i].frequency - != CPUFREQ_TABLE_END; - i++) { - if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) - return per_cpu(centrino_model, cpu)-> - op_points[i].frequency; - } - if (failsafe) - return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; - else - return 0; -} - -/* Return the current CPU frequency in kHz */ -static unsigned int get_cur_freq(unsigned int cpu) -{ - unsigned l, h; - unsigned clock_freq; - - rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); - clock_freq = extract_clock(l, cpu, 0); - - if (unlikely(clock_freq == 0)) { - /* - * On some CPUs, we can see transient MSR values (which are - * not present in _PSS), while CPU is doing some automatic - * P-state transition (like TM2). Get the last freq set - * in PERF_CTL. - */ - rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); - clock_freq = extract_clock(l, cpu, 1); - } - return clock_freq; -} - - -static int centrino_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - unsigned freq; - unsigned l, h; - int ret; - int i; - - /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) - centrino_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (policy->cpu != 0) - return -ENODEV; - - for (i = 0; i < N_IDS; i++) - if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) - break; - - if (i != N_IDS) - per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - - if (!per_cpu(centrino_cpu, policy->cpu)) { - dprintk("found unsupported CPU with " - "Enhanced SpeedStep: send /proc/cpuinfo to " - MAINTAINER "\n"); - return -ENODEV; - } - - if (centrino_cpu_init_table(policy)) { - return -ENODEV; - } - - /* Check to see if Enhanced SpeedStep is enabled, and try to - enable it if not. */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); - wrmsr(MSR_IA32_MISC_ENABLE, l, h); - - /* check to see if it stuck */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO PFX - "couldn't enable Enhanced SpeedStep\n"); - return -ENODEV; - } - } - - freq = get_cur_freq(policy->cpu); - policy->cpuinfo.transition_latency = 10000; - /* 10uS transition latency */ - policy->cur = freq; - - dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - - ret = cpufreq_frequency_table_cpuinfo(policy, - per_cpu(centrino_model, policy->cpu)->op_points); - if (ret) - return (ret); - - cpufreq_frequency_table_get_attr( - per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); - - return 0; -} - -static int centrino_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - - if (!per_cpu(centrino_model, cpu)) - return -ENODEV; - - cpufreq_frequency_table_put_attr(cpu); - - per_cpu(centrino_model, cpu) = NULL; - - return 0; -} - -/** - * centrino_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within this model's frequency range at least one - * border included. - */ -static int centrino_verify (struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - per_cpu(centrino_model, policy->cpu)->op_points); -} - -/** - * centrino_setpolicy - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int centrino_target (struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; - struct cpufreq_freqs freqs; - int retval = 0; - unsigned int j, k, first_cpu, tmp; - cpumask_var_t covered_cpus; - - if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) - return -ENOMEM; - - if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { - retval = -ENODEV; - goto out; - } - - if (unlikely(cpufreq_frequency_table_target(policy, - per_cpu(centrino_model, cpu)->op_points, - target_freq, - relation, - &newstate))) { - retval = -EINVAL; - goto out; - } - - first_cpu = 1; - for_each_cpu(j, policy->cpus) { - int good_cpu; - - /* cpufreq holds the hotplug lock, so we are safe here */ - if (!cpu_online(j)) - continue; - - /* - * Support for SMP systems. - * Make sure we are running on CPU that wants to change freq - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - good_cpu = cpumask_any_and(policy->cpus, - cpu_online_mask); - else - good_cpu = j; - - if (good_cpu >= nr_cpu_ids) { - dprintk("couldn't limit to CPUs in this domain\n"); - retval = -EAGAIN; - if (first_cpu) { - /* We haven't started the transition yet. */ - goto out; - } - break; - } - - msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; - - if (first_cpu) { - rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); - if (msr == (oldmsr & 0xffff)) { - dprintk("no change needed - msr was and needs " - "to be %x\n", oldmsr); - retval = 0; - goto out; - } - - freqs.old = extract_clock(oldmsr, cpu, 0); - freqs.new = extract_clock(msr, cpu, 0); - - dprintk("target=%dkHz old=%d new=%d msr=%04x\n", - target_freq, freqs.old, freqs.new, msr); - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, - CPUFREQ_PRECHANGE); - } - - first_cpu = 0; - /* all but 16 LSB are reserved, treat them with care */ - oldmsr &= ~0xffff; - msr &= 0xffff; - oldmsr |= msr; - } - - wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - break; - - cpumask_set_cpu(j, covered_cpus); - } - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - if (unlikely(retval)) { - /* - * We have failed halfway through the frequency change. - * We have sent callbacks to policy->cpus and - * MSRs have already been written on coverd_cpus. - * Best effort undo.. - */ - - for_each_cpu(j, covered_cpus) - wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); - - tmp = freqs.new; - freqs.new = freqs.old; - freqs.old = tmp; - for_each_cpu(j, policy->cpus) { - if (!cpu_online(j)) - continue; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - } - retval = 0; - -out: - free_cpumask_var(covered_cpus); - return retval; -} - -static struct freq_attr* centrino_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver centrino_driver = { - .name = "centrino", /* should be speedstep-centrino, - but there's a 16 char limit */ - .init = centrino_cpu_init, - .exit = centrino_cpu_exit, - .verify = centrino_verify, - .target = centrino_target, - .get = get_cur_freq, - .attr = centrino_attr, - .owner = THIS_MODULE, -}; - - -/** - * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver - * - * Initializes the Enhanced SpeedStep support. Returns -ENODEV on - * unsupported devices, -ENOENT if there's no voltage table for this - * particular CPU model, -EINVAL on problems during initiatization, - * and zero on success. - * - * This is quite picky. Not only does the CPU have to advertise the - * "est" flag in the cpuid capability flags, we look for a specific - * CPU model and stepping, and we need to have the exact model name in - * our voltage tables. That is, be paranoid about not releasing - * someone's valuable magic smoke. - */ -static int __init centrino_init(void) -{ - struct cpuinfo_x86 *cpu = &cpu_data(0); - - if (!cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - return cpufreq_register_driver(¢rino_driver); -} - -static void __exit centrino_exit(void) -{ - cpufreq_unregister_driver(¢rino_driver); -} - -MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>"); -MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors."); -MODULE_LICENSE ("GPL"); - -late_initcall(centrino_init); -module_exit(centrino_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c deleted file mode 100644 index 561758e95180..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * (C) 2001 Dave Jones, Arjan van de ven. - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information, and on Intel documentation - * for chipsets ICH2-M and ICH3-M. - * - * Many thanks to Ducrot Bruno for finding and fixing the last - * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler - * for extensive testing. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/sched.h> - -#include "speedstep-lib.h" - - -/* speedstep_chipset: - * It is necessary to know which chipset is used. As accesses to - * this device occur at various places in this module, we need a - * static struct pci_dev * pointing to that device. - */ -static struct pci_dev *speedstep_chipset_dev; - - -/* speedstep_processor - */ -static enum speedstep_processor speedstep_processor; - -static u32 pmbase; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-ich", msg) - - -/** - * speedstep_find_register - read the PMBASE address - * - * Returns: -ENODEV if no register could be found - */ -static int speedstep_find_register(void) -{ - if (!speedstep_chipset_dev) - return -ENODEV; - - /* get PMBASE */ - pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase); - if (!(pmbase & 0x01)) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - pmbase &= 0xFFFFFFFE; - if (!pmbase) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - dprintk("pmbase is 0x%x\n", pmbase); - return 0; -} - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - * Tries to change the SpeedStep state. Can be called from - * smp_call_function_single. - */ -static void speedstep_set_state(unsigned int state) -{ - u8 pm2_blk; - u8 value; - unsigned long flags; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - /* read state */ - value = inb(pmbase + 0x50); - - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - /* write new state */ - value &= 0xFE; - value |= state; - - dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); - - /* Disable bus master arbitration */ - pm2_blk = inb(pmbase + 0x20); - pm2_blk |= 0x01; - outb(pm2_blk, (pmbase + 0x20)); - - /* Actual transition */ - outb(value, (pmbase + 0x50)); - - /* Restore bus master arbitration */ - pm2_blk &= 0xfe; - outb(pm2_blk, (pmbase + 0x20)); - - /* check if transition was successful */ - value = inb(pmbase + 0x50); - - /* Enable IRQs */ - local_irq_restore(flags); - - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - if (state == (value & 0x1)) - dprintk("change to %u MHz succeeded\n", - speedstep_get_frequency(speedstep_processor) / 1000); - else - printk(KERN_ERR "cpufreq: change failed - I/O error\n"); - - return; -} - -/* Wrapper for smp_call_function_single. */ -static void _speedstep_set_state(void *_state) -{ - speedstep_set_state(*(unsigned int *)_state); -} - -/** - * speedstep_activate - activate SpeedStep control in the chipset - * - * Tries to activate the SpeedStep status and control registers. - * Returns -EINVAL on an unsupported chipset, and zero on success. - */ -static int speedstep_activate(void) -{ - u16 value = 0; - - if (!speedstep_chipset_dev) - return -EINVAL; - - pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value); - if (!(value & 0x08)) { - value |= 0x08; - dprintk("activating SpeedStep (TM) registers\n"); - pci_write_config_word(speedstep_chipset_dev, 0x00A0, value); - } - - return 0; -} - - -/** - * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic - * - * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to - * the LPC bridge / PM module which contains all power-management - * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected - * chipset, or zero on failure. - */ -static unsigned int speedstep_detect_chipset(void) -{ - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 4; /* 4-M */ - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801CA_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 3; /* 3-M */ - - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_10, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) { - /* speedstep.c causes lockups on Dell Inspirons 8000 and - * 8100 which use a pretty old revision of the 82815 - * host brige. Abort on these systems. - */ - static struct pci_dev *hostbridge; - - hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82815_MC, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - - if (!hostbridge) - return 2; /* 2-M */ - - if (hostbridge->revision < 5) { - dprintk("hostbridge does not support speedstep\n"); - speedstep_chipset_dev = NULL; - pci_dev_put(hostbridge); - return 0; - } - - pci_dev_put(hostbridge); - return 2; /* 2-M */ - } - - return 0; -} - -static void get_freq_data(void *_speed) -{ - unsigned int *speed = _speed; - - *speed = speedstep_get_frequency(speedstep_processor); -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - unsigned int speed; - - /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) - BUG(); - - dprintk("detected %u kHz as current frequency\n", speed); - return speed; -} - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0, policy_cpu; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - freqs.old = speedstep_get(policy_cpu); - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = policy->cpu; - - dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new); - - /* no transition necessary */ - if (freqs.old == freqs.new) - return 0; - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, - true); - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - -struct get_freqs { - struct cpufreq_policy *policy; - int ret; -}; - -static void get_freqs_on_cpu(void *_get_freqs) -{ - struct get_freqs *get_freqs = _get_freqs; - - get_freqs->ret = - speedstep_get_freqs(speedstep_processor, - &speedstep_freqs[SPEEDSTEP_LOW].frequency, - &speedstep_freqs[SPEEDSTEP_HIGH].frequency, - &get_freqs->policy->cpuinfo.transition_latency, - &speedstep_set_state); -} - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int policy_cpu, speed; - struct get_freqs gf; - - /* only run on CPU to be set, or on its sibling */ -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - - /* detect low and high frequency and transition latency */ - gf.policy = policy; - smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); - if (gf.ret) - return gf.ret; - - /* get current speed setting */ - speed = speedstep_get(policy_cpu); - if (!speed) - return -EIO; - - dprintk("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-ich", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - /* detect processor */ - speedstep_processor = speedstep_detect_processor(); - if (!speedstep_processor) { - dprintk("Intel(R) SpeedStep(TM) capable processor " - "not found\n"); - return -ENODEV; - } - - /* detect chipset */ - if (!speedstep_detect_chipset()) { - dprintk("Intel(R) SpeedStep(TM) for this chipset not " - "(yet) available.\n"); - return -ENODEV; - } - - /* activate speedstep support */ - if (speedstep_activate()) { - pci_dev_put(speedstep_chipset_dev); - return -EINVAL; - } - - if (speedstep_find_register()) - return -ENODEV; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - pci_dev_put(speedstep_chipset_dev); - cpufreq_unregister_driver(&speedstep_driver); -} - - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>, " - "Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets " - "with ICH-M southbridges."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c deleted file mode 100644 index a94ec6be69fa..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ /dev/null @@ -1,481 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> - -#include <asm/msr.h> -#include <asm/tsc.h> -#include "speedstep-lib.h" - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-lib", msg) - -#define PFX "speedstep-lib: " - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -static int relaxed_check; -#else -#define relaxed_check 0 -#endif - -/********************************************************************* - * GET PROCESSOR CORE SPEED IN KHZ * - *********************************************************************/ - -static unsigned int pentium3_get_frequency(enum speedstep_processor processor) -{ - /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ - struct { - unsigned int ratio; /* Frequency Multiplier (x10) */ - u8 bitmap; /* power on configuration bits - [27, 25:22] (in MSR 0x2a) */ - } msr_decode_mult[] = { - { 30, 0x01 }, - { 35, 0x05 }, - { 40, 0x02 }, - { 45, 0x06 }, - { 50, 0x00 }, - { 55, 0x04 }, - { 60, 0x0b }, - { 65, 0x0f }, - { 70, 0x09 }, - { 75, 0x0d }, - { 80, 0x0a }, - { 85, 0x26 }, - { 90, 0x20 }, - { 100, 0x2b }, - { 0, 0xff } /* error or unknown value */ - }; - - /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ - struct { - unsigned int value; /* Front Side Bus speed in MHz */ - u8 bitmap; /* power on configuration bits [18: 19] - (in MSR 0x2a) */ - } msr_decode_fsb[] = { - { 66, 0x0 }, - { 100, 0x2 }, - { 133, 0x1 }, - { 0, 0xff} - }; - - u32 msr_lo, msr_tmp; - int i = 0, j = 0; - - /* read MSR 0x2a - we only need the low 32 bits */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - msr_tmp = msr_lo; - - /* decode the FSB */ - msr_tmp &= 0x00c0000; - msr_tmp >>= 18; - while (msr_tmp != msr_decode_fsb[i].bitmap) { - if (msr_decode_fsb[i].bitmap == 0xff) - return 0; - i++; - } - - /* decode the multiplier */ - if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) { - dprintk("workaround for early PIIIs\n"); - msr_lo &= 0x03c00000; - } else - msr_lo &= 0x0bc00000; - msr_lo >>= 22; - while (msr_lo != msr_decode_mult[j].bitmap) { - if (msr_decode_mult[j].bitmap == 0xff) - return 0; - j++; - } - - dprintk("speed is %u\n", - (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); - - return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100; -} - - -static unsigned int pentiumM_get_frequency(void) -{ - u32 msr_lo, msr_tmp; - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - - /* see table B-2 of 24547212.pdf */ - if (msr_lo & 0x00040000) { - printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n", - msr_lo, msr_tmp); - return 0; - } - - msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * 100 * 1000)); - - return msr_tmp * 100 * 1000; -} - -static unsigned int pentium_core_get_frequency(void) -{ - u32 fsb = 0; - u32 msr_lo, msr_tmp; - int ret; - - rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp); - /* see table B-2 of 25366920.pdf */ - switch (msr_lo & 0x07) { - case 5: - fsb = 100000; - break; - case 1: - fsb = 133333; - break; - case 3: - fsb = 166667; - break; - case 2: - fsb = 200000; - break; - case 0: - fsb = 266667; - break; - case 4: - fsb = 333333; - break; - default: - printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value"); - } - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", - msr_lo, msr_tmp); - - msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * fsb)); - - ret = (msr_tmp * fsb); - return ret; -} - - -static unsigned int pentium4_get_frequency(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 msr_lo, msr_hi, mult; - unsigned int fsb = 0; - unsigned int ret; - u8 fsb_code; - - /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency - * to System Bus Frequency Ratio Field in the Processor Frequency - * Configuration Register of the MSR. Therefore the current - * frequency cannot be calculated and has to be measured. - */ - if (c->x86_model < 2) - return cpu_khz; - - rdmsr(0x2c, msr_lo, msr_hi); - - dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); - - /* decode the FSB: see IA-32 Intel (C) Architecture Software - * Developer's Manual, Volume 3: System Prgramming Guide, - * revision #12 in Table B-1: MSRs in the Pentium 4 and - * Intel Xeon Processors, on page B-4 and B-5. - */ - fsb_code = (msr_lo >> 16) & 0x7; - switch (fsb_code) { - case 0: - fsb = 100 * 1000; - break; - case 1: - fsb = 13333 * 10; - break; - case 2: - fsb = 200 * 1000; - break; - } - - if (!fsb) - printk(KERN_DEBUG PFX "couldn't detect FSB speed. " - "Please send an e-mail to <linux@brodo.de>\n"); - - /* Multiplier. */ - mult = msr_lo >> 24; - - dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", - fsb, mult, (fsb * mult)); - - ret = (fsb * mult); - return ret; -} - - -/* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(enum speedstep_processor processor) -{ - switch (processor) { - case SPEEDSTEP_CPU_PCORE: - return pentium_core_get_frequency(); - case SPEEDSTEP_CPU_PM: - return pentiumM_get_frequency(); - case SPEEDSTEP_CPU_P4D: - case SPEEDSTEP_CPU_P4M: - return pentium4_get_frequency(); - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - return pentium3_get_frequency(processor); - default: - return 0; - }; - return 0; -} -EXPORT_SYMBOL_GPL(speedstep_get_frequency); - - -/********************************************************************* - * DETECT SPEEDSTEP-CAPABLE PROCESSOR * - *********************************************************************/ - -unsigned int speedstep_detect_processor(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - u32 ebx, msr_lo, msr_hi; - - dprintk("x86: %x, model: %x\n", c->x86, c->x86_model); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - ((c->x86 != 6) && (c->x86 != 0xF))) - return 0; - - if (c->x86 == 0xF) { - /* Intel Mobile Pentium 4-M - * or Intel Mobile Pentium 4 with 533 MHz FSB */ - if (c->x86_model != 2) - return 0; - - ebx = cpuid_ebx(0x00000001); - ebx &= 0x000000FF; - - dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); - - switch (c->x86_mask) { - case 4: - /* - * B-stepping [M-P4-M] - * sample has ebx = 0x0f, production has 0x0e. - */ - if ((ebx == 0x0e) || (ebx == 0x0f)) - return SPEEDSTEP_CPU_P4M; - break; - case 7: - /* - * C-stepping [M-P4-M] - * needs to have ebx=0x0e, else it's a celeron: - * cf. 25130917.pdf / page 7, footnote 5 even - * though 25072120.pdf / page 7 doesn't say - * samples are only of B-stepping... - */ - if (ebx == 0x0e) - return SPEEDSTEP_CPU_P4M; - break; - case 9: - /* - * D-stepping [M-P4-M or M-P4/533] - * - * this is totally strange: CPUID 0x0F29 is - * used by M-P4-M, M-P4/533 and(!) Celeron CPUs. - * The latter need to be sorted out as they don't - * support speedstep. - * Celerons with CPUID 0x0F29 may have either - * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything - * specific. - * M-P4-Ms may have either ebx=0xe or 0xf [see above] - * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] - * also, M-P4M HTs have ebx=0x8, too - * For now, they are distinguished by the model_id - * string - */ - if ((ebx == 0x0e) || - (strstr(c->x86_model_id, - "Mobile Intel(R) Pentium(R) 4") != NULL)) - return SPEEDSTEP_CPU_P4M; - break; - default: - break; - } - return 0; - } - - switch (c->x86_model) { - case 0x0B: /* Intel PIII [Tualatin] */ - /* cpuid_ebx(1) is 0x04 for desktop PIII, - * 0x06 for mobile PIII-M */ - ebx = cpuid_ebx(0x00000001); - dprintk("ebx is %x\n", ebx); - - ebx &= 0x000000FF; - - if (ebx != 0x06) - return 0; - - /* So far all PIII-M processors support SpeedStep. See - * Intel's 24540640.pdf of June 2003 - */ - return SPEEDSTEP_CPU_PIII_T; - - case 0x08: /* Intel PIII [Coppermine] */ - - /* all mobile PIII Coppermines have FSB 100 MHz - * ==> sort out a few desktop PIIIs. */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", - msr_lo, msr_hi); - msr_lo &= 0x00c0000; - if (msr_lo != 0x0080000) - return 0; - - /* - * If the processor is a mobile version, - * platform ID has bit 50 set - * it has SpeedStep technology if either - * bit 56 or 57 is set - */ - rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", - msr_lo, msr_hi); - if ((msr_hi & (1<<18)) && - (relaxed_check ? 1 : (msr_hi & (3<<24)))) { - if (c->x86_mask == 0x01) { - dprintk("early PIII version\n"); - return SPEEDSTEP_CPU_PIII_C_EARLY; - } else - return SPEEDSTEP_CPU_PIII_C; - } - - default: - return 0; - } -} -EXPORT_SYMBOL_GPL(speedstep_detect_processor); - - -/********************************************************************* - * DETECT SPEEDSTEP SPEEDS * - *********************************************************************/ - -unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)) -{ - unsigned int prev_speed; - unsigned int ret = 0; - unsigned long flags; - struct timeval tv1, tv2; - - if ((!processor) || (!low_speed) || (!high_speed) || (!set_state)) - return -EINVAL; - - dprintk("trying to determine both speeds\n"); - - /* get current speed */ - prev_speed = speedstep_get_frequency(processor); - if (!prev_speed) - return -EIO; - - dprintk("previous speed is %u\n", prev_speed); - - local_irq_save(flags); - - /* switch to low state */ - set_state(SPEEDSTEP_LOW); - *low_speed = speedstep_get_frequency(processor); - if (!*low_speed) { - ret = -EIO; - goto out; - } - - dprintk("low speed is %u\n", *low_speed); - - /* start latency measurement */ - if (transition_latency) - do_gettimeofday(&tv1); - - /* switch to high state */ - set_state(SPEEDSTEP_HIGH); - - /* end latency measurement */ - if (transition_latency) - do_gettimeofday(&tv2); - - *high_speed = speedstep_get_frequency(processor); - if (!*high_speed) { - ret = -EIO; - goto out; - } - - dprintk("high speed is %u\n", *high_speed); - - if (*low_speed == *high_speed) { - ret = -ENODEV; - goto out; - } - - /* switch to previous state, if necessary */ - if (*high_speed != prev_speed) - set_state(SPEEDSTEP_LOW); - - if (transition_latency) { - *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC + - tv2.tv_usec - tv1.tv_usec; - dprintk("transition latency is %u uSec\n", *transition_latency); - - /* convert uSec to nSec and add 20% for safety reasons */ - *transition_latency *= 1200; - - /* check if the latency measurement is too high or too low - * and set it to a safe value (500uSec) in that case - */ - if (*transition_latency > 10000000 || - *transition_latency < 50000) { - printk(KERN_WARNING PFX "frequency transition " - "measured seems out of range (%u " - "nSec), falling back to a safe one of" - "%u nSec.\n", - *transition_latency, 500000); - *transition_latency = 500000; - } - } - -out: - local_irq_restore(flags); - return ret; -} -EXPORT_SYMBOL_GPL(speedstep_get_freqs); - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -module_param(relaxed_check, int, 0444); -MODULE_PARM_DESC(relaxed_check, - "Don't do all checks for speedstep capability."); -#endif - -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h deleted file mode 100644 index 70d9cea1219d..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - - -/* processors */ -enum speedstep_processor { - SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ - SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ -/* the following processors are not speedstep-capable and are not auto-detected - * in speedstep_detect_processor(). However, their speed can be detected using - * the speedstep_get_frequency() call. */ - SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ - SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ - SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ -}; - -/* speedstep states -- only two of them */ - -#define SPEEDSTEP_HIGH 0x00000000 -#define SPEEDSTEP_LOW 0x00000001 - - -/* detect a speedstep-capable processor */ -extern enum speedstep_processor speedstep_detect_processor(void); - -/* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); - - -/* detect the low and high speeds of the processor. The callback - * set_state"'s first argument is either SPEEDSTEP_HIGH or - * SPEEDSTEP_LOW; the second argument is zero so that no - * cpufreq_notify_transition calls are initiated. - */ -extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c deleted file mode 100644 index 8abd869baabf..000000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Intel SpeedStep SMI driver. - * - * (C) 2003 Hiroshi Miura <miura@da-cha.org> - * - * Licensed under the terms of the GNU GPL License version 2. - * - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/delay.h> -#include <linux/io.h> -#include <asm/ist.h> - -#include "speedstep-lib.h" - -/* speedstep system management interface port/command. - * - * These parameters are got from IST-SMI BIOS call. - * If user gives it, these are used. - * - */ -static int smi_port; -static int smi_cmd; -static unsigned int smi_sig; - -/* info about the processor */ -static enum speedstep_processor speedstep_processor; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - -#define GET_SPEEDSTEP_OWNER 0 -#define GET_SPEEDSTEP_STATE 1 -#define SET_SPEEDSTEP_STATE 2 -#define GET_SPEEDSTEP_FREQS 4 - -/* how often shall the SMI call be tried if it failed, e.g. because - * of DMA activity going on? */ -#define SMI_TRIES 5 - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-smi", msg) - -/** - * speedstep_smi_ownership - */ -static int speedstep_smi_ownership(void) -{ - u32 command, result, magic, dummy; - u32 function = GET_SPEEDSTEP_OWNER; - unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - magic = virt_to_phys(magic_data); - - dprintk("trying to obtain ownership with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=D" (result), - "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), - "=S" (dummy) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), - "D" (0), "S" (magic) - : "memory" - ); - - dprintk("result is %x\n", result); - - return result; -} - -/** - * speedstep_smi_get_freqs - get SpeedStep preferred & current freq. - * @low: the low frequency value is placed here - * @high: the high frequency value is placed here - * - * Only available on later SpeedStep-enabled systems, returns false results or - * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing - * shows that the latter occurs if !(ist_info.event & 0xFFFF). - */ -static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high) -{ - u32 command, result = 0, edi, high_mhz, low_mhz, dummy; - u32 state = 0; - u32 function = GET_SPEEDSTEP_FREQS; - - if (!(ist_info.event & 0xFFFF)) { - dprintk("bug #1422 -- can't read freqs from BIOS\n"); - return -ENODEV; - } - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to determine frequencies with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=a" (result), - "=b" (high_mhz), - "=c" (low_mhz), - "=d" (state), "=D" (edi), "=S" (dummy) - : "a" (command), - "b" (function), - "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - - dprintk("result %x, low_freq %u, high_freq %u\n", - result, low_mhz, high_mhz); - - /* abort if results are obviously incorrect... */ - if ((high_mhz + low_mhz) < 600) - return -EINVAL; - - *high = high_mhz * 1000; - *low = low_mhz * 1000; - - return result; -} - -/** - * speedstep_get_state - set the SpeedStep state - * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static int speedstep_get_state(void) -{ - u32 function = GET_SPEEDSTEP_STATE; - u32 result, state, edi, command, dummy; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to determine current setting with command %x " - "at port %x\n", command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=a" (result), - "=b" (state), "=D" (edi), - "=c" (dummy), "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (0), - "d" (smi_port), "S" (0), "D" (0) - ); - - dprintk("state is %x, result is %x\n", state, result); - - return state & 1; -} - - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static void speedstep_set_state(unsigned int state) -{ - unsigned int result = 0, command, new_state, dummy; - unsigned long flags; - unsigned int function = SET_SPEEDSTEP_STATE; - unsigned int retry = 0; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to set frequency to state %u " - "with command %x at port %x\n", - state, command, smi_port); - - do { - if (retry) { - dprintk("retry %u, previous result %u, waiting...\n", - retry, result); - mdelay(retry * 50); - } - retry++; - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=b" (new_state), "=D" (result), - "=c" (dummy), "=a" (dummy), - "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - } while ((new_state != state) && (retry <= SMI_TRIES)); - - /* enable IRQs */ - local_irq_restore(flags); - - if (new_state == state) - dprintk("change to %u MHz succeeded after %u tries " - "with result %u\n", - (speedstep_freqs[new_state].frequency / 1000), - retry, result); - else - printk(KERN_ERR "cpufreq: change to state %u " - "failed with new_state %u and result %u\n", - state, new_state, result); - - return; -} - - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: new freq - * @relation: - * - * Sets a new CPUFreq policy/freq. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int newstate = 0; - struct cpufreq_freqs freqs; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = speedstep_freqs[speedstep_get_state()].frequency; - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = 0; /* speedstep.c is UP only driver */ - - if (freqs.old == freqs.new) - return 0; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - speedstep_set_state(newstate); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int speed, state; - unsigned int *low, *high; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - result = speedstep_smi_ownership(); - if (result) { - dprintk("fails in aquiring ownership of a SMI interface.\n"); - return -EINVAL; - } - - /* detect low and high frequency */ - low = &speedstep_freqs[SPEEDSTEP_LOW].frequency; - high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency; - - result = speedstep_smi_get_freqs(low, high); - if (result) { - /* fall back to speedstep_lib.c dection mechanism: - * try both states out */ - dprintk("could not detect low and high frequencies " - "by SMI call.\n"); - result = speedstep_get_freqs(speedstep_processor, - low, high, - NULL, - &speedstep_set_state); - - if (result) { - dprintk("could not detect two different speeds" - " -- aborting.\n"); - return result; - } else - dprintk("workaround worked.\n"); - } - - /* get current speed setting */ - state = speedstep_get_state(); - speed = speedstep_freqs[state].frequency; - - dprintk("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - if (cpu) - return -ENODEV; - return speedstep_get_frequency(speedstep_processor); -} - - -static int speedstep_resume(struct cpufreq_policy *policy) -{ - int result = speedstep_smi_ownership(); - - if (result) - dprintk("fails in re-aquiring ownership of a SMI interface.\n"); - - return result; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-smi", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .resume = speedstep_resume, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * BIOS, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - speedstep_processor = speedstep_detect_processor(); - - switch (speedstep_processor) { - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - break; - default: - speedstep_processor = 0; - } - - if (!speedstep_processor) { - dprintk("No supported Intel CPU detected.\n"); - return -ENODEV; - } - - dprintk("signature:0x%.8lx, command:0x%.8lx, " - "event:0x%.8lx, perf_level:0x%.8lx.\n", - ist_info.signature, ist_info.command, - ist_info.event, ist_info.perf_level); - - /* Error if no IST-SMI BIOS or no PARM - sig= 'ISGE' aka 'Intel Speedstep Gate E' */ - if ((ist_info.signature != 0x47534943) && ( - (smi_port == 0) || (smi_cmd == 0))) - return -ENODEV; - - if (smi_sig == 1) - smi_sig = 0x47534943; - else - smi_sig = ist_info.signature; - - /* setup smi_port from MODLULE_PARM or BIOS */ - if ((smi_port > 0xff) || (smi_port < 0)) - return -EINVAL; - else if (smi_port == 0) - smi_port = ist_info.command & 0xff; - - if ((smi_cmd > 0xff) || (smi_cmd < 0)) - return -EINVAL; - else if (smi_cmd == 0) - smi_cmd = (ist_info.command >> 16) & 0xff; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - cpufreq_unregister_driver(&speedstep_driver); -} - -module_param(smi_port, int, 0444); -module_param(smi_cmd, int, 0444); -module_param(smi_sig, uint, 0444); - -MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value " - "-- Intel's default setting is 0xb2"); -MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value " - "-- Intel's default setting is 0x82"); -MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the " - "SMI interface."); - -MODULE_AUTHOR("Hiroshi Miura"); -MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d16c2c53d6bf..1edf5ba4fb2b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -29,10 +29,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { + u64 misc_enable; + /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { @@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * (model 2) with the same problem. */ if (c->x86 == 15) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { @@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) } } #endif + + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + printk(KERN_INFO "Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } } #ifdef CONFIG_X86_32 @@ -276,14 +287,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); - int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ - node = apicid_to_node[apicid]; + node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE || !node_online(node)) { /* reuse the value from init_cpu_to_node() */ node = cpu_to_node(cpu); @@ -401,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 5: - if (c->x86_mask == 0) { - if (l2 == 0) - p = "Celeron (Covington)"; - else if (l2 == 256) - p = "Mobile Pentium II (Dixon)"; - } + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; break; case 6: diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index ec2c19a7b8ef..c105c533ed94 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -304,8 +304,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, struct _cache_attr { struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); + ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, + unsigned int); }; #ifdef CONFIG_AMD_NB @@ -326,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } @@ -400,7 +400,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, #define SHOW_CACHE_DISABLE(slot) \ static ssize_t \ -show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return show_cache_disable(this_leaf, buf, slot); \ } @@ -452,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, { int ret = 0; -#define SUBCACHE_MASK (3UL << 20) -#define SUBCACHE_INDEX 0xfff - - /* - * check whether this slot is already used or - * the index is already disabled - */ + /* check if @slot is already used or the index is already disabled */ ret = amd_get_l3_disable_slot(l3, slot); if (ret >= 0) return -EINVAL; - /* - * check whether the other slot has disabled the - * same index already - */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index > l3->indices) return -EINVAL; - /* do not allow writes outside of allowed bits */ - if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((index & SUBCACHE_INDEX) > l3->indices)) + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(l3, !slot)) return -EINVAL; amd_l3_disable_index(l3, cpu, slot, index); @@ -512,7 +502,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count, \ + unsigned int cpu) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -524,6 +515,39 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); +static ssize_t +show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) +{ + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t +store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, + unsigned int cpu) +{ + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + if (strict_strtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static struct _cache_attr subcaches = + __ATTR(subcaches, 0644, show_subcaches, store_subcaches); + #else /* CONFIG_AMD_NB */ #define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB */ @@ -532,9 +556,9 @@ static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { @@ -732,11 +756,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) struct cpuinfo_x86 *c = &cpu_data(cpu); if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - for_each_cpu(i, c->llc_shared_map) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, c->llc_shared_map) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); @@ -870,8 +894,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ -static ssize_t show_##file_name \ - (struct _cpuid4_info *this_leaf, char *buf) \ +static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } @@ -882,7 +906,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); -static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { return sprintf(buf, "%luK\n", this_leaf->size / 1024); } @@ -906,17 +931,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, return n; } -static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 0, buf); } -static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { switch (this_leaf->eax.split.type) { case CACHE_TYPE_DATA: @@ -974,6 +1002,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); if (attrs == NULL) return attrs = default_attrs; @@ -986,6 +1017,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) attrs[n++] = &cache_disable_1.attr; } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + attrs[n++] = &subcaches.attr; + return attrs; } #endif @@ -998,7 +1032,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf) : + buf, this_leaf->cpu) : 0; return ret; } @@ -1012,7 +1046,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, ret = fattr->store ? fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count) : + buf, count, this_leaf->cpu) : 0; return ret; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a5..83930deec3c6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a77971979564..0ed633c5048b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -32,7 +32,7 @@ static void inject_mce(struct mce *m) { struct mce *i = &per_cpu(injectm, m->extcpu); - /* Make sure noone reads partially written injectm */ + /* Make sure no one reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d916183b7f9c..ff1ae9b6464d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -21,6 +21,7 @@ #include <linux/percpu.h> #include <linux/string.h> #include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/ctype.h> #include <linux/sched.h> @@ -104,20 +105,6 @@ static int cpu_missing; ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); -static int default_decode_mce(struct notifier_block *nb, unsigned long val, - void *data) -{ - pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); - pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); - - return NOTIFY_STOP; -} - -static struct notifier_block mce_dec_nb = { - .notifier_call = default_decode_mce, - .priority = -1, -}; - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -211,6 +198,8 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { + int ret = 0; + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -238,7 +227,11 @@ static void print_mce(struct mce *m) * Print out human-readable details about the MCE error, * (if the CPU has an implementation for that) */ - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret == NOTIFY_STOP) + return; + + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -589,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - add_taint(TAINT_MACHINE_CHECK); } /* @@ -881,7 +873,7 @@ reset: * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses upto page granuality for now. + * parser). So only support physical addresses up to page granuality for now. */ static int mce_usable_address(struct mce *m) { @@ -1625,7 +1617,7 @@ out: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference_check_mce(mcelog.next)) + if (rcu_access_index(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; @@ -1721,8 +1713,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); - mcheck_intel_therm_init(); return 0; @@ -1749,14 +1739,14 @@ static int mce_disable_error_reporting(void) return 0; } -static int mce_suspend(struct sys_device *dev, pm_message_t state) +static int mce_suspend(void) { return mce_disable_error_reporting(); } -static int mce_shutdown(struct sys_device *dev) +static void mce_shutdown(void) { - return mce_disable_error_reporting(); + mce_disable_error_reporting(); } /* @@ -1764,14 +1754,18 @@ static int mce_shutdown(struct sys_device *dev) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static int mce_resume(struct sys_device *dev) +static void mce_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); - - return 0; } +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, +}; + static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); @@ -1808,9 +1802,6 @@ static void mce_enable_ce(void *all) } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, .name = "machinecheck", }; @@ -2139,6 +2130,7 @@ static __init int mcheck_init_device(void) return err; } + register_syscore_ops(&mce_syscore_ops); register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5bf2fac52aca..bb0adad35143 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -509,6 +509,7 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; @@ -527,15 +528,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; -#ifdef CONFIG_SMP - struct cpuinfo_x86 *c = &cpu_data(cpu); -#endif sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(c->llc_shared_map); + i = cpumask_first(cpu_llc_shared_mask(cpu)); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -555,7 +553,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; goto out; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9da97f..27c625178bf1 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - - add_taint(TAINT_MACHINE_CHECK); return 1; } if (old_event) { @@ -355,7 +353,6 @@ static void notify_thresholds(__u64 msr_val) static void intel_thermal_interrupt(void) { __u64 msr_val; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); @@ -367,19 +364,19 @@ static void intel_thermal_interrupt(void) CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); - if (cpu_has(c, X86_FEATURE_PTS)) { + if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, @@ -393,7 +390,6 @@ static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; @@ -446,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9f27228ceffd..a71efcdbb092 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,6 +1,6 @@ /* * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - * because MTRRs can span upto 40 bits (36bits on most modern x86) + * because MTRRs can span up to 40 bits (36bits on most modern x86) */ #define DEBUG diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index bebabec5b448..929739a653d1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -45,6 +45,7 @@ #include <linux/cpu.h> #include <linux/pci.h> #include <linux/smp.h> +#include <linux/syscore_ops.h> #include <asm/processor.h> #include <asm/e820.h> @@ -292,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ /* * HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... + * + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * this cpu we still do mtrr_if->set_all(). During boot/resume, this + * is unnecessary if at this point we are still on the cpu that started + * the boot/resume sequence. But there is no guarantee that we are still + * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be + * sure that we are in sync with everyone else. */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); - else if (!mtrr_aps_delayed_init) + else mtrr_if->set_all(); /* Wait for the others */ @@ -630,7 +641,7 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device *sysdev, pm_message_t state) +static int mtrr_save(void) { int i; @@ -642,7 +653,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state) return 0; } -static int mtrr_restore(struct sys_device *sysdev) +static void mtrr_restore(void) { int i; @@ -653,12 +664,11 @@ static int mtrr_restore(struct sys_device *sysdev) mtrr_value[i].ltype); } } - return 0; } -static struct sysdev_driver mtrr_sysdev_driver = { +static struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; @@ -839,7 +849,7 @@ static int __init mtrr_init_finialize(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + register_syscore_ops(&mtrr_syscore_ops); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9d977a2ea693..3a0338b4b179 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -30,6 +30,8 @@ #include <asm/stacktrace.h> #include <asm/nmi.h> #include <asm/compat.h> +#include <asm/smp.h> +#include <asm/alternative.h> #if 0 #undef wrmsrl @@ -93,6 +95,8 @@ struct amd_nb { struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; +struct intel_percore; + #define MAX_LBR_ENTRIES 16 struct cpu_hw_events { @@ -128,6 +132,13 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; /* + * Intel percore register state. + * Coordinate shared resources between HT threads. + */ + int percore_used; /* Used by this CPU? */ + struct intel_percore *per_core; + + /* * AMD specific bits */ struct amd_nb *amd_nb; @@ -166,7 +177,7 @@ struct cpu_hw_events { /* * Constraint on the Event code + UMask */ -#define PEBS_EVENT_CONSTRAINT(c, n) \ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) #define EVENT_CONSTRAINT_END \ @@ -175,6 +186,28 @@ struct cpu_hw_events { #define for_each_event_constraint(e, c) \ for ((e) = (c); (e)->weight; (e)++) +/* + * Extra registers for specific events. + * Some events need large masks and require external MSRs. + * Define a mapping to these extra registers. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + } +#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) + union perf_capabilities { struct { u64 lbr_format : 6; @@ -219,6 +252,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; + struct event_constraint *percore_constraints; void (*quirks)(void); int perfctr_second_write; @@ -247,6 +281,11 @@ struct x86_pmu { */ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ int lbr_nr; /* hardware stack size */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; }; static struct x86_pmu x86_pmu __read_mostly; @@ -271,6 +310,10 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +static u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. @@ -298,7 +341,7 @@ x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base + idx, new_raw_count); + rdmsrl(hwc->event_base, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -321,6 +364,55 @@ again: return new_raw_count; } +static inline int x86_pmu_addr_offset(int index) +{ + int offset; + + /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ + alternative_io(ASM_NOP2, + "shll $1, %%eax", + X86_FEATURE_PERFCTR_CORE, + "=a" (offset), + "a" (index)); + + return offset; +} + +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + x86_pmu_addr_offset(index); +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + x86_pmu_addr_offset(index); +} + +/* + * Find and validate any extra registers to set up. + */ +static int x86_pmu_extra_regs(u64 config, struct perf_event *event) +{ + struct extra_reg *er; + + event->hw.extra_reg = 0; + event->hw.extra_config = 0; + + if (!x86_pmu.extra_regs) + return 0; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + event->hw.extra_reg = er->msr; + event->hw.extra_config = event->attr.config1; + break; + } + return 0; +} + static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); @@ -331,12 +423,12 @@ static bool reserve_pmc_hardware(void) int i; for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -344,13 +436,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); + release_evntsel_nmi(x86_pmu_config_addr(i)); i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); + release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } @@ -360,8 +452,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); + release_perfctr_nmi(x86_pmu_event_addr(i)); + release_evntsel_nmi(x86_pmu_config_addr(i)); } } @@ -382,7 +474,7 @@ static bool check_hw_exists(void) * complain and bail. */ for (i = 0; i < x86_pmu.num_counters; i++) { - reg = x86_pmu.eventsel + i; + reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; @@ -407,20 +499,25 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu.perfctr, val); - ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); + ret = checking_wrmsrl(x86_pmu_event_addr(0), val); + ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); if (ret || val != val_new) goto msr_fail; return true; bios_fail: - printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); + /* + * We still allow the PMU driver to operate: + */ + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); - return false; + + return true; msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + return false; } @@ -442,8 +539,9 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -470,8 +568,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return -EINVAL; hwc->config |= val; - - return 0; + attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + return x86_pmu_extra_regs(val, event); } static int x86_setup_perfctr(struct perf_event *event) @@ -495,11 +593,15 @@ static int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } + /* + * Do not allow config1 (extended registers) to propagate, + * there's no sane user-space generalization yet: + */ if (attr->type == PERF_TYPE_RAW) return 0; if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); + return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; @@ -518,8 +620,8 @@ static int x86_setup_perfctr(struct perf_event *event) /* * Branch tracing: */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { /* BTS is not supported by this architecture. */ if (!x86_pmu.bts_active) return -EOPNOTSUPP; @@ -617,11 +719,11 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(x86_pmu.eventsel + idx, val); + rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + wrmsrl(x86_pmu_config_addr(idx), val); } } @@ -642,21 +744,26 @@ static void x86_pmu_disable(struct pmu *pmu) x86_pmu.disable_all(); } +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + if (hwc->extra_reg) + wrmsrl(hwc->extra_reg, hwc->extra_config); + wrmsrl(hwc->config_base, hwc->config | enable_mask); +} + static void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } @@ -821,15 +928,10 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->event_base = 0; } else if (hwc->idx >= X86_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); } else { - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu_config_addr(hwc->idx); + hwc->event_base = x86_pmu_event_addr(hwc->idx); } } @@ -915,17 +1017,11 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu.enable_all(added); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, - u64 enable_mask) -{ - wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); -} - static inline void x86_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base + hwc->idx, hwc->config); + wrmsrl(hwc->config_base, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -978,7 +1074,7 @@ x86_perf_event_set_period(struct perf_event *event) */ local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Due to erratum on certan cpu we need @@ -986,7 +1082,7 @@ x86_perf_event_set_period(struct perf_event *event) * is updated properly */ if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base + idx, + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); } @@ -1029,7 +1125,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) /* * If group events scheduling transaction was started, - * skip the schedulability test here, it will be peformed + * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole */ if (cpuc->group_flag & PERF_EVENT_TXN) @@ -1113,8 +1209,8 @@ void perf_event_print_debug(void) pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); + rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); + rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); @@ -1199,6 +1295,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) { /* @@ -1285,8 +1391,6 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_DONE; } - apic_write(APIC_LVTPC, APIC_DM_NMI); - handled = x86_pmu.handle_irq(args->regs); if (!handled) return NOTIFY_DONE; @@ -1389,7 +1493,7 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } -int __init init_hw_perf_events(void) +static int __init init_hw_perf_events(void) { struct event_constraint *c; int err; @@ -1608,7 +1712,7 @@ out: return ret; } -int x86_pmu_event_init(struct perf_event *event) +static int x86_pmu_event_init(struct perf_event *event) { struct pmu *tmp; int err; @@ -1669,17 +1773,6 @@ static struct pmu pmu = { * callchain support */ -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - static int backtrace_stack(void *data, char *name) { return 0; @@ -1693,8 +1786,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, .walk_stack = print_context_stack_bp, @@ -1710,7 +1801,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 67e2202a6039..fe29c1d2219e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(L1D) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ @@ -96,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ }; static u64 amd_pmu_event_map(int hw_event) @@ -127,6 +129,11 @@ static int amd_pmu_hw_config(struct perf_event *event) /* * AMD64 events are detected based on their event codes. */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + static inline int amd_is_nb_event(struct hw_perf_event *hwc) { return (hwc->config & 0xe0) == 0xe0; @@ -385,13 +392,195 @@ static __initconst const struct x86_pmu amd_pmu = { .cpu_dead = amd_pmu_cpu_dead, }; +/* AMD Family 15h */ + +#define AMD_EVENT_TYPE_MASK 0x000000F0ULL + +#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL +#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL +#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL +#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL +#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL +#define AMD_EVENT_EX_LS 0x000000C0ULL +#define AMD_EVENT_DE 0x000000D0ULL +#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL + +/* + * AMD family 15h event code/PMC mappings: + * + * type = event_code & 0x0F0: + * + * 0x000 FP PERF_CTL[5:3] + * 0x010 FP PERF_CTL[5:3] + * 0x020 LS PERF_CTL[5:0] + * 0x030 LS PERF_CTL[5:0] + * 0x040 DC PERF_CTL[5:0] + * 0x050 DC PERF_CTL[5:0] + * 0x060 CU PERF_CTL[2:0] + * 0x070 CU PERF_CTL[2:0] + * 0x080 IC/DE PERF_CTL[2:0] + * 0x090 IC/DE PERF_CTL[2:0] + * 0x0A0 --- + * 0x0B0 --- + * 0x0C0 EX/LS PERF_CTL[5:0] + * 0x0D0 DE PERF_CTL[2:0] + * 0x0E0 NB NB_PERF_CTL[3:0] + * 0x0F0 NB NB_PERF_CTL[3:0] + * + * Exceptions: + * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x00B FP PERF_CTL[3] + * 0x00D FP PERF_CTL[3] + * 0x023 DE PERF_CTL[2:0] + * 0x02D LS PERF_CTL[3] + * 0x02E LS PERF_CTL[3,0] + * 0x043 CU PERF_CTL[2:0] + * 0x045 CU PERF_CTL[2:0] + * 0x046 CU PERF_CTL[2:0] + * 0x054 CU PERF_CTL[2:0] + * 0x055 CU PERF_CTL[2:0] + * 0x08F IC PERF_CTL[0] + * 0x187 DE PERF_CTL[0] + * 0x188 DE PERF_CTL[0] + * 0x0DB EX PERF_CTL[5:0] + * 0x0DC LS PERF_CTL[5:0] + * 0x0DD LS PERF_CTL[5:0] + * 0x0DE LS PERF_CTL[5:0] + * 0x0DF LS PERF_CTL[5:0] + * 0x1D6 EX PERF_CTL[5:0] + * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used + */ + +static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); +static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); +static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); +static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); + +static struct event_constraint * +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); + + switch (event_code & AMD_EVENT_TYPE_MASK) { + case AMD_EVENT_FP: + switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; + case 0x003: + case 0x00B: + case 0x00D: + return &amd_f15_PMC3; + } + return &amd_f15_PMC53; + case AMD_EVENT_LS: + case AMD_EVENT_DC: + case AMD_EVENT_EX_LS: + switch (event_code) { + case 0x023: + case 0x043: + case 0x045: + case 0x046: + case 0x054: + case 0x055: + return &amd_f15_PMC20; + case 0x02D: + return &amd_f15_PMC3; + case 0x02E: + return &amd_f15_PMC30; + default: + return &amd_f15_PMC50; + } + case AMD_EVENT_CU: + case AMD_EVENT_IC_DE: + case AMD_EVENT_DE: + switch (event_code) { + case 0x08F: + case 0x187: + case 0x188: + return &amd_f15_PMC0; + case 0x0DB ... 0x0DF: + case 0x1D6: + case 0x1D8: + return &amd_f15_PMC50; + default: + return &amd_f15_PMC20; + } + case AMD_EVENT_NB: + /* not yet implemented */ + return &emptyconstraint; + default: + return &emptyconstraint; + } +} + +static __initconst const struct x86_pmu amd_pmu_f15h = { + .name = "AMD Family 15h", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = amd_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_F15H_PERF_CTL, + .perfctr = MSR_F15H_PERF_CTR, + .event_map = amd_pmu_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 6, + .cntval_bits = 48, + .cntval_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints_f15h, + /* nortbridge counters not yet implemented: */ +#if 0 + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, +#endif +}; + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - x86_pmu = amd_pmu; + /* + * If core performance counter extensions exists, it must be + * family 15h, otherwise fail. See x86_pmu_addr_offset(). + */ + switch (boot_cpu_data.x86) { + case 0x15: + if (!cpu_has_perfctr_core) + return -ENODEV; + x86_pmu = amd_pmu_f15h; + break; + default: + if (cpu_has_perfctr_core) + return -ENODEV; + x86_pmu = amd_pmu; + break; + } /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 008835c1d79c..41178c826c48 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,9 +1,31 @@ #ifdef CONFIG_CPU_SUP_INTEL +#define MAX_EXTRA_REGS 2 + +/* + * Per register state. + */ +struct er_account { + int ref; /* reference count */ + unsigned int extra_reg; /* extra MSR number */ + u64 extra_config; /* extra MSR config */ +}; + +/* + * Per core state + * This used to coordinate shared registers for HT threads. + */ +struct intel_percore { + raw_spinlock_t lock; /* protect structure */ + struct er_account regs[MAX_EXTRA_REGS]; + int refcnt; /* number of threads */ + unsigned core_id; +}; + /* * Intel PerfMon, used on Core and later. */ -static const u64 intel_perfmon_event_map[] = +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -14,7 +36,7 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static struct event_constraint intel_core_event_constraints[] = +static struct event_constraint intel_core_event_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -25,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_core2_event_constraints[] = +static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -48,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_event_constraints[] = +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -64,7 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_event_constraints[] = +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = +{ + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = +{ + INTEL_EVENT_CONSTRAINT(0xb7, 0), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -76,7 +110,34 @@ static struct event_constraint intel_westmere_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_gen_event_constraints[] = +static struct event_constraint intel_snb_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */ + INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = +{ + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = +{ + INTEL_EVENT_CONSTRAINT(0xb7, 0), + INTEL_EVENT_CONSTRAINT(0xbb, 0), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -89,6 +150,103 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +static __initconst const u64 snb_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */ + [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -124,16 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -180,6 +348,59 @@ static __initconst const u64 westmere_hw_cache_event_ids }, }; +/* + * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; + * See IA32 SDM Vol 3B 30.6.1.3 + */ + +#define NHM_DMND_DATA_RD (1 << 0) +#define NHM_DMND_RFO (1 << 1) +#define NHM_DMND_IFETCH (1 << 2) +#define NHM_DMND_WB (1 << 3) +#define NHM_PF_DATA_RD (1 << 4) +#define NHM_PF_DATA_RFO (1 << 5) +#define NHM_PF_IFETCH (1 << 6) +#define NHM_OFFCORE_OTHER (1 << 7) +#define NHM_UNCORE_HIT (1 << 8) +#define NHM_OTHER_CORE_HIT_SNP (1 << 9) +#define NHM_OTHER_CORE_HITM (1 << 10) + /* reserved */ +#define NHM_REMOTE_CACHE_FWD (1 << 12) +#define NHM_REMOTE_DRAM (1 << 13) +#define NHM_LOCAL_DRAM (1 << 14) +#define NHM_NON_DRAM (1 << 15) + +#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) + +#define NHM_DMND_READ (NHM_DMND_DATA_RD) +#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) +#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) + +#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) + +static __initconst const u64 nehalem_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, + }, + } +}; + static __initconst const u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -187,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ @@ -215,16 +436,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -691,8 +922,8 @@ static void intel_pmu_reset(void) printk("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); + checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); @@ -719,6 +950,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This handler doesn't seem to have any issues with the unmasking + * so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); @@ -784,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; + if (event->attr.freq) + return NULL; + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); @@ -794,6 +1038,67 @@ intel_bts_constraints(struct perf_event *event) } static struct event_constraint * +intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; + struct event_constraint *c; + struct intel_percore *pc; + struct er_account *era; + int i; + int free_slot; + int found; + + if (!x86_pmu.percore_constraints || hwc->extra_alloc) + return NULL; + + for (c = x86_pmu.percore_constraints; c->cmask; c++) { + if (e != c->code) + continue; + + /* + * Allocate resource per core. + */ + pc = cpuc->per_core; + if (!pc) + break; + c = &emptyconstraint; + raw_spin_lock(&pc->lock); + free_slot = -1; + found = 0; + for (i = 0; i < MAX_EXTRA_REGS; i++) { + era = &pc->regs[i]; + if (era->ref > 0 && hwc->extra_reg == era->extra_reg) { + /* Allow sharing same config */ + if (hwc->extra_config == era->extra_config) { + era->ref++; + cpuc->percore_used = 1; + hwc->extra_alloc = 1; + c = NULL; + } + /* else conflict */ + found = 1; + break; + } else if (era->ref == 0 && free_slot == -1) + free_slot = i; + } + if (!found && free_slot != -1) { + era = &pc->regs[free_slot]; + era->ref = 1; + era->extra_reg = hwc->extra_reg; + era->extra_config = hwc->extra_config; + cpuc->percore_used = 1; + hwc->extra_alloc = 1; + c = NULL; + } + raw_spin_unlock(&pc->lock); + return c; + } + + return NULL; +} + +static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c; @@ -806,9 +1111,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; + c = intel_percore_constraints(cpuc, event); + if (c) + return c; + return x86_get_event_constraints(cpuc, event); } +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct extra_reg *er; + struct intel_percore *pc; + struct er_account *era; + struct hw_perf_event *hwc = &event->hw; + int i, allref; + + if (!cpuc->percore_used) + return; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (hwc->config & er->config_mask)) + continue; + + pc = cpuc->per_core; + raw_spin_lock(&pc->lock); + for (i = 0; i < MAX_EXTRA_REGS; i++) { + era = &pc->regs[i]; + if (era->ref > 0 && + era->extra_config == hwc->extra_config && + era->extra_reg == er->msr) { + era->ref--; + hwc->extra_alloc = 0; + break; + } + } + allref = 0; + for (i = 0; i < MAX_EXTRA_REGS; i++) + allref += pc->regs[i].ref; + if (allref == 0) + cpuc->percore_used = 0; + raw_spin_unlock(&pc->lock); + break; + } +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -880,20 +1227,67 @@ static __initconst const struct x86_pmu core_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, }; +static int intel_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + if (!cpu_has_ht_siblings()) + return NOTIFY_OK; + + cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpuc->per_core) + return NOTIFY_BAD; + + raw_spin_lock_init(&cpuc->per_core->lock); + cpuc->per_core->core_id = -1; + return NOTIFY_OK; +} + static void intel_pmu_cpu_starting(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int core_id = topology_core_id(cpu); + int i; + init_debug_store_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up. */ intel_pmu_lbr_reset(); + + if (!cpu_has_ht_siblings()) + return; + + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; + + if (pc && pc->core_id == core_id) { + kfree(cpuc->per_core); + cpuc->per_core = pc; + break; + } + } + + cpuc->per_core->core_id = core_id; + cpuc->per_core->refcnt++; } static void intel_pmu_cpu_dying(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_percore *pc = cpuc->per_core; + + if (pc) { + if (pc->core_id == -1 || --pc->refcnt == 0) + kfree(pc); + cpuc->per_core = NULL; + } + fini_debug_store_on_cpu(cpu); } @@ -918,7 +1312,9 @@ static __initconst const struct x86_pmu intel_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, }; @@ -939,7 +1335,7 @@ static void intel_clovertown_quirks(void) * AJ106 could possibly be worked around by not allowing LBR * usage from PEBS, including the fixup. * AJ68 could possibly be worked around by always programming - * a pebs_event_reset[0] value and coping with the lost events. + * a pebs_event_reset[0] value and coping with the lost events. * * But taken together it might just make sense to not enable PEBS on * these chips. @@ -1024,6 +1420,7 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_core(); x86_pmu.event_constraints = intel_core2_event_constraints; + x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); break; @@ -1032,11 +1429,33 @@ static __init int intel_pmu_init(void) case 46: /* 45 nm nehalem-ex, "Beckton" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_nehalem_event_constraints; + x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; + x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.extra_regs = intel_nehalem_extra_regs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + + if (ebx & 0x40) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + + pr_cont("erratum AAJ80 worked around, "); + } pr_cont("Nehalem events, "); break; @@ -1047,21 +1466,51 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_atom(); x86_pmu.event_constraints = intel_gen_event_constraints; + x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; pr_cont("Atom events, "); break; case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; + x86_pmu.percore_constraints = intel_westmere_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; + x86_pmu.extra_regs = intel_westmere_extra_regs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + pr_cont("Westmere events, "); break; + case 42: /* SandyBridge */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + intel_pmu_lbr_init_nhm(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_snb_pebs_events; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; + + pr_cont("SandyBridge events, "); + break; + default: /* * default constraints for v2 and up diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index b7dcd9f2b8a0..bab491b8ee25 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -361,30 +361,70 @@ static int intel_pmu_drain_bts_buffer(void) /* * PEBS */ +static struct event_constraint intel_core2_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_atom_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; -static struct event_constraint intel_core_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ +static struct event_constraint intel_nehalem_pebs_event_constraints[] = { + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ +static struct event_constraint intel_westmere_pebs_event_constraints[] = { + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_snb_pebs_events[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ + INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ + INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ + INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ EVENT_CONSTRAINT_END }; @@ -695,20 +735,17 @@ static void intel_ds_init(void) printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; - x86_pmu.pebs_constraints = intel_core_pebs_events; break; case 1: printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - x86_pmu.pebs_constraints = intel_nehalem_pebs_events; break; default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; - break; } } } diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ff751a9f182b..ead584fb6a7d 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1,5 +1,5 @@ /* - * Netburst Perfomance Events (P4, old Xeon) + * Netburst Performance Events (P4, old Xeon) * * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> @@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { @@ -679,7 +679,7 @@ static int p4_validate_raw_event(struct perf_event *event) */ /* - * if an event is shared accross the logical threads + * if an event is shared across the logical threads * the user needs special permissions to be able to use it */ if (p4_ht_active() && p4_event_bind_map[v].shared) { @@ -764,9 +764,9 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) u64 v; /* an official way for overflow indication */ - rdmsrl(hwc->config_base + hwc->idx, v); + rdmsrl(hwc->config_base, v); if (v & P4_CCCR_OVF) { - wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF); + wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); return 1; } @@ -777,6 +777,7 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) * the counter has reached zero value and continued counting before * real NMI signal was received: */ + rdmsrl(hwc->event_base, v); if (!(v & ARCH_P4_UNFLAGGED_BIT)) return 1; @@ -790,13 +791,13 @@ static void p4_pmu_disable_pebs(void) * * It's still allowed that two threads setup same cache * events so we can't simply clear metrics until we knew - * noone is depending on us, so we need kind of counter + * no one is depending on us, so we need kind of counter * for "ReplayEvent" users. * * What is more complex -- RAW events, if user (for some * reason) will pass some cache event metric with improper * event opcode -- it's fine from hardware point of view - * but completely nonsence from "meaning" of such action. + * but completely nonsense from "meaning" of such action. * * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! @@ -815,7 +816,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (void)checking_wrmsrl(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -885,7 +886,7 @@ static void p4_pmu_enable_event(struct perf_event *event) p4_pmu_enable_pebs(hwc->config); (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (void)checking_wrmsrl(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } @@ -911,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); @@ -946,14 +946,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - p4_pmu_disable_event(event); + x86_pmu_stop(event, 0); } - if (handled) { - /* p4 quirk: unmask it again */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + if (handled) inc_irq_stat(apic_perf_irqs); - } + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } @@ -1187,7 +1196,7 @@ static __init int p4_pmu_init(void) { unsigned int low, high; - /* If we get stripped -- indexig fails */ + /* If we get stripped -- indexing fails */ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 34ba07be2cda..20c097e33860 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + (void)checking_wrmsrl(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + (void)checking_wrmsrl(hwc->config_base, val); } static __initconst const struct x86_pmu p6_pmu = { diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d5a236615501..966512b2cacf 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -46,6 +46,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -70,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 227b0448960d..d22d0c4edcfd 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void) } /* - * While checking the dmi string infomation, just checking the product + * While checking the dmi string information, just checking the product * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index d5cd13945d5a..642f75a68cd5 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -14,9 +14,6 @@ static void *kdump_buf_page; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 994828899e09..afa64adb75ee 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,9 +10,6 @@ #include <linux/uaccess.h> #include <linux/io.h> -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c new file mode 100644 index 000000000000..690bc8461835 --- /dev/null +++ b/arch/x86/kernel/devicetree.c @@ -0,0 +1,441 @@ +/* + * Architecture specific OF callbacks. + */ +#include <linux/bootmem.h> +#include <linux/io.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/of_irq.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/of_pci.h> + +#include <asm/hpet.h> +#include <asm/irq_controller.h> +#include <asm/apic.h> +#include <asm/pci_x86.h> + +__initdata u64 initial_dtb; +char __initdata cmd_line[COMMAND_LINE_SIZE]; +static LIST_HEAD(irq_domains); +static DEFINE_RAW_SPINLOCK(big_irq_lock); + +int __initdata of_ioapic; + +#ifdef CONFIG_X86_IO_APIC +static void add_interrupt_host(struct irq_domain *ih) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_add(&ih->l, &irq_domains); + raw_spin_unlock_irqrestore(&big_irq_lock, flags); +} +#endif + +static struct irq_domain *get_ih_from_node(struct device_node *controller) +{ + struct irq_domain *ih, *found = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_for_each_entry(ih, &irq_domains, l) { + if (ih->controller == controller) { + found = ih; + break; + } + } + raw_spin_unlock_irqrestore(&big_irq_lock, flags); + return found; +} + +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + struct irq_domain *ih; + u32 virq, type; + int ret; + + ih = get_ih_from_node(controller); + if (!ih) + return 0; + ret = ih->xlate(ih, intspec, intsize, &virq, &type); + if (ret) + return 0; + if (type == IRQ_TYPE_NONE) + return virq; + irq_set_irq_type(virq, type); + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +unsigned long pci_address_to_pio(phys_addr_t address) +{ + /* + * The ioport address can be directly used by inX / outX + */ + BUG_ON(address >= (1 << 16)); + return (unsigned long)address; +} +EXPORT_SYMBOL_GPL(pci_address_to_pio); + +void __init early_init_dt_scan_chosen_arch(unsigned long node) +{ + BUG(); +} + +void __init early_init_dt_add_memory_arch(u64 base, u64 size) +{ + BUG(); +} + +void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) +{ + return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); +} + +void __init add_dtb(u64 data) +{ + initial_dtb = data + offsetof(struct setup_data, data); +} + +/* + * CE4100 ids. Will be moved to machine_device_initcall() once we have it. + */ +static struct of_device_id __initdata ce4100_ids[] = { + { .compatible = "intel,ce4100-cp", }, + { .compatible = "isa", }, + { .compatible = "pci", }, + {}, +}; + +static int __init add_bus_probe(void) +{ + if (!of_have_populated_dt()) + return 0; + + return of_platform_bus_probe(NULL, ce4100_ids, NULL); +} +module_init(add_bus_probe); + +#ifdef CONFIG_PCI +static int x86_of_pci_irq_enable(struct pci_dev *dev) +{ + struct of_irq oirq; + u32 virq; + int ret; + u8 pin; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (ret) + return ret; + if (!pin) + return 0; + + ret = of_irq_map_pci(dev, &oirq); + if (ret) + return ret; + + virq = irq_create_of_mapping(oirq.controller, oirq.specifier, + oirq.size); + if (virq == 0) + return -EINVAL; + dev->irq = virq; + return 0; +} + +static void x86_of_pci_irq_disable(struct pci_dev *dev) +{ +} + +void __cpuinit x86_of_pci_init(void) +{ + struct device_node *np; + + pcibios_enable_irq = x86_of_pci_irq_enable; + pcibios_disable_irq = x86_of_pci_irq_disable; + + for_each_node_by_type(np, "pci") { + const void *prop; + struct pci_bus *bus; + unsigned int bus_min; + struct device_node *child; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + + bus = pci_find_bus(0, bus_min); + if (!bus) { + printk(KERN_ERR "Can't find a node for bus %s.\n", + np->full_name); + continue; + } + + if (bus->self) + bus->self->dev.of_node = np; + else + bus->dev.of_node = np; + + for_each_child_of_node(np, child) { + struct pci_dev *dev; + u32 devfn; + + prop = of_get_property(child, "reg", NULL); + if (!prop) + continue; + + devfn = (be32_to_cpup(prop) >> 8) & 0xff; + dev = pci_get_slot(bus, devfn); + if (!dev) + continue; + dev->dev.of_node = child; + pci_dev_put(dev); + } + } +} +#endif + +static void __init dtb_setup_hpet(void) +{ +#ifdef CONFIG_HPET_TIMER + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet"); + if (!dn) + return; + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + WARN_ON(1); + return; + } + hpet_address = r.start; +#endif +} + +static void __init dtb_lapic_setup(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); + if (!dn) + return; + + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + + /* Did the boot loader setup the local APIC ? */ + if (!cpu_has_apic) { + if (apic_force_enable(r.start)) + return; + } + smp_found_config = 1; + pic_mode = 1; + register_lapic_address(r.start); + generic_processor_info(boot_cpu_physical_apicid, + GET_APIC_VERSION(apic_read(APIC_LVR))); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static unsigned int ioapic_id; + +static void __init dtb_add_ioapic(struct device_node *dn) +{ + struct resource r; + int ret; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from node %s.\n", + dn->full_name); + return; + } + mp_register_ioapic(++ioapic_id, r.start, gsi_top); +} + +static void __init dtb_ioapic_setup(void) +{ + struct device_node *dn; + + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; + return; + } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif + +static void __init dtb_apic_setup(void) +{ + dtb_lapic_setup(); + dtb_ioapic_setup(); +} + +#ifdef CONFIG_OF_FLATTREE +static void __init x86_flattree_get_config(void) +{ + u32 size, map_len; + void *new_dtb; + + if (!initial_dtb) + return; + + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), + (u64)sizeof(struct boot_param_header)); + + initial_boot_params = early_memremap(initial_dtb, map_len); + size = be32_to_cpu(initial_boot_params->totalsize); + if (map_len < size) { + early_iounmap(initial_boot_params, map_len); + initial_boot_params = early_memremap(initial_dtb, size); + map_len = size; + } + + new_dtb = alloc_bootmem(size); + memcpy(new_dtb, initial_boot_params, size); + early_iounmap(initial_boot_params, map_len); + + initial_boot_params = new_dtb; + + /* root level address cells */ + of_scan_flat_dt(early_init_dt_scan_root, NULL); + + unflatten_device_tree(); +} +#else +static inline void x86_flattree_get_config(void) { } +#endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} + +#ifdef CONFIG_X86_IO_APIC + +struct of_ioapic_type { + u32 out_type; + u32 trigger; + u32 polarity; +}; + +static struct of_ioapic_type of_ioapic_type[] = +{ + { + .out_type = IRQ_TYPE_EDGE_RISING, + .trigger = IOAPIC_EDGE, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_LEVEL_LOW, + .trigger = IOAPIC_LEVEL, + .polarity = 0, + }, + { + .out_type = IRQ_TYPE_LEVEL_HIGH, + .trigger = IOAPIC_LEVEL, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_FALLING, + .trigger = IOAPIC_EDGE, + .polarity = 0, + }, +}; + +static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, + u32 *out_hwirq, u32 *out_type) +{ + struct mp_ioapic_gsi *gsi_cfg; + struct io_apic_irq_attr attr; + struct of_ioapic_type *it; + u32 line, idx, type; + + if (intsize < 2) + return -EINVAL; + + line = *intspec; + idx = (u32) id->priv; + gsi_cfg = mp_ioapic_gsi_routing(idx); + *out_hwirq = line + gsi_cfg->gsi_base; + + intspec++; + type = *intspec; + + if (type >= ARRAY_SIZE(of_ioapic_type)) + return -EINVAL; + + it = of_ioapic_type + type; + *out_type = it->out_type; + + set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); + + return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); +} + +static void __init ioapic_add_ofnode(struct device_node *np) +{ + struct resource r; + int i, ret; + + ret = of_address_to_resource(np, 0, &r); + if (ret) { + printk(KERN_ERR "Failed to obtain address for %s\n", + np->full_name); + return; + } + + for (i = 0; i < nr_ioapics; i++) { + if (r.start == mpc_ioapic_addr(i)) { + struct irq_domain *id; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + BUG_ON(!id); + id->controller = np; + id->xlate = ioapic_xlate; + id->priv = (void *)i; + add_interrupt_host(id); + return; + } + } + printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name); +} + +void __init x86_add_irq_domains(void) +{ + struct device_node *dp; + + if (!of_have_populated_dt()) + return; + + for_each_node_with_property(dp, "interrupt-controller") { + if (of_device_is_compatible(dp, "intel,ce4100-ioapic")) + ioapic_add_ofnode(dp); + } +} +#else +void __init x86_add_irq_domains(void) { } +#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index df20723a6a1b..1aae78f775fc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,7 +27,7 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pS\n", (void *) address, + printk(" [<%p>] %s%pB\n", (void *) address, reliable ? "" : "? ", (void *) address); } @@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo, } EXPORT_SYMBOL_GPL(print_context_stack_bp); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - static int print_trace_stack(void *data, char *name) { printk("%s <%s> ", (char *)data, name); @@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, .stack = print_trace_stack, .address = print_trace_address, .walk_stack = print_context_stack, @@ -175,21 +159,21 @@ static const struct stacktrace_ops print_trace_ops = { void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack) + unsigned long *stack, unsigned long bp) { - show_trace_log_lvl(task, regs, stack, ""); + show_trace_log_lvl(task, regs, stack, bp, ""); } void show_stack(struct task_struct *task, unsigned long *sp) { - show_stack_log_lvl(task, NULL, sp, ""); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -197,14 +181,16 @@ void show_stack(struct task_struct *task, unsigned long *sp) */ void dump_stack(void) { + unsigned long bp; unsigned long stack; + bp = stack_frame(current, NULL); printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &stack); + show_trace(NULL, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -277,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; @@ -320,41 +305,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - flags = oops_begin(); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - oops_end(flags, regs, 0); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - static int __init kstack_setup(char *s) { if (!s) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 74cc1eda384b..3b97a80ce329 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,12 +17,11 @@ #include <asm/stacktrace.h> -void dump_trace(struct task_struct *task, - struct pt_regs *regs, unsigned long *stack, +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { int graph = 0; - unsigned long bp; if (!task) task = current; @@ -35,7 +34,9 @@ void dump_trace(struct task_struct *task, stack = (unsigned long *)task->thread.sp; } - bp = stack_frame(task, regs); + if (!bp) + bp = stack_frame(task, regs); + for (;;) { struct thread_info *context; @@ -55,7 +56,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -77,7 +78,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -102,7 +103,7 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a6b6fcf7f0ae..e71c98d3c0d2 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void dump_trace(struct task_struct *task, - struct pt_regs *regs, unsigned long *stack, +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -150,7 +150,6 @@ void dump_trace(struct task_struct *task, struct thread_info *tinfo; int graph = 0; unsigned long dummy; - unsigned long bp; if (!task) task = current; @@ -161,7 +160,8 @@ void dump_trace(struct task_struct *task, stack = (unsigned long *)task->thread.sp; } - bp = stack_frame(task, regs); + if (!bp) + bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -225,7 +225,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; @@ -269,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, preempt_enable(); printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_registers(struct pt_regs *regs) @@ -298,7 +298,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - KERN_EMERG); + 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 294f26da0c0c..3e2ef8425316 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> +#include <linux/crash_dump.h> #include <linux/bootmem.h> #include <linux/pfn.h> #include <linux/suspend.h> @@ -667,21 +668,15 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) +void __init parse_e820_ext(struct setup_data *sdata) { - u32 map_len; int entries; struct e820entry *extmap; entries = sdata->len / sizeof(struct e820entry); - map_len = sdata->len + sizeof(struct setup_data); - if (map_len > PAGE_SIZE) - sdata = early_ioremap(pa_data, map_len); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - if (map_len > PAGE_SIZE) - early_iounmap(sdata, map_len); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("extended"); } @@ -847,15 +842,21 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; -#ifdef CONFIG_X86_32 if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; - } +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; #endif + } userdef = 1; mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 9efbdcc56425..3755ef494390 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -159,7 +159,12 @@ static void __init ati_bugs_contd(int num, int slot, int func) if (rev >= 0x40) acpi_fix_pin2_polarity = 1; - if (rev > 0x13) + /* + * SB600: revisions 0x11, 0x12, 0x13, 0x14, ... + * SB700: revisions 0x39, 0x3a, ... + * SB800: revisions 0x40, 0x41, ... + */ + if (rev >= 0x39) return; if (acpi_use_timer_override) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c8b4efad7ebb..5c1a91974918 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -65,6 +65,8 @@ #define sysexit_audit syscall_exit_work #endif + .section .entry.text, "ax" + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -395,7 +397,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone) */ .section .init.rodata,"a" ENTRY(interrupt) -.text +.section .entry.text, "ax" .p2align 5 .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR .endif .previous .long 1b - .text + .section .entry.text, "ax" vector=vector+1 .endif .endr @@ -1409,11 +1411,10 @@ END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) RING0_EC_FRAME - pushl $do_async_page_fault - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_async_page_fault jmp error_code CFI_ENDPROC -END(apf_page_fault) +END(async_page_fault) #endif /* diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index aed1ffbeb0c9..8a445a0c989e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -18,7 +18,7 @@ * A note on terminology: * - top of stack: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. - * - partial stack frame: partially saved registers upto R11. + * - partial stack frame: partially saved registers up to R11. * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: @@ -61,6 +61,8 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 + .section .entry.text, "ax" + #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -420,7 +422,7 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* - * System call entry. Upto 6 arguments in registers are supported. + * System call entry. Up to 6 arguments in registers are supported. * * SYSCALL does not save anything on the stack and does not change the * stack pointer. @@ -744,7 +746,7 @@ END(stub_rt_sigreturn) */ .section .init.rodata,"a" ENTRY(interrupt) - .text + .section .entry.text .p2align 5 .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) @@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR .endif .previous .quad 1b - .text + .section .entry.text vector=vector+1 .endif .endr @@ -975,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -.irpc idx, "01234567" +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ invalidate_interrupt\idx smp_invalidate_interrupt +.endif .endr #endif @@ -1248,7 +1253,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) decl PER_CPU_VAR(irq_count) jmp error_exit CFI_ENDPROC -END(do_hypervisor_callback) +END(xen_do_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 382eb2936d4d..c9a281f272fd 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -123,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) static atomic_t nmi_running = ATOMIC_INIT(0); static int mod_code_status; /* holds return value of text write */ static void *mod_code_ip; /* holds the IP to write to */ -static void *mod_code_newcode; /* holds the text to write to the IP */ +static const void *mod_code_newcode; /* holds the text to write to the IP */ static unsigned nmi_wait_count; static atomic_t nmi_update_count = ATOMIC_INIT(0); @@ -225,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end) } static int -do_ftrace_mod_code(unsigned long ip, void *new_code) +do_ftrace_mod_code(unsigned long ip, const void *new_code) { /* * On x86_64, kernel text mappings are mapped read-only with @@ -260,14 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) return mod_code_status; } -static unsigned char *ftrace_nop_replace(void) +static const unsigned char *ftrace_nop_replace(void) { - return ideal_nop5; + return ideal_nops[NOP_ATOMIC5]; } static int -ftrace_modify_code(unsigned long ip, unsigned char *old_code, - unsigned char *new_code) +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -301,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned char *new, *old; + unsigned const char *new, *old; unsigned long ip = rec->ip; old = ftrace_call_replace(ip, addr); @@ -312,7 +312,7 @@ int ftrace_make_nop(struct module *mod, int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned char *new, *old; + unsigned const char *new, *old; unsigned long ip = rec->ip; old = ftrace_nop_replace(); @@ -437,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, return; } - if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer) == -EBUSY) { - *parent = old; - return; - } - trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) { - current->curr_ret_stack--; *parent = old; + return; + } + + if (ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer) == -EBUSY) { + *parent = old; + return; } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 7f138b3c3c52..3bb08509a7a1 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -23,7 +23,6 @@ static void __init i386_default_early_setup(void) { /* Initialize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; @@ -34,15 +33,6 @@ void __init i386_start_kernel(void) { memblock_init(); -#ifdef CONFIG_X86_TRAMPOLINE - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); -#endif - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2d2673c28aff..5655c2272adb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data) /* Make NULL pointers segfault */ zap_identity_mappings(); - /* Cleanup the over mapped high alias */ - cleanup_highmap(); - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 767d6c43de37..ce0be7cd085e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT */ KERNEL_PAGES = LOWMEM_PAGES -INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* @@ -137,7 +137,7 @@ ENTRY(startup_32) movsl 1: -#ifdef CONFIG_OLPC_OPENFIRMWARE +#ifdef CONFIG_OLPC /* save OFW's pgdir table for later use when calling into OFW */ movl %cr3, %eax movl %eax, pa(olpc_ofw_pgd) @@ -623,7 +623,7 @@ ENTRY(initial_code) * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE_asm + .align PAGE_SIZE #ifdef CONFIG_X86_PAE initial_pg_pmd: .fill 1024*KPMDS,4,0 @@ -644,7 +644,7 @@ ENTRY(swapper_pg_dir) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE_asm + .align PAGE_SIZE ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 @@ -662,7 +662,7 @@ ENTRY(initial_page_table) # else # error "Kernel PMDs should be 1, 2 or 3" # endif - .align PAGE_SIZE_asm /* needs to be page-sized too */ + .align PAGE_SIZE /* needs to be page-sized too */ #endif .data diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 239046bd447f..e11e39478a49 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -136,10 +136,9 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) -#ifdef CONFIG_X86_TRAMPOLINE + /* Fixup trampoline */ addq %rbp, trampoline_level4_pgt + 0(%rip) addq %rbp, trampoline_level4_pgt + (511*8)(%rip) -#endif /* Due to ENTRY(), sometimes the empty space gets filled with * zeros. Better take a jmp than relying on empty space being diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4ff5968f12d2..6781765b3a0d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -217,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { } /* * Common hpet info */ -static unsigned long hpet_period; +static unsigned long hpet_freq; static void hpet_legacy_set_mode(enum clock_event_mode mode, struct clock_event_device *evt); @@ -232,7 +232,6 @@ static struct clock_event_device hpet_clockevent = { .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = hpet_legacy_set_mode, .set_next_event = hpet_legacy_next_event, - .shift = 32, .irq = 0, .rating = 50, }; @@ -290,28 +289,12 @@ static void hpet_legacy_clockevent_register(void) hpet_enable_legacy_int(); /* - * The mult factor is defined as (include/linux/clockchips.h) - * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h) - * hpet_period is in units of femtoseconds (per cycle), so - * mult/2^shift = cyc/ns = 10^6/hpet_period - * mult = (10^6 * 2^shift)/hpet_period - * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period - */ - hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC, - hpet_period, hpet_clockevent.shift); - /* Calculate the min / max delta */ - hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &hpet_clockevent); - /* Setup minimum reprogramming delta. */ - hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, - &hpet_clockevent); - - /* * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(&hpet_clockevent); + clockevents_config_and_register(&hpet_clockevent, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hpet_clockevent; printk(KERN_DEBUG "hpet clockevent registered\n"); } @@ -503,7 +486,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) if (!irq) return -EINVAL; - set_irq_data(irq, dev); + irq_set_handler_data(irq, dev); if (hpet_setup_msi_irq(irq)) return -EINVAL; @@ -549,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev) static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) { struct clock_event_device *evt = &hdev->evt; - uint64_t hpet_freq; WARN_ON(cpu != smp_processor_id()); if (!(hdev->flags & HPET_DEV_VALID)) @@ -571,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) evt->set_mode = hpet_msi_set_mode; evt->set_next_event = hpet_msi_next_event; - evt->shift = 32; - - /* - * The period is a femto seconds value. We need to calculate the - * scaled math multiplication factor for nanosecond to hpet tick - * conversion. - */ - hpet_freq = FSEC_PER_SEC; - do_div(hpet_freq, hpet_period); - evt->mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, evt->shift); - /* Calculate the max delta */ - evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); - /* 5 usec minimum reprogramming delta. */ - evt->min_delta_ns = 5000; - evt->cpumask = cpumask_of(hdev->cpu); - clockevents_register_device(evt); + + clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, + 0x7FFFFFFF); } #ifdef CONFIG_HPET @@ -792,7 +760,6 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; - u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -819,24 +786,7 @@ static int hpet_clocksource_register(void) return -ENODEV; } - /* - * The definition of mult is (include/linux/clocksource.h) - * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc - * so we first need to convert hpet_period to ns/cyc units: - * mult/2^shift = ns/cyc = hpet_period/10^6 - * mult = (hpet_period * 2^shift)/10^6 - * mult = (hpet_period << shift)/FSEC_PER_NSEC - */ - - /* Need to convert hpet_period (fsec/cyc) to cyc/sec: - * - * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) - * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period - */ - hpet_freq = FSEC_PER_SEC; - do_div(hpet_freq, hpet_period); clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); - return 0; } @@ -845,7 +795,9 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { + unsigned long hpet_period; unsigned int id; + u64 freq; int i; if (!is_hpet_capable()) @@ -884,6 +836,14 @@ int __init hpet_enable(void) goto out_nohpet; /* + * The period is a femto seconds value. Convert it to a + * frequency. + */ + freq = FSEC_PER_SEC; + do_div(freq, hpet_period); + hpet_freq = freq; + + /* * Read the HPET ID register to retrieve the IRQ routing * information and the number of channels */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e60c38cc0eed..12aff2537682 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit); * The _current_ task is using the FPU for the first time * so initialize it and set the mxcsr to its default * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. + * remember the current task has used the FPU. */ int init_fpu(struct task_struct *tsk) { diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index b42ca694dc68..8eeaa81de066 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -10,7 +10,7 @@ */ #include <linux/init.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <asm/dma.h> @@ -21,7 +21,7 @@ * in asm/dma.h. */ -static int i8237A_resume(struct sys_device *dev) +static void i8237A_resume(void) { unsigned long flags; int i; @@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev) enable_dma(4); release_dma_lock(flags); - - return 0; } -static int i8237A_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class i8237_sysdev_class = { - .name = "i8237", - .suspend = i8237A_suspend, +static struct syscore_ops i8237_syscore_ops = { .resume = i8237A_resume, }; -static struct sys_device device_i8237A = { - .id = 0, - .cls = &i8237_sysdev_class, -}; - -static int __init i8237A_init_sysfs(void) +static int __init i8237A_init_ops(void) { - int error = sysdev_class_register(&i8237_sysdev_class); - if (!error) - error = sysdev_register(&device_i8237A); - return error; + register_syscore_ops(&i8237_syscore_ops); + return 0; } -device_initcall(i8237A_init_sysfs); +device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2dfd31597443..fb66dc9e36cb 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = { .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = init_pit_timer, .set_next_event = pit_next_event, - .shift = 32, .irq = 0, }; @@ -108,90 +107,12 @@ void __init setup_pit_timer(void) * IO_APIC has been initialized. */ pit_ce.cpumask = cpumask_of(smp_processor_id()); - pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift); - pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce); - pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce); - clockevents_register_device(&pit_ce); + clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF); global_clock_event = &pit_ce; } #ifndef CONFIG_X86_64 -/* - * Since the PIT overflows every tick, its not very useful - * to just read by itself. So use jiffies to emulate a free - * running counter: - */ -static cycle_t pit_read(struct clocksource *cs) -{ - static int old_count; - static u32 old_jifs; - unsigned long flags; - int count; - u32 jifs; - - raw_spin_lock_irqsave(&i8253_lock, flags); - /* - * Although our caller may have the read side of xtime_lock, - * this is now a seqlock, and we are cheating in this routine - * by having side effects on state that we cannot undo if - * there is a collision on the seqlock and our caller has to - * retry. (Namely, old_jifs and old_count.) So we must treat - * jiffies as volatile despite the lock. We read jiffies - * before latching the timer count to guarantee that although - * the jiffies value might be older than the count (that is, - * the counter may underflow between the last point where - * jiffies was incremented and the point where we latch the - * count), it cannot be newer. - */ - jifs = jiffies; - outb_pit(0x00, PIT_MODE); /* latch the count ASAP */ - count = inb_pit(PIT_CH0); /* read the latched count */ - count |= inb_pit(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_pit(0x34, PIT_MODE); - outb_pit(LATCH & 0xff, PIT_CH0); - outb_pit(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * It's possible for count to appear to go the wrong way for a - * couple of reasons: - * - * 1. The timer counter underflows, but we haven't handled the - * resulting interrupt and incremented jiffies yet. - * 2. Hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - * - * Previous attempts to handle these cases intelligently were - * buggy, so we just do the simple thing now. - */ - if (count > old_count && jifs == old_jifs) - count = old_count; - - old_count = count; - old_jifs = jifs; - - raw_spin_unlock_irqrestore(&i8253_lock, flags); - - count = (LATCH - 1) - count; - - return (cycle_t)(jifs * LATCH) + count; -} - -static struct clocksource pit_cs = { - .name = "pit", - .rating = 110, - .read = pit_read, - .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, -}; - static int __init init_pit_clocksource(void) { /* @@ -205,10 +126,7 @@ static int __init init_pit_clocksource(void) pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); - - return clocksource_register(&pit_cs); + return clocksource_i8253_init(); } arch_initcall(init_pit_clocksource); - #endif /* !CONFIG_X86_64 */ diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 20757cb2efa3..65b8f5c2eebf 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -8,7 +8,7 @@ #include <linux/random.h> #include <linux/init.h> #include <linux/kernel_stat.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/io.h> @@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, + irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, i8259A_chip.name); enable_irq(irq); } @@ -245,20 +245,19 @@ static void save_ELCR(char *trigger) trigger[1] = inb(0x4d1) & 0xDE; } -static int i8259A_resume(struct sys_device *dev) +static void i8259A_resume(void) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); - return 0; } -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +static int i8259A_suspend(void) { save_ELCR(irq_trigger); return 0; } -static int i8259A_shutdown(struct sys_device *dev) +static void i8259A_shutdown(void) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -266,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev) */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; } -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", +static struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - static void mask_8259A(void) { unsigned long flags; @@ -399,17 +391,12 @@ struct legacy_pic default_legacy_pic = { struct legacy_pic *legacy_pic = &default_legacy_pic; -static int __init i8259A_init_sysfs(void) +static int __init i8259A_init_ops(void) { - int error; - - if (legacy_pic != &default_legacy_pic) - return 0; + if (legacy_pic == &default_legacy_pic) + register_syscore_ops(&i8259_syscore_ops); - error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; + return 0; } -device_initcall(i8259A_init_sysfs); +device_initcall(i8259A_init_ops); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8eec0ec59af2..8c968974253d 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,22 +14,9 @@ #include <linux/slab.h> #include <linux/thread_info.h> #include <linux/syscalls.h> +#include <linux/bitmap.h> #include <asm/syscalls.h> -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, - unsigned int extent, int new_value) -{ - unsigned int i; - - for (i = base; i < base + extent; i++) { - if (new_value) - __set_bit(i, bitmap); - else - __clear_bit(i, bitmap); - } -} - /* * this changes the io permissions bitmap in the current task. */ @@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) */ tss = &per_cpu(init_tss, get_cpu()); - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); + else + bitmap_set(t->io_bitmap_ptr, from, num); /* * Search for a (possibly new) maximum. This is simple and stupid, diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 387b6a0c9e81..6c0802eb2f7f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -8,6 +8,7 @@ #include <linux/seq_file.h> #include <linux/smp.h> #include <linux/ftrace.h> +#include <linux/delay.h> #include <asm/apic.h> #include <asm/io_apic.h> @@ -44,9 +45,9 @@ void ack_bad_irq(unsigned int irq) #define irq_stats(x) (&per_cpu(irq_stat, x)) /* - * /proc/interrupts printing: + * /proc/interrupts printing for arch specific interrupts */ -static int show_other_interrupts(struct seq_file *p, int prec) +int arch_show_interrupts(struct seq_file *p, int prec) { int j; @@ -122,59 +123,6 @@ static int show_other_interrupts(struct seq_file *p, int prec) return 0; } -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j, prec; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) - j *= 10; - - if (i == nr_irqs) - return show_other_interrupts(p, prec); - - /* print header */ - if (i == 0) { - seq_printf(p, "%*s", prec + 8, ""); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d", j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - if (!desc) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%*d: ", prec, i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - /* * /proc/stat helpers */ @@ -276,15 +224,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs) EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); -#ifdef CONFIG_OF -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - return intspec[0]; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); -#endif - #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) @@ -293,6 +232,7 @@ void fixup_irqs(void) static int warned; struct irq_desc *desc; struct irq_data *data; + struct irq_chip *chip; for_each_irq_desc(irq, desc) { int break_affinity = 0; @@ -307,10 +247,10 @@ void fixup_irqs(void) /* interrupt's are disabled at this point */ raw_spin_lock(&desc->lock); - data = &desc->irq_data; + data = irq_desc_get_irq_data(desc); affinity = data->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { + if (!irq_has_action(irq) || irqd_is_per_cpu(data) || + cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); continue; } @@ -327,16 +267,18 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) - data->chip->irq_mask(data); + chip = irq_data_get_irq_chip(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_mask) + chip->irq_mask(data); - if (data->chip->irq_set_affinity) - data->chip->irq_set_affinity(data, affinity, true); + if (chip->irq_set_affinity) + chip->irq_set_affinity(data, affinity, true); else if (!(warned++)) set_affinity = 0; - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) - data->chip->irq_unmask(data); + if (!irqd_can_move_in_process_context(data) && + !irqd_irq_disabled(data) && chip->irq_unmask) + chip->irq_unmask(data); raw_spin_unlock(&desc->lock); @@ -368,10 +310,11 @@ void fixup_irqs(void) irq = __this_cpu_read(vector_irq[vector]); desc = irq_to_desc(irq); - data = &desc->irq_data; + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); raw_spin_lock(&desc->lock); - if (data->chip->irq_retrigger) - data->chip->irq_retrigger(data); + if (chip->irq_retrigger) + chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } } diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 9974d21048fd..72090705a656 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -172,7 +172,7 @@ asmlinkage void do_softirq(void) call_on_stack(__do_softirq, isp); /* - * Shouldnt happen, we returned above if in_interrupt(): + * Shouldn't happen, we returned above if in_interrupt(): */ WARN_ON_ONCE(softirq_count()); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c752e973958d..f470e4ef993e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/i8259.h> #include <asm/traps.h> +#include <asm/prom.h> /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", + .flags = IRQF_NO_THREAD, }; #endif @@ -80,6 +82,7 @@ static struct irqaction fpu_irq = { static struct irqaction irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { @@ -110,7 +113,7 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) - set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); + irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); } void __init init_IRQ(void) @@ -118,6 +121,12 @@ void __init init_IRQ(void) int i; /* + * We probably need a better place for this, but it works for + * now ... + */ + x86_add_irq_domains(); + + /* * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If @@ -164,14 +173,77 @@ static void __init smp_intr_init(void) alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); +#define ALLOC_INVTLB_VEC(NR) \ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ + invalidate_interrupt##NR) + + switch (NUM_INVALIDATE_TLB_VECTORS) { + default: + ALLOC_INVTLB_VEC(31); + case 31: + ALLOC_INVTLB_VEC(30); + case 30: + ALLOC_INVTLB_VEC(29); + case 29: + ALLOC_INVTLB_VEC(28); + case 28: + ALLOC_INVTLB_VEC(27); + case 27: + ALLOC_INVTLB_VEC(26); + case 26: + ALLOC_INVTLB_VEC(25); + case 25: + ALLOC_INVTLB_VEC(24); + case 24: + ALLOC_INVTLB_VEC(23); + case 23: + ALLOC_INVTLB_VEC(22); + case 22: + ALLOC_INVTLB_VEC(21); + case 21: + ALLOC_INVTLB_VEC(20); + case 20: + ALLOC_INVTLB_VEC(19); + case 19: + ALLOC_INVTLB_VEC(18); + case 18: + ALLOC_INVTLB_VEC(17); + case 17: + ALLOC_INVTLB_VEC(16); + case 16: + ALLOC_INVTLB_VEC(15); + case 15: + ALLOC_INVTLB_VEC(14); + case 14: + ALLOC_INVTLB_VEC(13); + case 13: + ALLOC_INVTLB_VEC(12); + case 12: + ALLOC_INVTLB_VEC(11); + case 11: + ALLOC_INVTLB_VEC(10); + case 10: + ALLOC_INVTLB_VEC(9); + case 9: + ALLOC_INVTLB_VEC(8); + case 8: + ALLOC_INVTLB_VEC(7); + case 7: + ALLOC_INVTLB_VEC(6); + case 6: + ALLOC_INVTLB_VEC(5); + case 5: + ALLOC_INVTLB_VEC(4); + case 4: + ALLOC_INVTLB_VEC(3); + case 3: + ALLOC_INVTLB_VEC(2); + case 2: + ALLOC_INVTLB_VEC(1); + case 1: + ALLOC_INVTLB_VEC(0); + break; + } /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); @@ -243,7 +315,7 @@ void __init native_init_IRQ(void) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } - if (!acpi_ioapic) + if (!acpi_ioapic && !of_ioapic) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 961b6b30ba90..3fee346ef545 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry, code.offset = entry->target - (entry->code + JUMP_LABEL_NOP_SIZE); } else - memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); + memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); get_online_cpus(); mutex_lock(&text_mutex); text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); @@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry, void arch_jump_label_text_poke_early(jump_label_t addr) { - text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); + text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], + JUMP_LABEL_NOP_SIZE); } #endif diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index a4130005028a..5f9ecff328b5 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -121,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, dbg_reg_def[regno].size); - switch (regno) { #ifdef CONFIG_X86_32 + switch (regno) { case GDB_SS: if (!user_mode_vm(regs)) *(unsigned long *)mem = __KERNEL_DS; @@ -135,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) case GDB_FS: *(unsigned long *)mem = 0xFFFF; break; -#endif } +#endif return dbg_reg_def[regno].name; } @@ -278,7 +278,7 @@ static int hw_break_release_slot(int breakno) pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); if (dbg_release_bp_slot(*pevent)) /* - * The debugger is responisble for handing the retry on + * The debugger is responsible for handing the retry on * remove failure. */ return -1; @@ -533,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) } return NOTIFY_DONE; - case DIE_NMIWATCHDOG: - if (atomic_read(&kgdb_active) != -1) { - /* KGDB CPU roundup: */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - return NOTIFY_STOP; - } - /* Enter debugger: */ - break; - case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index d91c477b3f62..f1a6244d7d93 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - preempt_disable(); + local_irq_save(flags); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { @@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + local_irq_restore(flags); } static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) @@ -1276,6 +1277,14 @@ static int __kprobes can_optimize(unsigned long paddr) if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) return 0; + /* + * Do not optimize in the entry code due to the unstable + * stack handling. + */ + if ((paddr >= (unsigned long )__entry_text_start) && + (paddr < (unsigned long )__entry_text_end)) + return 0; + /* Check there is enough space for a relative jump. */ if (size - offset < RELATIVEJUMP_SIZE) return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8dc44662394b..33c07b0b122e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); } -static void kvm_guest_cpu_online(void *dummy) +static void __cpuinit kvm_guest_cpu_online(void *dummy) { kvm_guest_cpu_init(); } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f98d3eafe07a..6389a6bca11b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -26,8 +26,6 @@ #include <asm/x86_init.h> #include <asm/reboot.h> -#define KVM_SCALE 22 - static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; @@ -120,8 +118,6 @@ static struct clocksource kvm_clock = { .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << KVM_SCALE, - .shift = KVM_SCALE, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -203,7 +199,7 @@ void __init kvmclock_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 63eaf6596233..177183cbb6ae 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -259,7 +259,7 @@ static int __init mca_init(void) /* * WARNING: Be careful when making changes here. Putting an adapter * and the motherboard simultaneously into setup mode may result in - * damage to chips (according to The Indispensible PC Hardware Book + * damage to chips (according to The Indispensable PC Hardware Book * by Hans-Peter Messmer). Also, we disable system interrupts (so * that we are not disturbed in the middle of this). */ diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 0fe6d1a66c38..c5610384ab16 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -66,7 +66,6 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE 2048 #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 @@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); return -1; } + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + return 0; } -static int get_matching_microcode(int cpu, void *mc, int rev) +static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, + int rev) { - struct microcode_header_amd *mc_header = mc; unsigned int current_cpu_id; u16 equiv_cpu_id = 0; unsigned int i = 0; @@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev) if (!equiv_cpu_id) return 0; - if (mc_header->processor_rev_id != equiv_cpu_id) + if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; /* ucode might be chipset specific -- currently we don't support this */ - if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - pr_err("CPU%d: loading of chipset specific code not yet supported\n", + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("CPU%d: chipset specific code not yet supported\n", cpu); return 0; } - if (mc_header->patch_id <= rev) + if (mc_hdr->patch_id <= rev) return 0; return 1; @@ -144,71 +143,93 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; } -static void * -get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) +static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) { - unsigned int total_size; - u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; - void *mc; + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int max_size, actual_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } - get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); + actual_size = buf[4] + (buf[5] << 8); - if (section_hdr[0] != UCODE_UCODE_TYPE) { - pr_err("error: invalid type field in container file section header\n"); - return NULL; + if (actual_size > size || actual_size > max_size) { + pr_err("section size mismatch\n"); + return 0; } - total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); + return actual_size; +} - if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("error: size mismatch\n"); - return NULL; +static struct microcode_header_amd * +get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) +{ + struct microcode_header_amd *mc = NULL; + unsigned int actual_size = 0; + + if (buf[0] != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + goto out; } - mc = vzalloc(UCODE_MAX_SIZE); + actual_size = verify_ucode_size(cpu, buf, size); + if (!actual_size) + goto out; + + mc = vzalloc(actual_size); if (!mc) - return NULL; + goto out; - get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); + *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; +out: return mc; } static int install_equiv_cpu_table(const u8 *buf) { - u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; - unsigned int *buf_pos = (unsigned int *)container_hdr; - unsigned long size; - - get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); - - size = buf_pos[2]; - - if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("error: invalid type field in container file section header\n"); - return 0; + unsigned int *ibuf = (unsigned int *)buf; + unsigned int type = ibuf[1]; + unsigned int size = ibuf[2]; + + if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + pr_err("empty section/" + "invalid type field in container file section header\n"); + return -EINVAL; } equiv_cpu_table = vmalloc(size); if (!equiv_cpu_table) { pr_err("failed to allocate equivalent CPU table\n"); - return 0; + return -ENOMEM; } - buf += UCODE_CONTAINER_HEADER_SIZE; - get_ucode_data(equiv_cpu_table, buf, size); + get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size); return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ } @@ -223,16 +244,16 @@ static enum ucode_state generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_header_amd *mc_hdr = NULL; + unsigned int mc_size, leftover; + int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; - void *mc; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover; - unsigned long offset; + unsigned int new_rev = uci->cpu_sig.rev; enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); - if (!offset) { + if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -241,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) leftover = size - offset; while (leftover) { - unsigned int uninitialized_var(mc_size); - struct microcode_header_amd *mc_header; - - mc = get_next_ucode(ucode_ptr, leftover, &mc_size); - if (!mc) + mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); + if (!mc_hdr) break; - mc_header = (struct microcode_header_amd *)mc; - if (get_matching_microcode(cpu, mc, new_rev)) { + if (get_matching_microcode(cpu, mc_hdr, new_rev)) { vfree(new_mc); - new_rev = mc_header->patch_id; - new_mc = mc; + new_rev = mc_hdr->patch_id; + new_mc = mc_hdr; } else - vfree(mc); + vfree(mc_hdr); ucode_ptr += mc_size; leftover -= mc_size; } - if (new_mc) { - if (!leftover) { - vfree(uci->mc); - uci->mc = new_mc; - pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); - } else { - vfree(new_mc); - state = UCODE_ERROR; - } - } else + if (!new_mc) { state = UCODE_NFOUND; + goto free_table; + } + if (!leftover) { + vfree(uci->mc); + uci->mc = new_mc; + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); + } else { + vfree(new_mc); + state = UCODE_ERROR; + } + +free_table: free_equiv_cpu_table(); return state; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_amd(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; - enum ucode_state ret; + const struct firmware *fw; + enum ucode_state ret = UCODE_NFOUND; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return UCODE_NFOUND; + if (request_firmware(&fw, fw_name, device)) { + pr_err("failed to load file %s\n", fw_name); + goto out; } - if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("invalid UCODE_MAGIC (0x%08x)\n", - *(u32 *)firmware->data); - return UCODE_ERROR; + ret = UCODE_ERROR; + if (*(u32 *)fw->data != UCODE_MAGIC) { + pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); + goto fw_release; } - ret = generic_load_microcode(cpu, firmware->data, firmware->size); + ret = generic_load_microcode(cpu, fw->data, fw->size); - release_firmware(firmware); +fw_release: + release_firmware(fw); +out: return ret; } @@ -319,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu) static struct microcode_ops microcode_amd_ops = { .request_microcode_user = request_microcode_user, - .request_microcode_fw = request_microcode_fw, + .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 1cca374a2bac..f9242800bc84 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -82,6 +82,7 @@ #include <linux/cpu.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/syscore_ops.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -417,8 +418,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) - err = -EINVAL; + if (microcode_init_cpu(cpu) == UCODE_ERROR) { + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return -EINVAL; + } return err; } @@ -436,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; } -static int mc_sysdev_resume(struct sys_device *dev) +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) { - int cpu = dev->id; + int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (!cpu_online(cpu)) - return 0; - - /* - * All non-bootup cpus are still disabled, - * so only CPU 0 will apply ucode here. - * - * Moreover, there can be no concurrent - * updates from any other places at this point. - */ - WARN_ON(cpu != 0); - if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); - - return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, +static struct syscore_ops mc_syscore_ops = { + .resume = mc_bp_resume, }; static __cpuinit int @@ -540,6 +535,7 @@ static int __init microcode_init(void) if (error) return error; + register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION @@ -554,6 +550,7 @@ static void __exit microcode_exit(void) microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); + unregister_syscore_ops(&mc_syscore_ops); get_online_cpus(); mutex_lock(µcode_mutex); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index ab23f1ad4bf1..52f256f2cc81 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include <linux/bug.h> #include <linux/mm.h> #include <linux/gfp.h> +#include <linux/jump_label.h> #include <asm/system.h> #include <asm/page.h> diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 01b0f6d06451..9103b89c145a 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) intsrc.type = MP_INTSRC; intsrc.irqflag = 0; /* conforming */ intsrc.srcbus = 0; - intsrc.dstapic = mp_ioapics[0].apicid; + intsrc.dstapic = mpc_ioapic_id(0); intsrc.irqtype = mp_INT; @@ -714,23 +714,21 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) *nr_m_spare += 1; } } -#else /* CONFIG_X86_IO_APIC */ -static -inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} -#endif /* CONFIG_X86_IO_APIC */ -static int +static int __init check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - int ret = 0; - if (!mpc_new_phys || count <= mpc_new_length) { WARN(1, "update_mptable: No spare slots (length: %x)\n", count); return -1; } - return ret; + return 0; } +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, @@ -883,7 +881,7 @@ static int __init update_mp_table(void) if (!mpc_new_phys) { unsigned char old, new; - /* check if we can change the postion */ + /* check if we can change the position */ mpc->checksum = 0; old = mpf_checksum((unsigned char *)mpc, mpc->length); mpc->checksum = 0xff; @@ -892,7 +890,7 @@ static int __init update_mp_table(void) printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); return 0; } - printk(KERN_INFO "use in-positon replacing\n"); + printk(KERN_INFO "use in-position replacing\n"); } else { mpf->physptr = mpc_new_phys; mpc_new = phys_to_virt(mpc_new_phys); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index f56a117cef68..e8c33a302006 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1279,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { /* - * FIXME: properly scan for devices accross the + * FIXME: properly scan for devices across the * PCI-to-PCI bridge on every CalIOC2 port. */ return 1; @@ -1295,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) /* * calgary_init_bitmap_from_tce_table(): - * Funtion for kdump case. In the second/kdump kernel initialize + * Function for kdump case. In the second/kdump kernel initialize * the bitmap based on the tce table entries obtained from first kernel */ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 9ea999a4dcc1..b49d00da2aed 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -68,74 +68,10 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA) -static __initdata void *dma32_bootmem_ptr; -static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); - -static int __init parse_dma32_size_opt(char *p) -{ - if (!p) - return -EINVAL; - dma32_bootmem_size = memparse(p, &p); - return 0; -} -early_param("dma32_size", parse_dma32_size_opt); - -void __init dma32_reserve_bootmem(void) -{ - unsigned long size, align; - if (max_pfn <= MAX_DMA32_PFN) - return; - - /* - * check aperture_64.c allocate_aperture() for reason about - * using 512M as goal - */ - align = 64ULL<<20; - size = roundup(dma32_bootmem_size, align); - dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, - 512ULL<<20); - /* - * Kmemleak should not scan this block as it may not be mapped via the - * kernel direct mapping. - */ - kmemleak_ignore(dma32_bootmem_ptr); - if (dma32_bootmem_ptr) - dma32_bootmem_size = size; - else - dma32_bootmem_size = 0; -} -static void __init dma32_free_bootmem(void) -{ - - if (max_pfn <= MAX_DMA32_PFN) - return; - - if (!dma32_bootmem_ptr) - return; - - free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size); - - dma32_bootmem_ptr = NULL; - dma32_bootmem_size = 0; -} -#else -void __init dma32_reserve_bootmem(void) -{ -} -static void __init dma32_free_bootmem(void) -{ -} - -#endif - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; - /* free the range so iommu could get some range less than 4G */ - dma32_free_bootmem(); - sort_iommu_table(__iommu_table, __iommu_table_end); check_iommu_entries(__iommu_table, __iommu_table_end); diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 55d745ec1181..35ccf75696eb 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start, struct iommu_table_entry *finish) { struct iommu_table_entry *p, *q, *x; - char sym_p[KSYM_SYMBOL_LEN]; - char sym_q[KSYM_SYMBOL_LEN]; /* Simple cyclic dependency checker. */ for (p = start; p < finish; p++) { q = find_dependents_of(start, finish, p); x = find_dependents_of(start, finish, q); if (p == x) { - sprint_symbol(sym_p, (unsigned long)p->detect); - sprint_symbol(sym_q, (unsigned long)q->detect); - - printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \ - " on %s and vice-versa. BREAKING IT.\n", - sym_p, sym_q); + printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", + p->detect, q->detect); /* Heavy handed way..*/ x->depend = 0; } @@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start, for (p = start; p < finish; p++) { q = find_dependents_of(p, finish, p); if (q && q > p) { - sprint_symbol(sym_p, (unsigned long)p->detect); - sprint_symbol(sym_q, (unsigned long)q->detect); - - printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\ - "should be called before %s!\n", - sym_p, sym_q); + printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", + p->detect, q->detect); } } } diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c index 071e7fea42e5..ba0a4cce53be 100644 --- a/arch/x86/kernel/probe_roms_32.c +++ b/arch/x86/kernel/probe_roms.c @@ -73,6 +73,107 @@ static struct resource video_rom_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const unsigned char *rom_list) +{ + unsigned short device; + + do { + if (probe_kernel_address(rom_list, device) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const unsigned char *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (probe_kernel_address(rom + 0x18, offset) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x6, device) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (probe_kernel_address(rom + offset + 0x8, list) == 0 && + probe_kernel_address(rom + offset + 0xc, rev) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8fb182956cbc..426a5b66f7e4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -87,7 +87,7 @@ void exit_thread(void) void show_regs(struct pt_regs *regs) { show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); } void show_regs_common(void) @@ -110,12 +110,9 @@ void show_regs_common(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk(KERN_CONT " "); - printk(KERN_CONT "%s %s", vendor, product); - if (board) { - printk(KERN_CONT "/"); - printk(KERN_CONT "%s", board); - } + printk(KERN_CONT " %s %s", vendor, product); + if (board) + printk(KERN_CONT "/%s", board); printk(KERN_CONT "\n"); } @@ -454,7 +451,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -470,7 +467,7 @@ static void mwait_idle(void) if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); trace_cpu_idle(1, smp_processor_id()); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bd387e8f73b4..6c9dd922ac0d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -501,6 +501,10 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that @@ -516,6 +520,10 @@ void set_personality_ia32(void) set_thread_flag(TIF_IA32); current->personality |= force_personality32; + /* Mark the associated mm as containing 32-bit tasks. */ + if (current->mm) + current->mm->context.ia32_compat = 1; + /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45892dc4b72a..807c2a2b80f1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) unsigned len, type; struct perf_event *bp; + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: @@ -655,6 +658,9 @@ restore: } goto restore; } + + ptrace_put_breakpoints(tsk); + return ((orig_ret < 0) ? orig_ret : rc); } @@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) if (n < HBP_NUM) { struct perf_event *bp; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + bp = thread->ptrace_bps[n]; if (!bp) - return 0; - val = bp->hw.info.address; + val = 0; + else + val = bp->hw.info.address; + + ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { @@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event *bp; struct thread_struct *t = &tsk->thread; struct perf_event_attr attr; + int err = 0; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; if (!t->ptrace_bps[nr]) { ptrace_breakpoint_init(&attr); @@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) - return PTR_ERR(bp); + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto put; + } t->ptrace_bps[nr] = bp; } else { - int err; - bp = t->ptrace_bps[nr]; attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); - if (err) - return err; } - - return 0; +put: + ptrace_put_breakpoints(tsk); + return err; } /* @@ -1347,7 +1363,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, * We must return the syscall number to actually look up in the table. * This can be -1L to skip running any syscall at all. */ -asmregparm long syscall_trace_enter(struct pt_regs *regs) +long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; @@ -1392,7 +1408,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) return ret ?: regs->orig_ax; } -asmregparm void syscall_trace_leave(struct pt_regs *regs) +void syscall_trace_leave(struct pt_regs *regs) { bool step; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 715037caeb43..0c016f727695 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -6,6 +6,7 @@ #include <linux/dmi.h> #include <linux/sched.h> #include <linux/tboot.h> +#include <linux/delay.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -35,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) @@ -303,68 +304,16 @@ static int __init reboot_init(void) } core_initcall(reboot_init); -/* The following code and data reboots the machine by switching to real - mode and jumping to the BIOS reset entry point, as if the CPU has - really been reset. The previous version asked the keyboard - controller to pulse the CPU reset line, which is more thorough, but - doesn't work with at least one type of 486 motherboard. It is easy - to stop this code working; hence the copious comments. */ -static const unsigned long long -real_mode_gdt_entries [3] = -{ - 0x0000000000000000ULL, /* Null descriptor */ - 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ - 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ -}; +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; -static const struct desc_ptr -real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, 0 }; - -/* This is 16-bit protected mode code to disable paging and the cache, - switch to real mode and jump to the BIOS reset code. - - The instruction that switches to real mode by writing to CR0 must be - followed immediately by a far jump instruction, which set CS to a - valid value for real mode, and flushes the prefetch queue to avoid - running instructions that have already been decoded in protected - mode. - - Clears all the flags except ET, especially PG (paging), PE - (protected-mode enable) and TS (task switch for coprocessor state - save). Flushes the TLB after paging has been disabled. Sets CD and - NW, to disable the cache on a 486, and invalidates the cache. This - is more like the state of a 486 after reset. I don't know if - something else should be done for other chips. - - More could be done here to set up the registers as if a CPU reset had - occurred; hopefully real BIOSs don't assume much. */ -static const unsigned char real_mode_switch [] = +void machine_real_restart(unsigned int type) { - 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ - 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ - 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ - 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ - 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */ - 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */ - 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */ - 0x74, 0x02, /* jz f */ - 0x0f, 0x09, /* wbinvd */ - 0x24, 0x10, /* f: andb $0x10,al */ - 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ -}; -static const unsigned char jump_to_bios [] = -{ - 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ -}; + void *restart_va; + unsigned long restart_pa; + void (*restart_lowmem)(unsigned int); + u64 *lowmem_gdt; -/* - * Switch to real mode and then execute the code - * specified by the code and length parameters. - * We assume that length will aways be less that 100! - */ -void machine_real_restart(const unsigned char *code, int length) -{ local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -392,41 +341,23 @@ void machine_real_restart(const unsigned char *code, int length) too. */ *((unsigned short *)0x472) = reboot_mode; - /* For the switch to real mode, copy some code to low memory. It has - to be in the first 64k because it is running in 16-bit mode, and it - has to have the same physical and virtual address, because it turns - off paging. Copy it near the end of the first page, out of the way - of BIOS variables. */ - memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), - real_mode_switch, sizeof (real_mode_switch)); - memcpy((void *)(0x1000 - 100), code, length); - - /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); - - /* Set up a GDT from which we can load segment descriptors for real - mode. The GDT is not used in real mode; it is just needed here to - prepare the descriptors. */ - load_gdt(&real_mode_gdt); - - /* Load the data segment registers, and thus the descriptors ready for - real mode. The base address of each segment is 0x100, 16 times the - selector value being loaded here. This is so that the segment - registers don't have to be reloaded after switching to real mode: - the values are consistent for real mode operation already. */ - __asm__ __volatile__ ("movl $0x0010,%%eax\n" - "\tmovl %%eax,%%ds\n" - "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" - "\tmovl %%eax,%%ss" : : : "eax"); - - /* Jump to the 16-bit code that we copied earlier. It disables paging - and the cache, switches to real mode, and jumps to the BIOS reset - entry point. */ - __asm__ __volatile__ ("ljmp $0x0008,%0" - : - : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100))); + /* Patch the GDT in the low memory trampoline */ + lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + + restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); + restart_pa = virt_to_phys(restart_va); + restart_lowmem = (void (*)(unsigned int))restart_pa; + + /* GDT[0]: GDT self-pointer */ + lowmem_gdt[0] = + (u64)(sizeof(machine_real_restart_gdt) - 1) + + ((u64)virt_to_phys(lowmem_gdt) << 16); + /* GDT[1]: 64K real mode code segment */ + lowmem_gdt[1] = + GDT_ENTRY(0x009b, restart_pa, 0xffff); + + /* Jump to the identity-mapped low memory code */ + restart_lowmem(type); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); @@ -547,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void) { } +/* + * Windows compatible x86 hardware expects the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * + * If the machine is still alive at this stage, it gives up. We default to + * following the same pattern, except that if we're still alive after (4) we'll + * try to force a triple fault and then cycle between hitting the keyboard + * controller and doing that + */ static void native_machine_emergency_restart(void) { int i; + int attempt = 0; + int orig_reboot_type = reboot_type; if (reboot_emergency) emergency_vmx_disable_all(); @@ -571,6 +517,13 @@ static void native_machine_emergency_restart(void) outb(0xfe, 0x64); /* pulse reset low */ udelay(50); } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_TRIPLE; + } + break; case BOOT_TRIPLE: load_idt(&no_idt); @@ -581,7 +534,7 @@ static void native_machine_emergency_restart(void) #ifdef CONFIG_X86_32 case BOOT_BIOS: - machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); + machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S new file mode 100644 index 000000000000..1d5c46df0d78 --- /dev/null +++ b/arch/x86/kernel/reboot_32.S @@ -0,0 +1,135 @@ +#include <linux/linkage.h> +#include <linux/init.h> +#include <asm/segment.h> +#include <asm/page_types.h> + +/* + * The following code and data reboots the machine by switching to real + * mode and jumping to the BIOS reset entry point, as if the CPU has + * really been reset. The previous version asked the keyboard + * controller to pulse the CPU reset line, which is more thorough, but + * doesn't work with at least one type of 486 motherboard. It is easy + * to stop this code working; hence the copious comments. + * + * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + */ + .section ".x86_trampoline","a" + .balign 16 + .code32 +ENTRY(machine_real_restart_asm) +r_base = . + /* Get our own relocated address */ + call 1f +1: popl %ebx + subl $(1b - r_base), %ebx + + /* Compute the equivalent real-mode segment */ + movl %ebx, %ecx + shrl $4, %ecx + + /* Patch post-real-mode segment jump */ + movw (dispatch_table - r_base)(%ebx,%eax,2),%ax + movw %ax, (101f - r_base)(%ebx) + movw %cx, (102f - r_base)(%ebx) + + /* Set up the IDT for real mode. */ + lidtl (machine_real_restart_idt - r_base)(%ebx) + + /* + * Set up a GDT from which we can load segment descriptors for real + * mode. The GDT is not used in real mode; it is just needed here to + * prepare the descriptors. + */ + lgdtl (machine_real_restart_gdt - r_base)(%ebx) + + /* + * Load the data segment registers with 16-bit compatible values + */ + movl $16, %ecx + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + ljmpl $8, $1f - r_base + +/* + * This is 16-bit protected mode code to disable paging and the cache, + * switch to real mode and jump to the BIOS reset code. + * + * The instruction that switches to real mode by writing to CR0 must be + * followed immediately by a far jump instruction, which set CS to a + * valid value for real mode, and flushes the prefetch queue to avoid + * running instructions that have already been decoded in protected + * mode. + * + * Clears all the flags except ET, especially PG (paging), PE + * (protected-mode enable) and TS (task switch for coprocessor state + * save). Flushes the TLB after paging has been disabled. Sets CD and + * NW, to disable the cache on a 486, and invalidates the cache. This + * is more like the state of a 486 after reset. I don't know if + * something else should be done for other chips. + * + * More could be done here to set up the registers as if a CPU reset had + * occurred; hopefully real BIOSs don't assume much. This is not the + * actual BIOS entry point, anyway (that is at 0xfffffff0). + * + * Most of this work is probably excessive, but it is what is tested. + */ + .code16 +1: + xorl %ecx, %ecx + movl %cr0, %eax + andl $0x00000011, %eax + orl $0x60000000, %eax + movl %eax, %cr0 + movl %ecx, %cr3 + movl %cr0, %edx + andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + jz 2f + wbinvd +2: + andb $0x10, %al + movl %eax, %cr0 + .byte 0xea /* ljmpw */ +101: .word 0 /* Offset */ +102: .word 0 /* Segment */ + +bios: + ljmpw $0xf000, $0xfff0 + +apm: + movw $0x1000, %ax + movw %ax, %ss + movw $0xf000, %sp + movw $0x5307, %ax + movw $0x0001, %bx + movw $0x0003, %cx + int $0x15 + +END(machine_real_restart_asm) + + .balign 16 + /* These must match <asm/reboot.h */ +dispatch_table: + .word bios - r_base + .word apm - r_base +END(dispatch_table) + + .balign 16 +machine_real_restart_idt: + .word 0xffff /* Length - real mode default value */ + .long 0 /* Base - real mode default value */ +END(machine_real_restart_idt) + + .balign 16 +ENTRY(machine_real_restart_gdt) + .quad 0 /* Self-pointer, filled in by PM code */ + .quad 0 /* 16-bit code segment, filled in by PM code */ + /* + * 16-bit data segment with the selector value 16 = 0x10 and + * base value 0x100; since this is consistent with real mode + * semantics we don't have to reload the segments once CR0.PE = 0. + */ + .quad GDT_ENTRY(0x0093, 0x100, 0xffff) +END(machine_real_restart_gdt) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 6f39cab052d5..3f2ad2640d85 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -6,6 +6,7 @@ #include <linux/acpi.h> #include <linux/bcd.h> #include <linux/pnp.h> +#include <linux/of.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void) } } #endif + if (of_have_populated_dt()) + return 0; platform_device_register(&rtc_device); dev_info(&rtc_device.dev, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3cfe26c0252..afaf38447ef5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -113,6 +113,7 @@ #endif #include <asm/mce.h> #include <asm/alternative.h> +#include <asm/prom.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -297,6 +298,9 @@ static void __init init_gbpages(void) static inline void init_gbpages(void) { } +static void __init cleanup_highmap(void) +{ +} #endif static void __init reserve_brk(void) @@ -429,16 +433,30 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_memremap(pa_data, PAGE_SIZE); + u32 data_len, map_len; + + map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), + (u64)sizeof(struct setup_data)); + data = early_memremap(pa_data, map_len); + data_len = data->len + sizeof(struct setup_data); + if (data_len > map_len) { + early_iounmap(data, map_len); + data = early_memremap(pa_data, data_len); + map_len = data_len; + } + switch (data->type) { case SETUP_E820_EXT: - parse_e820_ext(data, pa_data); + parse_e820_ext(data); + break; + case SETUP_DTB: + add_dtb(pa_data); break; default: break; } pa_data = data->next; - early_iounmap(data, PAGE_SIZE); + early_iounmap(data, map_len); } } @@ -601,28 +619,6 @@ void __init reserve_standard_io_resources(void) } -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - static __init void reserve_ibft_region(void) { unsigned long addr, size = 0; @@ -680,15 +676,6 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static u64 __init get_max_mapped(void) -{ - u64 end = max_pfn_mapped; - - end <<= PAGE_SHIFT; - - return end; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -704,10 +691,6 @@ static u64 __init get_max_mapped(void) void __init setup_arch(char **cmdline_p) { - int acpi = 0; - int amd = 0; - unsigned long flags; - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -922,9 +905,18 @@ void __init setup_arch(char **cmdline_p) */ reserve_brk(); + cleanup_highmap(); + memblock.current_limit = get_max_mapped(); memblock_x86_fill(); + /* + * The EFI specification says that boot service code won't be called + * after ExitBootServices(). This is, in fact, a lie. + */ + if (efi_enabled) + efi_reserve_boot_services(); + /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); @@ -935,15 +927,8 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); - reserve_trampoline_memory(); + setup_trampolines(); -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - * even before init_memory_mapping - */ - acpi_reserve_wakeup_memory(); -#endif init_gbpages(); /* max_pfn_mapped is updated here */ @@ -968,6 +953,8 @@ void __init setup_arch(char **cmdline_p) if (init_ohci1394_dma_early) init_ohci1394_dma_on_all_controllers(); #endif + /* Allocate bigger log buffer */ + setup_log_buf(1); reserve_initrd(); @@ -984,21 +971,8 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi = acpi_numa_init(); -#endif - -#ifdef CONFIG_AMD_NUMA - if (!acpi) - amd = !amd_numa_init(0, max_pfn); -#endif - - initmem_init(0, max_pfn, acpi, amd); + initmem_init(); memblock_find_dma_reserve(); - dma32_reserve_bootmem(); #ifdef CONFIG_KVM_CLOCK kvmclock_init(); @@ -1008,6 +982,11 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + if (boot_cpu_data.cpuid_level >= 0) { + /* A CPU has %cr4 if and only if it has CPUID */ + mmu_cr4_features = read_cr4(); + } + #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, @@ -1029,8 +1008,8 @@ void __init setup_arch(char **cmdline_p) * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); - sfi_init(); + x86_dtb_init(); /* * get boot-time SMP configuration: @@ -1040,9 +1019,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); -#ifdef CONFIG_X86_64 init_cpu_to_node(); -#endif init_apic_mappings(); ioapic_and_gsi_init(); @@ -1066,11 +1043,11 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.oem.banner(); + x86_init.timers.wallclock_init(); + mcheck_init(); - local_irq_save(flags); - arch_init_ideal_nop5(); - local_irq_restore(flags); + arch_init_ideal_nops(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 002b79685f73..71f4727da373 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); #endif +#ifdef CONFIG_X86_32 + per_cpu(x86_cpu_to_logical_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); +#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; +#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void) */ set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); #endif -#endif /* * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. @@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) +#ifdef CONFIG_X86_32 + early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; +#endif +#ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4fd173cd8e57..40a24932a8a1 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs) goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; @@ -682,6 +679,7 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) { + sigset_t blocked; int ret; /* Are we from a system call? */ @@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + sigaddset(&blocked, sig); + set_current_blocked(&blocked); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac7228d..013e7eba83bb 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2c33633595cc..eefd96765e79 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,6 +64,7 @@ #include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -71,10 +72,6 @@ #include <asm/smpboot_hooks.h> #include <asm/i8259.h> -#ifdef CONFIG_X86_32 -u8 apicid_2_node[MAX_APICID]; -#endif - /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -/* which node each logical CPU is on */ -int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_to_node_map); - -/* set up a mapping between cpu and node. */ -static void map_cpu_to_node(int cpu, int node) -{ - printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); - cpumask_set_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = node; -} - -/* undo a mapping between cpu and node. */ -static void unmap_cpu_to_node(int cpu) -{ - int node; - - printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); - for (node = 0; node < MAX_NUMNODES; node++) - cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = 0; -} -#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ -#define map_cpu_to_node(cpu, node) ({}) -#define unmap_cpu_to_node(cpu) ({}) -#endif - -#ifdef CONFIG_X86_32 -static int boot_cpu_logical_apicid; - -u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = - { [0 ... NR_CPUS-1] = BAD_APICID }; - -static void map_cpu_to_logical_apicid(void) -{ - int cpu = smp_processor_id(); - int apicid = logical_smp_processor_id(); - int node = apic->apicid_to_node(apicid); - - if (!node_online(node)) - node = first_online_node; - - cpu_2_logical_apicid[cpu] = apicid; - map_cpu_to_node(cpu, node); -} - -void numa_remove_cpu(int cpu) -{ - cpu_2_logical_apicid[cpu] = BAD_APICID; - unmap_cpu_to_node(cpu); -} -#else -#define map_cpu_to_logical_apicid() do {} while (0) -#endif - /* * Report back to the Boot Processor. * Running on AP. @@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void) apic->smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); - map_cpu_to_logical_apicid(); /* * Need to setup vector mappings before we enable interrupts. @@ -355,23 +297,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -#ifdef CONFIG_CPUMASK_OFFSTACK -/* In this case, llc_shared_map is a pointer to a cpumask. */ -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - struct cpumask *llc = dst->llc_shared_map; - *dst = *src; - dst->llc_shared_map = llc; -} -#else -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - *dst = *src; -} -#endif /* CONFIG_CPUMASK_OFFSTACK */ - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -381,7 +306,7 @@ void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); - copy_cpuinfo_x86(c, &boot_cpu_data); + *c = boot_cpu_data; c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); @@ -389,15 +314,12 @@ void __cpuinit smp_store_cpu_info(int id) static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { - struct cpuinfo_x86 *c1 = &cpu_data(cpu1); - struct cpuinfo_x86 *c2 = &cpu_data(cpu2); - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, c2->llc_shared_map); - cpumask_set_cpu(cpu2, c1->llc_shared_map); + cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); } @@ -414,6 +336,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) if (cpu_has(c, X86_FEATURE_TOPOEXT)) { if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && c->compute_unit_id == o->compute_unit_id) link_thread_siblings(cpu, i); } else if (c->phys_proc_id == o->phys_proc_id && @@ -425,7 +348,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); } - cpumask_set_cpu(cpu, c->llc_shared_map); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); @@ -436,8 +359,8 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); + cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); @@ -476,7 +399,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) !(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else - return c->llc_shared_map; + return cpu_llc_shared_mask(cpu); } static void impress_friends(void) @@ -788,7 +711,7 @@ do_rest: stack_start = c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ - start_ip = setup_trampoline(); + start_ip = trampoline_address(); /* So we see what's up */ announce_cpu(cpu, apicid); @@ -798,6 +721,8 @@ do_rest: * the targeted processor. */ + printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); + atomic_set(&init_deasserted, 0); if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { @@ -851,8 +776,8 @@ do_rest: pr_debug("CPU%d: has booted.\n", cpu); else { boot_error = 1; - if (*((volatile unsigned char *)trampoline_base) - == 0xA5) + if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) + == 0xA5A5A5A5) /* trampoline started but...? */ pr_err("CPU%d: Stuck ??\n", cpu); else @@ -878,7 +803,7 @@ do_rest: } /* mark "stuck" area as not stuck */ - *((volatile unsigned long *)trampoline_base) = 0; + *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { /* @@ -945,6 +870,14 @@ int __cpuinit native_cpu_up(unsigned int cpu) return 0; } +/** + * arch_disable_smp_support() - disables SMP support for x86 at runtime + */ +void arch_disable_smp_support(void) +{ + disable_ioapic_support(); +} + /* * Fall back to non SMP mode after errors. * @@ -960,7 +893,6 @@ static __init void disable_smp(void) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else physid_set_mask_of_physid(0, &phys_cpu_present_map); - map_cpu_to_logical_apicid(); cpumask_set_cpu(0, cpu_sibling_mask(0)); cpumask_set_cpu(0, cpu_core_mask(0)); } @@ -1045,7 +977,7 @@ static int __init smp_sanity_check(unsigned max_cpus) "(tell your hw vendor)\n"); } smpboot_clear_io_apic(); - arch_disable_smp_support(); + disable_ioapic_support(); return -1; } @@ -1089,21 +1021,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) preempt_disable(); smp_cpu_index_default(); - memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); - cpumask_copy(cpu_callin_mask, cpumask_of(0)); - mb(); + /* * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ -#ifdef CONFIG_X86_32 - boot_cpu_logical_apicid = logical_smp_processor_id(); -#endif + cpumask_copy(cpu_callin_mask, cpumask_of(0)); + mb(); + current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } set_cpu_sibling_map(0); @@ -1139,8 +1069,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) bsp_end_local_APIC_setup(); - map_cpu_to_logical_apicid(); - if (apic->setup_portio_remap) apic->setup_portio_remap(); @@ -1404,9 +1332,9 @@ static inline void mwait_play_dead(void) void *mwait_ptr; struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); - if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) + if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)) return; - if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) + if (!this_cpu_has(X86_FEATURE_CLFLSH)) return; if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 938c8e10a19a..55d9bc03f696 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -9,15 +9,6 @@ #include <linux/uaccess.h> #include <asm/stacktrace.h> -static void save_stack_warning(void *data, char *msg) -{ -} - -static void -save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) -{ -} - static int save_stack_stack(void *data, char *name) { return 0; @@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, .stack = save_stack_stack, .address = save_stack_address, .walk_stack = print_context_stack, }; static const struct stacktrace_ops save_stack_ops_nosched = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, .stack = save_stack_stack, .address = save_stack_address_nosched, .walk_stack = print_context_stack, @@ -73,7 +60,7 @@ static const struct stacktrace_ops save_stack_ops_nosched = { */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } @@ -81,14 +68,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) { - dump_trace(current, regs, NULL, &save_stack_ops, trace); + dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 58de45ee08b6..7977f0cfe339 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block) * Make sure block stepping (BTF) is not enabled unless it should be. * Note that we don't try to worry about any is_setting_trap_flag() * instructions after the first when using block stepping. - * So noone should try to use debugger block stepping in a program + * So no one should try to use debugger block stepping in a program * that uses user-mode single stepping itself. */ if (enable_single_step(child) && block) { diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786dc9b8f..fbb0a045a1a2 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,9 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_name_to_handle_at + .long sys_open_by_handle_at + .long sys_clock_adjtime + .long sys_syncfs + .long sys_sendmmsg /* 345 */ + .long sys_setns diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 998e972f3b1a..30ac65df7d4e 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), - .cpu_vm_mask = CPU_MASK_ALL, }; static inline void switch_to_tboot_pt(void) diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 787a5e499dd1..3f92ce07e525 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -161,7 +161,7 @@ static int test_NX(void) } #endif - return 0; + return ret; } static void test_exit(void) diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25a28a245937..00cbb272627f 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 7e4515957a1c..8927486a4649 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num) /* * CPU0 cannot be offlined due to several * restrictions and assumptions in kernel. This basically - * doesnt add a control file, one cannot attempt to offline + * doesn't add a control file, one cannot attempt to offline * BSP. * * Also certain PCI quirks require not to enable hotplug control diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a375616d77f7..a91ae7709b49 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -2,39 +2,41 @@ #include <linux/memblock.h> #include <asm/trampoline.h> +#include <asm/cacheflush.h> #include <asm/pgtable.h> -#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) -#define __trampinit -#define __trampinitdata -#else -#define __trampinit __cpuinit -#define __trampinitdata __cpuinitdata -#endif +unsigned char *x86_trampoline_base; -/* ready for x86_64 and x86 */ -unsigned char *__trampinitdata trampoline_base; - -void __init reserve_trampoline_memory(void) +void __init setup_trampolines(void) { phys_addr_t mem; + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); if (mem == MEMBLOCK_ERROR) panic("Cannot allocate trampoline\n"); - trampoline_base = __va(mem); - memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); + x86_trampoline_base = __va(mem); + memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); + + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + x86_trampoline_base, (unsigned long long)mem, size); + + memcpy(x86_trampoline_base, x86_trampoline_start, size); } /* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. + * setup_trampolines() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. */ -unsigned long __trampinit setup_trampoline(void) +static int __init configure_trampolines(void) { - memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); - return virt_to_phys(trampoline_base); + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); + + set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT); + return 0; } +arch_initcall(configure_trampolines); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 8508237e8e43..451c0a7ef7fd 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -32,9 +32,11 @@ #include <asm/segment.h> #include <asm/page_types.h> -/* We can free up trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -.code16 +#ifdef CONFIG_SMP + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -44,7 +46,7 @@ r_base = . cli # We should be safe anyway - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running /* GDT tables in non default location kernel can be beyond 16MB and @@ -72,5 +74,10 @@ boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L +ENTRY(trampoline_status) + .long 0 + .globl trampoline_end trampoline_end: + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 075d130efcf9..09ff51799e96 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -32,13 +32,9 @@ #include <asm/segment.h> #include <asm/processor-flags.h> -#ifdef CONFIG_ACPI_SLEEP -.section .rodata, "a", @progbits -#else -/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -#endif -.code16 + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -50,7 +46,7 @@ r_base = . mov %ax, %ss - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running # Setup stack @@ -64,10 +60,13 @@ r_base = . movzx %ax, %esi # Find the 32bit trampoline location shll $4, %esi - # Fixup the vectors - addl %esi, startup_32_vector - r_base - addl %esi, startup_64_vector - r_base - addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer + # Fixup the absolute vectors + leal (startup_32 - r_base)(%esi), %eax + movl %eax, startup_32_vector - r_base + leal (startup_64 - r_base)(%esi), %eax + movl %eax, startup_64_vector - r_base + leal (tgdt - r_base)(%esi), %eax + movl %eax, (tgdt + 2 - r_base) /* * GDT tables in non default location kernel can be beyond 16MB and @@ -129,6 +128,7 @@ no_longmode: jmp no_longmode #include "verify_cpu.S" + .balign 4 # Careful these need to be in the same 64K segment as the above; tidt: .word 0 # idt limit = 0 @@ -156,6 +156,10 @@ startup_64_vector: .long startup_64 - r_base .word __KERNEL_CS, 0 + .balign 4 +ENTRY(trampoline_status) + .long 0 + trampoline_stack: .org 0x1000 trampoline_stack_end: diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ffe5755caa8b..6cc6922262af 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -427,7 +427,7 @@ unsigned long native_calibrate_tsc(void) * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when a - * SMI/SMM disturbance happend between the two reads. If the + * SMI/SMM disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * @@ -763,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs) ret : clocksource_tsc.cycle_last; } -#ifdef CONFIG_X86_64 -static cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret; - - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - rdtsc_barrier(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; -} -#endif - static void resume_tsc(struct clocksource *cs) { clocksource_tsc.cycle_last = 0; @@ -900,7 +881,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); * timer based, instead of loop based, we don't block the boot * process while this longer calibration is done. * - * If there are any calibration anomolies (too many SMIs, etc), + * If there are any calibration anomalies (too many SMIs, etc), * or the refined calibration is off by 1% of the fast early * calibration, we throw out the new calibration and use the * early calibration. diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 0edefc19a113..b9242bacbe59 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -18,7 +18,7 @@ * This file is expected to run in 32bit code. Currently: * * arch/x86/boot/compressed/head_64.S: Boot cpu verification - * arch/x86/kernel/trampoline_64.S: secondary processor verfication + * arch/x86/kernel/trampoline_64.S: secondary processor verification * arch/x86/kernel/head_32.S: processor startup * * verify_cpu, returns the status of longmode and SSE in register %eax. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bf4700755184..89aed99aafce 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -105,6 +105,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + ENTRY_TEXT IRQENTRY_TEXT *(.fixup) *(.gnu.warning) @@ -160,6 +161,12 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) +#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \ + ADDR(.vsyscall_0) + offset \ + : AT(VLOAD(.vsyscall_var_ ## x)) { \ + *(.vsyscall_var_ ## x) \ + } \ + x = VVIRT(.vsyscall_var_ ## x); . = ALIGN(4096); __vsyscall_0 = .; @@ -174,18 +181,6 @@ SECTIONS *(.vsyscall_fn) } - . = ALIGN(L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { - *(.vsyscall_gtod_data) - } - - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { - *(.vsyscall_clock) - } - vsyscall_clock = VVIRT(.vsyscall_clock); - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } @@ -193,21 +188,14 @@ SECTIONS *(.vsyscall_2) } - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { - *(.vgetcpu_mode) - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { - *(.jiffies) - } - jiffies = VVIRT(.jiffies); - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR @@ -215,6 +203,7 @@ SECTIONS #undef VLOAD #undef VVIRT_OFFSET #undef VVIRT +#undef EMIT_VVAR #endif /* CONFIG_X86_64 */ @@ -230,7 +219,7 @@ SECTIONS * output PHDR, so the next output section - .init.text - should * start another segment - init. */ - PERCPU_VADDR(0, :percpu) + PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) #endif INIT_TEXT_SECTION(PAGE_SIZE) @@ -240,6 +229,18 @@ SECTIONS INIT_DATA_SECTION(16) + /* + * Code and data for a variety of lowlevel trampolines, to be + * copied into base memory (< 1 MiB) during initialization. + * Since it is copied early, the main copy can be discarded + * afterwards. + */ + .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) { + x86_trampoline_start = .; + *(.x86_trampoline) + x86_trampoline_end = .; + } + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) @@ -291,6 +292,14 @@ SECTIONS *(.iommu_table) __iommu_table_end = .; } + + . = ALIGN(8); + .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { + __apicdrivers = .; + *(.apicdrivers); + __apicdrivers_end = .; + } + . = ALIGN(8); /* * .exit.text is discard at runtime, not link time, to deal with @@ -305,7 +314,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(THREAD_SIZE) + PERCPU_SECTION(INTERNODE_CACHE_BYTES) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c new file mode 100644 index 000000000000..a81aa9e9894c --- /dev/null +++ b/arch/x86/kernel/vread_tsc_64.c @@ -0,0 +1,36 @@ +/* This code runs in userspace. */ + +#define DISABLE_BRANCH_PROFILING +#include <asm/vgtod.h> + +notrace cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dcbb28c4b694..3e682184d76c 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -49,17 +49,10 @@ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" -/* - * vsyscall_gtod_data contains data that is : - * - readonly from vsyscalls - * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) - * Try to keep this structure as small as possible to avoid cache line ping pongs - */ -int __vgetcpu_mode __section_vgetcpu_mode; - -struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = +DEFINE_VVAR(int, vgetcpu_mode); +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { - .lock = SEQLOCK_UNLOCKED, + .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __vsyscall_gtod_data.sys_tz; + *tz = VVAR(vsyscall_gtod_data).sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) unsigned long mult, shift, nsec; cycle_t (*vread)(void); do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - vread = __vsyscall_gtod_data.clock.vread; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + vread = VVAR(vsyscall_gtod_data).clock.vread; + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled || + !vread)) { gettimeofday(tv,NULL); return; } now = vread(); - base = __vsyscall_gtod_data.clock.cycle_last; - mask = __vsyscall_gtod_data.clock.mask; - mult = __vsyscall_gtod_data.clock.mult; - shift = __vsyscall_gtod_data.clock.shift; + base = VVAR(vsyscall_gtod_data).clock.cycle_last; + mask = VVAR(vsyscall_gtod_data).clock.mask; + mult = VVAR(vsyscall_gtod_data).clock.mult; + shift = VVAR(vsyscall_gtod_data).clock.shift; - tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; - nsec = __vsyscall_gtod_data.wall_time_nsec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; + nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; + } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; @@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t) { unsigned seq; time_t result; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) return time_syscall(t); do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - result = __vsyscall_gtod_data.wall_time_sec; + result = VVAR(vsyscall_gtod_data).wall_time_sec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); if (t) *t = result; @@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) We do this here because otherwise user space would do it on its own in a likely inferior way (no access to jiffies). If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = __jiffies)) { + if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { p = tcache->blob[1]; - } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ native_read_tscp(&p); } else { diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1b950d151e58..9796c2f3d074 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ceb2911aa439..6f164bd5e14d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { } struct x86_init_ops x86_init __initdata = { .resources = { - .probe_roms = x86_init_noop, + .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = default_machine_specific_memory_setup, }, @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, + .mapping = { + .pagetable_reserve = native_pagetable_reserve, + }, + .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, @@ -70,6 +74,7 @@ struct x86_init_ops x86_init __initdata = { .setup_percpu_clockev = setup_boot_APIC_clock, .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, + .wallclock_init = x86_init_noop, }, .iommu = { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 547128546cc3..a3911343976b 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk) /* * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is upto date. + * to do for us, as the memory layout is up to date. */ if ((xstate_bv & pcntxt_mask) == pcntxt_mask) return; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index caf966781d25..d6e2477feb18 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -73,9 +73,15 @@ #define MemAbs (1<<11) /* Memory operand is absolute displacement */ #define String (1<<12) /* String instruction (rep capable) */ #define Stack (1<<13) /* Stack instruction (push/pop) */ +#define GroupMask (7<<14) /* Opcode uses one of the group mechanisms */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ -#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupDual (2<<14) /* Alternate decoding of mod == 3 */ +#define Prefix (3<<14) /* Instruction varies with 66/f2/f3 prefix */ +#define RMExt (4<<14) /* Opcode extension in ModRM r/m if mod == 3 */ +#define Sse (1<<17) /* SSE Vector instruction */ /* Misc flags */ +#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ +#define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -101,11 +107,14 @@ struct opcode { u32 flags; + u8 intercept; union { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; struct group_dual *gdual; + struct gprefix *gprefix; } u; + int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; struct group_dual { @@ -113,6 +122,13 @@ struct group_dual { struct opcode mod3[8]; }; +struct gprefix { + struct opcode pfx_no; + struct opcode pfx_66; + struct opcode pfx_f2; + struct opcode pfx_f3; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -247,42 +263,42 @@ struct group_dual { "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ } while (0) -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ - break; \ - case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ - break; \ - } \ +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ } while (0) #define __emulate_1op(_op, _dst, _eflags, _suffix) \ @@ -345,13 +361,25 @@ struct group_dual { } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ - do { \ - switch((_src).bytes) { \ - case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ - case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ - case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "b"); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "w"); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "l"); \ + break; \ + case 8: \ + ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "q")); \ + break; \ } \ } while (0) @@ -387,13 +415,33 @@ struct group_dual { (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ +#define insn_fetch_arr(_arr, _size, _eip) \ ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ (_eip) += (_size); \ }) +static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, + enum x86_intercept intercept, + enum x86_intercept_stage stage) +{ + struct x86_instruction_info info = { + .intercept = intercept, + .rep_prefix = ctxt->decode.rep_prefix, + .modrm_mod = ctxt->decode.modrm_mod, + .modrm_reg = ctxt->decode.modrm_reg, + .modrm_rm = ctxt->decode.modrm_rm, + .src_val = ctxt->decode.src.val64, + .src_bytes = ctxt->decode.src.bytes, + .dst_bytes = ctxt->decode.dst.bytes, + .ad_bytes = ctxt->decode.ad_bytes, + .next_rip = ctxt->eip, + }; + + return ctxt->ops->intercept(ctxt, &info, stage); +} + static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -429,6 +477,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel) register_address_increment(c, &c->eip, rel); } +static u32 desc_limit_scaled(struct desc_struct *desc) +{ + u32 limit = get_desc_limit(desc); + + return desc->g ? (limit << 12) | 0xfff : limit; +} + static void set_seg_override(struct decode_cache *c, int seg) { c->has_seg_override = true; @@ -441,11 +496,10 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return ops->get_cached_segment_base(seg, ctxt->vcpu); + return ops->get_cached_segment_base(ctxt, seg); } static unsigned seg_override(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct decode_cache *c) { if (!c->has_seg_override) @@ -454,18 +508,6 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt, return c->seg_override; } -static ulong linear(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr) -{ - struct decode_cache *c = &ctxt->decode; - ulong la; - - la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; - if (c->ad_bytes != 8) - la &= (u32)-1; - return la; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { @@ -475,11 +517,21 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, return X86EMUL_PROPAGATE_FAULT; } +static int emulate_db(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, DB_VECTOR, 0, false); +} + static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, GP_VECTOR, err, true); } +static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) +{ + return emulate_exception(ctxt, SS_VECTOR, err, true); +} + static int emulate_ud(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, UD_VECTOR, 0, false); @@ -495,6 +547,128 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, DE_VECTOR, 0, false); } +static int emulate_nm(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, NM_VECTOR, 0, false); +} + +static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) +{ + u16 selector; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); + return selector; +} + +static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, + unsigned seg) +{ + u16 dummy; + u32 base3; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); +} + +static int __linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, bool fetch, + ulong *linear) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_struct desc; + bool usable; + ulong la; + u32 lim; + u16 sel; + unsigned cpl, rpl; + + la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + switch (ctxt->mode) { + case X86EMUL_MODE_REAL: + break; + case X86EMUL_MODE_PROT64: + if (((signed long)la << 16) >> 16 != la) + return emulate_gp(ctxt, 0); + break; + default: + usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, + addr.seg); + if (!usable) + goto bad; + /* code segment or read-only data segment */ + if (((desc.type & 8) || !(desc.type & 2)) && write) + goto bad; + /* unreadable code segment */ + if (!fetch && (desc.type & 8) && !(desc.type & 2)) + goto bad; + lim = desc_limit_scaled(&desc); + if ((desc.type & 8) || !(desc.type & 4)) { + /* expand-up segment */ + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } else { + /* exapand-down segment */ + if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) + goto bad; + lim = desc.d ? 0xffffffff : 0xffff; + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } + cpl = ctxt->ops->cpl(ctxt); + rpl = sel & 3; + cpl = max(cpl, rpl); + if (!(desc.type & 8)) { + /* data segment */ + if (cpl > desc.dpl) + goto bad; + } else if ((desc.type & 8) && !(desc.type & 4)) { + /* nonconforming code segment */ + if (cpl != desc.dpl) + goto bad; + } else if ((desc.type & 8) && (desc.type & 4)) { + /* conforming code segment */ + if (cpl < desc.dpl) + goto bad; + } + break; + } + if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) + la &= (u32)-1; + *linear = la; + return X86EMUL_CONTINUE; +bad: + if (addr.seg == VCPU_SREG_SS) + return emulate_ss(ctxt, addr.seg); + else + return emulate_gp(ctxt, addr.seg); +} + +static int linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, + ulong *linear) +{ + return __linearize(ctxt, addr, size, write, false, linear); +} + + +static int segmented_read_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, false, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -504,10 +678,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, int size, cur_size; if (eip == fc->end) { + unsigned long linear; + struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; cur_size = fc->end - fc->start; size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); - rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, - size, ctxt->vcpu, &ctxt->exception); + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = ops->fetch(ctxt, linear, fc->data + cur_size, + size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; fc->end += size; @@ -550,7 +729,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, } static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct segmented_address addr, u16 *size, unsigned long *address, int op_bytes) { @@ -559,13 +737,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, size, 2); if (rc != X86EMUL_CONTINUE) return rc; addr.ea += 2; - rc = ops->read_std(linear(ctxt, addr), address, op_bytes, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, address, op_bytes); return rc; } @@ -622,7 +798,63 @@ static void fetch_register_operand(struct operand *op) } } -static void decode_register_operand(struct operand *op, +static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, + int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op, struct decode_cache *c, int inhibit_bytereg) { @@ -631,6 +863,15 @@ static void decode_register_operand(struct operand *op, if (!(c->d & ModRM)) reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = reg; + read_sse_reg(ctxt, &op->vec_val, reg); + return; + } + op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { op->addr.reg = decode_register(reg, c->regs, highbyte_regs); @@ -670,6 +911,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; op->addr.reg = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = c->modrm_rm; + read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); + return rc; + } fetch_register_operand(op); return rc; } @@ -818,8 +1066,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - rc = ops->read_emulated(addr, mc->data + mc->end, n, - &ctxt->exception, ctxt->vcpu); + rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, + &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -833,6 +1081,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } +static int segmented_read(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, false, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return read_emulated(ctxt, ctxt->ops, linear, data, size); +} + +static int segmented_write(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_emulated(ctxt, linear, data, size, + &ctxt->exception); +} + +static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *orig_data, const void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, + size, &ctxt->exception); +} + static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned int size, unsigned short port, @@ -853,7 +1145,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (n == 0) n = 1; rc->pos = rc->end = 0; - if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) return 0; rc->end = n * size; } @@ -863,27 +1155,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } -static u32 desc_limit_scaled(struct desc_struct *desc) -{ - u32 limit = get_desc_limit(desc); - - return desc->g ? (limit << 12) | 0xfff : limit; -} - static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, struct desc_ptr *dt) { if (selector & 1 << 2) { struct desc_struct desc; + u16 sel; + memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ dt->address = get_desc_base(&desc); } else - ops->get_gdt(dt, ctxt->vcpu); + ops->get_gdt(ctxt, dt); } /* allowed just for 8 bytes segments */ @@ -901,8 +1188,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } @@ -923,12 +1209,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } +/* Does not support long mode */ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, int seg) @@ -983,7 +1269,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ops->cpl(ctxt->vcpu); + cpl = ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1039,8 +1325,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } load: - ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + ops->set_segment(ctxt, selector, &seg_desc, 0, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1066,8 +1351,7 @@ static void write_register_operand(struct operand *op) } } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int writeback(struct x86_emulate_ctxt *ctxt) { int rc; struct decode_cache *c = &ctxt->decode; @@ -1078,23 +1362,22 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, break; case OP_MEM: if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_cmpxchg(ctxt, + c->dst.addr.mem, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes); else - rc = ops->write_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_write(ctxt, + c->dst.addr.mem, + &c->dst.val, + c->dst.bytes); if (rc != X86EMUL_CONTINUE) return rc; break; + case OP_XMM: + write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); + break; case OP_NONE: /* no writeback */ break; @@ -1104,21 +1387,21 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_push(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + struct segmented_address addr; - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); - c->dst.addr.mem.seg = VCPU_SREG_SS; + addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + addr.seg = VCPU_SREG_SS; + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, void *dest, int len) { struct decode_cache *c = &ctxt->decode; @@ -1127,7 +1410,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; - rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); + rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1135,6 +1418,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, return rc; } +static int em_pop(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + return emulate_pop(ctxt, &c->dst.val, c->op_bytes); +} + static int emulate_popf(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, void *dest, int len) @@ -1142,9 +1432,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, int rc; unsigned long val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ops->cpl(ctxt->vcpu); + int cpl = ops->cpl(ctxt); - rc = emulate_pop(ctxt, ops, &val, len); + rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1176,14 +1466,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static int em_popf(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); + c->dst.type = OP_REG; + c->dst.addr.reg = &ctxt->eflags; + c->dst.bytes = c->op_bytes; + return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); +} - emulate_push(ctxt, ops); +static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = get_segment_selector(ctxt, seg); + + return em_push(ctxt); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1193,7 +1493,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, unsigned long selector; int rc; - rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + rc = emulate_pop(ctxt, &selector, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1201,8 +1501,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static int emulate_pusha(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_pusha(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; @@ -1213,23 +1512,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt, (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt, ops); - - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ++reg; } - /* Disable writeback. */ - c->dst.type = OP_NONE; - return rc; } -static int emulate_popa(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_pushf(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = (unsigned long)ctxt->eflags; + return em_push(ctxt); +} + +static int em_popa(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; @@ -1242,7 +1543,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, --reg; } - rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); + rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -1262,37 +1563,32 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add limit checks */ c->src.val = ctxt->eflags; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = c->eip; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - c->dst.type = OP_NONE; - - ops->get_idt(&dt, ctxt->vcpu); + ops->get_idt(ctxt, &dt); eip_addr = dt.address + (irq << 2); cs_addr = dt.address + (irq << 2) + 2; - rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; @@ -1336,7 +1632,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add stack limit check */ - rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1344,12 +1640,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, if (temp_eip & ~0xffff) return emulate_gp(ctxt, 0); - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1391,15 +1687,31 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, } } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_jmp_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned short sel; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + return X86EMUL_CONTINUE; +} + +static int em_grp1a(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); } -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +static int em_grp2(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; switch (c->modrm_reg) { @@ -1426,10 +1738,10 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); break; } + return X86EMUL_CONTINUE; } -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp3(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long *rax = &c->regs[VCPU_REGS_RAX]; @@ -1468,10 +1780,10 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp45(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; switch (c->modrm_reg) { case 0: /* inc */ @@ -1485,21 +1797,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; + case 5: /* jmp far */ + rc = em_jmp_far(ctxt); + break; case 6: /* push */ - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } - return X86EMUL_CONTINUE; + return rc; } -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp9(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; u64 old = c->dst.orig_val64; @@ -1525,12 +1839,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, int rc; unsigned long cs; - rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + rc = emulate_pop(ctxt, &c->eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); @@ -1559,8 +1873,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, struct desc_struct *ss) { + u16 selector; + memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1590,44 +1906,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_EFER, &efer); setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); ss_sel = (u16)(msr_data + 8); - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->regs[VCPU_REGS_RCX] = c->eip; - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - ops->get_msr(ctxt->vcpu, + ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1643,7 +1959,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); @@ -1656,7 +1974,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) @@ -1673,21 +1991,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; ss_sel &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 - || is_long_mode(ctxt->vcpu)) { + if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) { cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; @@ -1716,7 +2031,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.dpl = 3; ss.dpl = 3; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: cs_sel = (u16)(msr_data + 16); @@ -1736,10 +2051,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->eip = c->regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; @@ -1756,7 +2069,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return ops->cpl(ctxt->vcpu) > iopl; + return ops->cpl(ctxt) > iopl; } static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, @@ -1764,24 +2077,27 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { struct desc_struct tr_seg; + u32 base3; int r; - u16 io_bitmap_ptr; - u8 perm, bit_idx = port & 0x7; + u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; + unsigned long base; - ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) return false; - r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, - ctxt->vcpu, NULL); + base = get_desc_base(&tr_seg); +#ifdef CONFIG_X86_64 + base |= ((u64)base3) << 32; +#endif + r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, - &perm, 1, ctxt->vcpu, NULL); + r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -1822,11 +2138,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, tss->si = c->regs[VCPU_REGS_RSI]; tss->di = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, @@ -1851,11 +2167,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); /* * Now load segment descriptors. If fault happenes at this stage @@ -1889,7 +2205,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -1897,13 +2213,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, save_state_to_tss16(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -1912,10 +2228,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -1930,7 +2246,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; - tss->cr3 = ops->get_cr(3, ctxt->vcpu); + tss->cr3 = ops->get_cr(ctxt, 3); tss->eip = c->eip; tss->eflags = ctxt->eflags; tss->eax = c->regs[VCPU_REGS_RAX]; @@ -1942,13 +2258,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->esi = c->regs[VCPU_REGS_RSI]; tss->edi = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); - tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); - tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); + tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, @@ -1958,7 +2274,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) + if (ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; @@ -1975,13 +2291,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); - ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); - ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); + set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); + set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* * Now load segment descriptors. If fault happenes at this stage @@ -2021,7 +2337,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2029,13 +2345,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, save_state_to_tss32(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2044,10 +2360,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -2063,9 +2379,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, { struct desc_struct curr_tss_desc, next_tss_desc; int ret; - u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); + u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); ulong old_tss_base = - ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; /* FIXME: old_tss_base == ~0 ? */ @@ -2081,7 +2397,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) + ops->cpl(ctxt) > next_tss_desc.dpl) return emulate_gp(ctxt, 0); } @@ -2125,9 +2441,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, &next_tss_desc); } - ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); - ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); + ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); + ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); if (has_error_code) { struct decode_cache *c = &ctxt->decode; @@ -2135,7 +2450,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - emulate_push(ctxt, ops); + ret = em_push(ctxt); } return ret; @@ -2155,13 +2470,10 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, has_error_code, error_code); - if (rc == X86EMUL_CONTINUE) { - rc = writeback(ctxt, ops); - if (rc == X86EMUL_CONTINUE) - ctxt->eip = c->eip; - } + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, @@ -2175,12 +2487,6 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, op->addr.mem.seg = seg; } -static int em_push(struct x86_emulate_ctxt *ctxt) -{ - emulate_push(ctxt, ctxt->ops); - return X86EMUL_CONTINUE; -} - static int em_das(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2227,7 +2533,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) ulong old_eip; int rc; - old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = c->eip; memcpy(&sel, c->src.valptr + c->op_bytes, 2); @@ -2238,20 +2544,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) memcpy(&c->eip, c->src.valptr, c->op_bytes); c->src.val = old_cs; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = old_eip; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->dst.type = OP_NONE; - - return X86EMUL_CONTINUE; + return em_push(ctxt); } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) @@ -2262,13 +2560,79 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); return X86EMUL_CONTINUE; } +static int em_add(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_or(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_adc(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sbb(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_and(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sub(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_xor(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_cmp(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static int em_imul(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2299,13 +2663,10 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { - unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); struct decode_cache *c = &ctxt->decode; u64 tsc = 0; - if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) - return emulate_gp(ctxt, 0); - ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); c->regs[VCPU_REGS_RAX] = (u32)tsc; c->regs[VCPU_REGS_RDX] = tsc >> 32; return X86EMUL_CONTINUE; @@ -2318,22 +2679,375 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movdqu(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); + return X86EMUL_CONTINUE; +} + +static int em_invlpg(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + ulong linear; + + rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); + if (rc == X86EMUL_CONTINUE) + ctxt->ops->invlpg(ctxt, linear); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_clts(struct x86_emulate_ctxt *ctxt) +{ + ulong cr0; + + cr0 = ctxt->ops->get_cr(ctxt, 0); + cr0 &= ~X86_CR0_TS; + ctxt->ops->set_cr(ctxt, 0, cr0); + return X86EMUL_CONTINUE; +} + +static int em_vmcall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return X86EMUL_UNHANDLEABLE; + + rc = ctxt->ops->fix_hypercall(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* Let the processor re-execute the fixed hypercall */ + c->eip = ctxt->eip; + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_lgdt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_gdt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_vmmcall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = ctxt->ops->fix_hypercall(ctxt); + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return rc; +} + +static int em_lidt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_idt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_smsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = 2; + c->dst.val = ctxt->ops->get_cr(ctxt, 0); + return X86EMUL_CONTINUE; +} + +static int em_lmsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) + | (c->src.val & 0x0f)); + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static bool valid_cr(int nr) +{ + switch (nr) { + case 0: + case 2 ... 4: + case 8: + return true; + default: + return false; + } +} + +static int check_cr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + if (!valid_cr(c->modrm_reg)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_cr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int cr = c->modrm_reg; + u64 efer = 0; + + static u64 cr_reserved_bits[] = { + 0xffffffff00000000ULL, + 0, 0, 0, /* CR3 checked later */ + CR4_RESERVED_BITS, + 0, 0, 0, + CR8_RESERVED_BITS, + }; + + if (!valid_cr(cr)) + return emulate_ud(ctxt); + + if (new_val & cr_reserved_bits[cr]) + return emulate_gp(ctxt, 0); + + switch (cr) { + case 0: { + u64 cr4; + if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || + ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) + return emulate_gp(ctxt, 0); + + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && + !(cr4 & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + case 3: { + u64 rsvd = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + rsvd = CR3_L_MODE_RESERVED_BITS; + else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) + rsvd = CR3_PAE_RESERVED_BITS; + else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) + rsvd = CR3_NONPAE_RESERVED_BITS; + + if (new_val & rsvd) + return emulate_gp(ctxt, 0); + + break; + } + case 4: { + u64 cr4; + + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + } + + return X86EMUL_CONTINUE; +} + +static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) +{ + unsigned long dr7; + + ctxt->ops->get_dr(ctxt, 7, &dr7); + + /* Check if DR7.Global_Enable is set */ + return dr7 & (1 << 13); +} + +static int check_dr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int dr = c->modrm_reg; + u64 cr4; + + if (dr > 7) + return emulate_ud(ctxt); + + cr4 = ctxt->ops->get_cr(ctxt, 4); + if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) + return emulate_ud(ctxt); + + if (check_dr7_gd(ctxt)) + return emulate_db(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_dr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int dr = c->modrm_reg; + + if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) + return emulate_gp(ctxt, 0); + + return check_dr_read(ctxt); +} + +static int check_svme(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(efer & EFER_SVME)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_svme_pa(struct x86_emulate_ctxt *ctxt) +{ + u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; + + /* Valid physical address? */ + if (rax & 0xffff000000000000ULL) + return emulate_gp(ctxt, 0); + + return check_svme(ctxt); +} + +static int check_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); + + if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_rdpmc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); + u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; + + if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || + (rcx > 3)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int check_perm_in(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int check_perm_out(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } +#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } #define N D(0) +#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define II(_f, _e, _i) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } +#define IIP(_f, _e, _i, _p) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } +#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) +#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) -#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ - D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ - D2bv(((_f) & ~Lock) | DstAcc | SrcImm) +#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ + I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ + I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +static struct opcode group7_rm1[] = { + DI(SrcNone | ModRM | Priv, monitor), + DI(SrcNone | ModRM | Priv, mwait), + N, N, N, N, N, N, +}; + +static struct opcode group7_rm3[] = { + DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), + II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), + DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), +}; + +static struct opcode group7_rm7[] = { + N, + DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + N, N, N, N, N, N, +}; static struct opcode group1[] = { - X7(D(Lock)), N + I(Lock, em_add), + I(Lock, em_or), + I(Lock, em_adc), + I(Lock, em_sbb), + I(Lock, em_and), + I(Lock, em_sub), + I(Lock, em_xor), + I(0, em_cmp), }; static struct opcode group1A[] = { @@ -2359,15 +3073,28 @@ static struct opcode group5[] = { D(SrcMem | ModRM | Stack), N, }; +static struct opcode group6[] = { + DI(ModRM | Prot, sldt), + DI(ModRM | Prot, str), + DI(ModRM | Prot | Priv, lldt), + DI(ModRM | Prot | Priv, ltr), + N, N, N, N, +}; + static struct group_dual group7 = { { - N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), - D(SrcMem | ModRM | ByteOp | Priv | NoAccess), + DI(ModRM | Mov | DstMem | Priv, sgdt), + DI(ModRM | Mov | DstMem | Priv, sidt), + II(ModRM | SrcMem | Priv, em_lgdt, lgdt), + II(ModRM | SrcMem | Priv, em_lidt, lidt), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, + I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + EXT(0, group7_rm1), + N, EXT(0, group7_rm3), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), } }; static struct opcode group8[] = { @@ -2386,35 +3113,40 @@ static struct opcode group11[] = { I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), }; +static struct gprefix pfx_0f_6f_0f_7f = { + N, N, N, I(Sse, em_movdqu), +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - D6ALU(Lock), + I6ALU(Lock, em_add), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x08 - 0x0F */ - D6ALU(Lock), + I6ALU(Lock, em_or), D(ImplicitOps | Stack | No64), N, /* 0x10 - 0x17 */ - D6ALU(Lock), + I6ALU(Lock, em_adc), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x18 - 0x1F */ - D6ALU(Lock), + I6ALU(Lock, em_sbb), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x20 - 0x27 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ - D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), + I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ - D6ALU(0), N, N, + I6ALU(0, em_cmp), N, N, /* 0x40 - 0x4F */ X16(D(DstReg)), /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ - X8(D(DstReg | Stack)), + X8(I(DstReg | Stack, em_pop)), /* 0x60 - 0x67 */ - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + I(ImplicitOps | Stack | No64, em_pusha), + I(ImplicitOps | Stack | No64, em_popa), N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , N, N, N, N, /* 0x68 - 0x6F */ @@ -2422,8 +3154,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D2bv(DstDI | Mov | String), /* insb, insw/insd */ - D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */ + D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ @@ -2438,21 +3170,22 @@ static struct opcode opcode_table[256] = { D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ - X8(D(SrcAcc | DstReg)), + DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, - D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + II(ImplicitOps | Stack, em_pushf, pushf), + II(ImplicitOps | Stack, em_popf, popf), N, N, /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - D2bv(SrcSI | DstDI | String), + I2bv(SrcSI | DstDI | String, em_cmp), /* 0xA8 - 0xAF */ D2bv(DstAcc | SrcImm), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - D2bv(SrcAcc | DstDI | String), + I2bv(SrcAcc | DstDI | String, em_cmp), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ @@ -2465,7 +3198,8 @@ static struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), - D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + D(ImplicitOps), DI(SrcImmByte, intn), + D(ImplicitOps | No64), DI(ImplicitOps, iret), /* 0xD0 - 0xD7 */ D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), N, N, N, N, @@ -2473,14 +3207,17 @@ static struct opcode opcode_table[256] = { N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ X4(D(SrcImmByte)), - D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), + D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), - D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), + D2bvIP(SrcNone | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out), /* 0xF0 - 0xF7 */ - N, N, N, N, - D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + N, DI(ImplicitOps, icebp), N, N, + DI(ImplicitOps | Priv, hlt), D(ImplicitOps), + G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), @@ -2488,42 +3225,53 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ - N, GD(0, &group7), N, N, - N, D(ImplicitOps), D(ImplicitOps | Priv), N, - D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + G(0, group6), GD(0, &group7), N, N, + N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, + DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), - D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), + DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), + DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), + DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), - D(ImplicitOps | Priv), N, - D(ImplicitOps), D(ImplicitOps | Priv), N, N, + DI(ImplicitOps | Priv, wrmsr), + IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), + DI(ImplicitOps | Priv, rdmsr), + DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), + D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), + N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ X16(D(DstReg | SrcMem | ModRM | Mov)), /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x70 - 0x7F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ X16(D(SrcImm)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp), + DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), @@ -2555,10 +3303,13 @@ static struct opcode twobyte_table[256] = { #undef G #undef GD #undef I +#undef GP +#undef EXT #undef D2bv +#undef D2bvIP #undef I2bv -#undef D6ALU +#undef I6ALU static unsigned imm_size(struct decode_cache *c) { @@ -2616,8 +3367,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, dual, goffset; - struct opcode opcode, *g_mod012, *g_mod3; + int def_op_bytes, def_ad_bytes, goffset, simd_prefix; + bool op_prefix = false; + struct opcode opcode; struct operand memop = { .type = OP_NONE }; c->eip = ctxt->eip; @@ -2625,7 +3377,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->fetch.end = c->fetch.start + insn_len; if (insn_len > 0) memcpy(c->fetch.data, insn, insn_len); - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); switch (mode) { case X86EMUL_MODE_REAL: @@ -2653,6 +3404,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) for (;;) { switch (c->b = insn_fetch(u8, 1, c->eip)) { case 0x66: /* operand-size override */ + op_prefix = true; /* switch between 2/4 bytes */ c->op_bytes = def_op_bytes ^ 6; break; @@ -2683,10 +3435,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->lock_prefix = 1; break; case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; + c->rep_prefix = c->b; break; default: goto done_prefixes; @@ -2713,34 +3463,57 @@ done_prefixes: } c->d = opcode.flags; - if (c->d & Group) { - dual = c->d & GroupDual; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - if (c->d & GroupDual) { - g_mod012 = opcode.u.gdual->mod012; - g_mod3 = opcode.u.gdual->mod3; - } else - g_mod012 = g_mod3 = opcode.u.group; - - c->d &= ~(Group | GroupDual); - - goffset = (c->modrm >> 3) & 7; + while (c->d & GroupMask) { + switch (c->d & GroupMask) { + case Group: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + opcode = opcode.u.group[goffset]; + break; + case GroupDual: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + if ((c->modrm >> 6) == 3) + opcode = opcode.u.gdual->mod3[goffset]; + else + opcode = opcode.u.gdual->mod012[goffset]; + break; + case RMExt: + goffset = c->modrm & 7; + opcode = opcode.u.group[goffset]; + break; + case Prefix: + if (c->rep_prefix && op_prefix) + return X86EMUL_UNHANDLEABLE; + simd_prefix = op_prefix ? 0x66 : c->rep_prefix; + switch (simd_prefix) { + case 0x00: opcode = opcode.u.gprefix->pfx_no; break; + case 0x66: opcode = opcode.u.gprefix->pfx_66; break; + case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; + case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; + } + break; + default: + return X86EMUL_UNHANDLEABLE; + } - if ((c->modrm >> 6) == 3) - opcode = g_mod3[goffset]; - else - opcode = g_mod012[goffset]; + c->d &= ~GroupMask; c->d |= opcode.flags; } c->execute = opcode.u.execute; + c->check_perm = opcode.check_perm; + c->intercept = opcode.intercept; /* Unrecognised? */ if (c->d == 0 || (c->d & Undefined)) return -1; + if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + return -1; + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) c->op_bytes = 8; @@ -2751,6 +3524,9 @@ done_prefixes: c->op_bytes = 4; } + if (c->d & Sse) + c->op_bytes = 16; + /* ModRM and SIB bytes. */ if (c->d & ModRM) { rc = decode_modrm(ctxt, ops, &memop); @@ -2764,7 +3540,7 @@ done_prefixes: if (!c->has_seg_override) set_seg_override(c, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt, ops, c); + memop.addr.mem.seg = seg_override(ctxt, c); if (memop.type == OP_MEM && c->ad_bytes != 8) memop.addr.mem.ea = (u32)memop.addr.mem.ea; @@ -2780,7 +3556,7 @@ done_prefixes: case SrcNone: break; case SrcReg: - decode_register_operand(&c->src, c, 0); + decode_register_operand(ctxt, &c->src, c, 0); break; case SrcMem16: memop.bytes = 2; @@ -2824,7 +3600,7 @@ done_prefixes: c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSI]); - c->src.addr.mem.seg = seg_override(ctxt, ops, c), + c->src.addr.mem.seg = seg_override(ctxt, c); c->src.val = 0; break; case SrcImmFAddr: @@ -2871,7 +3647,7 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ switch (c->d & DstMask) { case DstReg: - decode_register_operand(&c->dst, c, + decode_register_operand(ctxt, &c->dst, c, c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; case DstImmUByte: @@ -2914,7 +3690,7 @@ done_prefixes: } done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) @@ -2967,12 +3743,51 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + if ((c->d & Sse) + && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) + || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); + goto done; + } + + if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); + goto done; + } + + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ - if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { + if ((c->d & Priv) && ops->cpl(ctxt)) { rc = emulate_gp(ctxt, 0); goto done; } + /* Instruction can only be executed in protected mode */ + if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { + rc = emulate_ud(ctxt); + goto done; + } + + /* Do instruction specific permission checks */ + if (c->check_perm) { + rc = c->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { @@ -2982,16 +3797,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), - c->src.valptr, c->src.bytes); + rc = segmented_read(ctxt, c->src.addr.mem, + c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val64 = c->src.val64; } if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), - &c->src2.val, c->src2.bytes); + rc = segmented_read(ctxt, c->src2.addr.mem, + &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -3002,7 +3817,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), + rc = segmented_read(ctxt, c->dst.addr.mem, &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -3011,6 +3826,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_MEMACCESS); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->execute) { rc = c->execute(ctxt); if (rc != X86EMUL_CONTINUE) @@ -3022,75 +3844,33 @@ special_insn: goto twobyte_insn; switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); break; - case 0x20 ... 0x25: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op("inc", c->dst, ctxt->eflags); break; case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - break; - case 0x60: /* pusha */ - rc = emulate_pusha(ctxt, ops); - break; - case 0x61: /* popa */ - rc = emulate_popa(ctxt, ops); - break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; @@ -3109,26 +3889,6 @@ special_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; case 0x84 ... 0x85: test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); @@ -3150,7 +3910,7 @@ special_insn: rc = emulate_ud(ctxt); goto done; } - c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); + c->dst.val = get_segment_selector(ctxt, c->modrm_reg); break; case 0x8d: /* lea r16/r32, m */ c->dst.val = c->src.addr.mem.ea; @@ -3175,7 +3935,7 @@ special_insn: break; } case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); + rc = em_grp1a(ctxt); break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) @@ -3188,31 +3948,17 @@ special_insn: case 8: c->dst.val = (s32)c->dst.val; break; } break; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt, ops); - break; - case 0x9d: /* popf */ - c->dst.type = OP_REG; - c->dst.addr.reg = &ctxt->eflags; - c->dst.bytes = c->op_bytes; - rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - c->dst.type = OP_NONE; /* Disable writeback. */ - goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; - case 0xae ... 0xaf: /* scas */ - goto cmp; case 0xc0 ... 0xc1: - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xc3: /* ret */ c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = em_pop(ctxt); + break; case 0xc4: /* les */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); break; @@ -3240,11 +3986,11 @@ special_insn: rc = emulate_iret(ctxt, ops); break; case 0xd0 ... 0xd1: /* Grp2 */ - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); @@ -3266,23 +4012,14 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: { /* jmp far */ - unsigned short sel; - jump_far: - memcpy(&sel, c->src.valptr + c->op_bytes, 2); - - if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) - goto done; - - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); + case 0xea: /* jmp far */ + rc = em_jmp_far(ctxt); break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -3292,11 +4029,6 @@ special_insn: case 0xed: /* in (e/r)ax,dx */ c->src.val = c->regs[VCPU_REGS_RDX]; do_io_in: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, &c->dst.val)) goto done; /* IO is needed */ @@ -3305,25 +4037,19 @@ special_insn: case 0xef: /* out dx,(e/r)ax */ c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->dst.val, - c->src.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } - ops->pio_out_emulated(c->src.bytes, c->dst.val, - &c->src.val, 1, ctxt->vcpu); + ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, + &c->src.val, 1); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; + ctxt->ops->halt(ctxt); break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; break; case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); + rc = em_grp3(ctxt); break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; @@ -3354,13 +4080,11 @@ special_insn: ctxt->eflags |= EFLG_DF; break; case 0xfe: /* Grp4 */ - grp45: - rc = emulate_grp45(ctxt, ops); + rc = em_grp45(ctxt); break; case 0xff: /* Grp5 */ - if (c->modrm_reg == 5) - goto jump_far; - goto grp45; + rc = em_grp45(ctxt); + break; default: goto cannot_emulate; } @@ -3369,7 +4093,7 @@ special_insn: goto done; writeback: - rc = writeback(ctxt, ops); + rc = writeback(ctxt); if (rc != X86EMUL_CONTINUE) goto done; @@ -3380,7 +4104,7 @@ writeback: c->dst.type = saved_dst_type; if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt, ops, c), + string_addr_inc(ctxt, seg_override(ctxt, c), VCPU_REGS_RSI, &c->src); if ((c->d & DstMask) == DstDI) @@ -3415,115 +4139,34 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) ctxt->have_exception = true; + if (rc == X86EMUL_INTERCEPTED) + return EMULATION_INTERCEPTED; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - u16 size; - unsigned long address; - - case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - - /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->eip; - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3) { - switch (c->modrm_rm) { - case 1: - rc = kvm_fix_hypercall(ctxt->vcpu); - break; - default: - goto cannot_emulate; - } - } else { - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, - c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - } - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 4: /* smsw */ - c->dst.bytes = 2; - c->dst.val = ops->get_cr(0, ctxt->vcpu); - break; - case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | - (c->src.val & 0x0f), ctxt->vcpu); - c->dst.type = OP_NONE; - break; - case 5: /* not defined */ - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, - linear(ctxt, c->src.addr.mem)); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - default: - goto cannot_emulate; - } - break; case 0x05: /* syscall */ rc = emulate_syscall(ctxt, ops); break; case 0x06: - emulate_clts(ctxt->vcpu); + rc = em_clts(ctxt); break; case 0x09: /* wbinvd */ - kvm_emulate_wbinvd(ctxt->vcpu); + (ctxt->ops->wbinvd)(ctxt); break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ break; case 0x20: /* mov cr, reg */ - switch (c->modrm_reg) { - case 1: - case 5 ... 7: - case 9 ... 15: - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); + c->dst.val = ops->get_cr(ctxt, c->modrm_reg); break; case 0x21: /* mov from dr to reg */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); + ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { + if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3531,16 +4174,9 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - - if (ops->set_dr(c->modrm_reg, c->src.val & + if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? - ~0ULL : ~0U), ctxt->vcpu) < 0) { + ~0ULL : ~0U)) < 0) { /* #UD condition is already handled by the code above */ emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; @@ -3553,7 +4189,7 @@ twobyte_insn: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3562,7 +4198,7 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3591,7 +4227,7 @@ twobyte_insn: c->dst.val = test_cc(c->b, ctxt->eflags); break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -3608,7 +4244,7 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); @@ -3715,7 +4351,7 @@ twobyte_insn: (u64) c->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops); + rc = em_grp9(ctxt); break; default: goto cannot_emulate; @@ -3727,5 +4363,5 @@ twobyte_insn: goto writeback; cannot_emulate: - return -1; + return EMULATION_FAILED; } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 46d08ca0b48f..51a97426e791 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -33,7 +33,6 @@ struct kvm_kpit_state { }; struct kvm_pit { - unsigned long base_addresss; struct kvm_io_device dev; struct kvm_io_device speaker_dev; struct kvm *kvm; @@ -51,7 +50,6 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3cece05e4ac4..19fe855e7953 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s) } if (!found) - found = s->kvm->bsp_vcpu; - - if (!found) return; kvm_make_request(KVM_REQ_EVENT, found); @@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s) static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); - s->isr_ack |= (1 << irq); if (s != &s->pics_state->pics[0]) irq += 8; /* @@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) pic_lock(s->pics_state); } -void kvm_pic_clear_isr_ack(struct kvm *kvm) -{ - struct kvm_pic *s = pic_irqchip(kvm); - - pic_lock(s); - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; - pic_unlock(s); -} - /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->irr = 0; s->imr = 0; s->isr = 0; - s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); - int irq = pic_get_irq(&s->pics[0]); - s->output = level; - if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { - s->pics[0].isr_ack &= ~(1 << irq); + if (!s->output) s->wakeup_needed = true; - } + s->output = level; } static const struct kvm_io_device_ops picdev_ops = { @@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ba910d149410..53e2d084bffb 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { @@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); -int pit_has_pending_timer(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93cf9d0d3653..2b2255b1f04b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (level) { result = 1; - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - printk(KERN_DEBUG - "INIT on a runnable vcpu %d\n", - vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); - if (vcpu->arch.apic->regs_page) - __free_page(vcpu->arch.apic->regs_page); + if (vcpu->arch.apic->regs) + free_page((unsigned long)vcpu->arch.apic->regs); kfree(vcpu->arch.apic); } @@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (apic->regs_page == NULL) { + apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } - apic->regs = page_address(apic->regs_page); apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f5fe32c5edad..52c9e6b9e725 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,7 +13,6 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; - struct page *regs_page; void *regs; gpa_t vapic_addr; struct page *vapic_page; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f02b8edc3d44..bd14bb4c8594 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) -#define PT64_LEVEL_MASK(level) \ - (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) - #define PT64_INDEX(address, level)\ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) @@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644); #define PT32_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) -#define PT32_LEVEL_MASK(level) \ - (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) #define PT32_LVL_OFFSET_MASK(level) \ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) @@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min) { - struct page *page; + void *page; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; - cache->objects[cache->nobjs++] = page_address(page); + cache->objects[cache->nobjs++] = page; } return 0; } @@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static struct kvm_memory_slot * +gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); - if (slot && slot->dirty_bitmap) - return true; - return false; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); - __free_page(virt_to_page(sp->spt)); + free_page((unsigned long)sp->spt); if (!sp->role.direct) - __free_page(virt_to_page(sp->gfns)); + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); kvm_mod_used_mmu_pages(kvm, -1); } @@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +static void nonpaging_update_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + const void *pte) +{ + WARN_ON(1); +} + #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static struct kvm_memory_slot * -pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) - slot = NULL; - - return slot; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; unsigned long hva; - slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) { get_page(bad_page); return page_to_pfn(bad_page); @@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) return -1; ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); @@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; + context->update_pte = paging64_update_pte; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; + context->update_pte = paging32_update_pte; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; context->direct_map = true; @@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static int init_kvm_mmu(struct kvm_vcpu *vcpu) { - vcpu->arch.update_pte.pfn = bad_pfn; - if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -3162,8 +3163,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte, + struct kvm_mmu_page *sp, u64 *spte, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { @@ -3172,10 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - if (!sp->role.cr4_pae) - paging32_update_pte(vcpu, sp, spte, new); - else - paging64_update_pte(vcpu, sp, spte, new); + vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -3210,28 +3207,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) return !!(spte && (*spte & shadow_accessed_mask)); } -static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - u64 gpte) -{ - gfn_t gfn; - pfn_t pfn; - - if (!is_present_gpte(gpte)) - return; - gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - - vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - pfn = gfn_to_pfn(vcpu->kvm, gfn); - - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - vcpu->arch.update_pte.gfn = gfn; - vcpu->arch.update_pte.pfn = pfn; -} - static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) { u64 *spte = vcpu->arch.last_pte_updated; @@ -3253,21 +3228,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); - u64 entry, gentry; - u64 *spte; - unsigned offset = offset_in_page(gpa); - unsigned pte_size; - unsigned page_offset; - unsigned misaligned; - unsigned quadrant; - int level; - int flooded = 0; - int npte; - int r; - int invlpg_counter; + u64 entry, gentry, *spte; + unsigned pte_size, page_offset, misaligned, quadrant, offset; + int level, npte, invlpg_counter, r, flooded = 0; bool remote_flush, local_flush, zap_page; zap_page = remote_flush = local_flush = false; + offset = offset_in_page(gpa); pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -3275,9 +3242,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, /* * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). + * as the current vcpu paging mode since we update the sptes only + * when they have the same mode. */ if ((is_pae(vcpu) && bytes == 4) || !new) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ @@ -3303,15 +3269,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - mmu_guess_page_from_pte_write(vcpu, gpa, gentry); spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; - kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { + kvm_mmu_access_page(vcpu, gfn); if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { ++vcpu->arch.last_pt_write_count; @@ -3385,10 +3350,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); - if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { - kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); - vcpu->arch.update_pte.pfn = bad_pfn; - } } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) @@ -3538,14 +3499,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (!test_bit(slot, sp->slot_bitmap)) continue; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_shadow_present_pte(pt[i]) || + !is_last_spte(pt[i], sp->role.level)) + continue; + + if (is_large_pte(pt[i])) { + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); + --kvm->stat.lpages; + continue; + } + /* avoid RMW */ if (is_writable_pte(pt[i])) update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); + } } kvm_flush_remote_tlbs(kvm); } @@ -3575,15 +3545,16 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } -static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; struct kvm *kvm_freed = NULL; + int nr_to_scan = sc->nr_to_scan; if (nr_to_scan == 0) goto out; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx, freed_pages; @@ -3606,7 +3577,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); out: return percpu_counter_read_positive(&kvm_total_used_mmu_pages); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bccc24c4181..6c4dc010c4cb 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -31,7 +31,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 @@ -48,7 +47,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 #define CMPXCHG cmpxchg @@ -80,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, - gfn_t table_gfn, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) +static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + pt_element_t __user *ptep_user, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) { + int npages; pt_element_t ret; pt_element_t *table; struct page *page; - page = gfn_to_page(kvm, table_gfn); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + /* Check if the user is doing something meaningless. */ + if (unlikely(npages != 1)) + return -EFAULT; table = kmap_atomic(page, KM_USER0); ret = CMPXCHG(&table[index], orig_pte, new_pte); @@ -119,6 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gva_t addr, u32 access) { pt_element_t pte; + pt_element_t __user *ptep_user; gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; @@ -154,6 +157,9 @@ walk: pt_access = ACC_ALL; for (;;) { + gfn_t real_gfn; + unsigned long host_addr; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -162,43 +168,64 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, - offset, sizeof(pte), - PFERR_USER_MASK|PFERR_WRITE_MASK)) { + real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), + PFERR_USER_MASK|PFERR_WRITE_MASK); + if (unlikely(real_gfn == UNMAPPED_GVA)) { + present = false; + break; + } + real_gfn = gpa_to_gfn(real_gfn); + + host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + if (unlikely(kvm_is_error_hva(host_addr))) { + present = false; + break; + } + + ptep_user = (pt_element_t __user *)((void *)host_addr + offset); + if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { present = false; break; } trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) { + if (unlikely(!is_present_gpte(pte))) { present = false; break; } - if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { + if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, + walker->level))) { rsvd_fault = true; break; } - if (write_fault && !is_writable_pte(pte)) - if (user_fault || is_write_protection(vcpu)) - eperm = true; + if (unlikely(write_fault && !is_writable_pte(pte) + && (user_fault || is_write_protection(vcpu)))) + eperm = true; - if (user_fault && !(pte & PT_USER_MASK)) + if (unlikely(user_fault && !(pte & PT_USER_MASK))) eperm = true; #if PTTYPE == 64 - if (fetch_fault && (pte & PT64_NX_MASK)) + if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) eperm = true; #endif - if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { + if (!eperm && !rsvd_fault + && unlikely(!(pte & PT_ACCESSED_MASK))) { + int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, - index, pte, pte|PT_ACCESSED_MASK)) + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_ACCESSED_MASK); + if (unlikely(ret < 0)) { + present = false; + break; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -243,17 +270,21 @@ walk: --walker->level; } - if (!present || eperm || rsvd_fault) + if (unlikely(!present || eperm || rsvd_fault)) goto error; - if (write_fault && !is_dirty_gpte(pte)) { - bool ret; + if (write_fault && unlikely(!is_dirty_gpte(pte))) { + int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, - pte|PT_DIRTY_MASK); - if (ret) + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_DIRTY_MASK); + if (unlikely(ret < 0)) { + present = false; + goto error; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; @@ -339,16 +370,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) + pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); return; - pfn = vcpu->arch.update_pte.pfn; - if (is_error_pfn(pfn)) - return; - if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) - return; - kvm_get_pfn(pfn); + } + /* - * we call mmu_set_spte() with host_writable = true beacuse that + * we call mmu_set_spte() with host_writable = true because that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, @@ -829,7 +858,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef PT_LEVEL_MASK #undef PT_LVL_ADDR_MASK #undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 63fec1531e89..506e4fe23adc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -63,6 +63,10 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define TSC_RATIO_RSVD 0xffffff0000000000ULL +#define TSC_RATIO_MIN 0x0000000000000001ULL +#define TSC_RATIO_MAX 0x000000ffffffffffULL + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -93,14 +97,6 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; - /* - * If we vmexit during an instruction emulation we need this to restore - * the l1 guest rip after the emulation - */ - unsigned long vmexit_rip; - unsigned long vmexit_rsp; - unsigned long vmexit_rax; - /* cache for intercepts of the guest */ u32 intercept_cr; u32 intercept_dr; @@ -135,6 +131,8 @@ struct vcpu_svm { u32 *msrpm; + ulong nmi_iret_rip; + struct nested_state nested; bool nmi_singlestep; @@ -142,8 +140,13 @@ struct vcpu_svm { unsigned int3_injected; unsigned long int3_rip; u32 apf_reason; + + u64 tsc_ratio; }; +static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define TSC_RATIO_DEFAULT 0x0100000000ULL + #define MSR_INVALID 0xffffffffU static struct svm_direct_access_msrs { @@ -188,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); +static u64 __scale_tsc(u64 ratio, u64 tsc); enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, @@ -374,7 +378,6 @@ struct svm_cpu_data { }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -static uint32_t svm_features; struct svm_init_data { int cpu; @@ -567,6 +570,10 @@ static int has_svm(void) static void svm_hardware_disable(void *garbage) { + /* Make sure we clean up behind us */ + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + cpu_svm_disable(); } @@ -608,6 +615,11 @@ static int svm_hardware_enable(void *garbage) wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; + } + svm_init_erratum_383(); return 0; @@ -789,6 +801,23 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); + if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + u64 max; + + kvm_has_tsc_control = true; + + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines and a value of 0 is used to disable + * tsc-scaling for the vcpu. + */ + max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); + + kvm_max_guest_tsc_khz = max; + } + if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); @@ -800,8 +829,6 @@ static __init int svm_hardware_setup(void) goto err; } - svm_features = cpuid_edx(SVM_CPUID_FUNC); - if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; @@ -852,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static u64 __scale_tsc(u64 ratio, u64 tsc) +{ + u64 mult, frac, _tsc; + + mult = ratio >> 32; + frac = ratio & ((1ULL << 32) - 1); + + _tsc = tsc; + _tsc *= mult; + _tsc += (tsc >> 32) * frac; + _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; + + return _tsc; +} + +static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 _tsc = tsc; + + if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + _tsc = __scale_tsc(svm->tsc_ratio, tsc); + + return _tsc; +} + +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 ratio; + u64 khz; + + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + return; + + /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ + if (user_tsc_khz == 0) { + vcpu->arch.virtual_tsc_khz = 0; + svm->tsc_ratio = TSC_RATIO_DEFAULT; + return; + } + + khz = user_tsc_khz; + + /* TSC scaling required - calculate ratio */ + ratio = khz << 32; + do_div(ratio, tsc_khz); + + if (ratio == 0 || ratio & TSC_RATIO_RSVD) { + WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return; + } + vcpu->arch.virtual_tsc_khz = user_tsc_khz; + svm->tsc_ratio = ratio; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -878,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = svm_scale_tsc(vcpu, native_read_tsc()); + + return target_tsc - tsc; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -973,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; - save->rflags = 2; + kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -1046,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->tsc_ratio = TSC_RATIO_DEFAULT; + err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -1139,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && + svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { + __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; + wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + } } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1153,8 +1255,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); load_gs_index(svm->host.gs); #else +#ifdef CONFIG_X86_32_LAZY_GS loadsegment(gs, svm->host.gs); #endif +#endif for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } @@ -1361,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu)) { - /* - * We are here because we run in nested mode, the host kvm - * intercepts cr0 writes but the l1 hypervisor does not. - * But the L1 hypervisor may intercept selective cr0 writes. - * This needs to be checked here. - */ - unsigned long old, new; - - /* Remove bits that would trigger a real cr0 write intercept */ - old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; - new = cr0 & SVM_CR0_SELECTIVE_MASK; - - if (old == new) { - /* cr0 write with ts and mp unchanged */ - svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { - svm->nested.vmexit_rip = kvm_rip_read(vcpu); - svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - return; - } - } - } - #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { @@ -2123,7 +2202,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); nested_vmcb->save.cr2 = vmcb->save.cr2; nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; nested_vmcb->save.rax = vmcb->save.rax; @@ -2180,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.ds = hsave->save.ds; svm->vmcb->save.gdtr = hsave->save.gdtr; svm->vmcb->save.idtr = hsave->save.idtr; - svm->vmcb->save.rflags = hsave->save.rflags; + kvm_set_rflags(&svm->vcpu, hsave->save.rflags); svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); @@ -2308,7 +2387,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.efer = svm->vcpu.arch.efer; hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = vmcb->save.rflags; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); hsave->save.rip = kvm_rip_read(&svm->vcpu); hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; @@ -2319,7 +2398,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(hsave, vmcb); - if (svm->vmcb->save.rflags & X86_EFLAGS_IF) + if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; @@ -2337,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.ds = nested_vmcb->save.ds; svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; svm->vmcb->save.idtr = nested_vmcb->save.idtr; - svm->vmcb->save.rflags = nested_vmcb->save.rflags; + kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); @@ -2439,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); @@ -2460,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); @@ -2653,6 +2732,7 @@ static int iret_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.nmi_window_exits; clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; + svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); return 1; } @@ -2671,6 +2751,29 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) +{ + unsigned long cr0 = svm->vcpu.arch.cr0; + bool ret = false; + u64 intercept; + + intercept = svm->nested.intercept; + + if (!is_guest_mode(&svm->vcpu) || + (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + return false; + + cr0 &= ~SVM_CR0_SELECTIVE_MASK; + val &= ~SVM_CR0_SELECTIVE_MASK; + + if (cr0 ^ val) { + svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); + } + + return ret; +} + #define CR_VALID (1ULL << 63) static int cr_interception(struct vcpu_svm *svm) @@ -2694,7 +2797,11 @@ static int cr_interception(struct vcpu_svm *svm) val = kvm_register_read(&svm->vcpu, reg); switch (cr) { case 0: - err = kvm_set_cr0(&svm->vcpu, val); + if (!check_selective_cr0_intercepted(svm, val)) + err = kvm_set_cr0(&svm->vcpu, val); + else + return 1; + break; case 3: err = kvm_set_cr3(&svm->vcpu, val); @@ -2739,23 +2846,6 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } -static int cr0_write_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - int r; - - r = cr_interception(svm); - - if (svm->nested.vmexit_rip) { - kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); - kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); - kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); - svm->nested.vmexit_rip = 0; - } - - return r; -} - static int dr_interception(struct vcpu_svm *svm) { int reg, dr; @@ -2808,7 +2898,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_TSC: { struct vmcb *vmcb = get_host_vmcb(svm); - *data = vmcb->control.tsc_offset + native_read_tsc(); + *data = vmcb->control.tsc_offset + + svm_scale_tsc(vcpu, native_read_tsc()); + break; } case MSR_STAR: @@ -3043,7 +3135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = cr_interception, [SVM_EXIT_READ_CR8] = cr_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = cr0_write_interception, + [SVM_EXIT_WRITE_CR0] = cr_interception, [SVM_EXIT_WRITE_CR3] = cr_interception, [SVM_EXIT_WRITE_CR4] = cr_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, @@ -3099,97 +3191,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_NPF] = pf_interception, }; -void dump_vmcb(struct kvm_vcpu *vcpu) +static void dump_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; pr_err("VMCB Control Area:\n"); - pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); - pr_err("cr_write: %04x\n", control->intercept_cr >> 16); - pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); - pr_err("dr_write: %04x\n", control->intercept_dr >> 16); - pr_err("exceptions: %08x\n", control->intercept_exceptions); - pr_err("intercepts: %016llx\n", control->intercept); - pr_err("pause filter count: %d\n", control->pause_filter_count); - pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); - pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); - pr_err("tsc_offset: %016llx\n", control->tsc_offset); - pr_err("asid: %d\n", control->asid); - pr_err("tlb_ctl: %d\n", control->tlb_ctl); - pr_err("int_ctl: %08x\n", control->int_ctl); - pr_err("int_vector: %08x\n", control->int_vector); - pr_err("int_state: %08x\n", control->int_state); - pr_err("exit_code: %08x\n", control->exit_code); - pr_err("exit_info1: %016llx\n", control->exit_info_1); - pr_err("exit_info2: %016llx\n", control->exit_info_2); - pr_err("exit_int_info: %08x\n", control->exit_int_info); - pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); - pr_err("nested_ctl: %lld\n", control->nested_ctl); - pr_err("nested_cr3: %016llx\n", control->nested_cr3); - pr_err("event_inj: %08x\n", control->event_inj); - pr_err("event_inj_err: %08x\n", control->event_inj_err); - pr_err("lbr_ctl: %lld\n", control->lbr_ctl); - pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); + pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); + pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); + pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); + pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); + pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); + pr_err("%-20s%d\n", "asid:", control->asid); + pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); + pr_err("%-20s%08x\n", "int_vector:", control->int_vector); + pr_err("%-20s%08x\n", "int_state:", control->int_state); + pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); + pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); + pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); + pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); + pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); + pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%08x\n", "event_inj:", control->event_inj); + pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); + pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("VMCB State Save Area:\n"); - pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", - save->es.selector, save->es.attrib, - save->es.limit, save->es.base); - pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", - save->cs.selector, save->cs.attrib, - save->cs.limit, save->cs.base); - pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", - save->ss.selector, save->ss.attrib, - save->ss.limit, save->ss.base); - pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", - save->ds.selector, save->ds.attrib, - save->ds.limit, save->ds.base); - pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); - pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); - pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->gdtr.selector, save->gdtr.attrib, - save->gdtr.limit, save->gdtr.base); - pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); - pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->idtr.selector, save->idtr.attrib, - save->idtr.limit, save->idtr.base); - pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "es:", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "cs:", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ss:", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ds:", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "fs:", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gs:", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gdtr:", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ldtr:", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "idtr:", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "tr:", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); pr_err("cpl: %d efer: %016llx\n", save->cpl, save->efer); - pr_err("cr0: %016llx cr2: %016llx\n", - save->cr0, save->cr2); - pr_err("cr3: %016llx cr4: %016llx\n", - save->cr3, save->cr4); - pr_err("dr6: %016llx dr7: %016llx\n", - save->dr6, save->dr7); - pr_err("rip: %016llx rflags: %016llx\n", - save->rip, save->rflags); - pr_err("rsp: %016llx rax: %016llx\n", - save->rsp, save->rax); - pr_err("star: %016llx lstar: %016llx\n", - save->star, save->lstar); - pr_err("cstar: %016llx sfmask: %016llx\n", - save->cstar, save->sfmask); - pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", - save->kernel_gs_base, save->sysenter_cs); - pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", - save->sysenter_esp, save->sysenter_eip); - pr_err("gpat: %016llx dbgctl: %016llx\n", - save->g_pat, save->dbgctl); - pr_err("br_from: %016llx br_to: %016llx\n", - save->br_from, save->br_to); - pr_err("excp_from: %016llx excp_to: %016llx\n", - save->last_excp_from, save->last_excp_to); - + pr_err("%-15s %016llx %-13s %016llx\n", + "cr0:", save->cr0, "cr2:", save->cr2); + pr_err("%-15s %016llx %-13s %016llx\n", + "cr3:", save->cr3, "cr4:", save->cr4); + pr_err("%-15s %016llx %-13s %016llx\n", + "dr6:", save->dr6, "dr7:", save->dr7); + pr_err("%-15s %016llx %-13s %016llx\n", + "rip:", save->rip, "rflags:", save->rflags); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsp:", save->rsp, "rax:", save->rax); + pr_err("%-15s %016llx %-13s %016llx\n", + "star:", save->star, "lstar:", save->lstar); + pr_err("%-15s %016llx %-13s %016llx\n", + "cstar:", save->cstar, "sfmask:", save->sfmask); + pr_err("%-15s %016llx %-13s %016llx\n", + "kernel_gs_base:", save->kernel_gs_base, + "sysenter_cs:", save->sysenter_cs); + pr_err("%-15s %016llx %-13s %016llx\n", + "sysenter_esp:", save->sysenter_esp, + "sysenter_eip:", save->sysenter_eip); + pr_err("%-15s %016llx %-13s %016llx\n", + "gpat:", save->g_pat, "dbgctl:", save->dbgctl); + pr_err("%-15s %016llx %-13s %016llx\n", + "br_from:", save->br_from, "br_to:", save->br_to); + pr_err("%-15s %016llx %-13s %016llx\n", + "excp_from:", save->last_excp_from, + "excp_to:", save->last_excp_to); } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) @@ -3379,7 +3483,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) return 0; - ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); + ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); if (is_guest_mode(vcpu)) return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); @@ -3474,7 +3578,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->int3_injected = 0; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) { + /* + * If we've made progress since setting HF_IRET_MASK, we've + * executed an IRET and can allow NMI injection. + */ + if ((svm->vcpu.arch.hflags & HF_IRET_MASK) + && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } @@ -3641,19 +3750,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif #endif reload_tss(vcpu); local_irq_disable(); - stgi(); - vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_before_handle_nmi(&svm->vcpu); + + stgi(); + + /* Any pending NMI will happen here */ + + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_after_handle_nmi(&svm->vcpu); + sync_cr8_to_lapic(vcpu); svm->next_rip = 0; @@ -3850,6 +3970,186 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) update_cr0_intercept(svm); } +#define PRE_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_PRE_EXCEPT, } +#define POST_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_EXCEPT, } +#define POST_MEM(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_MEMACCESS, } + +static struct __x86_intercept { + u32 exit_code; + enum x86_intercept_stage stage; +} x86_intercept_map[] = { + [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), + [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), + [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), + [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), + [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), + [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), + [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), + [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), + [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), + [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), + [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), + [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), + [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), + [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), + [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), + [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), + [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), + [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), + [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), + [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), + [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), + [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), + [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), + [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), + [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), + [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), + [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), + [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), + [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), + [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), + [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), + [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), + [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), + [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), + [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), + [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), +}; + +#undef PRE_EX +#undef POST_EX +#undef POST_MEM + +static int svm_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int vmexit, ret = X86EMUL_CONTINUE; + struct __x86_intercept icpt_info; + struct vmcb *vmcb = svm->vmcb; + + if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) + goto out; + + icpt_info = x86_intercept_map[info->intercept]; + + if (stage != icpt_info.stage) + goto out; + + switch (icpt_info.exit_code) { + case SVM_EXIT_READ_CR0: + if (info->intercept == x86_intercept_cr_read) + icpt_info.exit_code += info->modrm_reg; + break; + case SVM_EXIT_WRITE_CR0: { + unsigned long cr0, val; + u64 intercept; + + if (info->intercept == x86_intercept_cr_write) + icpt_info.exit_code += info->modrm_reg; + + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + break; + + intercept = svm->nested.intercept; + + if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + break; + + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; + + if (info->intercept == x86_intercept_lmsw) { + cr0 &= 0xfUL; + val &= 0xfUL; + /* lmsw can't clear PE - catch this here */ + if (cr0 & X86_CR0_PE) + val |= X86_CR0_PE; + } + + if (cr0 ^ val) + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + + break; + } + case SVM_EXIT_READ_DR0: + case SVM_EXIT_WRITE_DR0: + icpt_info.exit_code += info->modrm_reg; + break; + case SVM_EXIT_MSR: + if (info->intercept == x86_intercept_wrmsr) + vmcb->control.exit_info_1 = 1; + else + vmcb->control.exit_info_1 = 0; + break; + case SVM_EXIT_PAUSE: + /* + * We get this for NOP only, but pause + * is rep not, check this here + */ + if (info->rep_prefix != REPE_PREFIX) + goto out; + case SVM_EXIT_IOIO: { + u64 exit_info; + u32 bytes; + + exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; + + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + exit_info |= SVM_IOIO_TYPE_MASK; + bytes = info->src_bytes; + } else { + bytes = info->dst_bytes; + } + + if (info->intercept == x86_intercept_outs || + info->intercept == x86_intercept_ins) + exit_info |= SVM_IOIO_STR_MASK; + + if (info->rep_prefix) + exit_info |= SVM_IOIO_REP_MASK; + + bytes = min(bytes, 4u); + + exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; + + exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); + + vmcb->control.exit_info_1 = exit_info; + vmcb->control.exit_info_2 = info->next_rip; + + break; + } + default: + break; + } + + vmcb->control.next_rip = info->next_rip; + vmcb->control.exit_code = icpt_info.exit_code; + vmexit = nested_svm_exit_handled(svm); + + ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED + : X86EMUL_CONTINUE; + +out: + return ret; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -3931,10 +4231,14 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, + .set_tsc_khz = svm_set_tsc_khz, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, + .compute_tsc_offset = svm_compute_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, + + .check_intercept = svm_check_intercept, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index fc7a101c4a35..abd86e865be3 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) /* * There is a race window between reading and incrementing, but we do - * not care about potentially loosing timer events in the !reinject + * not care about potentially losing timer events in the !reinject * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked * in vcpu_enter_guest. */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 1357d7cf4ec8..db932760ea82 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall, TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), TP_STRUCT__entry( - __field( __u16, code ) - __field( bool, fast ) __field( __u16, rep_cnt ) __field( __u16, rep_idx ) __field( __u64, ingpa ) __field( __u64, outgpa ) + __field( __u16, code ) + __field( bool, fast ) ), TP_fast_assign( - __entry->code = code; - __entry->fast = fast; __entry->rep_cnt = rep_cnt; __entry->rep_idx = rep_idx; __entry->ingpa = ingpa; __entry->outgpa = outgpa; + __entry->code = code; + __entry->fast = fast; ), TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bf89ec2cfb82..4c3fa0f67469 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO); * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually small than 41 cycles. + * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); @@ -128,8 +128,11 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; + u8 cpl; + bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; + ulong rflags; struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; @@ -159,6 +162,10 @@ struct vcpu_vmx { u32 ar; } tr, es, ds, fs, gs; } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment seg[8]; + } segment_cache; int vpid; bool emulation_required; @@ -171,16 +178,25 @@ struct vcpu_vmx { bool rdtscp_enabled; }; +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); } -static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -643,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, + unsigned field) +{ + bool ret; + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { + vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + ret = vmx->segment_cache.bitmask & mask; + vmx->segment_cache.bitmask |= mask; + return ret; +} + +static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) +{ + u16 *p = &vmx->segment_cache.seg[seg].selector; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) + *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + return *p; +} + +static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) +{ + ulong *p = &vmx->segment_cache.seg[seg].base; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) + *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + return *p; +} + +static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].limit; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + return *p; +} + +static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].ar; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + return *p; +} + static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; @@ -970,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; - rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { - rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; - rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + to_vmx(vcpu)->rflags = rflags; } - return rflags; + return to_vmx(vcpu)->rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1053,7 +1132,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + int inc_eip = 0; + if (kvm_exception_is_soft(nr)) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -1151,6 +1233,16 @@ static u64 guest_read_tsc(void) } /* + * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ + * ioctl. In this case the call-back should update internal vmx state to make + * the changes effective. + */ +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + /* Nothing to do here */ +} + +/* * writes 'offset' into guest's timestamp counter offset register */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1164,6 +1256,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) vmcs_write64(TSC_OFFSET, offset + adjustment); } +static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + return target_tsc - native_read_tsc(); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -1243,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: @@ -1333,19 +1432,25 @@ static __init int vmx_disabled_by_bios(void) rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); if (msr & FEATURE_CONTROL_LOCKED) { + /* launched w/ TXT and VMX disabled */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && tboot_enabled()) return 1; + /* launched w/o TXT and VMX only enabled w/ TXT */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && !tboot_enabled()) { printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - " activate TXT before enabling KVM\n"); + "activate TXT before enabling KVM\n"); return 1; } + /* launched w/o TXT and VMX disabled */ + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; } return 0; - /* locked but not enabled */ } static void kvm_cpu_vmxon(u64 addr) @@ -1683,6 +1788,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmx_segment_cache_clear(vmx); + + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); @@ -1705,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_segment_cache_clear(vmx); + vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1756,6 +1866,21 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; + /* + * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * vcpu. Call it here with phys address pointing 16M below 4G. + */ + if (!vcpu->kvm->arch.tss_addr) { + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + "called before entering vcpu\n"); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + } + + vmx_segment_cache_clear(vmx); + + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1794,7 +1919,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) continue_rmode: kvm_mmu_reset_context(vcpu); - init_rmode(vcpu->kvm); } static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -1832,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; + vmx_segment_cache_clear(to_vmx(vcpu)); + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { printk(KERN_DEBUG "%s: tss fixup for long mode. \n", @@ -1979,6 +2105,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static u64 construct_eptp(unsigned long root_hpa) @@ -2030,23 +2157,39 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - return vmcs_readl(sf->base); -} - static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_save_segment *save; u32 ar; - var->base = vmcs_readl(sf->base); - var->limit = vmcs_read32(sf->limit); - var->selector = vmcs_read16(sf->selector); - ar = vmcs_read32(sf->ar_bytes); + if (vmx->rmode.vm86_active + && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES + || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS + || seg == VCPU_SREG_GS) + && !emulate_invalid_guest_state) { + switch (seg) { + case VCPU_SREG_TR: save = &vmx->rmode.tr; break; + case VCPU_SREG_ES: save = &vmx->rmode.es; break; + case VCPU_SREG_DS: save = &vmx->rmode.ds; break; + case VCPU_SREG_FS: save = &vmx->rmode.fs; break; + case VCPU_SREG_GS: save = &vmx->rmode.gs; break; + default: BUG(); + } + var->selector = save->selector; + var->base = save->base; + var->limit = save->limit; + ar = save->ar; + if (seg == VCPU_SREG_TR + || var->selector == vmx_read_guest_seg_selector(vmx, seg)) + goto use_saved_rmode_seg; + } + var->base = vmx_read_guest_seg_base(vmx, seg); + var->limit = vmx_read_guest_seg_limit(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + ar = vmx_read_guest_seg_ar(vmx, seg); +use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; var->type = ar & 15; @@ -2060,17 +2203,39 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } -static int vmx_get_cpl(struct kvm_vcpu *vcpu) +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment s; + + if (to_vmx(vcpu)->rmode.vm86_active) { + vmx_get_segment(vcpu, &s, seg); + return s.base; + } + return vmx_read_guest_seg_base(to_vmx(vcpu), seg); +} + +static int __vmx_get_cpl(struct kvm_vcpu *vcpu) { if (!is_protmode(vcpu)) return 0; - if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + if (!is_long_mode(vcpu) + && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ return 3; - return vmcs_read16(GUEST_CS_SELECTOR) & 3; + return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; +} + +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + } + return to_vmx(vcpu)->cpl; } + static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -2100,7 +2265,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; + vmx_segment_cache_clear(vmx); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; vmx->rmode.tr.base = var->base; vmx->rmode.tr.limit = var->limit; @@ -2135,11 +2303,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, ar); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { - u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); *db = (ar >> 14) & 1; *l = (ar >> 13) & 1; @@ -2361,11 +2530,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) static int init_rmode_tss(struct kvm *kvm) { - gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + gfn_t fn; u16 data = 0; - int ret = 0; - int r; + int r, idx, ret = 0; + idx = srcu_read_lock(&kvm->srcu); + fn = rmode_tss_base(kvm) >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2389,12 +2559,13 @@ static int init_rmode_tss(struct kvm *kvm) ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, r, ret; + int i, idx, r, ret; pfn_t identity_map_pfn; u32 tmp; @@ -2409,6 +2580,7 @@ static int init_rmode_identity_map(struct kvm *kvm) return 1; ret = 0; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2424,6 +2596,7 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm->arch.ept_identity_pagetable_done = true; ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } @@ -2699,22 +2872,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static int init_rmode(struct kvm *kvm) -{ - int idx, ret = 0; - - idx = srcu_read_lock(&kvm->srcu); - if (!init_rmode_tss(kvm)) - goto exit; - if (!init_rmode_identity_map(kvm)) - goto exit; - - ret = 1; -exit: - srcu_read_unlock(&kvm->srcu, idx); - return ret; -} - static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2722,10 +2879,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - if (!init_rmode(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } vmx->rmode.vm86_active = 0; @@ -2742,6 +2895,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (ret != 0) goto out; + vmx_segment_cache_clear(vmx); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode @@ -2805,7 +2960,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); if (vm_need_tpr_shadow(vmx->vcpu.kvm)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->vcpu.arch.apic->regs_page)); + __pa(vmx->vcpu.arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } @@ -2871,7 +3026,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + int inc_eip = 0; + if (vcpu->arch.interrupt.soft) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2904,8 +3062,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; + vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2928,6 +3087,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; + if (to_vmx(vcpu)->nmi_known_unmasked) + return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } @@ -2941,6 +3102,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) vmx->vnmi_blocked_time = 0; } } else { + vmx->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -2971,6 +3133,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; + if (!init_rmode_tss(kvm)) + return -ENOMEM; + return 0; } @@ -3055,7 +3220,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) enum emulation_result er; vect_info = vmx->idt_vectoring_info; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + intr_info = vmx->exit_intr_info; if (is_machine_check(intr_info)) return handle_machine_check(vcpu); @@ -3086,7 +3251,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) } error_code = 0; - rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -3133,6 +3297,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; + rip = kvm_rip_read(vcpu); kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; @@ -3469,9 +3634,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + vmx_set_nmi_mask(vcpu, true); break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR: @@ -3831,12 +3994,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; + + if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + return; + + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ - if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) - || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI - && is_machine_check(exit_intr_info))) + if (is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ @@ -3850,7 +4018,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; bool unblock_nmi; u8 vector; bool idtv_info_valid; @@ -3858,6 +4026,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { + if (vmx->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -3874,6 +4049,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); @@ -3910,8 +4089,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + vmx_set_nmi_mask(&vmx->vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = @@ -3962,7 +4140,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3991,6 +4169,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" /* placeholder for guest rcx */ "push %%"R"cx \n\t" "cmp %%"R"sp, %c[host_rsp](%0) \n\t" "je 1f \n\t" @@ -4032,10 +4211,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ - "xchg %0, (%%"R"sp) \n\t" + "mov %0, %c[wordsize](%%"R"sp) \n\t" + "pop %0 \n\t" "mov %%"R"ax, %c[rax](%0) \n\t" "mov %%"R"bx, %c[rbx](%0) \n\t" - "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "pop"Q" %c[rcx](%0) \n\t" "mov %%"R"dx, %c[rdx](%0) \n\t" "mov %%"R"si, %c[rsi](%0) \n\t" "mov %%"R"di, %c[rdi](%0) \n\t" @@ -4053,7 +4233,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%cr2, %%"R"ax \n\t" "mov %%"R"ax, %c[cr2](%0) \n\t" - "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" + "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), @@ -4076,7 +4256,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), + [wordsize]"i"(sizeof(ulong)) : "cc", "memory" , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 @@ -4085,7 +4266,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) ); vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; @@ -4095,7 +4279,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); @@ -4156,8 +4339,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + err = -ENOMEM; if (!vmx->guest_msrs) { - err = -ENOMEM; goto uninit_vcpu; } @@ -4176,15 +4359,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vmcs; if (vm_need_virtualize_apic_accesses(kvm)) - if (alloc_apic_access_page(kvm) != 0) + err = alloc_apic_access_page(kvm); + if (err) goto free_vmcs; if (enable_ept) { if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + err = -ENOMEM; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + if (!init_rmode_identity_map(kvm)) + goto free_vmcs; } return &vmx->vcpu; @@ -4326,6 +4513,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { } +static int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return X86EMUL_CONTINUE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4407,10 +4601,14 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .set_tsc_khz = vmx_set_tsc_khz, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, + .compute_tsc_offset = vmx_compute_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, + + .check_intercept = vmx_check_intercept, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bcc0efce85bf..77c9d8673dc4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -60,30 +60,21 @@ #include <asm/div64.h> #define MAX_IO_MSRS 256 -#define CR0_RESERVED_BITS \ - (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ - | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ - | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR4_RESERVED_BITS \ - (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ - | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) - -#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) - #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +#define emul_to_vcpu(ctxt) \ + container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM */ #ifdef CONFIG_X86_64 -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +static +u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); #else -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM @@ -99,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +bool kvm_has_tsc_control; +EXPORT_SYMBOL_GPL(kvm_has_tsc_control); +u32 kvm_max_guest_tsc_khz; +EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -156,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); + static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; @@ -525,8 +523,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - if ((cr0 ^ old_cr0) & X86_CR0_PG) + if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); @@ -979,7 +979,15 @@ static inline int kvm_tsc_changes_freq(void) return ret; } -static inline u64 nsec_to_cycles(u64 nsec) +static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.virtual_tsc_khz) + return vcpu->arch.virtual_tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { u64 ret; @@ -987,25 +995,24 @@ static inline u64 nsec_to_cycles(u64 nsec) if (kvm_tsc_changes_freq()) printk_once(KERN_WARNING "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * __this_cpu_read(cpu_tsc_khz); + ret = nsec * vcpu_tsc_khz(vcpu); do_div(ret, USEC_PER_SEC); return ret; } -static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &kvm->arch.virtual_tsc_shift, - &kvm->arch.virtual_tsc_mult); - kvm->arch.virtual_tsc_khz = this_tsc_khz; + &vcpu->arch.tsc_catchup_shift, + &vcpu->arch.tsc_catchup_mult); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->kvm->arch.virtual_tsc_mult, - vcpu->kvm->arch.virtual_tsc_shift); + vcpu->arch.tsc_catchup_mult, + vcpu->arch.tsc_catchup_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1017,8 +1024,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) unsigned long flags; s64 sdiff; - spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = data - native_read_tsc(); + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; sdiff = data - kvm->arch.last_tsc_write; @@ -1028,19 +1035,19 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) /* * Special case: close write to TSC within 5 seconds of * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accomodate host load / swapping as + * The 5 seconds is to accommodate host load / swapping as * well as any reset of TSC during the boot process. * * In that case, for a reliable TSC, we can match TSC offsets, * or make a best guest using elapsed value. */ - if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && elapsed < 5ULL * NSEC_PER_SEC) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { - u64 delta = nsec_to_cycles(elapsed); + u64 delta = nsec_to_cycles(vcpu, elapsed); offset += delta; pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } @@ -1050,7 +1057,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; kvm_x86_ops->write_tsc_offset(vcpu, offset); - spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; @@ -1072,8 +1079,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); - this_tsc_khz = __this_cpu_read(cpu_tsc_khz); - + this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -1453,6 +1459,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } +static void kvmclock_reset(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1510,10 +1524,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); vcpu->arch.time = data; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -1592,6 +1603,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } else return set_msr_hyperv(vcpu, msr, data); break; + case MSR_IA32_BBL_CR_CTL3: + /* Drop writes to this legacy MSR -- see rdmsr + * counterpart for further detail. + */ + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1846,6 +1863,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) } else return get_msr_hyperv(vcpu, msr, pdata); break; + case MSR_IA32_BBL_CR_CTL3: + /* This legacy MSR exists but isn't fully documented in current + * silicon. It is however accessed by winxp in very narrow + * scenarios where it sets bit #19, itself documented as + * a "reserved" bit. Best effort attempt to source coherent + * read data here should the balance of the register be + * interpreted by the guest: + * + * L2 cache control register 3: 64GB range, 256KB size, + * enabled, latency 0x1, configured + */ + data = 0xbe702111; + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1966,6 +1996,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_GET_TSC_KHZ: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1992,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XCRS: r = cpu_has_xsave; break; + case KVM_CAP_TSC_CONTROL: + r = kvm_has_tsc_control; + break; default: r = 0; break; @@ -2093,15 +2127,20 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { /* Make sure TSC doesn't go backwards */ - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - native_read_tsc() - vcpu->arch.last_host_tsc; + s64 tsc_delta; + u64 tsc; + + kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : + tsc - vcpu->arch.last_guest_tsc; + if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); vcpu->arch.tsc_catchup = 1; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2112,7 +2151,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = native_read_tsc(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); } static int is_efer_nx(void) @@ -2297,6 +2336,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | + F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | + F(PMM) | F(PMM_EN); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); @@ -2368,9 +2413,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, int i; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (i = 1; *nent < maxnent; ++i) { - if (entry[i - 1].eax == 0 && i != 2) - break; + for (i = 1; *nent < maxnent && i < 64; ++i) { + if (entry[i].eax == 0) + continue; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; @@ -2391,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); entry->ebx = 0; entry->ecx = 0; @@ -2405,6 +2451,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + /*Add support for Centaur's CPUID instruction*/ + case 0xC0000000: + /*Just support up to 0xC0000004 now*/ + entry->eax = min(entry->eax, 0xC0000004); + break; + case 0xC0000001: + entry->edx &= kvm_supported_word5_x86_features; + cpuid_mask(&entry->edx, 5); + break; + case 0xC0000002: + case 0xC0000003: + case 0xC0000004: + /*Now nothing to do, reserved for the future*/ + break; } kvm_x86_ops->set_supported_cpuid(function, entry); @@ -2451,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, if (nent >= cpuid->nent) goto out_free; + /* Add support for Centaur's CPUID instruction. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { + do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + limit = cpuid_entries[nent - 1].eax; + for (func = 0xC0000001; + func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + } + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, cpuid->nent); @@ -2575,9 +2655,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - printk(KERN_DEBUG "kvm: set_mce: " - "injects mce exception while " - "previous one is in progress!\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } @@ -2648,8 +2725,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; - if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) kvm_x86_ops->set_interrupt_shadow(vcpu, events->interrupt.shadow); @@ -3024,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; + + r = -EINVAL; + if (!kvm_has_tsc_control) + break; + + user_tsc_khz = (u32)arg; + + if (user_tsc_khz >= kvm_max_guest_tsc_khz) + goto out; + + kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + + r = 0; + goto out; + } + case KVM_GET_TSC_KHZ: { + r = -EIO; + if (check_tsc_unstable()) + goto out; + + r = vcpu_tsc_khz(vcpu); + + goto out; + } default: r = -EINVAL; } @@ -3573,20 +3674,43 @@ static void kvm_init_msr_list(void) static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; - return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + handled += n; + addr += n; + len -= n; + v += n; + } while (len); + + return handled; } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; - return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + handled += n; + addr += n; + len -= n; + v += n; + } while (len); + + return handled; } static void kvm_set_segment(struct kvm_vcpu *vcpu, @@ -3681,37 +3805,43 @@ out: } /* used for instruction fetching */ -static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access | PFERR_FETCH_MASK, exception); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } -static int kvm_write_guest_virt_system(gva_t addr, void *val, +static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); void *data = val; int r = X86EMUL_CONTINUE; @@ -3739,13 +3869,15 @@ out: return r; } -static int emulator_read_emulated(unsigned long addr, +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; + int handled; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3764,7 +3896,7 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) + if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; @@ -3772,18 +3904,24 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); + handled = vcpu_mmio_read(vcpu, gpa, bytes, val); + + if (handled == bytes) return X86EMUL_CONTINUE; - } + + gpa += handled; + bytes -= handled; + val += handled; trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; + vcpu->mmio_index = 0; return X86EMUL_IO_NEEDED; } @@ -3807,6 +3945,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int handled; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3825,25 +3964,35 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + if (handled == bytes) return X86EMUL_CONTINUE; + gpa += handled; + bytes -= handled; + val += handled; + vcpu->mmio_needed = 1; + memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->mmio_index = 0; return X86EMUL_CONTINUE; } -int emulator_write_emulated(unsigned long addr, +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int rc, now; @@ -3871,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr, (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) #endif -static int emulator_cmpxchg_emulated(unsigned long addr, +static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; struct page *page; char *kaddr; @@ -3933,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, exception, vcpu); + return emulator_write_emulated(ctxt, addr, new, bytes, exception); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -3952,9 +4102,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } -static int emulator_pio_in_emulated(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + if (vcpu->arch.pio.count) goto data_avail; @@ -3982,10 +4135,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, return 0; } -static int emulator_pio_out_emulated(int size, unsigned short port, - const void *val, unsigned int count, - struct kvm_vcpu *vcpu) +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + trace_kvm_pio(1, port, size, count); vcpu->arch.pio.port = port; @@ -4015,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) return kvm_x86_ops->get_segment_base(vcpu, seg); } -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) { - kvm_mmu_invlpg(vcpu, address); - return X86EMUL_CONTINUE; + kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) @@ -4040,22 +4194,20 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); -int emulate_clts(struct kvm_vcpu *vcpu) +static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) { - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - kvm_x86_ops->fpu_activate(vcpu); - return X86EMUL_CONTINUE; + kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); } -int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return _kvm_get_dr(vcpu, dr, dest); + return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } -int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - return __kvm_set_dr(vcpu, dr, value); + return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -4063,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val) return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } -static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) +static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); unsigned long value; switch (cr) { @@ -4091,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int res = 0; switch (cr) { @@ -4119,33 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) return res; } -static int emulator_get_cpl(struct kvm_vcpu *vcpu) +static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->get_cpl(vcpu); + return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); } -static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(vcpu, dt); + kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); } -static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(vcpu, dt); + kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); } -static unsigned long emulator_get_cached_segment_base(int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - return get_segment_base(vcpu, seg); + kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) +{ + kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); +} + +static unsigned long emulator_get_cached_segment_base( + struct x86_emulate_ctxt *ctxt, int seg) +{ + return get_segment_base(emul_to_vcpu(ctxt), seg); +} + +static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, + int seg) { struct kvm_segment var; - kvm_get_segment(vcpu, &var, seg); + kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); + *selector = var.selector; if (var.unusable) return false; @@ -4154,6 +4320,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, var.limit >>= 12; set_desc_limit(desc, var.limit); set_desc_base(desc, (unsigned long)var.base); +#ifdef CONFIG_X86_64 + if (base3) + *base3 = var.base >> 32; +#endif desc->type = var.type; desc->s = var.s; desc->dpl = var.dpl; @@ -4166,15 +4336,18 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, + int seg) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_segment var; - /* needed to preserve selector */ - kvm_get_segment(vcpu, &var, seg); - + var.selector = selector; var.base = get_desc_base(desc); +#ifdef CONFIG_X86_64 + var.base |= ((u64)base3) << 32; +#endif var.limit = get_desc_limit(desc); if (desc->g) var.limit = (var.limit << 12) | 0xfff; @@ -4194,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, return; } -static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) +static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) { - struct kvm_segment kvm_seg; + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); +} - kvm_get_segment(vcpu, &kvm_seg, seg); - return kvm_seg.selector; +static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } -static void emulator_set_segment_selector(u16 sel, int seg, - struct kvm_vcpu *vcpu) +static void emulator_halt(struct x86_emulate_ctxt *ctxt) { - struct kvm_segment kvm_seg; + emul_to_vcpu(ctxt)->arch.halt_request = 1; +} - kvm_get_segment(vcpu, &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); +static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_disable(); + kvm_load_guest_fpu(emul_to_vcpu(ctxt)); + /* + * CR0.TS may reference the host fpu state, not the guest fpu state, + * so it may be clear at this point. + */ + clts(); +} + +static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_enable(); +} + +static int emulator_intercept(struct x86_emulate_ctxt *ctxt, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } static struct x86_emulate_ops emulate_ops = { @@ -4219,22 +4414,29 @@ static struct x86_emulate_ops emulate_ops = { .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .invlpg = emulator_invlpg, .pio_in_emulated = emulator_pio_in_emulated, .pio_out_emulated = emulator_pio_out_emulated, - .get_cached_descriptor = emulator_get_cached_descriptor, - .set_cached_descriptor = emulator_set_cached_descriptor, - .get_segment_selector = emulator_get_segment_selector, - .set_segment_selector = emulator_set_segment_selector, + .get_segment = emulator_get_segment, + .set_segment = emulator_set_segment, .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_idt = emulator_get_idt, + .set_gdt = emulator_set_gdt, + .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .set_msr = kvm_set_msr, - .get_msr = kvm_get_msr, + .set_msr = emulator_set_msr, + .get_msr = emulator_get_msr, + .halt = emulator_halt, + .wbinvd = emulator_wbinvd, + .fix_hypercall = emulator_fix_hypercall, + .get_fpu = emulator_get_fpu, + .put_fpu = emulator_put_fpu, + .intercept = emulator_intercept, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -4276,12 +4478,17 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int cs_db, cs_l; + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ cache_all_regs(vcpu); kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : @@ -4289,11 +4496,13 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int ret; @@ -4302,7 +4511,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.decode.op_bytes = 2; vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + + inc_eip; ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); if (ret != X86EMUL_CONTINUE) @@ -4311,7 +4521,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.eip = c->eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (irq == NMI_VECTOR) vcpu->arch.nmi_pending = false; @@ -4373,16 +4583,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, { int r; struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + bool writeback = true; kvm_clear_exception_queue(vcpu); - vcpu->arch.mmio_fault_cr2 = cr2; - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); @@ -4390,41 +4593,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.have_exception = false; vcpu->arch.emulate_ctxt.perm_ok = false; + vcpu->arch.emulate_ctxt.only_vendor_specific_insn + = emulation_type & EMULTYPE_TRAP_UD; + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); - if (r == X86EMUL_PROPAGATE_FAULT) - goto done; trace_kvm_emulate_insn_start(vcpu); - - /* Only allow emulation of specific instructions on #UD - * (namely VMMCALL, sysenter, sysexit, syscall)*/ - if (emulation_type & EMULTYPE_TRAP_UD) { - if (!c->twobyte) - return EMULATE_FAIL; - switch (c->b) { - case 0x01: /* VMMCALL */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - return EMULATE_FAIL; - break; - case 0x34: /* sysenter */ - case 0x35: /* sysexit */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - case 0x05: /* syscall */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - default: - return EMULATE_FAIL; - } - - if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) - return EMULATE_FAIL; - } - ++vcpu->stat.insn_emulation; if (r) { + if (emulation_type & EMULTYPE_TRAP_UD) + return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) @@ -4438,13 +4616,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } - /* this is needed for vmware backdor interface to work since it + /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + } restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + if (r == EMULATION_INTERCEPTED) + return EMULATE_DONE; + if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; @@ -4452,28 +4636,34 @@ restart: return handle_emulation_failure(vcpu); } -done: if (vcpu->arch.emulate_ctxt.have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; + else + writeback = false; r = EMULATE_DO_MMIO; } else if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; + if (!vcpu->mmio_is_write) + writeback = false; r = EMULATE_DO_MMIO; } else if (r == EMULATION_RESTART) goto restart; else r = EMULATE_DONE; - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (writeback) { + toggle_interruptibility(vcpu, + vcpu->arch.emulate_ctxt.interruptibility); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + } else + vcpu->arch.emulate_regs_need_sync_to_vcpu = true; return r; } @@ -4482,7 +4672,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction); int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); + int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, + size, port, &val, 1); /* do not return to emulator after return from userspace */ vcpu->arch.pio.count = 0; return ret; @@ -4562,7 +4753,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -4572,7 +4763,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -4876,8 +5067,9 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu) +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); @@ -4890,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); -} - -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_gdt(vcpu, &dt); -} - -void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_idt(vcpu, &dt); + return emulator_write_emulated(&vcpu->arch.emulate_ctxt, + rip, instruction, 3, NULL); } static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) @@ -4955,12 +5134,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, best = e; break; } - /* - * Both basic or both extended? - */ - if (((e->function ^ function) & 0x80000000) == 0) - if (!best || e->function > best->function) - best = e; } return best; } @@ -4980,6 +5153,27 @@ not_found: return 36; } +/* + * If no match is found, check whether we exceed the vCPU's limit + * and return the content of the highest valid _standard_ leaf instead. + * This is to satisfy the CPUID specification. + */ +static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *maxlevel; + + maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + if (!maxlevel || maxlevel->eax >= function) + return NULL; + if (function & 0x80000000) { + maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); + if (!maxlevel) + return NULL; + } + return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); +} + void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 function, index; @@ -4992,6 +5186,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RCX, 0); kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = kvm_find_cpuid_entry(vcpu, function, index); + + if (!best) + best = check_cpuid_limit(vcpu, function, index); + if (best) { kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); @@ -5148,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; + bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5191,11 +5390,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; + /* + * An NMI can be injected between local nmi_pending read and + * vcpu->arch.nmi_pending read inside inject_pending_event(). + * But in that case, KVM_REQ_EVENT will be set, which makes + * the race described above benign. + */ + nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + if (nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); @@ -5213,14 +5420,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_fpu(vcpu); kvm_load_guest_xcr0(vcpu); - atomic_set(&vcpu->guest_mode, 1); - smp_wmb(); + vcpu->mode = IN_GUEST_MODE; + + /* We should set ->mode before check ->requests, + * see the comment in make_all_cpus_request. + */ + smp_mb(); local_irq_disable(); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); @@ -5256,7 +5467,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -5371,6 +5582,41 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +static int complete_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int r; + + if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) + return 1; + + if (vcpu->mmio_needed) { + vcpu->mmio_needed = 0; + if (!vcpu->mmio_is_write) + memcpy(vcpu->mmio_data + vcpu->mmio_index, + run->mmio.data, 8); + vcpu->mmio_index += 8; + if (vcpu->mmio_index < vcpu->mmio_size) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; + memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); + run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); + run->mmio.is_write = vcpu->mmio_is_write; + vcpu->mmio_needed = 1; + return 0; + } + if (vcpu->mmio_is_write) + return 1; + vcpu->mmio_read_completed = 1; + } + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r != EMULATE_DONE) + return 0; + return 1; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -5397,20 +5643,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - if (vcpu->arch.pio.count || vcpu->mmio_needed) { - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) { - r = 0; - goto out; - } - } + r = complete_mmio(vcpu); + if (r <= 0) + goto out; + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); @@ -5427,6 +5663,18 @@ out: int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { + /* + * We are here if userspace calls get_regs() in the middle of + * instruction emulation. Registers state needs to be copied + * back from emulation context to vcpu. Usrapace shouldn't do + * that usually, but some bad designed PV devices (vmware + * backdoor interface) need this to work + */ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -5454,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = true; + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -5564,7 +5815,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } @@ -5574,7 +5825,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int pending_vec, max_bits; + int pending_vec, max_bits, idx; struct desc_ptr dt; dt.size = sregs->idt.limit; @@ -5603,10 +5854,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (sregs->cr4 & X86_CR4_OSXSAVE) update_cpuid(vcpu); + + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); @@ -5617,8 +5871,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -5814,10 +6066,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fx_free(vcpu); @@ -5878,6 +6127,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + kvmclock_reset(vcpu); + kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; @@ -5946,8 +6197,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - if (!kvm->arch.virtual_tsc_khz) - kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + kvm_init_tsc_catchup(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) @@ -6005,7 +6255,7 @@ int kvm_arch_init_vm(struct kvm *kvm) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - spin_lock_init(&kvm->arch.tsc_write_lock); + raw_spin_lock_init(&kvm->arch.tsc_write_lock); return 0; } @@ -6103,7 +6353,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; + int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; if (!user_alloc && !old.user_alloc && old.rmap && !npages) { int ret; @@ -6118,12 +6368,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, "failed to munmap memory\n"); } + if (!kvm->arch.n_requested_mmu_pages) + nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + spin_lock(&kvm->mmu_lock); - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - } - kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); } @@ -6157,7 +6407,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (atomic_xchg(&vcpu->guest_mode, 0)) + if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) smp_send_reschedule(cpu); put_cpu(); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c600da830ce0..e407ed3df817 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -77,7 +77,7 @@ static inline u32 bit(int bitno) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index eba687f0cc0c..e191c096ab90 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -7,7 +7,7 @@ * kernel and insert a module (lg.ko) which allows us to run other Linux * kernels the same way we'd run processes. We call the first kernel the Host, * and the others the Guests. The program which sets up and configures Guests - * (such as the example in Documentation/lguest/lguest.c) is called the + * (such as the example in Documentation/virtual/lguest/lguest.c) is called the * Launcher. * * Secondly, we only run specially modified Guests, not normal kernels: setting @@ -397,7 +397,7 @@ static void lguest_load_tr_desc(void) * instead we just use the real "cpuid" instruction. Then I pretty much turned * off feature bits until the Guest booted. (Don't say that: you'll damage * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) Noone's listening! They don't like you anyway, + * hardly future proof.) No one's listening! They don't like you anyway, * parenthetic weirdo! * * Replacing the cpuid so we can turn features off is great for the kernel, but @@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void) void lguest_setup_irq(unsigned int irq) { irq_alloc_desc_at(irq, 0); - set_irq_chip_and_handler_name(irq, &lguest_irq_controller, + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); } @@ -913,8 +913,6 @@ static struct clocksource lguest_clock = { .rating = 200, .read = lguest_clock_read, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << 22, - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -995,9 +993,9 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ - set_irq_handler(0, lguest_time_irq); + irq_set_handler(0, lguest_time_irq); - clocksource_register(&lguest_clock); + clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index e10cf070ede0..f2479f19ddde 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -42,4 +42,5 @@ else lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o + lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 2cda60a06e65..e8e7e0d06f42 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -15,14 +15,12 @@ /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg - pushfl - CFI_ADJUST_CFA_OFFSET 4 + pushfl_cfi cli .endm .macro UNLOCK reg - popfl - CFI_ADJUST_CFA_OFFSET -4 + popfl_cfi .endm #define BEGIN(op) \ diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 71e080de3352..391a083674b4 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -14,14 +14,12 @@ #include <asm/dwarf2.h> .macro SAVE reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %\reg CFI_REL_OFFSET \reg, 0 .endm .macro RESTORE reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %\reg CFI_RESTORE \reg .endm diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index adbccd0bbb78..78d16a554db0 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -132,11 +130,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -148,11 +144,9 @@ ENDPROC(csum_partial) ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -260,11 +254,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic) CFI_STARTPROC subl $4,%esp CFI_ADJUST_CFA_OFFSET 4 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len @@ -426,17 +415,13 @@ DST( movb %cl, (%edi) ) .previous - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ecx # equivalent to addl $4,%esp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx # equivalent to addl $4,%esp ret CFI_ENDPROC ENDPROC(csum_partial_copy_generic) @@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic) ENTRY(csum_partial_copy_generic) CFI_STARTPROC - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst @@ -527,14 +509,11 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx ret CFI_ENDPROC diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index aa4326bfb24a..f2145cfa12a6 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,6 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * Zero a page. @@ -14,6 +15,15 @@ ENTRY(clear_page_c) CFI_ENDPROC ENDPROC(clear_page_c) +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) + ENTRY(clear_page) CFI_STARTPROC xorl %eax,%eax @@ -38,21 +48,26 @@ ENTRY(clear_page) .Lclear_page_end: ENDPROC(clear_page) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ + /* + * Some CPUs support enhanced REP MOVSB/STOSB instructions. + * It is recommended to use this when possible. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original function. + * + */ #include <asm/cpufeature.h> .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp <disp8> */ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: +2: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ +3: .previous .section .altinstructions,"a" - .align 8 - .quad clear_page - .quad 1b - .word X86_FEATURE_REP_GOOD - .byte .Lclear_page_end - clear_page - .byte 2b - 1b + altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ + .Lclear_page_end-clear_page, 2b-1b + altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ + .Lclear_page_end-clear_page,3b-2b .previous diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S new file mode 100644 index 000000000000..1e572c507d06 --- /dev/null +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -0,0 +1,65 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +#ifdef CONFIG_SMP +#define SEG_PREFIX %gs: +#else +#define SEG_PREFIX +#endif + +.text + +/* + * Inputs: + * %rsi : memory location to compare + * %rax : low 64 bits of old value + * %rdx : high 64 bits of old value + * %rbx : low 64 bits of new value + * %rcx : high 64 bits of new value + * %al : Operation successful + */ +ENTRY(this_cpu_cmpxchg16b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not +# via the ZF. Caller will access %al to get result. +# +# Note that this is only useful for a cpuops operation. Meaning that we +# do *not* have a fully atomic operation but just an operation that is +# *atomic* on a single cpu (as provided by the this_cpu_xx class of +# macros). +# +this_cpu_cmpxchg16b_emu: + pushf + cli + + cmpq SEG_PREFIX(%rsi), %rax + jne not_same + cmpq SEG_PREFIX 8(%rsi), %rdx + jne not_same + + movq %rbx, SEG_PREFIX(%rsi) + movq %rcx, SEG_PREFIX 8(%rsi) + + popf + mov $1, %al + ret + + not_same: + popf + xor %al,%al + ret + +CFI_ENDPROC + +ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index a460158b5ac5..024840266ba0 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -15,23 +15,30 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/cpufeature.h> +#include <asm/alternative-asm.h> - .macro ALTERNATIVE_JUMP feature,orig,alt +/* + * By placing feature2 after feature1 in altinstructions section, we logically + * implement: + * If CPU has feature2, jmp to alt2 is used + * else if CPU has feature1, jmp to alt1 is used + * else jmp to orig is used. + */ + .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: .byte 0xe9 /* 32bit jump */ .long \orig-1f /* by default jump to orig */ 1: .section .altinstr_replacement,"ax" 2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ + .long \alt1-1b /* offset */ /* or alternatively to alt1 */ +3: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt2-1b /* offset */ /* or alternatively to alt2 */ .previous + .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .word \feature /* when feature is set */ - .byte 5 - .byte 5 + altinstruction_entry 0b,2b,\feature1,5,5 + altinstruction_entry 0b,3b,\feature2,5,5 .previous .endm @@ -72,8 +79,10 @@ ENTRY(_copy_to_user) addq %rdx,%rcx jc bad_to_user cmpq TI_addr_limit(%rax),%rcx - jae bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ja bad_to_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(_copy_to_user) @@ -85,8 +94,10 @@ ENTRY(_copy_from_user) addq %rdx,%rcx jc bad_from_user cmpq TI_addr_limit(%rax),%rcx - jae bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ja bad_from_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(_copy_from_user) @@ -117,7 +128,7 @@ ENDPROC(bad_from_user) * rdx count * * Output: - * eax uncopied bytes or 0 if successfull. + * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) CFI_STARTPROC @@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string) .previous CFI_ENDPROC ENDPROC(copy_user_generic_string) + +/* + * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. + * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_enhanced_fast_string) + CFI_STARTPROC + andl %edx,%edx + jz 2f + movl %edx,%ecx +1: rep + movsb +2: xorl %eax,%eax + ret + + .section .fixup,"ax" +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + .section __ex_table,"a" + .align 8 + .quad 1b,12b + .previous + CFI_ENDPROC +ENDPROC(copy_user_enhanced_fast_string) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index f0dba36578ea..fb903b758da8 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -1,6 +1,6 @@ /* - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - * + * Copyright 2002, 2003 Andi Kleen, SuSE Labs. + * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. No warranty for anything given at all. @@ -11,82 +11,82 @@ /* * Checksum copy with exception handling. - * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the + * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the * destination is zeroed. - * + * * Input * rdi source * rsi destination * edx len (32bit) - * ecx sum (32bit) + * ecx sum (32bit) * r8 src_err_ptr (int) * r9 dst_err_ptr (int) * * Output * eax 64bit sum. undefined in case of exception. - * - * Wrappers need to take care of valid exception sum and zeroing. + * + * Wrappers need to take care of valid exception sum and zeroing. * They also should align source or destination to 8 bytes. */ .macro source 10: - .section __ex_table,"a" + .section __ex_table, "a" .align 8 - .quad 10b,.Lbad_source + .quad 10b, .Lbad_source .previous .endm - + .macro dest 20: - .section __ex_table,"a" + .section __ex_table, "a" .align 8 - .quad 20b,.Lbad_dest + .quad 20b, .Lbad_dest .previous .endm - + .macro ignore L=.Lignore 30: - .section __ex_table,"a" + .section __ex_table, "a" .align 8 - .quad 30b,\L + .quad 30b, \L .previous .endm - - + + ENTRY(csum_partial_copy_generic) CFI_STARTPROC - cmpl $3*64,%edx - jle .Lignore + cmpl $3*64, %edx + jle .Lignore -.Lignore: - subq $7*8,%rsp +.Lignore: + subq $7*8, %rsp CFI_ADJUST_CFA_OFFSET 7*8 - movq %rbx,2*8(%rsp) + movq %rbx, 2*8(%rsp) CFI_REL_OFFSET rbx, 2*8 - movq %r12,3*8(%rsp) + movq %r12, 3*8(%rsp) CFI_REL_OFFSET r12, 3*8 - movq %r14,4*8(%rsp) + movq %r14, 4*8(%rsp) CFI_REL_OFFSET r14, 4*8 - movq %r13,5*8(%rsp) + movq %r13, 5*8(%rsp) CFI_REL_OFFSET r13, 5*8 - movq %rbp,6*8(%rsp) + movq %rbp, 6*8(%rsp) CFI_REL_OFFSET rbp, 6*8 - movq %r8,(%rsp) - movq %r9,1*8(%rsp) - - movl %ecx,%eax - movl %edx,%ecx + movq %r8, (%rsp) + movq %r9, 1*8(%rsp) - xorl %r9d,%r9d - movq %rcx,%r12 + movl %ecx, %eax + movl %edx, %ecx - shrq $6,%r12 - jz .Lhandle_tail /* < 64 */ + xorl %r9d, %r9d + movq %rcx, %r12 + + shrq $6, %r12 + jz .Lhandle_tail /* < 64 */ clc - + /* main loop. clear in 64 byte blocks */ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ /* r11: temp3, rdx: temp4, r12 loopcnt */ @@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic) .p2align 4 .Lloop: source - movq (%rdi),%rbx + movq (%rdi), %rbx source - movq 8(%rdi),%r8 + movq 8(%rdi), %r8 source - movq 16(%rdi),%r11 + movq 16(%rdi), %r11 source - movq 24(%rdi),%rdx + movq 24(%rdi), %rdx source - movq 32(%rdi),%r10 + movq 32(%rdi), %r10 source - movq 40(%rdi),%rbp + movq 40(%rdi), %rbp source - movq 48(%rdi),%r14 + movq 48(%rdi), %r14 source - movq 56(%rdi),%r13 - + movq 56(%rdi), %r13 + ignore 2f prefetcht0 5*64(%rdi) -2: - adcq %rbx,%rax - adcq %r8,%rax - adcq %r11,%rax - adcq %rdx,%rax - adcq %r10,%rax - adcq %rbp,%rax - adcq %r14,%rax - adcq %r13,%rax +2: + adcq %rbx, %rax + adcq %r8, %rax + adcq %r11, %rax + adcq %rdx, %rax + adcq %r10, %rax + adcq %rbp, %rax + adcq %r14, %rax + adcq %r13, %rax decl %r12d - + dest - movq %rbx,(%rsi) + movq %rbx, (%rsi) dest - movq %r8,8(%rsi) + movq %r8, 8(%rsi) dest - movq %r11,16(%rsi) + movq %r11, 16(%rsi) dest - movq %rdx,24(%rsi) + movq %rdx, 24(%rsi) dest - movq %r10,32(%rsi) + movq %r10, 32(%rsi) dest - movq %rbp,40(%rsi) + movq %rbp, 40(%rsi) dest - movq %r14,48(%rsi) + movq %r14, 48(%rsi) dest - movq %r13,56(%rsi) - + movq %r13, 56(%rsi) + 3: - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - jnz .Lloop + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi - adcq %r9,%rax + jnz .Lloop - /* do last upto 56 bytes */ + adcq %r9, %rax + + /* do last up to 56 bytes */ .Lhandle_tail: /* ecx: count */ - movl %ecx,%r10d - andl $63,%ecx - shrl $3,%ecx - jz .Lfold + movl %ecx, %r10d + andl $63, %ecx + shrl $3, %ecx + jz .Lfold clc .p2align 4 -.Lloop_8: +.Lloop_8: source - movq (%rdi),%rbx - adcq %rbx,%rax + movq (%rdi), %rbx + adcq %rbx, %rax decl %ecx dest - movq %rbx,(%rsi) - leaq 8(%rsi),%rsi /* preserve carry */ - leaq 8(%rdi),%rdi + movq %rbx, (%rsi) + leaq 8(%rsi), %rsi /* preserve carry */ + leaq 8(%rdi), %rdi jnz .Lloop_8 - adcq %r9,%rax /* add in carry */ + adcq %r9, %rax /* add in carry */ .Lfold: /* reduce checksum to 32bits */ - movl %eax,%ebx - shrq $32,%rax - addl %ebx,%eax - adcl %r9d,%eax + movl %eax, %ebx + shrq $32, %rax + addl %ebx, %eax + adcl %r9d, %eax - /* do last upto 6 bytes */ + /* do last up to 6 bytes */ .Lhandle_7: - movl %r10d,%ecx - andl $7,%ecx - shrl $1,%ecx + movl %r10d, %ecx + andl $7, %ecx + shrl $1, %ecx jz .Lhandle_1 - movl $2,%edx - xorl %ebx,%ebx - clc + movl $2, %edx + xorl %ebx, %ebx + clc .p2align 4 -.Lloop_1: +.Lloop_1: source - movw (%rdi),%bx - adcl %ebx,%eax + movw (%rdi), %bx + adcl %ebx, %eax decl %ecx dest - movw %bx,(%rsi) - leaq 2(%rdi),%rdi - leaq 2(%rsi),%rsi + movw %bx, (%rsi) + leaq 2(%rdi), %rdi + leaq 2(%rsi), %rsi jnz .Lloop_1 - adcl %r9d,%eax /* add in carry */ - + adcl %r9d, %eax /* add in carry */ + /* handle last odd byte */ .Lhandle_1: - testl $1,%r10d + testl $1, %r10d jz .Lende - xorl %ebx,%ebx + xorl %ebx, %ebx source - movb (%rdi),%bl + movb (%rdi), %bl dest - movb %bl,(%rsi) - addl %ebx,%eax - adcl %r9d,%eax /* carry */ - + movb %bl, (%rsi) + addl %ebx, %eax + adcl %r9d, %eax /* carry */ + CFI_REMEMBER_STATE .Lende: - movq 2*8(%rsp),%rbx + movq 2*8(%rsp), %rbx CFI_RESTORE rbx - movq 3*8(%rsp),%r12 + movq 3*8(%rsp), %r12 CFI_RESTORE r12 - movq 4*8(%rsp),%r14 + movq 4*8(%rsp), %r14 CFI_RESTORE r14 - movq 5*8(%rsp),%r13 + movq 5*8(%rsp), %r13 CFI_RESTORE r13 - movq 6*8(%rsp),%rbp + movq 6*8(%rsp), %rbp CFI_RESTORE rbp - addq $7*8,%rsp + addq $7*8, %rsp CFI_ADJUST_CFA_OFFSET -7*8 ret CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: - movq (%rsp),%rax - testq %rax,%rax + movq (%rsp), %rax + testq %rax, %rax jz .Lende - movl $-EFAULT,(%rax) + movl $-EFAULT, (%rax) jmp .Lende - + .Lbad_dest: - movq 8(%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) + movq 8(%rsp), %rax + testq %rax, %rax + jz .Lende + movl $-EFAULT, (%rax) jmp .Lende CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index bf51144d97e1..9845371c5c36 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len) count64--; } - /* last upto 7 8byte blocks */ + /* last up to 7 8byte blocks */ count %= 8; while (count) { asm("addq %1,%0\n\t" diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e35e38..efbf2a0ecdea 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,6 +4,7 @@ #include <asm/cpufeature.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * memcpy - Copy a memory block. @@ -37,6 +38,23 @@ .Lmemcpy_e: .previous +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + + movl %edx, %ecx + rep movsb + ret +.Lmemcpy_e_e: + .previous + ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC @@ -49,7 +67,7 @@ ENTRY(memcpy) jb .Lhandle_tail /* - * We check whether memory false dependece could occur, + * We check whether memory false dependence could occur, * then jump to corresponding copy mode. */ cmp %dil, %sil @@ -171,21 +189,22 @@ ENDPROC(memcpy) ENDPROC(__memcpy) /* - * Some CPUs run faster using the string copy instructions. - * It is also a lot simpler. Use this when possible: - */ - - .section .altinstructions, "a" - .align 8 - .quad memcpy - .quad .Lmemcpy_c - .word X86_FEATURE_REP_GOOD - - /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * * Replace only beginning, memcpy is used to apply alternatives, * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte .Lmemcpy_e - .Lmemcpy_c - .byte .Lmemcpy_e - .Lmemcpy_c + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S new file mode 100644 index 000000000000..d0ec9c2936d7 --- /dev/null +++ b/arch/x86/lib/memmove_64.S @@ -0,0 +1,224 @@ +/* + * Normally compiler builtins are used, but sometimes the compiler calls out + * of line code. Based on asm-i386/string.h. + * + * This assembly file is re-written from memmove_64.c file. + * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> + */ +#define _STRING_C +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeature.h> + +#undef memmove + +/* + * Implement memmove(). This can handle overlap between src and dst. + * + * Input: + * rdi: dest + * rsi: src + * rdx: count + * + * Output: + * rax: dest + */ +ENTRY(memmove) + CFI_STARTPROC + + /* Handle more 32bytes in loop */ + mov %rdi, %rax + cmp $0x20, %rdx + jb 1f + + /* Decide forward/backward copy mode */ + cmp %rdi, %rsi + jge .Lmemmove_begin_forward + mov %rsi, %r8 + add %rdx, %r8 + cmp %rdi, %r8 + jg 2f + +.Lmemmove_begin_forward: + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + cmp $680, %rdx + jb 3f + /* + * movsq instruction is only good for aligned case. + */ + + cmpb %dil, %sil + je 4f +3: + sub $0x20, %rdx + /* + * We gobble 32byts forward in each loop. + */ +5: + sub $0x20, %rdx + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r8 + leaq 4*8(%rsi), %rsi + + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, 2*8(%rdi) + movq %r8, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae 5b + addq $0x20, %rdx + jmp 1f + /* + * Handle data forward by movsq. + */ + .p2align 4 +4: + movq %rdx, %rcx + movq -8(%rsi, %rdx), %r11 + lea -8(%rdi, %rdx), %r10 + shrq $3, %rcx + rep movsq + movq %r11, (%r10) + jmp 13f +.Lmemmove_end_forward: + + /* + * Handle data backward by movsq. + */ + .p2align 4 +7: + movq %rdx, %rcx + movq (%rsi), %r11 + movq %rdi, %r10 + leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx), %rdi + shrq $3, %rcx + std + rep movsq + cld + movq %r11, (%r10) + jmp 13f + + /* + * Start to prepare for backward copy. + */ + .p2align 4 +2: + cmp $680, %rdx + jb 6f + cmp %dil, %sil + je 7b +6: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * We gobble 32byts backward in each loop. + */ +8: + subq $0x20, %rdx + movq -1*8(%rsi), %r11 + movq -2*8(%rsi), %r10 + movq -3*8(%rsi), %r9 + movq -4*8(%rsi), %r8 + leaq -4*8(%rsi), %rsi + + movq %r11, -1*8(%rdi) + movq %r10, -2*8(%rdi) + movq %r9, -3*8(%rdi) + movq %r8, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae 8b + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi +1: + cmpq $16, %rdx + jb 9f + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq -2*8(%rsi, %rdx), %r9 + movq -1*8(%rsi, %rdx), %r8 + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, -2*8(%rdi, %rdx) + movq %r8, -1*8(%rdi, %rdx) + jmp 13f + .p2align 4 +9: + cmpq $8, %rdx + jb 10f + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r11 + movq -1*8(%rsi, %rdx), %r10 + movq %r11, 0*8(%rdi) + movq %r10, -1*8(%rdi, %rdx) + jmp 13f +10: + cmpq $4, %rdx + jb 11f + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %r11d + movl -4(%rsi, %rdx), %r10d + movl %r11d, (%rdi) + movl %r10d, -4(%rdi, %rdx) + jmp 13f +11: + cmp $2, %rdx + jb 12f + /* + * Move data from 2 bytes to 3 bytes. + */ + movw (%rsi), %r11w + movw -2(%rsi, %rdx), %r10w + movw %r11w, (%rdi) + movw %r10w, -2(%rdi, %rdx) + jmp 13f +12: + cmp $1, %rdx + jb 13f + /* + * Move data for 1 byte. + */ + movb (%rsi), %r11b + movb %r11b, (%rdi) +13: + retq + CFI_ENDPROC + + .section .altinstr_replacement,"ax" +.Lmemmove_begin_forward_efs: + /* Forward moving data. */ + movq %rdx, %rcx + rep movsb + retq +.Lmemmove_end_forward_efs: + .previous + + .section .altinstructions,"a" + .align 8 + .quad .Lmemmove_begin_forward + .quad .Lmemmove_begin_forward_efs + .word X86_FEATURE_ERMS + .byte .Lmemmove_end_forward-.Lmemmove_begin_forward + .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + .previous +ENDPROC(memmove) diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c deleted file mode 100644 index 6d0f0ec41b34..000000000000 --- a/arch/x86/lib/memmove_64.c +++ /dev/null @@ -1,192 +0,0 @@ -/* Normally compiler builtins are used, but sometimes the compiler calls out - of line code. Based on asm-i386/string.h. - */ -#define _STRING_C -#include <linux/string.h> -#include <linux/module.h> - -#undef memmove -void *memmove(void *dest, const void *src, size_t count) -{ - unsigned long d0,d1,d2,d3,d4,d5,d6,d7; - char *ret; - - __asm__ __volatile__( - /* Handle more 32bytes in loop */ - "mov %2, %3\n\t" - "cmp $0x20, %0\n\t" - "jb 1f\n\t" - - /* Decide forward/backward copy mode */ - "cmp %2, %1\n\t" - "jb 2f\n\t" - - /* - * movsq instruction have many startup latency - * so we handle small size by general register. - */ - "cmp $680, %0\n\t" - "jb 3f\n\t" - /* - * movsq instruction is only good for aligned case. - */ - "cmpb %%dil, %%sil\n\t" - "je 4f\n\t" - "3:\n\t" - "sub $0x20, %0\n\t" - /* - * We gobble 32byts forward in each loop. - */ - "5:\n\t" - "sub $0x20, %0\n\t" - "movq 0*8(%1), %4\n\t" - "movq 1*8(%1), %5\n\t" - "movq 2*8(%1), %6\n\t" - "movq 3*8(%1), %7\n\t" - "leaq 4*8(%1), %1\n\t" - - "movq %4, 0*8(%2)\n\t" - "movq %5, 1*8(%2)\n\t" - "movq %6, 2*8(%2)\n\t" - "movq %7, 3*8(%2)\n\t" - "leaq 4*8(%2), %2\n\t" - "jae 5b\n\t" - "addq $0x20, %0\n\t" - "jmp 1f\n\t" - /* - * Handle data forward by movsq. - */ - ".p2align 4\n\t" - "4:\n\t" - "movq %0, %8\n\t" - "movq -8(%1, %0), %4\n\t" - "lea -8(%2, %0), %5\n\t" - "shrq $3, %8\n\t" - "rep movsq\n\t" - "movq %4, (%5)\n\t" - "jmp 13f\n\t" - /* - * Handle data backward by movsq. - */ - ".p2align 4\n\t" - "7:\n\t" - "movq %0, %8\n\t" - "movq (%1), %4\n\t" - "movq %2, %5\n\t" - "leaq -8(%1, %0), %1\n\t" - "leaq -8(%2, %0), %2\n\t" - "shrq $3, %8\n\t" - "std\n\t" - "rep movsq\n\t" - "cld\n\t" - "movq %4, (%5)\n\t" - "jmp 13f\n\t" - - /* - * Start to prepare for backward copy. - */ - ".p2align 4\n\t" - "2:\n\t" - "cmp $680, %0\n\t" - "jb 6f \n\t" - "cmp %%dil, %%sil\n\t" - "je 7b \n\t" - "6:\n\t" - /* - * Calculate copy position to tail. - */ - "addq %0, %1\n\t" - "addq %0, %2\n\t" - "subq $0x20, %0\n\t" - /* - * We gobble 32byts backward in each loop. - */ - "8:\n\t" - "subq $0x20, %0\n\t" - "movq -1*8(%1), %4\n\t" - "movq -2*8(%1), %5\n\t" - "movq -3*8(%1), %6\n\t" - "movq -4*8(%1), %7\n\t" - "leaq -4*8(%1), %1\n\t" - - "movq %4, -1*8(%2)\n\t" - "movq %5, -2*8(%2)\n\t" - "movq %6, -3*8(%2)\n\t" - "movq %7, -4*8(%2)\n\t" - "leaq -4*8(%2), %2\n\t" - "jae 8b\n\t" - /* - * Calculate copy position to head. - */ - "addq $0x20, %0\n\t" - "subq %0, %1\n\t" - "subq %0, %2\n\t" - "1:\n\t" - "cmpq $16, %0\n\t" - "jb 9f\n\t" - /* - * Move data from 16 bytes to 31 bytes. - */ - "movq 0*8(%1), %4\n\t" - "movq 1*8(%1), %5\n\t" - "movq -2*8(%1, %0), %6\n\t" - "movq -1*8(%1, %0), %7\n\t" - "movq %4, 0*8(%2)\n\t" - "movq %5, 1*8(%2)\n\t" - "movq %6, -2*8(%2, %0)\n\t" - "movq %7, -1*8(%2, %0)\n\t" - "jmp 13f\n\t" - ".p2align 4\n\t" - "9:\n\t" - "cmpq $8, %0\n\t" - "jb 10f\n\t" - /* - * Move data from 8 bytes to 15 bytes. - */ - "movq 0*8(%1), %4\n\t" - "movq -1*8(%1, %0), %5\n\t" - "movq %4, 0*8(%2)\n\t" - "movq %5, -1*8(%2, %0)\n\t" - "jmp 13f\n\t" - "10:\n\t" - "cmpq $4, %0\n\t" - "jb 11f\n\t" - /* - * Move data from 4 bytes to 7 bytes. - */ - "movl (%1), %4d\n\t" - "movl -4(%1, %0), %5d\n\t" - "movl %4d, (%2)\n\t" - "movl %5d, -4(%2, %0)\n\t" - "jmp 13f\n\t" - "11:\n\t" - "cmp $2, %0\n\t" - "jb 12f\n\t" - /* - * Move data from 2 bytes to 3 bytes. - */ - "movw (%1), %4w\n\t" - "movw -2(%1, %0), %5w\n\t" - "movw %4w, (%2)\n\t" - "movw %5w, -2(%2, %0)\n\t" - "jmp 13f\n\t" - "12:\n\t" - "cmp $1, %0\n\t" - "jb 13f\n\t" - /* - * Move data for 1 byte. - */ - "movb (%1), %4b\n\t" - "movb %4b, (%2)\n\t" - "13:\n\t" - : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) , - "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7) - :"0" (count), - "1" (src), - "2" (dest) - :"memory"); - - return ret; - -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 09d344269652..79bd454b78a3 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -2,9 +2,13 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> /* - * ISO C memset - set a memory block to a byte value. + * ISO C memset - set a memory block to a byte value. This function uses fast + * string to get better performance than the original function. The code is + * simpler and shorter than the orignal function as well. * * rdi destination * rsi value (char) @@ -31,6 +35,28 @@ .Lmemset_e: .previous +/* + * ISO C memset - set a memory block to a byte value. This function uses + * enhanced rep stosb to override the fast string function. + * The code is simpler and shorter than the fast string function as well. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c_e: + movq %rdi,%r9 + movb %sil,%al + movl %edx,%ecx + rep stosb + movq %r9,%rax + ret +.Lmemset_e_e: + .previous + ENTRY(memset) ENTRY(__memset) CFI_STARTPROC @@ -112,16 +138,20 @@ ENTRY(__memset) ENDPROC(memset) ENDPROC(__memset) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - + /* Some CPUs support enhanced REP MOVSB/STOSB feature. + * It is recommended to use this when possible. + * + * If enhanced REP MOVSB/STOSB feature is not available, use fast string + * instructions. + * + * Otherwise, use original memset function. + * + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + */ .section .altinstructions,"a" - .align 8 - .quad memset - .quad .Lmemset_c - .word X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte .Lmemset_e - .Lmemset_c + altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-memset,.Lmemset_e-.Lmemset_c + altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e .previous diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S index 41fcf00e49df..67743977398b 100644 --- a/arch/x86/lib/rwsem_64.S +++ b/arch/x86/lib/rwsem_64.S @@ -23,43 +23,50 @@ #include <asm/dwarf2.h> #define save_common_regs \ - pushq %rdi; \ - pushq %rsi; \ - pushq %rcx; \ - pushq %r8; \ - pushq %r9; \ - pushq %r10; \ - pushq %r11 + pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ + pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ + pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ + pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ + pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ + pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ + pushq_cfi %r11; CFI_REL_OFFSET r11, 0 #define restore_common_regs \ - popq %r11; \ - popq %r10; \ - popq %r9; \ - popq %r8; \ - popq %rcx; \ - popq %rsi; \ - popq %rdi + popq_cfi %r11; CFI_RESTORE r11; \ + popq_cfi %r10; CFI_RESTORE r10; \ + popq_cfi %r9; CFI_RESTORE r9; \ + popq_cfi %r8; CFI_RESTORE r8; \ + popq_cfi %rcx; CFI_RESTORE rcx; \ + popq_cfi %rsi; CFI_RESTORE rsi; \ + popq_cfi %rdi; CFI_RESTORE rdi /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) + CFI_STARTPROC save_common_regs - pushq %rdx + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 movq %rax,%rdi call rwsem_down_read_failed - popq %rdx + popq_cfi %rdx + CFI_RESTORE rdx restore_common_regs ret - ENDPROC(call_rwsem_down_read_failed) + CFI_ENDPROC +ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) + CFI_STARTPROC save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs ret - ENDPROC(call_rwsem_down_write_failed) + CFI_ENDPROC +ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) + CFI_STARTPROC decl %edx /* do nothing if still outstanding active readers */ jnz 1f save_common_regs @@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake) call rwsem_wake restore_common_regs 1: ret - ENDPROC(call_rwsem_wake) + CFI_ENDPROC +ENDPROC(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) + CFI_STARTPROC save_common_regs - pushq %rdx + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 movq %rax,%rdi call rwsem_downgrade_wake - popq %rdx + popq_cfi %rdx + CFI_RESTORE rdx restore_common_regs ret - ENDPROC(call_rwsem_downgrade_wake) + CFI_ENDPROC +ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 648fe4741782..06691daa4108 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -36,7 +36,7 @@ */ #ifdef CONFIG_SMP ENTRY(__write_lock_failed) - CFI_STARTPROC simple + CFI_STARTPROC FRAME 2: LOCK_PREFIX addl $ RW_LOCK_BIAS,(%eax) @@ -74,29 +74,23 @@ ENTRY(__read_lock_failed) /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx,0 call rwsem_down_read_failed - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 calll rwsem_down_write_failed - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_down_write_failed) @@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake) CFI_STARTPROC decw %dx /* do nothing if still outstanding active readers */ jnz 1f - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 call rwsem_wake - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx 1: ret CFI_ENDPROC ENDPROC(call_rwsem_wake) @@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx,0 call rwsem_downgrade_wake - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 650b11e00ecc..2930ae05d773 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -7,24 +7,6 @@ #include <linux/linkage.h> -#define ARCH_TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - #ifdef CONFIG_TRACE_IRQFLAGS /* put return address in eax (arg1) */ .macro thunk_ra name,func diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index bf9a7d5a5428..782b082c9ff7 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -22,26 +22,6 @@ CFI_ENDPROC .endm - /* rdi: arg1 ... normal C conventions. rax is passed from C. */ - .macro thunk_retrax name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore_norax - CFI_ENDPROC - .endm - - - .section .sched.text, "ax" -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed - thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed - thunk rwsem_wake_thunk,rwsem_wake - thunk rwsem_downgrade_thunk,rwsem_downgrade_wake -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* put return address in rdi (arg1) */ .macro thunk_ra name,func @@ -72,10 +52,3 @@ restore: RESTORE_ARGS ret CFI_ENDPROC - - CFI_STARTPROC - SAVE_ARGS -restore_norax: - RESTORE_ARGS 1 - ret - CFI_ENDPROC diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 09df2f9a3d69..3d11327c9ab4 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,8 +23,9 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_AMD_NUMA) += amdtopology.o +obj-$(CONFIG_ACPI_NUMA) += srat.o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c new file mode 100644 index 000000000000..5247d01329ca --- /dev/null +++ b/arch/x86/mm/amdtopology.c @@ -0,0 +1,197 @@ +/* + * AMD NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the AMD northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <linux/pci_ids.h> +#include <linux/acpi.h> +#include <asm/types.h> +#include <asm/mmzone.h> +#include <asm/proto.h> +#include <asm/e820.h> +#include <asm/pci-direct.h> +#include <asm/numa.h> +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <asm/amd_nb.h> + +static unsigned char __initdata nodeids[8]; + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -ENOENT; +} + +static __init void early_get_boot_cpu_id(void) +{ + /* + * need to get the APIC ID of the BSP so can use that to + * create apicid_to_node in amd_scan_nodes() + */ +#ifdef CONFIG_X86_MPPARSE + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + early_get_smp_config(); +#endif +} + +int __init amd_numa_init(void) +{ + u64 start = PFN_PHYS(0); + u64 end = PFN_PHYS(max_pfn); + unsigned numnodes; + u64 prevbase; + int i, j, nb; + u32 nodeid, reg; + unsigned int bits, cores, apicid_base; + + if (!early_pci_allowed()) + return -EINVAL; + + nb = find_northbridge(); + if (nb < 0) + return nb; + + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -ENOENT; + + pr_info("Number of physical nodes %d\n", numnodes); + + prevbase = 0; + for (i = 0; i < 8; i++) { + u64 base, limit; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeids[i] = nodeid = limit & 7; + if ((base & 3) == 0) { + if (i < numnodes) + pr_info("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + pr_info("Skipping node entry %d (base %Lx)\n", + i, base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + pr_err("Node %d using interleaving mode %Lx/%Lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); + return -EINVAL; + } + if (node_isset(nodeid, numa_nodes_parsed)) { + pr_info("Node %d already present, skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit <<= 24; + limit |= (1<<24)-1; + limit++; + + if (limit > end) + limit = end; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + pr_err("Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + pr_err("Node %d bogus settings %Lx-%Lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + pr_err("Node map not sorted %Lx,%Lx\n", + prevbase, base); + return -EINVAL; + } + + pr_info("Node %d MemBase %016Lx Limit %016Lx\n", + nodeid, base, limit); + + prevbase = base; + numa_add_memblk(nodeid, base, limit); + node_set(nodeid, numa_nodes_parsed); + } + + if (!nodes_weight(numa_nodes_parsed)) + return -ENOENT; + + /* + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the coreid bits from early_identify_cpu. + */ + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + apicid_base = 0; + + /* get the APIC ID of the BSP early for systems with apicid lifting */ + early_get_boot_cpu_id(); + if (boot_cpu_physical_apicid > 0) { + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } + + for_each_node_mask(i, numa_nodes_parsed) + for (j = apicid_base; j < cores + apicid_base; j++) + set_apicid_to_node((i << bits) + j, i); + + return 0; +} diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c deleted file mode 100644 index f21962c435ed..000000000000 --- a/arch/x86/mm/amdtopology_64.c +++ /dev/null @@ -1,294 +0,0 @@ -/* - * AMD NUMA support. - * Discover the memory map and associated nodes. - * - * This version reads it directly from the AMD northbridge. - * - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/module.h> -#include <linux/nodemask.h> -#include <linux/memblock.h> - -#include <asm/io.h> -#include <linux/pci_ids.h> -#include <linux/acpi.h> -#include <asm/types.h> -#include <asm/mmzone.h> -#include <asm/proto.h> -#include <asm/e820.h> -#include <asm/pci-direct.h> -#include <asm/numa.h> -#include <asm/mpspec.h> -#include <asm/apic.h> -#include <asm/amd_nb.h> - -static struct bootnode __initdata nodes[8]; -static unsigned char __initdata nodeids[8]; -static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; - -static __init int find_northbridge(void) -{ - int num; - - for (num = 0; num < 32; num++) { - u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) - continue; - return num; - } - - return -1; -} - -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in amd_scan_nodes() - */ -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif -} - -int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long start = PFN_PHYS(start_pfn); - unsigned long end = PFN_PHYS(end_pfn); - unsigned numnodes; - unsigned long prevbase; - int i, nb, found = 0; - u32 nodeid, reg; - - if (!early_pci_allowed()) - return -1; - - nb = find_northbridge(); - if (nb < 0) - return nb; - - pr_info("Scanning NUMA topology in Northbridge %d\n", nb); - - reg = read_pci_config(0, nb, 0, 0x60); - numnodes = ((reg >> 4) & 0xF) + 1; - if (numnodes <= 1) - return -1; - - pr_info("Number of physical nodes %d\n", numnodes); - - prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base, limit; - - base = read_pci_config(0, nb, 1, 0x40 + i*8); - limit = read_pci_config(0, nb, 1, 0x44 + i*8); - - nodeids[i] = nodeid = limit & 7; - if ((base & 3) == 0) { - if (i < numnodes) - pr_info("Skipping disabled node %d\n", i); - continue; - } - if (nodeid >= numnodes) { - pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); - continue; - } - - if (!limit) { - pr_info("Skipping node entry %d (base %lx)\n", - i, base); - continue; - } - if ((base >> 8) & 3 || (limit >> 8) & 3) { - pr_err("Node %d using interleaving mode %lx/%lx\n", - nodeid, (base >> 8) & 3, (limit >> 8) & 3); - return -1; - } - if (node_isset(nodeid, nodes_parsed)) { - pr_info("Node %d already present, skipping\n", - nodeid); - continue; - } - - limit >>= 16; - limit <<= 24; - limit |= (1<<24)-1; - limit++; - - if (limit > end) - limit = end; - if (limit <= base) - continue; - - base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - pr_err("Empty node %d\n", nodeid); - continue; - } - if (limit < base) { - pr_err("Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); - continue; - } - - /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { - pr_err("Node map not sorted %lx,%lx\n", - prevbase, base); - return -1; - } - - pr_info("Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - - found++; - - nodes[nodeid].start = base; - nodes[nodeid].end = limit; - - prevbase = base; - - node_set(nodeid, nodes_parsed); - } - - if (!found) - return -1; - return 0; -} - -#ifdef CONFIG_NUMA_EMU -static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; - -void __init amd_get_nodes(struct bootnode *physnodes) -{ - int i; - - for_each_node_mask(i, nodes_parsed) { - physnodes[i].start = nodes[i].start; - physnodes[i].end = nodes[i].end; - } -} - -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for (i = 0; i < 8; i++) - if (addr >= nodes[i].start && addr < nodes[i].end) { - ret = i; - break; - } - return ret; -} - -/* - * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be - * setup to represent the physical topology but reflect the emulated - * environment. For each emulated node, the real node which it appears on is - * found and a fake pxm to nid mapping is created which mirrors the actual - * locality. node_distance() then represents the correct distances between - * emulated nodes by using the fake acpi mappings to pxms. - */ -void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) -{ - unsigned int bits; - unsigned int cores; - unsigned int apicid_base = 0; - int i; - - bits = boot_cpu_data.x86_coreid_bits; - cores = 1 << bits; - early_get_boot_cpu_id(); - if (boot_cpu_physical_apicid > 0) - apicid_base = boot_cpu_physical_apicid; - - for (i = 0; i < nr_nodes; i++) { - int index; - int nid; - int j; - - nid = find_node_by_addr(nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - - index = nodeids[nid] << bits; - if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) - for (j = apicid_base; j < cores + apicid_base; j++) - fake_apicid_to_node[index + j] = i; -#ifdef CONFIG_ACPI_NUMA - __acpi_map_pxm_to_node(nid, i); -#endif - } - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); -} -#endif /* CONFIG_NUMA_EMU */ - -int __init amd_scan_nodes(void) -{ - unsigned int bits; - unsigned int cores; - unsigned int apicid_base; - int i; - - BUG_ON(nodes_empty(nodes_parsed)); - node_possible_map = nodes_parsed; - memnode_shift = compute_hash_shift(nodes, 8, NULL); - if (memnode_shift < 0) { - pr_err("No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - pr_info("Using node hash shift of %d\n", memnode_shift); - - /* use the coreid bits from early_identify_cpu */ - bits = boot_cpu_data.x86_coreid_bits; - cores = (1<<bits); - apicid_base = 0; - /* get the APIC ID of the BSP early for systems with apicid lifting */ - early_get_boot_cpu_id(); - if (boot_cpu_physical_apicid > 0) { - pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; - } - - for_each_node_mask(i, node_possible_map) { - int j; - - memblock_x86_register_active_regions(i, - nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - for (j = apicid_base; j < cores + apicid_base; j++) - apicid_to_node[(i << bits) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - - numa_init_array(); - return 0; -} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 20e3f8702d1e..2dbf6bf4c7e5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -12,6 +12,7 @@ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ +#include <linux/prefetch.h> /* prefetchw */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -822,16 +823,30 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_info_fault(SIGBUS, code, address, tsk, fault); } -static noinline void +static noinline int mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { + /* + * Pagefault was interrupted by SIGKILL. We have no reason to + * continue pagefault. + */ + if (fatal_signal_pending(current)) { + if (!(fault & VM_FAULT_RETRY)) + up_read(¤t->mm->mmap_sem); + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); + return 1; + } + if (!(fault & VM_FAULT_ERROR)) + return 0; + if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address); - return; + return 1; } out_of_memory(regs, error_code, address); @@ -842,6 +857,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, else BUG(); } + return 1; } static int spurious_fault_check(unsigned long error_code, pte_t *pte) @@ -964,7 +980,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) struct mm_struct *mm; int fault; int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | (write ? FAULT_FLAG_WRITE : 0); tsk = current; @@ -1132,9 +1148,9 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, fault); - return; + if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { + if (mm_fault_error(regs, error_code, address, fault)) + return; } /* diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 069ce7c37c01..f581a18c0d4d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* @@ -326,7 +326,7 @@ try_again: if (mm->free_area_cache < len) goto fail; - /* either no address requested or cant fit in requested address hole */ + /* either no address requested or can't fit in requested address hole */ addr = (mm->free_area_cache - len) & huge_page_mask(h); do { /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 947f42abe820..30326443ab81 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -16,11 +16,9 @@ #include <asm/tlb.h> #include <asm/proto.h> -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - -unsigned long __initdata e820_table_start; -unsigned long __meminitdata e820_table_end; -unsigned long __meminitdata e820_table_top; +unsigned long __initdata pgt_buf_start; +unsigned long __meminitdata pgt_buf_end; +unsigned long __meminitdata pgt_buf_top; int after_bootmem; @@ -33,7 +31,7 @@ int direct_gbpages static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { - unsigned long puds, pmds, ptes, tables, start; + unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; @@ -65,29 +63,25 @@ static void __init find_early_table_space(unsigned long end, int use_pse, #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 - start = 0x7000; -#else - start = 0x8000; + good_end = max_pfn_mapped << PAGE_SHIFT; #endif - base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, - tables, PAGE_SIZE); + + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (base == MEMBLOCK_ERROR) panic("Cannot find space for the kernel page tables"); - e820_table_start = base >> PAGE_SHIFT; - e820_table_end = e820_table_start; - e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); + end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); +} + +void __init native_pagetable_reserve(u64 start, u64 end) +{ + memblock_x86_reserve_range(start, end, "PGTABLE"); } struct map_range { @@ -279,30 +273,26 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, load_cr3(swapper_pg_dir); #endif -#ifdef CONFIG_X86_64 - if (!after_bootmem && !start) { - pud_t *pud; - pmd_t *pmd; - - mmu_cr4_features = read_cr4(); - - /* - * _brk_end cannot change anymore, but it and _end may be - * located on different 2M pages. cleanup_highmap(), however, - * can only consider _end when it runs, so destroy any - * mappings beyond _brk_end here. - */ - pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); - pmd = pmd_offset(pud, _brk_end - 1); - while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) - pmd_clear(pmd); - } -#endif __flush_tlb_all(); - if (!after_bootmem && e820_table_end > e820_table_start) - memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, - e820_table_end << PAGE_SHIFT, "PGTABLE"); + /* + * Reserve the kernel pagetable pages we used (pgt_buf_start - + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) + * so that they can be reused for other purposes. + * + * On native it just means calling memblock_x86_reserve_range, on Xen it + * also means marking RW the pagetable pages that we allocated before + * but that haven't been used. + * + * In fact on xen we mark RO the whole range pgt_buf_start - + * pgt_buf_top, because we have to make sure that when + * init_memory_mapping reaches the pagetable pages area, it maps + * RO all the pagetable pages, including the ones that are beyond + * pgt_buf_end at that time. + */ + if (!after_bootmem && pgt_buf_end > pgt_buf_start) + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), + PFN_PHYS(pgt_buf_end)); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c821074b7f0b..29f7c6d98179 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = e820_table_end++; + unsigned long pfn = pgt_buf_end++; void *adr; - if (pfn >= e820_table_top) + if (pfn >= pgt_buf_top) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); @@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start - || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { + && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start + || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { pte_t *newpte; int i; @@ -644,8 +644,7 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; @@ -679,8 +678,10 @@ static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); +#ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; +#endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; @@ -717,6 +718,7 @@ void __init paging_init(void) * NOTE: at this point the bootmem allocator is fully available. */ olpc_dt_build_devicetree(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); zone_sizes_init(); } @@ -918,7 +920,7 @@ static void mark_nxdata_nx(void) { /* * When this called, init has already been executed and released, - * so everything past _etext sould be NX. + * so everything past _etext should be NX. */ unsigned long start = PFN_ALIGN(_etext); /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c14a5422e152..d865c4aeec55 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -51,6 +51,8 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> static int __init parse_direct_gbpages_off(char *arg) { @@ -293,18 +295,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * - * We limit the mappings to the region from _text to _end. _end is - * rounded up to the 2MB boundary. This catches the invalid pmds as + * We limit the mappings to the region from _text to _brk_end. _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as * well, as they are located before _text: */ void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; + unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; - pmd_t *last_pmd = pmd + PTRS_PER_PMD; - for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; if (vaddr < (unsigned long) _text || vaddr > end) @@ -314,7 +316,7 @@ void __init cleanup_highmap(void) static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = e820_table_end++; + unsigned long pfn = pgt_buf_end++; void *adr; if (after_bootmem) { @@ -324,7 +326,7 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= e820_table_top) + if (pfn >= pgt_buf_top) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); @@ -333,12 +335,28 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } +static __ref void *map_low_page(void *virt) +{ + void *adr; + unsigned long phys, left; + + if (after_bootmem) + return virt; + + phys = __pa(virt); + left = phys & (PAGE_SIZE - 1); + adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); + adr = (void *)(((unsigned long)adr) | left); + + return adr; +} + static __ref void unmap_low_page(void *adr) { if (after_bootmem) return; - early_iounmap(adr, PAGE_SIZE); + early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); } static unsigned long __meminit @@ -386,15 +404,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, - pgprot_t prot) -{ - pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - - return phys_pte_init(pte, address, end, prot); -} - -static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long page_size_mask, pgprot_t prot) { @@ -420,8 +429,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pte_update(pmd, address, + pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + last_map_addr = phys_pte_init(pte, address, end, prot); + unmap_low_page(pte); spin_unlock(&init_mm.page_table_lock); continue; } @@ -468,18 +479,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, } static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask, pgprot_t prot) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long last_map_addr; - - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); - __flush_tlb_all(); - return last_map_addr; -} - -static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long page_size_mask) { @@ -504,8 +503,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (pud_val(*pud)) { if (!pud_large(*pud)) { - last_map_addr = phys_pmd_update(pud, addr, end, + pmd = map_low_page(pmd_offset(pud, 0)); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); + unmap_low_page(pmd); + __flush_tlb_all(); continue; } /* @@ -553,17 +555,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, return last_map_addr; } -static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, - unsigned long page_size_mask) -{ - pud_t *pud; - - pud = (pud_t *)pgd_page_vaddr(*pgd); - - return phys_pud_init(pud, addr, end, page_size_mask); -} - unsigned long __meminit kernel_physical_mapping_init(unsigned long start, unsigned long end, @@ -587,8 +578,10 @@ kernel_physical_mapping_init(unsigned long start, next = end; if (pgd_val(*pgd)) { - last_map_addr = phys_pud_update(pgd, __pa(start), + pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); + unmap_low_page(pud); continue; } @@ -612,10 +605,9 @@ kernel_physical_mapping_init(unsigned long start, } #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(void) { - memblock_x86_register_active_regions(0, start_pfn, end_pfn); + memblock_x86_register_active_regions(0, 0, max_pfn); } #endif @@ -624,7 +616,9 @@ void __init paging_init(void) unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); +#ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; @@ -687,14 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(arch_add_memory); -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; @@ -870,18 +856,18 @@ static struct vm_area_struct gate_vma = { .vm_flags = VM_READ | VM_EXEC }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) + if (!mm || mm->context.ia32_compat) return NULL; #endif return &gate_vma; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task); + struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; @@ -890,11 +876,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr) } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } @@ -908,6 +894,19 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } +#ifdef CONFIG_X86_UV +#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) + +unsigned long memory_block_size_bytes(void) +{ + if (is_uv_system()) { + printk(KERN_INFO "UV: memory block size 2GB\n"); + return 2UL * 1024 * 1024 * 1024; + } + return MIN_MEMORY_BLOCK_SIZE; +} +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0369843511dc..be1ef574ce9a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return (__force void __iomem *)phys_to_virt(phys_addr); /* - * Check if the request spans more than any BAR in the iomem resource - * tree. - */ - WARN_ONCE(iomem_map_sanity_check(phys_addr, size), - KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ last_pfn = last_addr >> PAGE_SHIFT; @@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, ret_addr = (void __iomem *) (vaddr + offset); mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + return ret_addr; err_free_area: free_vm_area(area); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ebf6d7887a38..f5510d889a22 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -1,11 +1,39 @@ /* Common code for 32 and 64-bit NUMA */ -#include <linux/topology.h> -#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h> #include <linux/bootmem.h> -#include <asm/numa.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> + +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/dma.h> #include <asm/acpi.h> +#include <asm/amd_nb.h> + +#include "numa_internal.h" int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; static __init int numa_setup(char *opt) { @@ -26,12 +54,59 @@ static __init int numa_setup(char *opt) early_param("numa", numa_setup); /* - * Which logical CPUs are on which nodes + * apicid, cpu, node mappings */ +s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + +int __cpuinit numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); /* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + set_cpu_numa_node(cpu, node); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +/* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * @@ -57,7 +132,659 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } -#ifdef CONFIG_DEBUG_PER_CPU_MAPS +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start, u64 end) +{ + const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); + const u64 nd_high = PFN_PHYS(max_pfn_mapped); + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + bool remapped = false; + u64 nd_pa; + void *nd; + int tnid; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + + /* initialize remap allocator before aligning to ZONE_ALIGN */ + init_alloc_remap(nid, start, end); + + start = roundup(start, ZONE_ALIGN); + + printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", + nid, start, end); + + /* + * Allocate node data. Try remap allocator first, node-local + * memory and then any node. Never allocate in DMA zone. + */ + nd = alloc_remap(nid, nd_size); + if (nd) { + nd_pa = __pa(nd); + remapped = true; + } else { + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) + nd_pa = memblock_find_in_range(nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; + } + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); + nd = __va(nd_pa); + } + + /* report and initialize */ + printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", + nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (!remapped && tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; + } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", + bi->nid, bi->start, bi->end, bj->start, bj->end, + start, end); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node at the time of + * table creation or @distance doesn't make sense, the call is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + u64 numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((s64)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - (memblock_x86_hole_size(0, + PFN_PHYS(max_pfn)) >> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = PFN_PHYS(max_pfn); + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start < end) + setup_node_data(nid, start, end); + } + + return 0; +} + +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +static void __init numa_init_array(void) +{ + int rr, i; + + rr = first_node(node_online_map); + for (i = 0; i < nr_cpu_ids; i++) { + if (early_cpu_to_node(i) != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } +} + +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", + 0LLU, PFN_PHYS(max_pfn)); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_X86_NUMAQ + if (!numa_init(numaq_numa_init)) + return; +#endif +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { + int node = numa_cpu_node(cpu); + + if (node == NUMA_NO_NODE) + continue; + if (!node_online(node)) + node = find_near_online_node(node); + numa_set_node(cpu, node); + } +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void __cpuinit numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif /* !CONFIG_NUMA_EMU */ + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +int __cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!cpu_possible(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + +void debug_cpumask_set_cpu(int cpu, int node, bool enable) +{ + struct cpumask *mask; + char buf[64]; + + if (node == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return; + } + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + return; +} + +# ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, bool enable) +{ + debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +# endif /* !CONFIG_NUMA_EMU */ + /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ @@ -80,4 +807,20 @@ const struct cpumask *cpumask_of_node(int node) return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); + +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84a3e4c9f277..849a975d3fa0 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -22,39 +22,11 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/mm.h> #include <linux/bootmem.h> #include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/highmem.h> -#include <linux/initrd.h> -#include <linux/nodemask.h> #include <linux/module.h> -#include <linux/kexec.h> -#include <linux/pfn.h> -#include <linux/swap.h> -#include <linux/acpi.h> - -#include <asm/e820.h> -#include <asm/setup.h> -#include <asm/mmzone.h> -#include <asm/bios_ebda.h> -#include <asm/proto.h> - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -/* - * numa interface - we expect the numa architecture specific code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +#include "numa_internal.h" #ifdef CONFIG_DISCONTIGMEM /* @@ -99,102 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, } #endif -extern unsigned long find_max_low_pfn(void); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static unsigned long kva_start_pfn; -static unsigned long kva_pages; -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -int __init get_memcfg_numa_flat(void) -{ - printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - memblock_x86_register_active_regions(0, 0, max_pfn); - memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - char buf[16]; - - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { - unsigned long pgdat_phys; - pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, - max_pfn_mapped<<PAGE_SHIFT, - sizeof(pg_data_t), - PAGE_SIZE); - NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); - memset(buf, 0, sizeof(buf)); - sprintf(buf, "NODE_DATA %d", nid); - memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); - } - printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", - nid, (unsigned long)NODE_DATA(nid)); -} - /* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. + * Remap memory allocator */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; +/** + * alloc_remap - Allocate remapped memory + * @nid: NUMA node to allocate memory from + * @size: The size of allocation + * + * Allocate @size bytes from the remap area of NUMA node @nid. The + * size of the remap area is predetermined by init_alloc_remap() and + * only the callers considered there should call this function. For + * more info, please read the comment on top of init_alloc_remap(). + * + * The caller must be ready to handle allocation failure from this + * function and fall back to regular memory allocator in such cases. + * + * CONTEXT: + * Single CPU early boot context. + * + * RETURNS: + * Pointer to the allocated memory on success, %NULL on failure. + */ void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; size = ALIGN(size, L1_CACHE_BYTES); - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) return NULL; node_remap_alloc_vaddr[nid] += size; @@ -203,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); - printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", - (unsigned long)vaddr, - node_remap_start_pfn[node] + pfn); - set_pmd_pfn((ulong) vaddr, - node_remap_start_pfn[node] + pfn, - PAGE_KERNEL_LARGE); - } - } -} - #ifdef CONFIG_HIBERNATION /** * resume_map_numa_kva - add KVA mapping to the temporary page tables created @@ -234,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) int node; for_each_online_node(node) { - unsigned long start_va, start_pfn, size, pfn; + unsigned long start_va, start_pfn, nr_pages, pfn; start_va = (unsigned long)node_remap_start_vaddr[node]; start_pfn = node_remap_start_pfn[node]; - size = node_remap_size[node]; + nr_pages = (node_remap_end_vaddr[node] - + node_remap_start_vaddr[node]) >> PAGE_SHIFT; printk(KERN_DEBUG "%s: node %d\n", __func__, node); - for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { + for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); pgd_t *pgd = pgd_base + pgd_index(vaddr); pud_t *pud = pud_offset(pgd, vaddr); @@ -258,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static __init unsigned long calculate_numa_remap_pages(void) -{ - int nid; - unsigned long size, reserve_pages = 0; - - for_each_online_node(nid) { - u64 node_kva_target; - u64 node_kva_final; - - /* - * The acpi/srat node info can show hot-add memroy zones - * where memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, node_start_pfn[nid], node_end_pfn[nid]); - if (node_start_pfn[nid] > max_pfn) - continue; - if (!node_end_pfn[nid]) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - node_kva_target = round_down(node_end_pfn[nid] - size, - PTRS_PER_PTE); - node_kva_target <<= PAGE_SHIFT; - do { - node_kva_final = memblock_find_in_range(node_kva_target, - ((u64)node_end_pfn[nid])<<PAGE_SHIFT, - ((u64)size)<<PAGE_SHIFT, - LARGE_PAGE_BYTES); - node_kva_target -= LARGE_PAGE_BYTES; - } while (node_kva_final == MEMBLOCK_ERROR && - (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - - if (node_kva_final == MEMBLOCK_ERROR) - panic("Can not get kva ram\n"); - - node_remap_size[nid] = size; - node_remap_offset[nid] = reserve_pages; - reserve_pages += size; - printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" - " node %d at %llx\n", - size, nid, node_kva_final>>PAGE_SHIFT); - - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then max_low_pfn - * but we could have some hole in high memory, and it will only - * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide - * to use it as free. - * So memblock_x86_reserve_range here, hope we don't run out of that array - */ - memblock_x86_reserve_range(node_kva_final, - node_kva_final+(((u64)size)<<PAGE_SHIFT), - "KVA RAM"); - - node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; - } - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; -} - -static void init_remap_allocator(int nid) -{ - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) node_remap_end_vaddr[nid]); -} - -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +/** + * init_alloc_remap - Initialize remap allocator for a NUMA node + * @nid: NUMA node to initizlie remap allocator for + * + * NUMA nodes may end up without any lowmem. As allocating pgdat and + * memmap on a different node with lowmem is inefficient, a special + * remap allocator is implemented which can be used by alloc_remap(). + * + * For each node, the amount of memory which will be necessary for + * pgdat and memmap is calculated and two memory areas of the size are + * allocated - one in the node and the other in lowmem; then, the area + * in the node is remapped to the lowmem area. + * + * As pgdat and memmap must be allocated in lowmem anyway, this + * doesn't waste lowmem address space; however, the actual lowmem + * which gets remapped over is wasted. The amount shouldn't be + * problematic on machines this feature will be used. + * + * Initialization failure isn't fatal. alloc_remap() is used + * opportunistically and the callers will fall back to other memory + * allocation mechanisms on failure. + */ +void __init init_alloc_remap(int nid, u64 start, u64 end) { - int nid; - long kva_target_pfn; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long size, pfn; + u64 node_pa, remap_pa; + void *remap_va; /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundary between ZONE_NORMAL - * and ZONE_HIGHMEM. + * The acpi/srat node info can show hot-add memroy zones where + * memory could be added but not currently present. */ + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", + nid, start_pfn, end_pfn); + + /* calculate the necessary space aligned to large page size */ + size = node_memmap_size_bytes(nid, start_pfn, end_pfn); + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); + size = ALIGN(size, LARGE_PAGE_BYTES); + + /* allocate node memory and the lowmem remap area */ + node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); + if (node_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", + size, nid); + return; + } + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + + remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + size, LARGE_PAGE_BYTES); + if (remap_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", + size, nid); + memblock_x86_free_range(node_pa, node_pa + size); + return; + } + memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); + remap_va = phys_to_virt(remap_pa); + + /* perform actual remap */ + for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) + set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), + (node_pa >> PAGE_SHIFT) + pfn, + PAGE_KERNEL_LARGE); + + /* initialize remap allocator parameters */ + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; + node_remap_start_vaddr[nid] = remap_va; + node_remap_end_vaddr[nid] = remap_va + size; + node_remap_alloc_vaddr[nid] = remap_va; + + printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", + nid, node_pa, node_pa + size, remap_va, remap_va + size); +} - get_memcfg_numa(); - - kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); - - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - do { - kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT, - max_low_pfn<<PAGE_SHIFT, - kva_pages<<PAGE_SHIFT, - PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; - kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); - - if (kva_start_pfn == MEMBLOCK_ERROR) - panic("Can not get kva space\n"); - - printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", - kva_start_pfn, max_low_pfn); - printk(KERN_INFO "max_pfn = %lx\n", max_pfn); +void __init initmem_init(void) +{ + x86_numa_init(); - /* avoid clash with initrd */ - memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT, - (kva_start_pfn + kva_pages)<<PAGE_SHIFT, - "KVA PG"); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) @@ -403,51 +257,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - init_remap_allocator(nid); - - allocate_pgdat(nid); - } - remap_numa_kva(); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); - for_each_online_node(nid) - propagate_e820_map_node(nid); - - for_each_online_node(nid) { - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - NODE_DATA(nid)->node_id = nid; - } setup_bootmem_allocator(); } - -#ifdef CONFIG_MEMORY_HOTPLUG -static int paddr_to_nid(u64 addr) -{ - int nid; - unsigned long pfn = PFN_DOWN(addr); - - for_each_node(nid) - if (node_start_pfn[nid] <= pfn && - pfn < node_end_pfn[nid]) - return nid; - - return -1; -} - -/* - * This function is used to ask node id BEFORE memmap and mem_section's - * initialization (pfn_to_nid() can't be used yet). - * If _PXM is not defined on ACPI's DSDT, node id must be found by this. - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - int nid = paddr_to_nid(addr); - return (nid >= 0) ? nid : 0; -} - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1337c51b07d7..dd27f401f0a0 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -2,663 +2,13 @@ * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/init.h> #include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/ctype.h> -#include <linux/module.h> -#include <linux/nodemask.h> -#include <linux/sched.h> -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/dma.h> -#include <asm/numa.h> -#include <asm/acpi.h> -#include <asm/amd_nb.h> +#include "numa_internal.h" -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -struct memnode memnode; - -s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; - -static unsigned long __initdata nodemap_addr; -static unsigned long __initdata nodemap_size; - -/* - * Map cpu index to node index - */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init populate_memnodemap(const struct bootnode *nodes, - int numnodes, int shift, int *nodeids) -{ - unsigned long addr, end; - int i, res = -1; - - memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); - for (i = 0; i < numnodes; i++) { - addr = nodes[i].start; - end = nodes[i].end; - if (addr >= end) - continue; - if ((end >> shift) >= memnodemapsize) - return 0; - do { - if (memnodemap[addr >> shift] != NUMA_NO_NODE) - return -1; - - if (!nodeids) - memnodemap[addr >> shift] = i; - else - memnodemap[addr >> shift] = nodeids[i]; - - addr += (1UL << shift); - } while (addr < end); - res = 1; - } - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long addr; - - memnodemap = memnode.embedded_map; - if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) - return 0; - - addr = 0x8000; - nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, - nodemap_size, L1_CACHE_BYTES); - if (nodemap_addr == MEMBLOCK_ERROR) { - printk(KERN_ERR - "NUMA: Unable to allocate Memory to Node hash map\n"); - nodemap_addr = nodemap_size = 0; - return -1; - } - memnodemap = phys_to_virt(nodemap_addr); - memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); - - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); - return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init extract_lsb_from_nodes(const struct bootnode *nodes, - int numnodes) -{ - int i, nodes_used = 0; - unsigned long start, end; - unsigned long bitfield = 0, memtop = 0; - - for (i = 0; i < numnodes; i++) { - start = nodes[i].start; - end = nodes[i].end; - if (start >= end) - continue; - bitfield |= start; - nodes_used++; - if (end > memtop) - memtop = end; - } - if (nodes_used <= 1) - i = 63; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i)+1; - return i; -} - -int __init compute_hash_shift(struct bootnode *nodes, int numnodes, - int *nodeids) -{ - int shift; - - shift = extract_lsb_from_nodes(nodes, numnodes); - if (allocate_cachealigned_memnodemap()) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { - printk(KERN_INFO "Your memory is not aligned you need to " - "rebuild your kernel with a bigger NODEMAPSIZE " - "shift=%d\n", shift); - return -1; - } - return shift; -} - -int __meminit __early_pfn_to_nid(unsigned long pfn) -{ - return phys_to_nid(pfn << PAGE_SHIFT); -} - -static void * __init early_node_mem(int nodeid, unsigned long start, - unsigned long end, unsigned long size, - unsigned long align) -{ - unsigned long mem; - - /* - * put it on high as possible - * something will go with NODE_DATA - */ - if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) - start = MAX_DMA_PFN<<PAGE_SHIFT; - if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && - end > (MAX_DMA32_PFN<<PAGE_SHIFT)) - start = MAX_DMA32_PFN<<PAGE_SHIFT; - mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); - if (mem != MEMBLOCK_ERROR) - return __va(mem); - - /* extend the search scope */ - end = max_pfn_mapped << PAGE_SHIFT; - start = MAX_DMA_PFN << PAGE_SHIFT; - mem = memblock_find_in_range(start, end, size, align); - if (mem != MEMBLOCK_ERROR) - return __va(mem); - - printk(KERN_ERR "Cannot find %lu bytes in node %d\n", - size, nodeid); - - return NULL; -} - -/* Initialize bootmem allocator for a node */ -void __init -setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ - unsigned long start_pfn, last_pfn, nodedata_phys; - const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - int nid; - - if (!end) - return; - - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - return; - - start = roundup(start, ZONE_ALIGN); - - printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, - start, end); - - start_pfn = start >> PAGE_SHIFT; - last_pfn = end >> PAGE_SHIFT; - - node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, - SMP_CACHE_BYTES); - if (node_data[nodeid] == NULL) - return; - nodedata_phys = __pa(node_data[nodeid]); - memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); - printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, - nodedata_phys + pgdat_size - 1); - nid = phys_to_nid(nodedata_phys); - if (nid != nodeid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); - - memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->node_id = nodeid; - NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; - - node_set_online(nodeid); -} - -/* - * There are unfortunately some poorly designed mainboards around that - * only connect memory to a single CPU. This breaks the 1:1 cpu->node - * mapping. To avoid this fill in the mapping for all possible CPUs, - * as the number of CPUs is not known yet. We round robin the existing - * nodes. - */ -void __init numa_init_array(void) -{ - int rr, i; - - rr = first_node(node_online_map); - for (i = 0; i < nr_cpu_ids; i++) { - if (early_cpu_to_node(i) != NUMA_NO_NODE) - continue; - numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); - } -} - -#ifdef CONFIG_NUMA_EMU -/* Numa emulation */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; -static char *cmdline __initdata; - -void __init numa_emu_cmdline(char *str) -{ - cmdline = str; -} - -static int __init setup_physnodes(unsigned long start, unsigned long end, - int acpi, int amd) -{ - int ret = 0; - int i; - - memset(physnodes, 0, sizeof(physnodes)); -#ifdef CONFIG_ACPI_NUMA - if (acpi) - acpi_get_nodes(physnodes, start, end); -#endif -#ifdef CONFIG_AMD_NUMA - if (amd) - amd_get_nodes(physnodes); -#endif - /* - * Basic sanity checking on the physical node map: there may be errors - * if the SRAT or AMD code incorrectly reported the topology or the mem= - * kernel parameter is used. - */ - for (i = 0; i < MAX_NUMNODES; i++) { - if (physnodes[i].start == physnodes[i].end) - continue; - if (physnodes[i].start > end) { - physnodes[i].end = physnodes[i].start; - continue; - } - if (physnodes[i].end < start) { - physnodes[i].start = physnodes[i].end; - continue; - } - if (physnodes[i].start < start) - physnodes[i].start = start; - if (physnodes[i].end > end) - physnodes[i].end = end; - ret++; - } - - /* - * If no physical topology was detected, a single node is faked to cover - * the entire address space. - */ - if (!ret) { - physnodes[ret].start = start; - physnodes[ret].end = end; - ret = 1; - } - return ret; -} - -static void __init fake_physnodes(int acpi, int amd, int nr_nodes) -{ - int i; - - BUG_ON(acpi && amd); -#ifdef CONFIG_ACPI_NUMA - if (acpi) - acpi_fake_nodes(nodes, nr_nodes); -#endif -#ifdef CONFIG_AMD_NUMA - if (amd) - amd_fake_nodes(nodes, nr_nodes); -#endif - if (!acpi && !amd) - for (i = 0; i < nr_cpu_ids; i++) - numa_set_node(i, 0); -} - -/* - * Setups up nid to range from addr to addr + size. If the end - * boundary is greater than max_addr, then max_addr is used instead. - * The return value is 0 if there is additional memory left for - * allocation past addr and -1 otherwise. addr is adjusted to be at - * the end of the node. - */ -static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) -{ - int ret = 0; - nodes[nid].start = *addr; - *addr += size; - if (*addr >= max_addr) { - *addr = max_addr; - ret = -1; - } - nodes[nid].end = *addr; - node_set(nid, node_possible_map); - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - nodes[nid].start, nodes[nid].end, - (nodes[nid].end - nodes[nid].start) >> 20); - return ret; -} - -/* - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. - */ -static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) -{ - nodemask_t physnode_mask = NODE_MASK_NONE; - u64 size; - int big; - int ret = 0; - int i; - - if (nr_nodes <= 0) - return -1; - if (nr_nodes > MAX_NUMNODES) { - pr_info("numa=fake=%d too large, reducing to %d\n", - nr_nodes, MAX_NUMNODES); - nr_nodes = MAX_NUMNODES; - } - - size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the remainder. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / - FAKE_NODE_MIN_SIZE; - - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - pr_err("Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = 0; i < MAX_NUMNODES; i++) - if (physnodes[i].start != physnodes[i].end) - node_set(i, physnode_mask); - - /* - * Continue to fill physical nodes with fake nodes until there is no - * memory left on any of them. - */ - while (nodes_weight(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 end = physnodes[i].start + size; - u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - - if (ret < big) - end += FAKE_NODE_MIN_SIZE; - - /* - * Continue to add memory to this fake node if its - * non-reserved memory is less than the per-node size. - */ - while (end - physnodes[i].start - - memblock_x86_hole_size(physnodes[i].start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > physnodes[i].end) { - end = physnodes[i].end; - break; - } - } - - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (physnodes[i].end - end - - memblock_x86_hole_size(end, physnodes[i].end) < size) - end = physnodes[i].end; - - /* - * Avoid allocating more nodes than requested, which can - * happen as a result of rounding down each node's size - * to FAKE_NODE_MIN_SIZE. - */ - if (nodes_weight(physnode_mask) + ret >= nr_nodes) - end = physnodes[i].end; - - if (setup_node_range(ret++, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) - node_clear(i, physnode_mask); - } - } - return ret; -} - -/* - * Returns the end address of a node so that there is at least `size' amount of - * non-reserved memory or `max_addr' is reached. - */ -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) -{ - u64 end = start + size; - - while (end - start - memblock_x86_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - return end; -} - -/* - * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. - */ -static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) -{ - nodemask_t physnode_mask = NODE_MASK_NONE; - u64 min_size; - int ret = 0; - int i; - - if (!size) - return -1; - /* - * The limit on emulated nodes is MAX_NUMNODES, so the size per node is - * increased accordingly if the requested size is too small. This - * creates a uniform distribution of node sizes across the entire - * machine (but not necessarily over physical nodes). - */ - min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / - MAX_NUMNODES; - min_size = max(min_size, FAKE_NODE_MIN_SIZE); - if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) - min_size = (min_size + FAKE_NODE_MIN_SIZE) & - FAKE_NODE_MIN_HASH_MASK; - if (size < min_size) { - pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); - size = min_size; - } - size &= FAKE_NODE_MIN_HASH_MASK; - - for (i = 0; i < MAX_NUMNODES; i++) - if (physnodes[i].start != physnodes[i].end) - node_set(i, physnode_mask); - /* - * Fill physical nodes with fake nodes of size until there is no memory - * left on any of them. - */ - while (nodes_weight(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; - u64 end; - - end = find_end_of_node(physnodes[i].start, - physnodes[i].end, size); - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (physnodes[i].end - end - - memblock_x86_hole_size(end, physnodes[i].end) < size) - end = physnodes[i].end; - - /* - * Setup the fake node that will be allocated as bootmem - * later. If setup_node_range() returns non-zero, there - * is no more memory available on this physical node. - */ - if (setup_node_range(ret++, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) - node_clear(i, physnode_mask); - } - } - return ret; -} - -/* - * Sets up the system RAM area from start_pfn to last_pfn according to the - * numa=fake command-line option. - */ -static int __init numa_emulation(unsigned long start_pfn, - unsigned long last_pfn, int acpi, int amd) +void __init initmem_init(void) { - u64 addr = start_pfn << PAGE_SHIFT; - u64 max_addr = last_pfn << PAGE_SHIFT; - int num_nodes; - int i; - - /* - * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. Otherwise, if it is just a single number N, - * split the system RAM into N fake nodes. - */ - if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { - u64 size; - - size = memparse(cmdline, &cmdline); - num_nodes = split_nodes_size_interleave(addr, max_addr, size); - } else { - unsigned long n; - - n = simple_strtoul(cmdline, NULL, 0); - num_nodes = split_nodes_interleave(addr, max_addr, n); - } - - if (num_nodes < 0) - return num_nodes; - memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. NUMA emulation " - "disabled.\n"); - return -1; - } - - /* - * We need to vacate all active ranges that may have been registered for - * the e820 memory map. - */ - remove_all_active_ranges(); - for_each_node_mask(i, node_possible_map) { - memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - setup_physnodes(addr, max_addr, acpi, amd); - fake_physnodes(acpi, amd, num_nodes); - numa_init_array(); - return 0; -} -#endif /* CONFIG_NUMA_EMU */ - -void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, - int acpi, int amd) -{ - int i; - - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - -#ifdef CONFIG_NUMA_EMU - setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, - acpi, amd); - if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) - return; - setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, - acpi, amd); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif - -#ifdef CONFIG_ACPI_NUMA - if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT)) - return; - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif - -#ifdef CONFIG_AMD_NUMA - if (!numa_off && amd && !amd_scan_nodes()) - return; - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif - printk(KERN_INFO "%s\n", - numa_off ? "NUMA turned off" : "No NUMA configuration found"); - - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT); - /* setup dummy node covering all memory */ - memnode_shift = 63; - memnodemap = memnode.embedded_map; - memnodemap[0] = 0; - node_set_online(0); - node_set(0, node_possible_map); - for (i = 0; i < nr_cpu_ids; i++) - numa_set_node(i, 0); - memblock_x86_register_active_regions(0, start_pfn, last_pfn); - setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); + x86_numa_init(); } unsigned long __init numa_free_all_bootmem(void) @@ -673,253 +23,3 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } - -#ifdef CONFIG_NUMA - -static __init int find_near_online_node(int node) -{ - int n, val; - int min_val = INT_MAX; - int best_node = -1; - - for_each_online_node(n) { - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; -} - -/* - * Setup early cpu_to_node. - * - * Populate cpu_to_node[] only if x86_cpu_to_apicid[], - * and apicid_to_node[] tables have valid entries for a CPU. - * This means we skip cpu_to_node[] initialisation for NUMA - * emulation and faking node case (when running a kernel compiled - * for NUMA on a non NUMA box), which is OK as cpu_to_node[] - * is already initialized in a round robin manner at numa_init_array, - * prior to this call, and this initialization is good enough - * for the fake NUMA cases. - * - * Called before the per_cpu areas are setup. - */ -void __init init_cpu_to_node(void) -{ - int cpu; - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - - BUG_ON(cpu_to_apicid == NULL); - - for_each_possible_cpu(cpu) { - int node; - u16 apicid = cpu_to_apicid[cpu]; - - if (apicid == BAD_APICID) - continue; - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) - continue; - if (!node_online(node)) - node = find_near_online_node(node); - numa_set_node(cpu, node); - } -} -#endif - - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - /* early setting, no percpu area yet */ - if (cpu_to_node_map) { - cpu_to_node_map[cpu] = node; - return; - } - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { - printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); - dump_stack(); - return; - } -#endif - per_cpu(x86_cpu_to_node_map, cpu) = node; - - if (node != NUMA_NO_NODE) - set_cpu_numa_node(cpu, node); -} - -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -#ifndef CONFIG_NUMA_EMU -void __cpuinit numa_add_cpu(int cpu) -{ - cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} -#else -void __cpuinit numa_add_cpu(int cpu) -{ - unsigned long addr; - u16 apicid; - int physnid; - int nid = NUMA_NO_NODE; - - nid = early_cpu_to_node(cpu); - BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); - - /* - * Use the starting address of the emulated node to find which physical - * node it is allocated on. - */ - addr = node_start_pfn(nid) << PAGE_SHIFT; - for (physnid = 0; physnid < MAX_NUMNODES; physnid++) - if (addr >= physnodes[physnid].start && - addr < physnodes[physnid].end) - break; - - /* - * Map the cpu to each emulated node that is allocated on the physical - * node of the cpu's apic id. - */ - for_each_online_node(nid) { - addr = node_start_pfn(nid) << PAGE_SHIFT; - if (addr >= physnodes[physnid].start && - addr < physnodes[physnid].end) - cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); - } -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - int i; - - for_each_online_node(i) - cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); -} -#endif /* !CONFIG_NUMA_EMU */ - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ -static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) -{ - int node = early_cpu_to_node(cpu); - struct cpumask *mask; - char buf[64]; - - mask = node_to_cpumask_map[node]; - if (!mask) { - pr_err("node_to_cpumask_map[%i] NULL\n", node); - dump_stack(); - return NULL; - } - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); - return mask; -} - -/* - * --------- debug versions of the numa functions --------- - */ -#ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - struct cpumask *mask; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); -} -#else -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - int node = early_cpu_to_node(cpu); - struct cpumask *mask; - int i; - - for_each_online_node(i) { - unsigned long addr; - - addr = node_start_pfn(i) << PAGE_SHIFT; - if (addr < physnodes[node].start || - addr >= physnodes[node].end) - continue; - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); - } -} -#endif /* CONFIG_NUMA_EMU */ - -void __cpuinit numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, 1); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, 0); -} - -int __cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(__cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!cpu_possible(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} - -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000000..d0ed086b6247 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c @@ -0,0 +1,492 @@ +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <asm/dma.h> + +#include "numa_internal.h" + +static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; +static char *emu_cmdline __initdata; + +void __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + eb->start, eb->end, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - + memblock_x86_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - memblock_x86_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int nid = 0; + int i, ret; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = PFN_PHYS(max_pfn); + u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, NULL, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + u64 phys; + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + phys_size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); + phys_dist = __va(phys); + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = 0; + dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (dfl_phys_nid == NUMA_NO_NODE) + dfl_phys_nid = emu_nid_to_phys[i]; + } + } + if (dfl_phys_nid == NUMA_NO_NODE) { + pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); + goto no_emu; + } + + /* commit */ + *numa_meminfo = ei; + + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = dfl_phys_nid; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + if (phys_dist) + memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void __cpuinit numa_add_cpu(int cpu) +{ + int physnid, nid; + + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void __cpuinit numa_set_cpumask(int cpu, bool enable) +{ + int nid, physnid; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(nid) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + debug_cpumask_set_cpu(cpu, nid, enable); + } +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 000000000000..7178c3afe05e --- /dev/null +++ b/arch/x86/mm/numa_internal.h @@ -0,0 +1,39 @@ +#ifndef __X86_MM_NUMA_INTERNAL_H +#define __X86_MM_NUMA_INTERNAL_H + +#include <linux/types.h> +#include <asm/numa.h> + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +void __init x86_numa_init(void); + +#ifdef CONFIG_X86_64 +static inline void init_alloc_remap(int nid, u64 start, u64 end) { } +#else +void __init init_alloc_remap(int nid, u64 start, u64 end); +#endif + +#ifdef CONFIG_NUMA_EMU +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +#endif + +#endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 90825f2eb0f4..f9e526742fa1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -310,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * these shared mappings are made of small page mappings. * Thus this don't enforce !RW mapping for small page kernel * text mapping logic will help Linux Xen parvirt guest boot - * aswell. + * as well. */ if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) pgprot_val(forbidden) |= _PAGE_RW; diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 38e6d174c497..9f0614daea85 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) unsigned char *p; struct prefix_bits prf; int i; - unsigned long rv; p = (unsigned char *)ins_addr; p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(reg_rop); i++) - if (reg_rop[i] == opcode) { - rv = REG_READ; + if (reg_rop[i] == opcode) goto do_work; - } for (i = 0; i < ARRAY_SIZE(reg_wop); i++) - if (reg_wop[i] == opcode) { - rv = REG_WRITE; + if (reg_wop[i] == opcode) goto do_work; - } printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " "0x%02x\n", opcode); @@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) unsigned char *p; struct prefix_bits prf; int i; - unsigned long rv; p = (unsigned char *)ins_addr; p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(imm_wop); i++) - if (imm_wop[i] == opcode) { - rv = IMM_WRITE; + if (imm_wop[i] == opcode) goto do_work; - } printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " "0x%02x\n", opcode); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0113d19c8aa6..8573b83a63d0 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -168,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ - if (mm == current->active_mm) - write_cr3(read_cr3()); + flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c new file mode 100644 index 000000000000..81dbfdeb080d --- /dev/null +++ b/arch/x86/mm/srat.c @@ -0,0 +1,184 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/mmzone.h> +#include <linux/bitmap.h> +#include <linux/module.h> +#include <linux/topology.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <asm/proto.h> +#include <asm/numa.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +int acpi_numa __initdata; + +static __init int setup_node(int pxm) +{ + return acpi_map_pxm_to_node(pxm); +} + +static __init void bad_srat(void) +{ + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; +} + +static __init inline int srat_disabled(void) +{ + return acpi_numa < 0; +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + int i, j; + + for (i = 0; i < slit->locality_count; i++) + for (j = 0; j < slit->locality_count; j++) + numa_set_distance(pxm_to_node(i), pxm_to_node(j), + slit->entry[slit->locality_count * i + j]); +} + +/* Callback for Proximity Domain -> x2APIC mapping */ +void __init +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + apic_id = pa->apic_id; + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", + pxm, apic_id, node); +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain_lo; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (get_uv_system_type() >= UV_X2APIC) + apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; + else + apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", + pxm, apic_id, node); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static inline int save_add_info(void) {return 1;} +#else +static inline int save_add_info(void) {return 0;} +#endif + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + u64 start, end; + int node, pxm; + + if (srat_disabled()) + return; + if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { + bad_srat(); + return; + } + if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) + return; + + if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + return; + start = ma->base_address; + end = start + ma->length; + pxm = ma->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + bad_srat(); + return; + } + + if (numa_add_memblk(node, start, end) < 0) { + bad_srat(); + return; + } + + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, + start, end); +} + +void __init acpi_numa_arch_fixup(void) {} + +int __init x86_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; +} diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c deleted file mode 100644 index ae96e7b8051d..000000000000 --- a/arch/x86/mm/srat_32.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen <gone@us.ibm.com> - */ -#include <linux/mm.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/acpi.h> -#include <linux/nodemask.h> -#include <asm/srat.h> -#include <asm/topology.h> -#include <asm/smp.h> -#include <asm/e820.h> - -/* - * proximity macros and definitions - */ -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ - -#define MAX_CHUNKS_PER_NODE 3 -#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) -struct node_memory_chunk_s { - unsigned long start_pfn; - unsigned long end_pfn; - u8 pxm; // proximity domain of node - u8 nid; // which cnode contains this chunk? - u8 bank; // which mem bank on this node -}; -static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; - -static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_APICID]; - -int acpi_numa __initdata; - -static __init void bad_srat(void) -{ - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - num_memory_chunks = 0; -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* Identify CPU proximity domains */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) -{ - if (srat_disabled()) - return; - if (cpu_affinity->header.length != - sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - - if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; /* empty entry */ - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); - - /* don't need to check apic_id here, because it is always 8 bits */ - apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - - printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", - cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); -} - -/* - * Identify memory proximity domains and hot-remove capabilities. - * Fill node memory chunk list structure. - */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) -{ - unsigned long long paddr, size; - unsigned long start_pfn, end_pfn; - u8 pxm; - struct node_memory_chunk_s *p, *q, *pend; - - if (srat_disabled()) - return; - if (memory_affinity->header.length != - sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - - if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; /* empty entry */ - - pxm = memory_affinity->proximity_domain & 0xff; - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, pxm); - - /* calculate info for memory chunk structure */ - paddr = memory_affinity->base_address; - size = memory_affinity->length; - - start_pfn = paddr >> PAGE_SHIFT; - end_pfn = (paddr + size) >> PAGE_SHIFT; - - - if (num_memory_chunks >= MAXCHUNKS) { - printk(KERN_WARNING "Too many mem chunks in SRAT." - " Ignoring %lld MBytes at %llx\n", - size/(1024*1024), paddr); - return; - } - - /* Insertion sort based on base address */ - pend = &node_memory_chunk[num_memory_chunks]; - for (p = &node_memory_chunk[0]; p < pend; p++) { - if (start_pfn < p->start_pfn) - break; - } - if (p < pend) { - for (q = pend; q >= p; q--) - *(q + 1) = *q; - } - p->start_pfn = start_pfn; - p->end_pfn = end_pfn; - p->pxm = pxm; - - num_memory_chunks++; - - printk(KERN_DEBUG "Memory range %08lx to %08lx" - " in proximity domain %02x %s\n", - start_pfn, end_pfn, - pxm, - ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? - "enabled and removable" : "enabled" ) ); -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void acpi_numa_arch_fixup(void) -{ -} -/* - * The SRAT table always lists ascending addresses, so can always - * assume that the first "start" address that you see is the real - * start of the node, and that the current "end" address is after - * the previous one. - */ -static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) -{ - /* - * Only add present memory as told by the e820. - * There is no guarantee from the SRAT that the memory it - * enumerates is present at boot time because it represents - * *possible* memory hotplug areas the same as normal RAM. - */ - if (memory_chunk->start_pfn >= max_pfn) { - printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", - memory_chunk->start_pfn, memory_chunk->end_pfn); - return -1; - } - if (memory_chunk->nid != nid) - return -1; - - if (!node_has_online_mem(nid)) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_start_pfn[nid] > memory_chunk->start_pfn) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_end_pfn[nid] < memory_chunk->end_pfn) - node_end_pfn[nid] = memory_chunk->end_pfn; - - return 0; -} - -int __init get_memcfg_from_srat(void) -{ - int i, j, nid; - - - if (srat_disabled()) - goto out_fail; - - if (num_memory_chunks == 0) { - printk(KERN_DEBUG - "could not find any ACPI SRAT memory areas.\n"); - goto out_fail; - } - - /* Calculate total number of nodes in system from PXM bitmap and create - * a set of sequential node IDs starting at zero. (ACPI doesn't seem - * to specify the range of _PXM values.) - */ - /* - * MCD - we no longer HAVE to number nodes sequentially. PXM domain - * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically - * 32, so we will continue numbering them in this manner until MAX_NUMNODES - * approaches MAX_PXM_DOMAINS for i386. - */ - nodes_clear(node_online_map); - for (i = 0; i < MAX_PXM_DOMAINS; i++) { - if (BMAP_TEST(pxm_bitmap, i)) { - int nid = acpi_map_pxm_to_node(i); - node_set_online(nid); - } - } - BUG_ON(num_online_nodes() == 0); - - /* set cnode id in memory chunk structure */ - for (i = 0; i < num_memory_chunks; i++) - node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - - printk(KERN_DEBUG "pxm bitmap: "); - for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk(KERN_CONT "%02x ", pxm_bitmap[i]); - } - printk(KERN_CONT "\n"); - printk(KERN_DEBUG "Number of logical nodes in system = %d\n", - num_online_nodes()); - printk(KERN_DEBUG "Number of memory chunks in system = %d\n", - num_memory_chunks); - - for (i = 0; i < MAX_APICID; i++) - apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); - - for (j = 0; j < num_memory_chunks; j++){ - struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; - printk(KERN_DEBUG - "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", - j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - if (node_read_chunk(chunk->nid, chunk)) - continue; - - memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, - min(chunk->end_pfn, max_pfn)); - } - /* for out of order entries in SRAT */ - sort_node_map(); - - for_each_online_node(nid) { - unsigned long start = node_start_pfn[nid]; - unsigned long end = min(node_end_pfn[nid], max_pfn); - - memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); - } - return 1; -out_fail: - printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" - " table\n"); - return 0; -} diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c deleted file mode 100644 index 603d285d1daa..000000000000 --- a/arch/x86/mm/srat_64.c +++ /dev/null @@ -1,585 +0,0 @@ -/* - * ACPI 3.0 based NUMA setup - * Copyright 2004 Andi Kleen, SuSE Labs. - * - * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. - * - * Called from acpi_numa_init while reading the SRAT and SLIT tables. - * Assumes all memory regions belonging to a single proximity domain - * are in one chunk. Holes between them will be included in the node. - */ - -#include <linux/kernel.h> -#include <linux/acpi.h> -#include <linux/mmzone.h> -#include <linux/bitmap.h> -#include <linux/module.h> -#include <linux/topology.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mm.h> -#include <asm/proto.h> -#include <asm/numa.h> -#include <asm/e820.h> -#include <asm/apic.h> -#include <asm/uv/uv.h> - -int acpi_numa __initdata; - -static struct acpi_table_slit *acpi_slit; - -static nodemask_t nodes_parsed __initdata; -static nodemask_t cpu_nodes_parsed __initdata; -static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode nodes_add[MAX_NUMNODES]; - -static int num_node_memblks __initdata; -static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; -static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; - -static __init int setup_node(int pxm) -{ - return acpi_map_pxm_to_node(pxm); -} - -static __init int conflicting_memblks(unsigned long start, unsigned long end) -{ - int i; - for (i = 0; i < num_node_memblks; i++) { - struct bootnode *nd = &node_memblk_range[i]; - if (nd->start == nd->end) - continue; - if (nd->end > start && nd->start < end) - return memblk_nodeid[i]; - if (nd->end == end && nd->start == start) - return memblk_nodeid[i]; - } - return -1; -} - -static __init void cutoff_node(int i, unsigned long start, unsigned long end) -{ - struct bootnode *nd = &nodes[i]; - - if (nd->start < start) { - nd->start = start; - if (nd->end < nd->start) - nd->start = nd->end; - } - if (nd->end > end) { - nd->end = end; - if (nd->start > nd->end) - nd->start = nd->end; - } -} - -static __init void bad_srat(void) -{ - int i; - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - for (i = 0; i < MAX_LOCAL_APIC; i++) - apicid_to_node[i] = NUMA_NO_NODE; - for (i = 0; i < MAX_NUMNODES; i++) { - nodes[i].start = nodes[i].end = 0; - nodes_add[i].start = nodes_add[i].end = 0; - } - remove_all_active_ranges(); -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ - unsigned length; - unsigned long phys; - - length = slit->header.length; - phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length, - PAGE_SIZE); - - if (phys == MEMBLOCK_ERROR) - panic(" Can not save slit!\n"); - - acpi_slit = __va(phys); - memcpy(acpi_slit, slit, length); - memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); -} - -/* Callback for Proximity Domain -> x2APIC mapping */ -void __init -acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) -{ - int pxm, node; - int apic_id; - - if (srat_disabled()) - return; - if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { - bad_srat(); - return; - } - if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; - pxm = pa->proximity_domain; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); - bad_srat(); - return; - } - - apic_id = pa->apic_id; - if (apic_id >= MAX_LOCAL_APIC) { - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); - return; - } - apicid_to_node[apic_id] = node; - node_set(node, cpu_nodes_parsed); - acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", - pxm, apic_id, node); -} - -/* Callback for Proximity Domain -> LAPIC mapping */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ - int pxm, node; - int apic_id; - - if (srat_disabled()) - return; - if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; - pxm = pa->proximity_domain_lo; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); - bad_srat(); - return; - } - - if (get_uv_system_type() >= UV_X2APIC) - apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; - else - apic_id = pa->apic_id; - - if (apic_id >= MAX_LOCAL_APIC) { - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); - return; - } - - apicid_to_node[apic_id] = node; - node_set(node, cpu_nodes_parsed); - acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", - pxm, apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int save_add_info(void) {return 1;} -#else -static inline int save_add_info(void) {return 0;} -#endif -/* - * Update nodes_add[] - * This code supports one contiguous hot add area per node - */ -static void __init -update_nodes_add(int node, unsigned long start, unsigned long end) -{ - unsigned long s_pfn = start >> PAGE_SHIFT; - unsigned long e_pfn = end >> PAGE_SHIFT; - int changed = 0; - struct bootnode *nd = &nodes_add[node]; - - /* I had some trouble with strange memory hotadd regions breaking - the boot. Be very strict here and reject anything unexpected. - If you want working memory hotadd write correct SRATs. - - The node size check is a basic sanity check to guard against - mistakes */ - if ((signed long)(end - start) < NODE_MIN_SIZE) { - printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return; - } - - /* This check might be a bit too strict, but I'm keeping it for now. */ - if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { - printk(KERN_ERR - "SRAT: Hotplug area %lu -> %lu has existing memory\n", - s_pfn, e_pfn); - return; - } - - /* Looks good */ - - if (nd->start == nd->end) { - nd->start = start; - nd->end = end; - changed = 1; - } else { - if (nd->start == end) { - nd->start = start; - changed = 1; - } - if (nd->end == start) { - nd->end = end; - changed = 1; - } - if (!changed) - printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); - } - - if (changed) { - node_set(node, cpu_nodes_parsed); - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", - nd->start, nd->end); - } -} - -/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) -{ - struct bootnode *nd, oldnode; - unsigned long start, end; - int node, pxm; - int i; - - if (srat_disabled()) - return; - if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; - - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; - start = ma->base_address; - end = start + ma->length; - pxm = ma->proximity_domain; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains.\n"); - bad_srat(); - return; - } - i = conflicting_memblks(start, end); - if (i == node) { - printk(KERN_WARNING - "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", - pxm, start, end, nodes[i].start, nodes[i].end); - } else if (i >= 0) { - printk(KERN_ERR - "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", - pxm, start, end, node_to_pxm(i), - nodes[i].start, nodes[i].end); - bad_srat(); - return; - } - nd = &nodes[node]; - oldnode = *nd; - if (!node_test_and_set(node, nodes_parsed)) { - nd->start = start; - nd->end = end; - } else { - if (start < nd->start) - nd->start = start; - if (nd->end < end) - nd->end = end; - } - - printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, - start, end); - - if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { - update_nodes_add(node, start, end); - /* restore nodes[node] */ - *nd = oldnode; - if ((nd->start | nd->end) == 0) - node_clear(node, nodes_parsed); - } - - node_memblk_range[num_node_memblks].start = start; - node_memblk_range[num_node_memblks].end = end; - memblk_nodeid[num_node_memblks] = node; - num_node_memblks++; -} - -/* Sanity check to catch more bad SRATs (they are amazingly common). - Make sure the PXMs cover all memory. */ -static int __init nodes_cover_memory(const struct bootnode *nodes) -{ - int i; - unsigned long pxmram, e820ram; - - pxmram = 0; - for_each_node_mask(i, nodes_parsed) { - unsigned long s = nodes[i].start >> PAGE_SHIFT; - unsigned long e = nodes[i].end >> PAGE_SHIFT; - pxmram += e - s; - pxmram -= __absent_pages_in_range(i, s, e); - if ((long)pxmram < 0) - pxmram = 0; - } - - e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { - printk(KERN_ERR - "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", - (pxmram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return 0; - } - return 1; -} - -void __init acpi_numa_arch_fixup(void) {} - -#ifdef CONFIG_NUMA_EMU -void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, - unsigned long end) -{ - int i; - - for_each_node_mask(i, nodes_parsed) { - cutoff_node(i, start, end); - physnodes[i].start = nodes[i].start; - physnodes[i].end = nodes[i].end; - } -} -#endif /* CONFIG_NUMA_EMU */ - -/* Use the information discovered above to actually set up the nodes. */ -int __init acpi_scan_nodes(unsigned long start, unsigned long end) -{ - int i; - - if (acpi_numa <= 0) - return -1; - - /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) - cutoff_node(i, start, end); - - /* - * Join together blocks on the same node, holes between - * which don't overlap with memory on other nodes. - */ - for (i = 0; i < num_node_memblks; ++i) { - int j, k; - - for (j = i + 1; j < num_node_memblks; ++j) { - unsigned long start, end; - - if (memblk_nodeid[i] != memblk_nodeid[j]) - continue; - start = min(node_memblk_range[i].end, - node_memblk_range[j].end); - end = max(node_memblk_range[i].start, - node_memblk_range[j].start); - for (k = 0; k < num_node_memblks; ++k) { - if (memblk_nodeid[i] == memblk_nodeid[k]) - continue; - if (start < node_memblk_range[k].end && - end > node_memblk_range[k].start) - break; - } - if (k < num_node_memblks) - continue; - start = min(node_memblk_range[i].start, - node_memblk_range[j].start); - end = max(node_memblk_range[i].end, - node_memblk_range[j].end); - printk(KERN_INFO "SRAT: Node %d " - "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - memblk_nodeid[i], - node_memblk_range[i].start, - node_memblk_range[i].end, - node_memblk_range[j].start, - node_memblk_range[j].end, - start, end); - node_memblk_range[i].start = start; - node_memblk_range[i].end = end; - k = --num_node_memblks - j; - memmove(memblk_nodeid + j, memblk_nodeid + j+1, - k * sizeof(*memblk_nodeid)); - memmove(node_memblk_range + j, node_memblk_range + j+1, - k * sizeof(*node_memblk_range)); - --j; - } - } - - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, - memblk_nodeid); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); - bad_srat(); - return -1; - } - - for (i = 0; i < num_node_memblks; i++) - memblock_x86_register_active_regions(memblk_nodeid[i], - node_memblk_range[i].start >> PAGE_SHIFT, - node_memblk_range[i].end >> PAGE_SHIFT); - - /* for out of order entries in SRAT */ - sort_node_map(); - if (!nodes_cover_memory(nodes)) { - bad_srat(); - return -1; - } - - /* Account for nodes with cpus and no memory */ - nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); - - /* Finally register nodes */ - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - /* Try again in case setup_node_bootmem missed one due - to missing bootmem */ - for_each_node_mask(i, node_possible_map) - if (!node_online(i)) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - - for (i = 0; i < nr_cpu_ids; i++) { - int node = early_cpu_to_node(i); - - if (node == NUMA_NO_NODE) - continue; - if (!node_online(node)) - numa_clear_node(i); - } - numa_init_array(); - return 0; -} - -#ifdef CONFIG_NUMA_EMU -static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL -}; -static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for_each_node_mask(i, nodes_parsed) { - /* - * Find the real node that this emulated node appears on. For - * the sake of simplicity, we only use a real node's starting - * address to determine which emulated node it appears on. - */ - if (addr >= nodes[i].start && addr < nodes[i].end) { - ret = i; - break; - } - } - return ret; -} - -/* - * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID - * mappings that respect the real ACPI topology but reflect our emulated - * environment. For each emulated node, we find which real node it appears on - * and create PXM to NID mappings for those fake nodes which mirror that - * locality. SLIT will now represent the correct distances between emulated - * nodes as a result of the real topology. - */ -void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) -{ - int i, j; - - for (i = 0; i < num_nodes; i++) { - int nid, pxm; - - nid = find_node_by_addr(fake_nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - pxm = node_to_pxm(nid); - if (pxm == PXM_INVAL) - continue; - fake_node_to_pxm_map[i] = pxm; - /* - * For each apicid_to_node mapping that exists for this real - * node, it must now point to the fake node ID. - */ - for (j = 0; j < MAX_LOCAL_APIC; j++) - if (apicid_to_node[j] == nid && - fake_apicid_to_node[j] == NUMA_NO_NODE) - fake_apicid_to_node[j] = i; - } - - /* - * If there are apicid-to-node mappings for physical nodes that do not - * have a corresponding emulated node, it should default to a guaranteed - * value. - */ - for (i = 0; i < MAX_LOCAL_APIC; i++) - if (apicid_to_node[i] != NUMA_NO_NODE && - fake_apicid_to_node[i] == NUMA_NO_NODE) - fake_apicid_to_node[i] = 0; - - for (i = 0; i < num_nodes; i++) - __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); - - nodes_clear(nodes_parsed); - for (i = 0; i < num_nodes; i++) - if (fake_nodes[i].start != fake_nodes[i].end) - node_set(i, nodes_parsed); -} - -static int null_slit_node_compare(int a, int b) -{ - return node_to_pxm(a) == node_to_pxm(b); -} -#else -static int null_slit_node_compare(int a, int b) -{ - return a == b; -} -#endif /* CONFIG_NUMA_EMU */ - -int __node_distance(int a, int b) -{ - int index; - - if (!acpi_slit) - return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : - REMOTE_DISTANCE; - index = acpi_slit->locality_count * node_to_pxm(a); - return acpi_slit->entry[index + node_to_pxm(b)]; -} - -EXPORT_SYMBOL(__node_distance); - -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) -int memory_add_physaddr_to_nid(u64 start) -{ - int i, ret = 0; - - for_each_node(i) - if (nodes_add[i].start <= start && nodes_add[i].end > start) - ret = i; - - return ret; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6acc724d5d8f..d6c0418c3e47 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, sender = this_cpu_read(tlb_vector_offset); f = &flush_state[sender]; - /* - * Could avoid this lock when - * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - * probably not worth checking this for a cache-hot lock. - */ - raw_spin_lock(&f->tlbstate_lock); + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - raw_spin_unlock(&f->tlbstate_lock); + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask, if (is_uv_system()) { unsigned int cpu; - cpu = get_cpu(); + cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); if (cpumask) flush_tlb_others_ipi(cpumask, mm, va); - put_cpu(); return; } flush_tlb_others_ipi(cpumask, mm, va); diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile new file mode 100644 index 000000000000..90568c33ddb0 --- /dev/null +++ b/arch/x86/net/Makefile @@ -0,0 +1,4 @@ +# +# Arch-specific network modules +# +obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S new file mode 100644 index 000000000000..66870223f8c5 --- /dev/null +++ b/arch/x86/net/bpf_jit.S @@ -0,0 +1,140 @@ +/* bpf_jit.S : BPF JIT helper functions + * + * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +/* + * Calling convention : + * rdi : skb pointer + * esi : offset of byte(s) to fetch in skb (can be scratched) + * r8 : copy of skb->data + * r9d : hlen = skb->len - skb->data_len + */ +#define SKBDATA %r8 + +sk_load_word_ind: + .globl sk_load_word_ind + + add %ebx,%esi /* offset += X */ +# test %esi,%esi /* if (offset < 0) goto bpf_error; */ + js bpf_error + +sk_load_word: + .globl sk_load_word + + mov %r9d,%eax # hlen + sub %esi,%eax # hlen - offset + cmp $3,%eax + jle bpf_slow_path_word + mov (SKBDATA,%rsi),%eax + bswap %eax /* ntohl() */ + ret + + +sk_load_half_ind: + .globl sk_load_half_ind + + add %ebx,%esi /* offset += X */ + js bpf_error + +sk_load_half: + .globl sk_load_half + + mov %r9d,%eax + sub %esi,%eax # hlen - offset + cmp $1,%eax + jle bpf_slow_path_half + movzwl (SKBDATA,%rsi),%eax + rol $8,%ax # ntohs() + ret + +sk_load_byte_ind: + .globl sk_load_byte_ind + add %ebx,%esi /* offset += X */ + js bpf_error + +sk_load_byte: + .globl sk_load_byte + + cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */ + jle bpf_slow_path_byte + movzbl (SKBDATA,%rsi),%eax + ret + +/** + * sk_load_byte_msh - BPF_S_LDX_B_MSH helper + * + * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf) + * Must preserve A accumulator (%eax) + * Inputs : %esi is the offset value, already known positive + */ +ENTRY(sk_load_byte_msh) + CFI_STARTPROC + cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */ + jle bpf_slow_path_byte_msh + movzbl (SKBDATA,%rsi),%ebx + and $15,%bl + shl $2,%bl + ret + CFI_ENDPROC +ENDPROC(sk_load_byte_msh) + +bpf_error: +# force a return 0 from jit handler + xor %eax,%eax + mov -8(%rbp),%rbx + leaveq + ret + +/* rsi contains offset and can be scratched */ +#define bpf_slow_path_common(LEN) \ + push %rdi; /* save skb */ \ + push %r9; \ + push SKBDATA; \ +/* rsi already has offset */ \ + mov $LEN,%ecx; /* len */ \ + lea -12(%rbp),%rdx; \ + call skb_copy_bits; \ + test %eax,%eax; \ + pop SKBDATA; \ + pop %r9; \ + pop %rdi + + +bpf_slow_path_word: + bpf_slow_path_common(4) + js bpf_error + mov -12(%rbp),%eax + bswap %eax + ret + +bpf_slow_path_half: + bpf_slow_path_common(2) + js bpf_error + mov -12(%rbp),%ax + rol $8,%ax + movzwl %ax,%eax + ret + +bpf_slow_path_byte: + bpf_slow_path_common(1) + js bpf_error + movzbl -12(%rbp),%eax + ret + +bpf_slow_path_byte_msh: + xchg %eax,%ebx /* dont lose A , X is about to be scratched */ + bpf_slow_path_common(1) + js bpf_error + movzbl -12(%rbp),%eax + and $15,%al + shl $2,%al + xchg %eax,%ebx + ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c new file mode 100644 index 000000000000..bfab3fa10edc --- /dev/null +++ b/arch/x86/net/bpf_jit_comp.c @@ -0,0 +1,654 @@ +/* bpf_jit_comp.c : BPF JIT compiler + * + * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/moduleloader.h> +#include <asm/cacheflush.h> +#include <linux/netdevice.h> +#include <linux/filter.h> + +/* + * Conventions : + * EAX : BPF A accumulator + * EBX : BPF X accumulator + * RDI : pointer to skb (first argument given to JIT function) + * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n) + * ECX,EDX,ESI : scratch registers + * r9d : skb->len - skb->data_len (headlen) + * r8 : skb->data + * -8(RBP) : saved RBX value + * -16(RBP)..-80(RBP) : BPF_MEMWORDS values + */ +int bpf_jit_enable __read_mostly; + +/* + * assembly code in arch/x86/net/bpf_jit.S + */ +extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[]; +extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[]; + +static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) +{ + if (len == 1) + *ptr = bytes; + else if (len == 2) + *(u16 *)ptr = bytes; + else { + *(u32 *)ptr = bytes; + barrier(); + } + return ptr + len; +} + +#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0) + +#define EMIT1(b1) EMIT(b1, 1) +#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) +#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) +#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0) + +#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */ +#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */ + +static inline bool is_imm8(int value) +{ + return value <= 127 && value >= -128; +} + +static inline bool is_near(int offset) +{ + return offset <= 127 && offset >= -128; +} + +#define EMIT_JMP(offset) \ +do { \ + if (offset) { \ + if (is_near(offset)) \ + EMIT2(0xeb, offset); /* jmp .+off8 */ \ + else \ + EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \ + } \ +} while (0) + +/* list of x86 cond jumps opcodes (. + s8) + * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) + */ +#define X86_JB 0x72 +#define X86_JAE 0x73 +#define X86_JE 0x74 +#define X86_JNE 0x75 +#define X86_JBE 0x76 +#define X86_JA 0x77 + +#define EMIT_COND_JMP(op, offset) \ +do { \ + if (is_near(offset)) \ + EMIT2(op, offset); /* jxx .+off8 */ \ + else { \ + EMIT2(0x0f, op + 0x10); \ + EMIT(offset, 4); /* jxx .+off32 */ \ + } \ +} while (0) + +#define COND_SEL(CODE, TOP, FOP) \ + case CODE: \ + t_op = TOP; \ + f_op = FOP; \ + goto cond_branch + + +#define SEEN_DATAREF 1 /* might call external helpers */ +#define SEEN_XREG 2 /* ebx is used */ +#define SEEN_MEM 4 /* use mem[] for temporary storage */ + +static inline void bpf_flush_icache(void *start, void *end) +{ + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + smp_wmb(); + flush_icache_range((unsigned long)start, (unsigned long)end); + set_fs(old_fs); +} + + +void bpf_jit_compile(struct sk_filter *fp) +{ + u8 temp[64]; + u8 *prog; + unsigned int proglen, oldproglen = 0; + int ilen, i; + int t_offset, f_offset; + u8 t_op, f_op, seen = 0, pass; + u8 *image = NULL; + u8 *func; + int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */ + unsigned int cleanup_addr; /* epilogue code offset */ + unsigned int *addrs; + const struct sock_filter *filter = fp->insns; + int flen = fp->len; + + if (!bpf_jit_enable) + return; + + addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL); + if (addrs == NULL) + return; + + /* Before first pass, make a rough estimation of addrs[] + * each bpf instruction is translated to less than 64 bytes + */ + for (proglen = 0, i = 0; i < flen; i++) { + proglen += 64; + addrs[i] = proglen; + } + cleanup_addr = proglen; /* epilogue address */ + + for (pass = 0; pass < 10; pass++) { + /* no prologue/epilogue for trivial filters (RET something) */ + proglen = 0; + prog = temp; + + if (seen) { + EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */ + EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */ + /* note : must save %rbx in case bpf_error is hit */ + if (seen & (SEEN_XREG | SEEN_DATAREF)) + EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */ + if (seen & SEEN_XREG) + CLEAR_X(); /* make sure we dont leek kernel memory */ + + /* + * If this filter needs to access skb data, + * loads r9 and r8 with : + * r9 = skb->len - skb->data_len + * r8 = skb->data + */ + if (seen & SEEN_DATAREF) { + if (offsetof(struct sk_buff, len) <= 127) + /* mov off8(%rdi),%r9d */ + EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); + else { + /* mov off32(%rdi),%r9d */ + EMIT3(0x44, 0x8b, 0x8f); + EMIT(offsetof(struct sk_buff, len), 4); + } + if (is_imm8(offsetof(struct sk_buff, data_len))) + /* sub off8(%rdi),%r9d */ + EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len)); + else { + EMIT3(0x44, 0x2b, 0x8f); + EMIT(offsetof(struct sk_buff, data_len), 4); + } + + if (is_imm8(offsetof(struct sk_buff, data))) + /* mov off8(%rdi),%r8 */ + EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data)); + else { + /* mov off32(%rdi),%r8 */ + EMIT3(0x4c, 0x8b, 0x87); + EMIT(offsetof(struct sk_buff, data), 4); + } + } + } + + switch (filter[0].code) { + case BPF_S_RET_K: + case BPF_S_LD_W_LEN: + case BPF_S_ANC_PROTOCOL: + case BPF_S_ANC_IFINDEX: + case BPF_S_ANC_MARK: + case BPF_S_ANC_RXHASH: + case BPF_S_ANC_CPU: + case BPF_S_ANC_QUEUE: + case BPF_S_LD_W_ABS: + case BPF_S_LD_H_ABS: + case BPF_S_LD_B_ABS: + /* first instruction sets A register (or is RET 'constant') */ + break; + default: + /* make sure we dont leak kernel information to user */ + CLEAR_A(); /* A = 0 */ + } + + for (i = 0; i < flen; i++) { + unsigned int K = filter[i].k; + + switch (filter[i].code) { + case BPF_S_ALU_ADD_X: /* A += X; */ + seen |= SEEN_XREG; + EMIT2(0x01, 0xd8); /* add %ebx,%eax */ + break; + case BPF_S_ALU_ADD_K: /* A += K; */ + if (!K) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xc0, K); /* add imm8,%eax */ + else + EMIT1_off32(0x05, K); /* add imm32,%eax */ + break; + case BPF_S_ALU_SUB_X: /* A -= X; */ + seen |= SEEN_XREG; + EMIT2(0x29, 0xd8); /* sub %ebx,%eax */ + break; + case BPF_S_ALU_SUB_K: /* A -= K */ + if (!K) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */ + else + EMIT1_off32(0x2d, K); /* sub imm32,%eax */ + break; + case BPF_S_ALU_MUL_X: /* A *= X; */ + seen |= SEEN_XREG; + EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */ + break; + case BPF_S_ALU_MUL_K: /* A *= K */ + if (is_imm8(K)) + EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */ + else { + EMIT2(0x69, 0xc0); /* imul imm32,%eax */ + EMIT(K, 4); + } + break; + case BPF_S_ALU_DIV_X: /* A /= X; */ + seen |= SEEN_XREG; + EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ + if (pc_ret0 != -1) + EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4)); + else { + EMIT_COND_JMP(X86_JNE, 2 + 5); + CLEAR_A(); + EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */ + } + EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */ + break; + case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ + EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ + EMIT(K, 4); + EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */ + break; + case BPF_S_ALU_AND_X: + seen |= SEEN_XREG; + EMIT2(0x21, 0xd8); /* and %ebx,%eax */ + break; + case BPF_S_ALU_AND_K: + if (K >= 0xFFFFFF00) { + EMIT2(0x24, K & 0xFF); /* and imm8,%al */ + } else if (K >= 0xFFFF0000) { + EMIT2(0x66, 0x25); /* and imm16,%ax */ + EMIT2(K, 2); + } else { + EMIT1_off32(0x25, K); /* and imm32,%eax */ + } + break; + case BPF_S_ALU_OR_X: + seen |= SEEN_XREG; + EMIT2(0x09, 0xd8); /* or %ebx,%eax */ + break; + case BPF_S_ALU_OR_K: + if (is_imm8(K)) + EMIT3(0x83, 0xc8, K); /* or imm8,%eax */ + else + EMIT1_off32(0x0d, K); /* or imm32,%eax */ + break; + case BPF_S_ALU_LSH_X: /* A <<= X; */ + seen |= SEEN_XREG; + EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ + break; + case BPF_S_ALU_LSH_K: + if (K == 0) + break; + else if (K == 1) + EMIT2(0xd1, 0xe0); /* shl %eax */ + else + EMIT3(0xc1, 0xe0, K); + break; + case BPF_S_ALU_RSH_X: /* A >>= X; */ + seen |= SEEN_XREG; + EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */ + break; + case BPF_S_ALU_RSH_K: /* A >>= K; */ + if (K == 0) + break; + else if (K == 1) + EMIT2(0xd1, 0xe8); /* shr %eax */ + else + EMIT3(0xc1, 0xe8, K); + break; + case BPF_S_ALU_NEG: + EMIT2(0xf7, 0xd8); /* neg %eax */ + break; + case BPF_S_RET_K: + if (!K) { + if (pc_ret0 == -1) + pc_ret0 = i; + CLEAR_A(); + } else { + EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ + } + /* fallinto */ + case BPF_S_RET_A: + if (seen) { + if (i != flen - 1) { + EMIT_JMP(cleanup_addr - addrs[i]); + break; + } + if (seen & SEEN_XREG) + EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */ + EMIT1(0xc9); /* leaveq */ + } + EMIT1(0xc3); /* ret */ + break; + case BPF_S_MISC_TAX: /* X = A */ + seen |= SEEN_XREG; + EMIT2(0x89, 0xc3); /* mov %eax,%ebx */ + break; + case BPF_S_MISC_TXA: /* A = X */ + seen |= SEEN_XREG; + EMIT2(0x89, 0xd8); /* mov %ebx,%eax */ + break; + case BPF_S_LD_IMM: /* A = K */ + if (!K) + CLEAR_A(); + else + EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ + break; + case BPF_S_LDX_IMM: /* X = K */ + seen |= SEEN_XREG; + if (!K) + CLEAR_X(); + else + EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */ + break; + case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */ + seen |= SEEN_MEM; + EMIT3(0x8b, 0x45, 0xf0 - K*4); + break; + case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */ + seen |= SEEN_XREG | SEEN_MEM; + EMIT3(0x8b, 0x5d, 0xf0 - K*4); + break; + case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */ + seen |= SEEN_MEM; + EMIT3(0x89, 0x45, 0xf0 - K*4); + break; + case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */ + seen |= SEEN_XREG | SEEN_MEM; + EMIT3(0x89, 0x5d, 0xf0 - K*4); + break; + case BPF_S_LD_W_LEN: /* A = skb->len; */ + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + if (is_imm8(offsetof(struct sk_buff, len))) + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len)); + else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, len), 4); + } + break; + case BPF_S_LDX_W_LEN: /* X = skb->len; */ + seen |= SEEN_XREG; + if (is_imm8(offsetof(struct sk_buff, len))) + /* mov off8(%rdi),%ebx */ + EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len)); + else { + EMIT2(0x8b, 0x9f); + EMIT(offsetof(struct sk_buff, len), 4); + } + break; + case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */ + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + if (is_imm8(offsetof(struct sk_buff, protocol))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, protocol), 4); + } + EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */ + break; + case BPF_S_ANC_IFINDEX: + if (is_imm8(offsetof(struct sk_buff, dev))) { + /* movq off8(%rdi),%rax */ + EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev)); + } else { + EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */ + EMIT(offsetof(struct sk_buff, dev), 4); + } + EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */ + EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6)); + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */ + EMIT(offsetof(struct net_device, ifindex), 4); + break; + case BPF_S_ANC_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + if (is_imm8(offsetof(struct sk_buff, mark))) { + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark)); + } else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, mark), 4); + } + break; + case BPF_S_ANC_RXHASH: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4); + if (is_imm8(offsetof(struct sk_buff, rxhash))) { + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash)); + } else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, rxhash), 4); + } + break; + case BPF_S_ANC_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + if (is_imm8(offsetof(struct sk_buff, queue_mapping))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, queue_mapping), 4); + } + break; + case BPF_S_ANC_CPU: +#ifdef CONFIG_SMP + EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */ + EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */ +#else + CLEAR_A(); +#endif + break; + case BPF_S_LD_W_ABS: + func = sk_load_word; +common_load: seen |= SEEN_DATAREF; + if ((int)K < 0) + goto out; + t_offset = func - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call */ + break; + case BPF_S_LD_H_ABS: + func = sk_load_half; + goto common_load; + case BPF_S_LD_B_ABS: + func = sk_load_byte; + goto common_load; + case BPF_S_LDX_B_MSH: + if ((int)K < 0) { + if (pc_ret0 != -1) { + EMIT_JMP(addrs[pc_ret0] - addrs[i]); + break; + } + CLEAR_A(); + EMIT_JMP(cleanup_addr - addrs[i]); + break; + } + seen |= SEEN_DATAREF | SEEN_XREG; + t_offset = sk_load_byte_msh - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */ + break; + case BPF_S_LD_W_IND: + func = sk_load_word_ind; +common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; + t_offset = func - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */ + break; + case BPF_S_LD_H_IND: + func = sk_load_half_ind; + goto common_load_ind; + case BPF_S_LD_B_IND: + func = sk_load_byte_ind; + goto common_load_ind; + case BPF_S_JMP_JA: + t_offset = addrs[i + K] - addrs[i]; + EMIT_JMP(t_offset); + break; + COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE); + COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB); + COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE); + COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE); + COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE); + COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB); + COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE); + COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE); + +cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; + t_offset = addrs[i + filter[i].jt] - addrs[i]; + + /* same targets, can avoid doing the test :) */ + if (filter[i].jt == filter[i].jf) { + EMIT_JMP(t_offset); + break; + } + + switch (filter[i].code) { + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JEQ_X: + seen |= SEEN_XREG; + EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */ + break; + case BPF_S_JMP_JSET_X: + seen |= SEEN_XREG; + EMIT2(0x85, 0xd8); /* test %ebx,%eax */ + break; + case BPF_S_JMP_JEQ_K: + if (K == 0) { + EMIT2(0x85, 0xc0); /* test %eax,%eax */ + break; + } + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGE_K: + if (K <= 127) + EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */ + else + EMIT1_off32(0x3d, K); /* cmp imm32,%eax */ + break; + case BPF_S_JMP_JSET_K: + if (K <= 0xFF) + EMIT2(0xa8, K); /* test imm8,%al */ + else if (!(K & 0xFFFF00FF)) + EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */ + else if (K <= 0xFFFF) { + EMIT2(0x66, 0xa9); /* test imm16,%ax */ + EMIT(K, 2); + } else { + EMIT1_off32(0xa9, K); /* test imm32,%eax */ + } + break; + } + if (filter[i].jt != 0) { + if (filter[i].jf) + t_offset += is_near(f_offset) ? 2 : 6; + EMIT_COND_JMP(t_op, t_offset); + if (filter[i].jf) + EMIT_JMP(f_offset); + break; + } + EMIT_COND_JMP(f_op, f_offset); + break; + default: + /* hmm, too complex filter, give up with jit compiler */ + goto out; + } + ilen = prog - temp; + if (image) { + if (unlikely(proglen + ilen > oldproglen)) { + pr_err("bpb_jit_compile fatal error\n"); + kfree(addrs); + module_free(NULL, image); + return; + } + memcpy(image + proglen, temp, ilen); + } + proglen += ilen; + addrs[i] = proglen; + prog = temp; + } + /* last bpf instruction is always a RET : + * use it to give the cleanup instruction(s) addr + */ + cleanup_addr = proglen - 1; /* ret */ + if (seen) + cleanup_addr -= 1; /* leaveq */ + if (seen & SEEN_XREG) + cleanup_addr -= 4; /* mov -8(%rbp),%rbx */ + + if (image) { + WARN_ON(proglen != oldproglen); + break; + } + if (proglen == oldproglen) { + image = module_alloc(max_t(unsigned int, + proglen, + sizeof(struct work_struct))); + if (!image) + goto out; + } + oldproglen = proglen; + } + if (bpf_jit_enable > 1) + pr_err("flen=%d proglen=%u pass=%d image=%p\n", + flen, proglen, pass, image); + + if (image) { + if (bpf_jit_enable > 1) + print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS, + 16, 1, image, proglen, false); + + bpf_flush_icache(image, image + proglen); + + fp->bpf_func = (void *)image; + } +out: + kfree(addrs); + return; +} + +static void jit_free_defer(struct work_struct *arg) +{ + module_free(NULL, arg); +} + +/* run from softirq, we must use a work_struct to call + * module_free() from process context + */ +void bpf_jit_free(struct sk_filter *fp) +{ + if (fp->bpf_func != sk_run_filter) { + struct work_struct *work = (struct work_struct *)fp->bpf_func; + + INIT_WORK(work, jit_free_defer); + schedule_work(work); + } +} diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 72cbec14d783..a5b64ab4cd6e 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -16,17 +16,6 @@ #include <asm/stacktrace.h> #include <linux/compat.h> -static void backtrace_warning_symbol(void *data, char *msg, - unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - static int backtrace_stack(void *data, char *name) { /* Yes, we want all stacks */ @@ -42,8 +31,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, .walk_stack = print_context_stack, @@ -126,7 +113,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, + dump_trace(NULL, regs, (unsigned long *)stack, 0, &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index e2b7b0c06cdf..cf9750004a08 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -15,7 +15,7 @@ #include <linux/notifier.h> #include <linux/smp.h> #include <linux/oprofile.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/slab.h> #include <linux/moduleparam.h> #include <linux/kdebug.h> @@ -49,6 +49,10 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; val |= (counter_config->unit_mask & 0xFF) << 8; + counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_EDGE | + ARCH_PERFMON_EVENTSEL_CMASK); + val |= counter_config->extra; event &= model->event_mask ? model->event_mask : 0xFF; val |= event & 0xFF; val |= (event & 0x0F00) << 24; @@ -440,6 +444,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); } return 0; @@ -536,7 +541,7 @@ static void nmi_shutdown(void) #ifdef CONFIG_PM -static int nmi_suspend(struct sys_device *dev, pm_message_t state) +static int nmi_suspend(void) { /* Only one CPU left, just stop that one */ if (nmi_enabled == 1) @@ -544,49 +549,31 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int nmi_resume(struct sys_device *dev) +static void nmi_resume(void) { if (nmi_enabled == 1) nmi_cpu_start(NULL); - return 0; } -static struct sysdev_class oprofile_sysclass = { - .name = "oprofile", +static struct syscore_ops oprofile_syscore_ops = { .resume = nmi_resume, .suspend = nmi_suspend, }; -static struct sys_device device_oprofile = { - .id = 0, - .cls = &oprofile_sysclass, -}; - -static int __init init_sysfs(void) +static void __init init_suspend_resume(void) { - int error; - - error = sysdev_class_register(&oprofile_sysclass); - if (error) - return error; - - error = sysdev_register(&device_oprofile); - if (error) - sysdev_class_unregister(&oprofile_sysclass); - - return error; + register_syscore_ops(&oprofile_syscore_ops); } -static void exit_sysfs(void) +static void exit_suspend_resume(void) { - sysdev_unregister(&device_oprofile); - sysdev_class_unregister(&oprofile_sysclass); + unregister_syscore_ops(&oprofile_syscore_ops); } #else -static inline int init_sysfs(void) { return 0; } -static inline void exit_sysfs(void) { } +static inline void init_suspend_resume(void) { } +static inline void exit_suspend_resume(void) { } #endif /* CONFIG_PM */ @@ -789,9 +776,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) mux_init(ops); - ret = init_sysfs(); - if (ret) - return ret; + init_suspend_resume(); printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; @@ -799,5 +784,5 @@ int __init op_nmi_init(struct oprofile_operations *ops) void op_nmi_exit(void) { - exit_sysfs(); + exit_suspend_resume(); } diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index e28398df0df2..0b7b7b179cbe 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -22,6 +22,7 @@ struct op_counter_config { unsigned long kernel; unsigned long user; unsigned long unit_mask; + unsigned long extra; }; extern struct op_counter_config counter_config[]; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index c3b8e24f2b16..9fd8a567fe1e 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -316,16 +316,23 @@ static void op_amd_stop_ibs(void) wrmsrl(MSR_AMD64_IBSOPCTL, 0); } -static inline int eilvt_is_available(int offset) +static inline int get_eilvt(int offset) { - /* check if we may assign a vector */ return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); } +static inline int put_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, 0, 1); +} + static inline int ibs_eilvt_valid(void) { int offset; u64 val; + int valid = 0; + + preempt_disable(); rdmsrl(MSR_AMD64_IBSCTL, val); offset = val & IBSCTL_LVT_OFFSET_MASK; @@ -333,16 +340,20 @@ static inline int ibs_eilvt_valid(void) if (!(val & IBSCTL_LVT_OFFSET_VALID)) { pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - return 0; + goto out; } - if (!eilvt_is_available(offset)) { + if (!get_eilvt(offset)) { pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - return 0; + goto out; } - return 1; + valid = 1; +out: + preempt_enable(); + + return valid; } static inline int get_ibs_offset(void) @@ -600,67 +611,69 @@ static int setup_ibs_ctl(int ibs_eilvt_off) static int force_ibs_eilvt_setup(void) { - int i; + int offset; int ret; - /* find the next free available EILVT entry */ - for (i = 1; i < 4; i++) { - if (!eilvt_is_available(i)) - continue; - ret = setup_ibs_ctl(i); - if (ret) - return ret; - pr_err(FW_BUG "using offset %d for IBS interrupts\n", i); - return 0; + /* + * find the next free available EILVT entry, skip offset 0, + * pin search to this cpu + */ + preempt_disable(); + for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { + if (get_eilvt(offset)) + break; } + preempt_enable(); - printk(KERN_DEBUG "No EILVT entry available\n"); - - return -EBUSY; -} - -static int __init_ibs_nmi(void) -{ - int ret; - - if (ibs_eilvt_valid()) - return 0; + if (offset == APIC_EILVT_NR_MAX) { + printk(KERN_DEBUG "No EILVT entry available\n"); + return -EBUSY; + } - ret = force_ibs_eilvt_setup(); + ret = setup_ibs_ctl(offset); if (ret) - return ret; + goto out; - if (!ibs_eilvt_valid()) - return -EFAULT; + if (!ibs_eilvt_valid()) { + ret = -EFAULT; + goto out; + } + pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); return 0; +out: + preempt_disable(); + put_eilvt(offset); + preempt_enable(); + return ret; } /* * check and reserve APIC extended interrupt LVT offset for IBS if * available - * - * init_ibs() preforms implicitly cpu-local operations, so pin this - * thread to its current CPU */ static void init_ibs(void) { - preempt_disable(); - ibs_caps = get_ibs_caps(); + if (!ibs_caps) + return; + + if (ibs_eilvt_valid()) goto out; - if (__init_ibs_nmi() < 0) - ibs_caps = 0; - else - printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); + if (!force_ibs_eilvt_setup()) + goto out; + + /* Failed to setup ibs */ + ibs_caps = 0; + return; out: - preempt_enable(); + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 9fadec074142..98ab13058f89 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -50,7 +50,7 @@ static inline void setup_num_counters(void) #endif } -static int inline addr_increment(void) +static inline int addr_increment(void) { #ifdef CONFIG_SMP return smp_num_siblings == 2 ? 2 : 1; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index e27dffbbb1a7..026e4931d162 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -350,7 +350,7 @@ static int __init early_fill_mp_bus_info(void) #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs(void *unused) +static void __cpuinit enable_pci_io_ecs(void *unused) { u64 reg; rdmsrl(MSR_AMD64_NB_CFG, reg); diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 9260b3eb18d4..67858be4b52b 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -255,7 +255,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value) static int ce4100_conf_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { - int i, retval = 1; + int i; if (bus == 1) { for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index bd33620b0071..e6fd8473fb7b 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -280,12 +280,9 @@ void __init pci_direct_init(int type) int __init pci_direct_probe(void) { - struct resource *region, *region2; - if ((pci_probe & PCI_PROBE_CONF1) == 0) goto type2; - region = request_region(0xCF8, 8, "PCI conf1"); - if (!region) + if (!request_region(0xCF8, 8, "PCI conf1")) goto type2; if (pci_check_type1()) { @@ -293,16 +290,14 @@ int __init pci_direct_probe(void) port_cf9_safe = true; return 1; } - release_resource(region); + release_region(0xCF8, 8); type2: if ((pci_probe & PCI_PROBE_CONF2) == 0) return 0; - region = request_region(0xCF8, 4, "PCI conf2"); - if (!region) + if (!request_region(0xCF8, 4, "PCI conf2")) return 0; - region2 = request_region(0xC000, 0x1000, "PCI conf2"); - if (!region2) + if (!request_region(0xC000, 0x1000, "PCI conf2")) goto fail2; if (pci_check_type2()) { @@ -311,8 +306,8 @@ int __init pci_direct_probe(void) return 2; } - release_resource(region2); + release_region(0xC000, 0x1000); fail2: - release_resource(region); + release_region(0xCF8, 4); return 0; } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index b1805b78842f..494f2e7ea2b4 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -241,7 +241,7 @@ void __init pcibios_resource_survey(void) e820_reserve_resources_late(); /* * Insert the IO APIC resources after PCI initialization has - * occured to handle IO APICS that are mapped in on a BAR in + * occurred to handle IO APICS that are mapped in on a BAR in * PCI space, but before trying to assign unassigned pci res. */ ioapic_insert_resources(); @@ -304,7 +304,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here - * aswell. + * as well. */ prot |= _PAGE_CACHE_UC_MINUS; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 87e6c8323117..372e9b8989b3 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -597,21 +597,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) { - r->name = "PIIX/ICH"; - r->get = pirq_piix_get; - r->set = pirq_piix_set; - return 1; - } return 0; } diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index e282886616a0..750c346ef50a 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -606,6 +606,16 @@ static void __init __pci_mmcfg_init(int early) if (list_empty(&pci_mmcfg_list)) return; + if (pcibios_last_bus < 0) { + const struct pci_mmcfg_region *cfg; + + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->segment) + break; + pcibios_last_bus = cfg->end_bus; + } + } + if (pci_mmcfg_arch_init()) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; else { diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 25cd4a07d09f..8214724ce54d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -20,7 +20,8 @@ #include <asm/xen/pci.h> #ifdef CONFIG_ACPI -static int xen_hvm_register_pirq(u32 gsi, int triggering) +static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, + int trigger, int polarity) { int rc, irq; struct physdev_map_pirq map_irq; @@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) return -1; } - if (triggering == ACPI_EDGE_SENSITIVE) { + if (trigger == ACPI_EDGE_SENSITIVE) { shareable = 0; name = "ioapic-edge"; } else { @@ -49,18 +50,12 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) name = "ioapic-level"; } - irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); + irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); return irq; } - -static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, - int trigger, int polarity) -{ - return xen_hvm_register_pirq(gsi, trigger); -} #endif #if defined(CONFIG_PCI_MSI) @@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, pirq, ret = 0; + int irq, pirq; struct msi_desc *msidesc; struct msi_msg msg; @@ -99,39 +94,33 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) __read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); - if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); - if (irq < 0) + if (msg.data != XEN_PIRQ_MSI_DATA || + xen_irq_from_pirq(pirq) < 0) { + pirq = xen_allocate_pirq_msi(dev, msidesc); + if (pirq < 0) goto error; - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" - " pirq=%d\n", irq, pirq); - return 0; + xen_msi_compose_msg(dev, pirq, &msg); + __write_msi_msg(msidesc, &msg); + dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); + } else { + dev_dbg(&dev->dev, + "xen: msi already bound to pirq=%d\n", pirq); } - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); - if (irq < 0 || pirq < 0) + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi", + DOMID_SELF); + if (irq < 0) goto error; - printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); - xen_msi_compose_msg(dev, pirq, &msg); - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - write_msi_msg(irq, &msg); + dev_dbg(&dev->dev, + "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq); } return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); - - return ret; + dev_err(&dev->dev, + "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + return -ENODEV; } /* @@ -150,35 +139,27 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return -ENOMEM; if (type == PCI_CAP_ID_MSIX) - ret = xen_pci_frontend_enable_msix(dev, &v, nvec); + ret = xen_pci_frontend_enable_msix(dev, v, nvec); else - ret = xen_pci_frontend_enable_msi(dev, &v); + ret = xen_pci_frontend_enable_msi(dev, v); if (ret) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_allocate_pirq(v[i], 0, /* not sharable */ - (type == PCI_CAP_ID_MSIX) ? - "pcifront-msi-x" : "pcifront-msi"); - if (irq < 0) { - ret = -1; + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, + (type == PCI_CAP_ID_MSIX) ? + "pcifront-msi-x" : + "pcifront-msi", + DOMID_SELF); + if (irq < 0) goto free; - } - - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error_while; i++; } kfree(v); return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); free: kfree(v); return ret; @@ -193,6 +174,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) xen_pci_frontend_disable_msix(dev); else xen_pci_frontend_disable_msi(dev); + + /* Free the IRQ's and the msidesc using the generic code. */ + default_teardown_msi_irqs(dev); } static void xen_teardown_msi_irq(unsigned int irq) @@ -200,47 +184,100 @@ static void xen_teardown_msi_irq(unsigned int irq) xen_destroy_irq(irq); } +#ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret; + int ret = 0; struct msi_desc *msidesc; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_create_msi_irq(dev, msidesc, type); - if (irq < 0) - return -1; + struct physdev_map_pirq map_irq; + domid_t domid; - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error; - } - return 0; + domid = ret = xen_find_device_domain_owner(dev); + /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED, + * hence check ret value for < 0. */ + if (ret < 0) + domid = DOMID_SELF; -error: - xen_destroy_irq(irq); + memset(&map_irq, 0, sizeof(map_irq)); + map_irq.domid = domid; + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = -1; + map_irq.bus = dev->bus->number; + map_irq.devfn = dev->devfn; + + if (type == PCI_CAP_ID_MSIX) { + int pos; + u32 table_offset, bir; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + + pci_read_config_dword(dev, pos + PCI_MSIX_TABLE, + &table_offset); + bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); + + map_irq.table_base = pci_resource_start(dev, bir); + map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + } + + ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (ret) { + dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n", + ret, domid); + goto out; + } + + ret = xen_bind_pirq_msi_to_irq(dev, msidesc, + map_irq.pirq, map_irq.index, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi", + domid); + if (ret < 0) + goto out; + } + ret = 0; +out: return ret; } #endif +#endif static int xen_pcifront_enable_irq(struct pci_dev *dev) { int rc; int share = 1; + int pirq; + u8 gsi; - dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); + rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", + rc); + return rc; + } - if (dev->irq < 0) - return -EINVAL; + rc = xen_allocate_pirq_gsi(gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n", + gsi, rc); + return rc; + } + pirq = rc; - if (dev->irq < NR_IRQS_LEGACY) + if (gsi < NR_IRQS_LEGACY) share = 0; - rc = xen_allocate_pirq(dev->irq, share, "pcifront"); + rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", - dev->irq, rc); + dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", + gsi, pirq, rc); return rc; } + + dev->irq = rc; + dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); return 0; } @@ -292,7 +329,7 @@ int __init pci_xen_hvm_init(void) #ifdef CONFIG_XEN_DOM0 static int xen_register_pirq(u32 gsi, int triggering) { - int rc, irq; + int rc, pirq, irq = -1; struct physdev_map_pirq map_irq; int shareable = 0; char *name; @@ -308,17 +345,20 @@ static int xen_register_pirq(u32 gsi, int triggering) name = "ioapic-level"; } - irq = xen_allocate_pirq(gsi, shareable, name); - - printk(KERN_DEBUG "xen: --> irq=%d\n", irq); + pirq = xen_allocate_pirq_gsi(gsi); + if (pirq < 0) + goto out; + irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); if (irq < 0) goto out; + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq); + map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; map_irq.index = gsi; - map_irq.pirq = irq; + map_irq.pirq = pirq; rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { @@ -405,13 +445,18 @@ static int __init pci_xen_initial_domain(void) void __init xen_setup_pirqs(void) { - int irq; + int pirq, irq; pci_xen_initial_domain(); if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) - xen_allocate_pirq(irq, 0, "xt-pic"); + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { + pirq = xen_allocate_pirq_gsi(irq); + if (WARN(pirq < 0, + "Could not allocate PIRQ for legacy interrupt\n")) + break; + irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); + } return; } @@ -427,3 +472,78 @@ void __init xen_setup_pirqs(void) } } #endif + +#ifdef CONFIG_XEN_DOM0 +struct xen_device_domain_owner { + domid_t domain; + struct pci_dev *dev; + struct list_head list; +}; + +static DEFINE_SPINLOCK(dev_domain_list_spinlock); +static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list); + +static struct xen_device_domain_owner *find_device(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + list_for_each_entry(owner, &dev_domain_list, list) { + if (owner->dev == dev) + return owner; + } + return NULL; +} + +int xen_find_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + int domain = -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (owner) + domain = owner->domain; + spin_unlock(&dev_domain_list_spinlock); + return domain; +} +EXPORT_SYMBOL_GPL(xen_find_device_domain_owner); + +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain) +{ + struct xen_device_domain_owner *owner; + + owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL); + if (!owner) + return -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + if (find_device(dev)) { + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return -EEXIST; + } + owner->domain = domain; + owner->dev = dev; + list_add_tail(&owner->list, &dev_domain_list); + spin_unlock(&dev_domain_list_spinlock); + return 0; +} +EXPORT_SYMBOL_GPL(xen_register_device_domain_owner); + +int xen_unregister_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (!owner) { + spin_unlock(&dev_domain_list_spinlock); + return -ENODEV; + } + list_del(&owner->list); + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return 0; +} +EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); +#endif diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index cd6f184c3b3f..28071bb31db7 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -16,21 +16,19 @@ #include <linux/serial_8250.h> #include <asm/ce4100.h> +#include <asm/prom.h> #include <asm/setup.h> +#include <asm/i8259.h> #include <asm/io.h> +#include <asm/io_apic.h> static int ce4100_i8042_detect(void) { return 0; } -static void __init sdv_find_smp_config(void) -{ -} - #ifdef CONFIG_SERIAL_8250 - static unsigned int mem_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; @@ -119,6 +117,15 @@ static void __init sdv_arch_setup(void) sdv_serial_fixup(); } +#ifdef CONFIG_X86_IO_APIC +static void __cpuinit sdv_pci_init(void) +{ + x86_of_pci_init(); + /* We can't set this earlier, because we need to calibrate the timer */ + legacy_pic = &null_legacy_pic; +} +#endif + /* * CE4100 specific x86_init function overrides and early setup * calls. @@ -129,6 +136,11 @@ void __init x86_ce4100_early_setup(void) x86_platform.i8042_detect = ce4100_i8042_detect; x86_init.resources.probe_roms = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; - x86_init.mpparse.find_smp_config = sdv_find_smp_config; + x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.pci.init = ce4100_pci_init; + +#ifdef CONFIG_X86_IO_APIC + x86_init.pci.init_irq = sdv_pci_init; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; +#endif } diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts new file mode 100644 index 000000000000..e70be38ce039 --- /dev/null +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -0,0 +1,430 @@ +/* + * CE4100 on Falcon Falls + * + * (c) Copyright 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + */ +/dts-v1/; +/ { + model = "intel,falconfalls"; + compatible = "intel,falconfalls"; + #address-cells = <1>; + #size-cells = <1>; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "intel,ce4100"; + reg = <0>; + lapic = <&lapic0>; + }; + }; + + soc@0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "intel,ce4100-cp"; + ranges; + + ioapic1: interrupt-controller@fec00000 { + #interrupt-cells = <2>; + compatible = "intel,ce4100-ioapic"; + interrupt-controller; + reg = <0xfec00000 0x1000>; + }; + + timer@fed00000 { + compatible = "intel,ce4100-hpet"; + reg = <0xfed00000 0x200>; + }; + + lapic0: interrupt-controller@fee00000 { + compatible = "intel,ce4100-lapic"; + reg = <0xfee00000 0x1000>; + }; + + pci@3fc { + #address-cells = <3>; + #size-cells = <2>; + compatible = "intel,ce4100-pci", "pci"; + device_type = "pci"; + bus-range = <0 0>; + ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000 + 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000 + 0x0000000 0 0x0 0x0 0 0x100>; + + /* Secondary IO-APIC */ + ioapic2: interrupt-controller@0,1 { + #interrupt-cells = <2>; + compatible = "intel,ce4100-ioapic"; + interrupt-controller; + reg = <0x100 0x0 0x0 0x0 0x0>; + assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>; + }; + + pci@1,0 { + #address-cells = <3>; + #size-cells = <2>; + compatible = "intel,ce4100-pci", "pci"; + device_type = "pci"; + bus-range = <1 1>; + reg = <0x0800 0x0 0x0 0x0 0x0>; + ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>; + + interrupt-parent = <&ioapic2>; + + display@2,0 { + compatible = "pci8086,2e5b.2", + "pci8086,2e5b", + "pciclass038000", + "pciclass0380"; + + reg = <0x11000 0x0 0x0 0x0 0x0>; + interrupts = <0 1>; + }; + + multimedia@3,0 { + compatible = "pci8086,2e5c.2", + "pci8086,2e5c", + "pciclass048000", + "pciclass0480"; + + reg = <0x11800 0x0 0x0 0x0 0x0>; + interrupts = <2 1>; + }; + + multimedia@4,0 { + compatible = "pci8086,2e5d.2", + "pci8086,2e5d", + "pciclass048000", + "pciclass0480"; + + reg = <0x12000 0x0 0x0 0x0 0x0>; + interrupts = <4 1>; + }; + + multimedia@4,1 { + compatible = "pci8086,2e5e.2", + "pci8086,2e5e", + "pciclass048000", + "pciclass0480"; + + reg = <0x12100 0x0 0x0 0x0 0x0>; + interrupts = <5 1>; + }; + + sound@6,0 { + compatible = "pci8086,2e5f.2", + "pci8086,2e5f", + "pciclass040100", + "pciclass0401"; + + reg = <0x13000 0x0 0x0 0x0 0x0>; + interrupts = <6 1>; + }; + + sound@6,1 { + compatible = "pci8086,2e5f.2", + "pci8086,2e5f", + "pciclass040100", + "pciclass0401"; + + reg = <0x13100 0x0 0x0 0x0 0x0>; + interrupts = <7 1>; + }; + + sound@6,2 { + compatible = "pci8086,2e60.2", + "pci8086,2e60", + "pciclass040100", + "pciclass0401"; + + reg = <0x13200 0x0 0x0 0x0 0x0>; + interrupts = <8 1>; + }; + + display@8,0 { + compatible = "pci8086,2e61.2", + "pci8086,2e61", + "pciclass038000", + "pciclass0380"; + + reg = <0x14000 0x0 0x0 0x0 0x0>; + interrupts = <9 1>; + }; + + display@8,1 { + compatible = "pci8086,2e62.2", + "pci8086,2e62", + "pciclass038000", + "pciclass0380"; + + reg = <0x14100 0x0 0x0 0x0 0x0>; + interrupts = <10 1>; + }; + + multimedia@8,2 { + compatible = "pci8086,2e63.2", + "pci8086,2e63", + "pciclass048000", + "pciclass0480"; + + reg = <0x14200 0x0 0x0 0x0 0x0>; + interrupts = <11 1>; + }; + + entertainment-encryption@9,0 { + compatible = "pci8086,2e64.2", + "pci8086,2e64", + "pciclass101000", + "pciclass1010"; + + reg = <0x14800 0x0 0x0 0x0 0x0>; + interrupts = <12 1>; + }; + + localbus@a,0 { + compatible = "pci8086,2e65.2", + "pci8086,2e65", + "pciclassff0000", + "pciclassff00"; + + reg = <0x15000 0x0 0x0 0x0 0x0>; + }; + + serial@b,0 { + compatible = "pci8086,2e66.2", + "pci8086,2e66", + "pciclass070003", + "pciclass0700"; + + reg = <0x15800 0x0 0x0 0x0 0x0>; + interrupts = <14 1>; + }; + + gpio@b,1 { + compatible = "pci8086,2e67.2", + "pci8086,2e67", + "pciclassff0000", + "pciclassff00"; + + #gpio-cells = <2>; + reg = <0x15900 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + gpio-controller; + }; + + i2c-controller@b,2 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "pci8086,2e68.2", + "pci8086,2e68", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15a00 0x0 0x0 0x0 0x0>; + interrupts = <16 1>; + ranges = <0 0 0x02000000 0 0xdffe0500 0x100 + 1 0 0x02000000 0 0xdffe0600 0x100 + 2 0 0x02000000 0 0xdffe0700 0x100>; + + i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <0 0 0x100>; + }; + + i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <1 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + + i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <2 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + }; + + smard-card@b,3 { + compatible = "pci8086,2e69.2", + "pci8086,2e69", + "pciclass070500", + "pciclass0705"; + + reg = <0x15b00 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + }; + + spi-controller@b,4 { + #address-cells = <1>; + #size-cells = <0>; + compatible = + "pci8086,2e6a.2", + "pci8086,2e6a", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15c00 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + + dac@0 { + compatible = "ti,pcm1755"; + reg = <0>; + spi-max-frequency = <115200>; + }; + + dac@1 { + compatible = "ti,pcm1609a"; + reg = <1>; + spi-max-frequency = <115200>; + }; + + eeprom@2 { + compatible = "atmel,at93c46"; + reg = <2>; + spi-max-frequency = <115200>; + }; + }; + + multimedia@b,7 { + compatible = "pci8086,2e6d.2", + "pci8086,2e6d", + "pciclassff0000", + "pciclassff00"; + + reg = <0x15f00 0x0 0x0 0x0 0x0>; + }; + + ethernet@c,0 { + compatible = "pci8086,2e6e.2", + "pci8086,2e6e", + "pciclass020000", + "pciclass0200"; + + reg = <0x16000 0x0 0x0 0x0 0x0>; + interrupts = <21 1>; + }; + + clock@c,1 { + compatible = "pci8086,2e6f.2", + "pci8086,2e6f", + "pciclassff0000", + "pciclassff00"; + + reg = <0x16100 0x0 0x0 0x0 0x0>; + interrupts = <3 1>; + }; + + usb@d,0 { + compatible = "pci8086,2e70.2", + "pci8086,2e70", + "pciclass0c0320", + "pciclass0c03"; + + reg = <0x16800 0x0 0x0 0x0 0x0>; + interrupts = <22 1>; + }; + + usb@d,1 { + compatible = "pci8086,2e70.2", + "pci8086,2e70", + "pciclass0c0320", + "pciclass0c03"; + + reg = <0x16900 0x0 0x0 0x0 0x0>; + interrupts = <22 1>; + }; + + sata@e,0 { + compatible = "pci8086,2e71.0", + "pci8086,2e71", + "pciclass010601", + "pciclass0106"; + + reg = <0x17000 0x0 0x0 0x0 0x0>; + interrupts = <23 1>; + }; + + flash@f,0 { + compatible = "pci8086,701.1", + "pci8086,701", + "pciclass050100", + "pciclass0501"; + + reg = <0x17800 0x0 0x0 0x0 0x0>; + interrupts = <13 1>; + }; + + entertainment-encryption@10,0 { + compatible = "pci8086,702.1", + "pci8086,702", + "pciclass101000", + "pciclass1010"; + + reg = <0x18000 0x0 0x0 0x0 0x0>; + }; + + co-processor@11,0 { + compatible = "pci8086,703.1", + "pci8086,703", + "pciclass0b4000", + "pciclass0b40"; + + reg = <0x18800 0x0 0x0 0x0 0x0>; + interrupts = <1 1>; + }; + + multimedia@12,0 { + compatible = "pci8086,704.0", + "pci8086,704", + "pciclass048000", + "pciclass0480"; + + reg = <0x19000 0x0 0x0 0x0 0x0>; + }; + }; + + isa@1f,0 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "isa"; + reg = <0xf800 0x0 0x0 0x0 0x0>; + ranges = <1 0 0 0 0 0x100>; + + rtc@70 { + compatible = "intel,ce4100-rtc", "motorola,mc146818"; + interrupts = <8 3>; + interrupt-parent = <&ioapic1>; + ctrl-reg = <2>; + freq-reg = <0x26>; + reg = <1 0x70 2>; + }; + }; + }; + }; +}; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 0fe27d7c6258..0d3a4fa34560 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -145,17 +145,6 @@ static void virt_efi_reset_system(int reset_type, data_size, data); } -static efi_status_t virt_efi_set_virtual_address_map( - unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - return efi_call_virt4(set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); -} - static efi_status_t __init phys_efi_set_virtual_address_map( unsigned long memory_map_size, unsigned long descriptor_size, @@ -315,6 +304,40 @@ static void __init print_efi_memmap(void) } #endif /* EFI_DEBUG */ +void __init efi_reserve_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + + memblock_x86_reserve_range(start, start + size, "EFI Boot"); + } +} + +static void __init efi_free_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + + free_bootmem_late(start, size); + } +} + void __init efi_init(void) { efi_config_table_t *config_tables; @@ -468,11 +491,25 @@ void __init efi_init(void) #endif } +void __init efi_set_executable(efi_memory_desc_t *md, bool executable) +{ + u64 addr, npages; + + addr = md->virt_addr; + npages = md->num_pages; + + memrange_efi_to_native(&addr, &npages); + + if (executable) + set_memory_x(addr, npages); + else + set_memory_nx(addr, npages); +} + static void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; void *p; - u64 addr, npages; /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -481,10 +518,7 @@ static void __init runtime_code_page_mkexec(void) if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_x(addr, npages); + efi_set_executable(md, true); } } @@ -498,16 +532,47 @@ static void __init runtime_code_page_mkexec(void) */ void __init efi_enter_virtual_mode(void) { - efi_memory_desc_t *md; + efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; u64 end, systab, addr, npages, end_pfn; - void *p, *va; + void *p, *va, *new_memmap = NULL; + int count = 0; efi.systab = NULL; + + /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + u64 prev_size; md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) + + if (!prev_md) { + prev_md = md; + continue; + } + + if (prev_md->type != md->type || + prev_md->attribute != md->attribute) { + prev_md = md; + continue; + } + + prev_size = prev_md->num_pages << EFI_PAGE_SHIFT; + + if (md->phys_addr == (prev_md->phys_addr + prev_size)) { + prev_md->num_pages += md->num_pages; + md->type = EFI_RESERVED_TYPE; + md->attribute = 0; + continue; + } + prev_md = md; + } + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) continue; size = md->num_pages << EFI_PAGE_SHIFT; @@ -541,15 +606,21 @@ void __init efi_enter_virtual_mode(void) systab += md->virt_addr - md->phys_addr; efi.systab = (efi_system_table_t *) (unsigned long) systab; } + new_memmap = krealloc(new_memmap, + (count + 1) * memmap.desc_size, + GFP_KERNEL); + memcpy(new_memmap + (count * memmap.desc_size), md, + memmap.desc_size); + count++; } BUG_ON(!efi.systab); status = phys_efi_set_virtual_address_map( - memmap.desc_size * memmap.nr_map, + memmap.desc_size * count, memmap.desc_size, memmap.desc_version, - memmap.phys_map); + (efi_memory_desc_t *)__pa(new_memmap)); if (status != EFI_SUCCESS) { printk(KERN_ALERT "Unable to switch EFI into virtual mode " @@ -558,6 +629,13 @@ void __init efi_enter_virtual_mode(void) } /* + * Thankfully, it does seem that no runtime services other than + * SetVirtualAddressMap() will touch boot services code, so we can + * get rid of it all at this point + */ + efi_free_boot_services(); + + /* * Now that EFI is in virtual mode, update the function * pointers in the runtime service table to the new virtual addresses. * @@ -572,11 +650,12 @@ void __init efi_enter_virtual_mode(void) efi.set_variable = virt_efi_set_variable; efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; - efi.set_virtual_address_map = virt_efi_set_virtual_address_map; + efi.set_virtual_address_map = NULL; if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; + kfree(new_memmap); } /* diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac0621a7ac3d..ac3aa54e2654 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -41,22 +41,7 @@ static pgd_t save_pgd __initdata; static unsigned long efi_flags __initdata; -static void __init early_mapping_set_exec(unsigned long start, - unsigned long end, - int executable) -{ - unsigned long num_pages; - - start &= PMD_MASK; - end = (end + PMD_SIZE - 1) & PMD_MASK; - num_pages = (end - start) >> PAGE_SHIFT; - if (executable) - set_memory_x((unsigned long)__va(start), num_pages); - else - set_memory_nx((unsigned long)__va(start), num_pages); -} - -static void __init early_runtime_code_mapping_set_exec(int executable) +static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; void *p; @@ -64,14 +49,12 @@ static void __init early_runtime_code_mapping_set_exec(int executable) if (!(__supported_pte_mask & _PAGE_NX)) return; - /* Make EFI runtime service code area executable */ + /* Make EFI service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - if (md->type == EFI_RUNTIME_SERVICES_CODE) { - unsigned long end; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - early_mapping_set_exec(md->phys_addr, end, executable); - } + if (md->type == EFI_RUNTIME_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_CODE) + efi_set_executable(md, executable); } } @@ -79,7 +62,7 @@ void __init efi_call_phys_prelog(void) { unsigned long vaddress; - early_runtime_code_mapping_set_exec(1); + early_code_mapping_set_exec(1); local_irq_save(efi_flags); vaddress = (unsigned long)__va(0x0UL); save_pgd = *pgd_offset_k(0x0UL); @@ -95,7 +78,7 @@ void __init efi_call_phys_epilog(void) set_pgd(pgd_offset_k(0x0UL), save_pgd); __flush_tlb_all(); local_irq_restore(efi_flags); - early_runtime_code_mapping_set_exec(0); + early_code_mapping_set_exec(0); } void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, @@ -107,8 +90,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, return ioremap(phys_addr, size); last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); - if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) - return NULL; + if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { + unsigned long top = last_map_pfn << PAGE_SHIFT; + efi_ioremap(top, size - (top - phys_addr), type); + } return (void __iomem *)__va(phys_addr); } diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index ea6529e93c6f..7000e74b3087 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -31,6 +31,7 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/mrst.h> +#include <asm/mrst-vrtc.h> #include <asm/io.h> #include <asm/i8259.h> #include <asm/intel_scu_ipc.h> @@ -96,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table) pentry->freq_hz, pentry->irq); if (!pentry->irq) continue; - mp_irq.type = MP_IOAPIC; + mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ mp_irq.irqflag = 5; - mp_irq.srcbus = 0; + mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; @@ -167,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->irq); - mp_irq.type = MP_IOAPIC; + mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; mp_irq.irqflag = 0xf; /* level trigger and active low */ - mp_irq.srcbus = 0; + mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; @@ -193,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void) return 0; } -void __init mrst_time_init(void) +static void __init mrst_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); switch (mrst_timer_options) { @@ -215,7 +216,7 @@ void __init mrst_time_init(void) apbt_time_init(); } -void __cpuinit mrst_arch_setup(void) +static void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; @@ -268,6 +269,7 @@ void __init x86_mrst_early_setup(void) x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_platform.i8042_detect = mrst_i8042_detect; + x86_init.timers.wallclock_init = mrst_rtc_init; x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; @@ -280,7 +282,7 @@ void __init x86_mrst_early_setup(void) /* Avoid searching for BIOS MP tables */ x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; - + set_bit(MP_BUS_ISA, mp_bus_not_pci); } /* diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 32cd7edd71a0..73d70d65e76e 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -100,22 +100,16 @@ int vrtc_set_mmss(unsigned long nowtime) void __init mrst_rtc_init(void) { - unsigned long rtc_paddr; - void __iomem *virt_base; + unsigned long vrtc_paddr; sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); - if (!sfi_mrtc_num) - return; - - rtc_paddr = sfi_mrtc_array[0].phys_addr; - /* vRTC's register address may not be page aligned */ - set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr); - - virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC); - virt_base += rtc_paddr & ~PAGE_MASK; - vrtc_virt_base = virt_base; + vrtc_paddr = sfi_mrtc_array[0].phys_addr; + if (!sfi_mrtc_num || !vrtc_paddr) + return; + vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC, + vrtc_paddr); x86_platform.get_wallclock = vrtc_get_time; x86_platform.set_wallclock = vrtc_set_mmss; } diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index e797428b163b..81c5e2165c24 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,4 +1,2 @@ -obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o -obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 127775696d6c..ab81fb271760 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/pm.h> +#include <linux/mfd/core.h> #include <asm/io.h> #include <asm/olpc.h> @@ -56,25 +57,24 @@ static void xo1_power_off(void) static int __devinit olpc_xo1_probe(struct platform_device *pdev) { struct resource *res; + int err; /* don't run on non-XOs */ if (!machine_is_olpc()) return -ENODEV; + err = mfd_cell_enable(pdev); + if (err) + return err; + res = platform_get_resource(pdev, IORESOURCE_IO, 0); if (!res) { dev_err(&pdev->dev, "can't fetch device resource info\n"); return -EIO; } - - if (!request_region(res->start, resource_size(res), DRV_NAME)) { - dev_err(&pdev->dev, "can't request region\n"); - return -EIO; - } - if (strcmp(pdev->name, "cs5535-pms") == 0) pms_base = res->start; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) acpi_base = res->start; /* If we have both addresses, we can override the poweroff hook */ @@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) static int __devexit olpc_xo1_remove(struct platform_device *pdev) { - struct resource *r; - - r = platform_get_resource(pdev, IORESOURCE_IO, 0); - release_region(r->start, resource_size(r)); + mfd_cell_disable(pdev); if (strcmp(pdev->name, "cs5535-pms") == 0) pms_base = 0; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) acpi_base = 0; pm_power_off = NULL; @@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = { static struct platform_driver cs5535_acpi_drv = { .driver = { - .name = "cs5535-acpi", + .name = "olpc-xo1-pm-acpi", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -143,7 +140,7 @@ static void __exit olpc_xo1_exit(void) MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:olpc-xo1"); +MODULE_ALIAS("platform:cs5535-pms"); module_init(olpc_xo1_init); module_exit(olpc_xo1_exit); diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index edaf3fe8dc5e..0060fd59ea00 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -18,6 +18,7 @@ #include <linux/io.h> #include <linux/string.h> #include <linux/platform_device.h> +#include <linux/of.h> #include <asm/geode.h> #include <asm/setup.h> @@ -187,41 +188,43 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -static bool __init check_ofw_architecture(void) +static bool __init check_ofw_architecture(struct device_node *root) { - size_t propsize; - char olpc_arch[5]; - const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; - void *res[] = { &propsize }; + const char *olpc_arch; + int propsize; - if (olpc_ofw("getprop", args, res)) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return false; - } + olpc_arch = of_get_property(root, "architecture", &propsize); return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; } -static u32 __init get_board_revision(void) +static u32 __init get_board_revision(struct device_node *root) { - size_t propsize; - __be32 rev; - const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res) || propsize != 4) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return cpu_to_be32(0); - } - return be32_to_cpu(rev); + int propsize; + const __be32 *rev; + + rev = of_get_property(root, "board-revision-int", &propsize); + if (propsize != 4) + return 0; + + return be32_to_cpu(*rev); } static bool __init platform_detect(void) { - if (!check_ofw_architecture()) + struct device_node *root = of_find_node_by_path("/"); + bool success; + + if (!root) return false; - olpc_platform_info.flags |= OLPC_F_PRESENT; - olpc_platform_info.boardrev = get_board_revision(); - return true; + + success = check_ofw_architecture(root); + if (success) { + olpc_platform_info.boardrev = get_board_revision(root); + olpc_platform_info.flags |= OLPC_F_PRESENT; + } + + of_node_put(root); + return success; } static int __init add_xo1_platform_devices(void) diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 044bda5b3174..d39f63d017d2 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -19,7 +19,9 @@ #include <linux/kernel.h> #include <linux/bootmem.h> #include <linux/of.h> +#include <linux/of_platform.h> #include <linux/of_pdt.h> +#include <asm/olpc.h> #include <asm/olpc_ofw.h> static phandle __init olpc_dt_getsibling(phandle node) @@ -180,3 +182,20 @@ void __init olpc_dt_build_devicetree(void) pr_info("PROM DT: Built device tree with %u bytes of memory.\n", prom_early_allocated); } + +/* A list of DT node/bus matches that we want to expose as platform devices */ +static struct of_device_id __initdata of_ids[] = { + { .compatible = "olpc,xo1-battery" }, + { .compatible = "olpc,xo1-dcon" }, + { .compatible = "olpc,xo1-rtc" }, + {}, +}; + +static int __init olpc_create_platform_devices(void) +{ + if (machine_is_olpc()) + return of_platform_bus_probe(NULL, of_ids, NULL); + else + return 0; +} +device_initcall(olpc_create_platform_devices); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index a7b38d35c29a..68e467f69fec 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. + * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -11,6 +11,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/delay.h> #include <asm/mmu_context.h> #include <asm/uv/uv.h> @@ -34,6 +35,7 @@ static int timeout_base_ns[] = { 5242880, 167772160 }; + static int timeout_us; static int nobau; static int baudisabled; @@ -41,20 +43,70 @@ static spinlock_t disable_lock; static cycles_t congested_cycles; /* tunables: */ -static int max_bau_concurrent = MAX_BAU_CONCURRENT; -static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; -static int plugged_delay = PLUGGED_DELAY; -static int plugsb4reset = PLUGSB4RESET; -static int timeoutsb4reset = TIMEOUTSB4RESET; -static int ipi_reset_limit = IPI_RESET_LIMIT; -static int complete_threshold = COMPLETE_THRESHOLD; -static int congested_response_us = CONGESTED_RESPONSE_US; -static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; +static int max_concurr = MAX_BAU_CONCURRENT; +static int max_concurr_const = MAX_BAU_CONCURRENT; +static int plugged_delay = PLUGGED_DELAY; +static int plugsb4reset = PLUGSB4RESET; +static int timeoutsb4reset = TIMEOUTSB4RESET; +static int ipi_reset_limit = IPI_RESET_LIMIT; +static int complete_threshold = COMPLETE_THRESHOLD; +static int congested_respns_us = CONGESTED_RESPONSE_US; +static int congested_reps = CONGESTED_REPS; +static int congested_period = CONGESTED_PERIOD; + +static struct tunables tunables[] = { + {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ + {&plugged_delay, PLUGGED_DELAY}, + {&plugsb4reset, PLUGSB4RESET}, + {&timeoutsb4reset, TIMEOUTSB4RESET}, + {&ipi_reset_limit, IPI_RESET_LIMIT}, + {&complete_threshold, COMPLETE_THRESHOLD}, + {&congested_respns_us, CONGESTED_RESPONSE_US}, + {&congested_reps, CONGESTED_REPS}, + {&congested_period, CONGESTED_PERIOD} +}; + static struct dentry *tunables_dir; static struct dentry *tunables_file; -static int __init setup_nobau(char *arg) +/* these correspond to the statistics printed by ptc_seq_show() */ +static char *stat_description[] = { + "sent: number of shootdown messages sent", + "stime: time spent sending messages", + "numuvhubs: number of hubs targeted with shootdown", + "numuvhubs16: number times 16 or more hubs targeted", + "numuvhubs8: number times 8 or more hubs targeted", + "numuvhubs4: number times 4 or more hubs targeted", + "numuvhubs2: number times 2 or more hubs targeted", + "numuvhubs1: number times 1 hub targeted", + "numcpus: number of cpus targeted with shootdown", + "dto: number of destination timeouts", + "retries: destination timeout retries sent", + "rok: : destination timeouts successfully retried", + "resetp: ipi-style resource resets for plugs", + "resett: ipi-style resource resets for timeouts", + "giveup: fall-backs to ipi-style shootdowns", + "sto: number of source timeouts", + "bz: number of stay-busy's", + "throt: number times spun in throttle", + "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE", + "recv: shootdown messages received", + "rtime: time spent processing messages", + "all: shootdown all-tlb messages", + "one: shootdown one-tlb messages", + "mult: interrupts that found multiple messages", + "none: interrupts that found no messages", + "retry: number of retry messages processed", + "canc: number messages canceled by retries", + "nocan: number retries that found nothing to cancel", + "reset: number of ipi-style reset requests processed", + "rcan: number messages canceled by reset requests", + "disable: number times use of the BAU was disabled", + "enable: number times use of the BAU was re-enabled" +}; + +static int __init +setup_nobau(char *arg) { nobau = 1; return 0; @@ -62,7 +114,7 @@ static int __init setup_nobau(char *arg) early_param("nobau", setup_nobau); /* base pnode in this partition */ -static int uv_partition_base_pnode __read_mostly; +static int uv_base_pnode __read_mostly; /* position of pnode (which is nasid>>1): */ static int uv_nshift __read_mostly; static unsigned long uv_mmask __read_mostly; @@ -108,60 +160,52 @@ static int __init uvhub_to_first_apicid(int uvhub) * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static inline void uv_reply_to_message(struct msg_desc *mdp, - struct bau_control *bcp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) { unsigned long dw; - struct bau_payload_queue_entry *msg; + struct bau_pq_entry *msg; msg = mdp->msg; if (!msg->canceled) { - dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | - msg->sw_ack_vector; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); + dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; + write_mmr_sw_ack(dw); } msg->replied_to = 1; - msg->sw_ack_vector = 0; + msg->swack_vec = 0; } /* * Process the receipt of a RETRY message */ -static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, - struct bau_control *bcp) +static void bau_process_retry_msg(struct msg_desc *mdp, + struct bau_control *bcp) { int i; int cancel_count = 0; - int slot2; unsigned long msg_res; unsigned long mmr = 0; - struct bau_payload_queue_entry *msg; - struct bau_payload_queue_entry *msg2; - struct ptc_stats *stat; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *msg2; + struct ptc_stats *stat = bcp->statp; - msg = mdp->msg; - stat = bcp->statp; stat->d_retries++; /* * cancel any message from msg+1 to the retry itself */ for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { - if (msg2 > mdp->va_queue_last) - msg2 = mdp->va_queue_first; + if (msg2 > mdp->queue_last) + msg2 = mdp->queue_first; if (msg2 == msg) break; - /* same conditions for cancellation as uv_do_reset */ + /* same conditions for cancellation as do_reset */ if ((msg2->replied_to == 0) && (msg2->canceled == 0) && - (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & - msg->sw_ack_vector) == 0) && + (msg2->swack_vec) && ((msg2->swack_vec & + msg->swack_vec) == 0) && (msg2->sending_cpu == msg->sending_cpu) && (msg2->msg_type != MSG_NOOP)) { - slot2 = msg2 - mdp->va_queue_first; - mmr = uv_read_local_mmr - (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = msg2->sw_ack_vector; + mmr = read_mmr_sw_ack(); + msg_res = msg2->swack_vec; /* * This is a message retry; clear the resources held * by the previous message only if they timed out. @@ -169,6 +213,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, * situation to report. */ if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { + unsigned long mr; /* * is the resource timed out? * make everyone ignore the cancelled message. @@ -176,10 +221,8 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, msg2->canceled = 1; stat->d_canceled++; cancel_count++; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << UV_SW_ACK_NPENDING) | - msg_res); + mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; + write_mmr_sw_ack(mr); } } } @@ -191,20 +234,19 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, * Do all the things a cpu should do for a TLB shootdown message. * Other cpu's may come here at the same time for this message. */ -static void uv_bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) +static void bau_process_message(struct msg_desc *mdp, + struct bau_control *bcp) { - int msg_ack_count; short socket_ack_count = 0; - struct ptc_stats *stat; - struct bau_payload_queue_entry *msg; + short *sp; + struct atomic_short *asp; + struct ptc_stats *stat = bcp->statp; + struct bau_pq_entry *msg = mdp->msg; struct bau_control *smaster = bcp->socket_master; /* * This must be a normal message, or retry of a normal message */ - msg = mdp->msg; - stat = bcp->statp; if (msg->address == TLB_FLUSH_ALL) { local_flush_tlb(); stat->d_alltlb++; @@ -221,30 +263,32 @@ static void uv_bau_process_message(struct msg_desc *mdp, * cpu number. */ if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) - uv_bau_process_retry_msg(mdp, bcp); + bau_process_retry_msg(mdp, bcp); /* - * This is a sw_ack message, so we have to reply to it. + * This is a swack message, so we have to reply to it. * Count each responding cpu on the socket. This avoids * pinging the count's cache line back and forth between * the sockets. */ - socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) - &smaster->socket_acknowledge_count[mdp->msg_slot]); + sp = &smaster->socket_acknowledge_count[mdp->msg_slot]; + asp = (struct atomic_short *)sp; + socket_ack_count = atom_asr(1, asp); if (socket_ack_count == bcp->cpus_in_socket) { + int msg_ack_count; /* * Both sockets dump their completed count total into * the message's count. */ smaster->socket_acknowledge_count[mdp->msg_slot] = 0; - msg_ack_count = atomic_add_short_return(socket_ack_count, - (struct atomic_short *)&msg->acknowledge_count); + asp = (struct atomic_short *)&msg->acknowledge_count; + msg_ack_count = atom_asr(socket_ack_count, asp); if (msg_ack_count == bcp->cpus_in_uvhub) { /* * All cpus in uvhub saw it; reply */ - uv_reply_to_message(mdp, bcp); + reply_to_message(mdp, bcp); } } @@ -267,62 +311,51 @@ static int uvhub_to_first_cpu(int uvhub) * Last resort when we get a large number of destination timeouts is * to clear resources held by a given cpu. * Do this with IPI so that all messages in the BAU message queue - * can be identified by their nonzero sw_ack_vector field. + * can be identified by their nonzero swack_vec field. * * This is entered for a single cpu on the uvhub. * The sender want's this uvhub to free a specific message's - * sw_ack resources. + * swack resources. */ -static void -uv_do_reset(void *ptr) +static void do_reset(void *ptr) { int i; - int slot; - int count = 0; - unsigned long mmr; - unsigned long msg_res; - struct bau_control *bcp; - struct reset_args *rap; - struct bau_payload_queue_entry *msg; - struct ptc_stats *stat; + struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id()); + struct reset_args *rap = (struct reset_args *)ptr; + struct bau_pq_entry *msg; + struct ptc_stats *stat = bcp->statp; - bcp = &per_cpu(bau_control, smp_processor_id()); - rap = (struct reset_args *)ptr; - stat = bcp->statp; stat->d_resets++; - /* * We're looking for the given sender, and - * will free its sw_ack resource. + * will free its swack resource. * If all cpu's finally responded after the timeout, its * message 'replied_to' was set. */ - for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { - /* uv_do_reset: same conditions for cancellation as - uv_bau_process_retry_msg() */ + for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { + unsigned long msg_res; + /* do_reset: same conditions for cancellation as + bau_process_retry_msg() */ if ((msg->replied_to == 0) && (msg->canceled == 0) && (msg->sending_cpu == rap->sender) && - (msg->sw_ack_vector) && + (msg->swack_vec) && (msg->msg_type != MSG_NOOP)) { + unsigned long mmr; + unsigned long mr; /* * make everyone else ignore this message */ msg->canceled = 1; - slot = msg - bcp->va_queue_first; - count++; /* * only reset the resource if it is still pending */ - mmr = uv_read_local_mmr - (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = msg->sw_ack_vector; + mmr = read_mmr_sw_ack(); + msg_res = msg->swack_vec; + mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; if (mmr & msg_res) { stat->d_rcanceled++; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << UV_SW_ACK_NPENDING) | - msg_res); + write_mmr_sw_ack(mr); } } } @@ -333,39 +366,38 @@ uv_do_reset(void *ptr) * Use IPI to get all target uvhubs to release resources held by * a given sending cpu number. */ -static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, - int sender) +static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender) { int uvhub; - int cpu; + int maskbits; cpumask_t mask; struct reset_args reset_args; reset_args.sender = sender; - cpus_clear(mask); /* find a single cpu for each uvhub in this distribution mask */ - for (uvhub = 0; - uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; - uvhub++) { + maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE; + for (uvhub = 0; uvhub < maskbits; uvhub++) { + int cpu; if (!bau_uvhub_isset(uvhub, distribution)) continue; /* find a cpu for this uvhub */ cpu = uvhub_to_first_cpu(uvhub); cpu_set(cpu, mask); } - /* IPI all cpus; Preemption is already disabled */ - smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); + + /* IPI all cpus; preemption is already disabled */ + smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1); return; } -static inline unsigned long -cycles_2_us(unsigned long long cyc) +static inline unsigned long cycles_2_us(unsigned long long cyc) { unsigned long long ns; unsigned long us; - ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) - >> CYC2NS_SCALE_FACTOR; + int cpu = smp_processor_id(); + + ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; us = ns / 1000; return us; } @@ -375,56 +407,56 @@ cycles_2_us(unsigned long long cyc) * leaves uvhub_quiesce set so that no new broadcasts are started by * bau_flush_send_and_wait() */ -static inline void -quiesce_local_uvhub(struct bau_control *hmaster) +static inline void quiesce_local_uvhub(struct bau_control *hmaster) { - atomic_add_short_return(1, (struct atomic_short *) - &hmaster->uvhub_quiesce); + atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce); } /* * mark this quiet-requestor as done */ -static inline void -end_uvhub_quiesce(struct bau_control *hmaster) +static inline void end_uvhub_quiesce(struct bau_control *hmaster) +{ + atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce); +} + +static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift) { - atomic_add_short_return(-1, (struct atomic_short *) - &hmaster->uvhub_quiesce); + unsigned long descriptor_status; + + descriptor_status = uv_read_local_mmr(mmr_offset); + descriptor_status >>= right_shift; + descriptor_status &= UV_ACT_STATUS_MASK; + return descriptor_status; } /* * Wait for completion of a broadcast software ack message * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP */ -static int uv_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift, int this_cpu, - struct bau_control *bcp, struct bau_control *smaster, long try) +static int uv1_wait_completion(struct bau_desc *bau_desc, + unsigned long mmr_offset, int right_shift, + struct bau_control *bcp, long try) { unsigned long descriptor_status; - cycles_t ttime; + cycles_t ttm; struct ptc_stats *stat = bcp->statp; - struct bau_control *hmaster; - - hmaster = bcp->uvhub_master; + descriptor_status = uv1_read_status(mmr_offset, right_shift); /* spin on the status MMR, waiting for it to go idle */ - while ((descriptor_status = (((unsigned long) - uv_read_local_mmr(mmr_offset) >> - right_shift) & UV_ACT_STATUS_MASK)) != - DESC_STATUS_IDLE) { + while ((descriptor_status != DS_IDLE)) { /* - * Our software ack messages may be blocked because there are - * no swack resources available. As long as none of them - * has timed out hardware will NACK our message and its - * state will stay IDLE. + * Our software ack messages may be blocked because + * there are no swack resources available. As long + * as none of them has timed out hardware will NACK + * our message and its state will stay IDLE. */ - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { + if (descriptor_status == DS_SOURCE_TIMEOUT) { stat->s_stimeout++; return FLUSH_GIVEUP; - } else if (descriptor_status == - DESC_STATUS_DESTINATION_TIMEOUT) { + } else if (descriptor_status == DS_DESTINATION_TIMEOUT) { stat->s_dtimeout++; - ttime = get_cycles(); + ttm = get_cycles(); /* * Our retries may be blocked by all destination @@ -432,8 +464,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc, * pending. In that case hardware returns the * ERROR that looks like a destination timeout. */ - if (cycles_2_us(ttime - bcp->send_message) < - timeout_us) { + if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { bcp->conseccompletes = 0; return FLUSH_RETRY_PLUGGED; } @@ -446,80 +477,160 @@ static int uv_wait_completion(struct bau_desc *bau_desc, */ cpu_relax(); } + descriptor_status = uv1_read_status(mmr_offset, right_shift); } bcp->conseccompletes++; return FLUSH_COMPLETE; } -static inline cycles_t -sec_2_cycles(unsigned long sec) +/* + * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. + */ +static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) { - unsigned long ns; - cycles_t cyc; + unsigned long descriptor_status; + unsigned long descriptor_status2; - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; + descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); + descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; + descriptor_status = (descriptor_status << 1) | descriptor_status2; + return descriptor_status; +} + +static int uv2_wait_completion(struct bau_desc *bau_desc, + unsigned long mmr_offset, int right_shift, + struct bau_control *bcp, long try) +{ + unsigned long descriptor_stat; + cycles_t ttm; + int cpu = bcp->uvhub_cpu; + struct ptc_stats *stat = bcp->statp; + + descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + + /* spin on the status MMR, waiting for it to go idle */ + while (descriptor_stat != UV2H_DESC_IDLE) { + /* + * Our software ack messages may be blocked because + * there are no swack resources available. As long + * as none of them has timed out hardware will NACK + * our message and its state will stay IDLE. + */ + if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || + (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) || + (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { + stat->s_stimeout++; + return FLUSH_GIVEUP; + } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { + stat->s_dtimeout++; + ttm = get_cycles(); + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + */ + if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { + bcp->conseccompletes = 0; + return FLUSH_RETRY_PLUGGED; + } + bcp->conseccompletes = 0; + return FLUSH_RETRY_TIMEOUT; + } else { + /* + * descriptor_stat is still BUSY + */ + cpu_relax(); + } + descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + } + bcp->conseccompletes++; + return FLUSH_COMPLETE; } /* - * conditionally add 1 to *v, unless *v is >= u - * return 0 if we cannot add 1 to *v because it is >= u - * return 1 if we can add 1 to *v because it is < u - * the add is atomic - * - * This is close to atomic_add_unless(), but this allows the 'u' value - * to be lowered below the current 'v'. atomic_add_unless can only stop - * on equal. + * There are 2 status registers; each and array[32] of 2 bits. Set up for + * which register to read and position in that register based on cpu in + * current hub. */ -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) +static int wait_completion(struct bau_desc *bau_desc, + struct bau_control *bcp, long try) { - spin_lock(lock); - if (atomic_read(v) >= u) { - spin_unlock(lock); - return 0; + int right_shift; + unsigned long mmr_offset; + int cpu = bcp->uvhub_cpu; + + if (cpu < UV_CPUS_PER_AS) { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + } else { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } - atomic_inc(v); - spin_unlock(lock); - return 1; + + if (is_uv1_hub()) + return uv1_wait_completion(bau_desc, mmr_offset, right_shift, + bcp, try); + else + return uv2_wait_completion(bau_desc, mmr_offset, right_shift, + bcp, try); +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + unsigned long ns; + cycles_t cyc; + + ns = sec * 1000000000; + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); + return cyc; } /* - * Our retries are blocked by all destination swack resources being + * Our retries are blocked by all destination sw ack resources being * in use, and a timeout is pending. In that case hardware immediately * returns the ERROR that looks like a destination timeout. */ -static void -destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, +static void destination_plugged(struct bau_desc *bau_desc, + struct bau_control *bcp, struct bau_control *hmaster, struct ptc_stats *stat) { udelay(bcp->plugged_delay); bcp->plugged_tries++; + if (bcp->plugged_tries >= bcp->plugsb4reset) { bcp->plugged_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + reset_with_ipi(&bau_desc->distribution, bcp->cpu); spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; stat->s_resets_plug++; } } -static void -destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, - struct bau_control *hmaster, struct ptc_stats *stat) +static void destination_timeout(struct bau_desc *bau_desc, + struct bau_control *bcp, struct bau_control *hmaster, + struct ptc_stats *stat) { - hmaster->max_bau_concurrent = 1; + hmaster->max_concurr = 1; bcp->timeout_tries++; if (bcp->timeout_tries >= bcp->timeoutsb4reset) { bcp->timeout_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + reset_with_ipi(&bau_desc->distribution, bcp->cpu); spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; stat->s_resets_timeout++; } @@ -529,34 +640,104 @@ destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, * Completions are taking a very long time due to a congested numalink * network. */ -static void -disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) +static void disable_for_congestion(struct bau_control *bcp, + struct ptc_stats *stat) { - int tcpu; - struct bau_control *tbcp; - /* let only one cpu do this disabling */ spin_lock(&disable_lock); + if (!baudisabled && bcp->period_requests && ((bcp->period_time / bcp->period_requests) > congested_cycles)) { + int tcpu; + struct bau_control *tbcp; /* it becomes this cpu's job to turn on the use of the BAU again */ baudisabled = 1; bcp->set_bau_off = 1; - bcp->set_bau_on_time = get_cycles() + - sec_2_cycles(bcp->congested_period); + bcp->set_bau_on_time = get_cycles(); + bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); stat->s_bau_disabled++; for_each_present_cpu(tcpu) { tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 1; + tbcp->baudisabled = 1; } } + spin_unlock(&disable_lock); } -/** - * uv_flush_send_and_wait - * +static void count_max_concurr(int stat, struct bau_control *bcp, + struct bau_control *hmaster) +{ + bcp->plugged_tries = 0; + bcp->timeout_tries = 0; + if (stat != FLUSH_COMPLETE) + return; + if (bcp->conseccompletes <= bcp->complete_threshold) + return; + if (hmaster->max_concurr >= hmaster->max_concurr_const) + return; + hmaster->max_concurr++; +} + +static void record_send_stats(cycles_t time1, cycles_t time2, + struct bau_control *bcp, struct ptc_stats *stat, + int completion_status, int try) +{ + cycles_t elapsed; + + if (time2 > time1) { + elapsed = time2 - time1; + stat->s_time += elapsed; + + if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { + bcp->period_requests++; + bcp->period_time += elapsed; + if ((elapsed > congested_cycles) && + (bcp->period_requests > bcp->cong_reps)) + disable_for_congestion(bcp, stat); + } + } else + stat->s_requestor--; + + if (completion_status == FLUSH_COMPLETE && try > 1) + stat->s_retriesok++; + else if (completion_status == FLUSH_GIVEUP) + stat->s_giveup++; +} + +/* + * Because of a uv1 hardware bug only a limited number of concurrent + * requests can be made. + */ +static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat) +{ + spinlock_t *lock = &hmaster->uvhub_lock; + atomic_t *v; + + v = &hmaster->active_descriptor_count; + if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) { + stat->s_throttles++; + do { + cpu_relax(); + } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)); + } +} + +/* + * Handle the completion status of a message send. + */ +static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, + struct bau_control *bcp, struct bau_control *hmaster, + struct ptc_stats *stat) +{ + if (completion_status == FLUSH_RETRY_PLUGGED) + destination_plugged(bau_desc, bcp, hmaster, stat); + else if (completion_status == FLUSH_RETRY_TIMEOUT) + destination_timeout(bau_desc, bcp, hmaster, stat); +} + +/* * Send a broadcast and wait for it to complete. * * The flush_mask contains the cpus the broadcast is to be sent to including @@ -567,44 +748,23 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) * returned to the kernel. */ int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) + struct cpumask *flush_mask, struct bau_control *bcp) { - int right_shift; - int completion_status = 0; int seq_number = 0; + int completion_stat = 0; long try = 0; - int cpu = bcp->uvhub_cpu; - int this_cpu = bcp->cpu; - unsigned long mmr_offset; unsigned long index; cycles_t time1; cycles_t time2; - cycles_t elapsed; struct ptc_stats *stat = bcp->statp; - struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; - if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, - &hmaster->active_descriptor_count, - hmaster->max_bau_concurrent)) { - stat->s_throttles++; - do { - cpu_relax(); - } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, - &hmaster->active_descriptor_count, - hmaster->max_bau_concurrent)); - } + if (is_uv1_hub()) + uv1_throttle(hmaster, stat); + while (hmaster->uvhub_quiesce) cpu_relax(); - if (cpu < UV_CPUS_PER_ACT_STATUS) { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; - } else { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = - ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); - } time1 = get_cycles(); do { if (try == 0) { @@ -614,64 +774,134 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, bau_desc->header.msg_type = MSG_RETRY; stat->s_retry_messages++; } + bau_desc->header.sequence = seq_number; - index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | - bcp->uvhub_cpu; + index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); + + write_mmr_activation(index); + try++; - completion_status = uv_wait_completion(bau_desc, mmr_offset, - right_shift, this_cpu, bcp, smaster, try); + completion_stat = wait_completion(bau_desc, bcp, try); + + handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); - if (completion_status == FLUSH_RETRY_PLUGGED) { - destination_plugged(bau_desc, bcp, hmaster, stat); - } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - destination_timeout(bau_desc, bcp, hmaster, stat); - } if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; - completion_status = FLUSH_GIVEUP; + completion_stat = FLUSH_GIVEUP; break; } cpu_relax(); - } while ((completion_status == FLUSH_RETRY_PLUGGED) || - (completion_status == FLUSH_RETRY_TIMEOUT)); + } while ((completion_stat == FLUSH_RETRY_PLUGGED) || + (completion_stat == FLUSH_RETRY_TIMEOUT)); + time2 = get_cycles(); - bcp->plugged_tries = 0; - bcp->timeout_tries = 0; - if ((completion_status == FLUSH_COMPLETE) && - (bcp->conseccompletes > bcp->complete_threshold) && - (hmaster->max_bau_concurrent < - hmaster->max_bau_concurrent_constant)) - hmaster->max_bau_concurrent++; + + count_max_concurr(completion_stat, bcp, hmaster); + while (hmaster->uvhub_quiesce) cpu_relax(); + atomic_dec(&hmaster->active_descriptor_count); - if (time2 > time1) { - elapsed = time2 - time1; - stat->s_time += elapsed; - if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { - bcp->period_requests++; - bcp->period_time += elapsed; - if ((elapsed > congested_cycles) && - (bcp->period_requests > bcp->congested_reps)) { - disable_for_congestion(bcp, stat); + + record_send_stats(time1, time2, bcp, stat, completion_stat, try); + + if (completion_stat == FLUSH_GIVEUP) + return 1; + return 0; +} + +/* + * The BAU is disabled. When the disabled time period has expired, the cpu + * that disabled it must re-enable it. + * Return 0 if it is re-enabled for all cpus. + */ +static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) +{ + int tcpu; + struct bau_control *tbcp; + + if (bcp->set_bau_off) { + if (get_cycles() >= bcp->set_bau_on_time) { + stat->s_bau_reenabled++; + baudisabled = 0; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 0; + tbcp->period_requests = 0; + tbcp->period_time = 0; } + return 0; } + } + return -1; +} + +static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs, + int remotes, struct bau_desc *bau_desc) +{ + stat->s_requestor++; + stat->s_ntargcpu += remotes + locals; + stat->s_ntargremotes += remotes; + stat->s_ntarglocals += locals; + + /* uvhub statistics */ + hubs = bau_uvhub_weight(&bau_desc->distribution); + if (locals) { + stat->s_ntarglocaluvhub++; + stat->s_ntargremoteuvhub += (hubs - 1); } else - stat->s_requestor--; - if (completion_status == FLUSH_COMPLETE && try > 1) - stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) { - stat->s_giveup++; - return 1; + stat->s_ntargremoteuvhub += hubs; + + stat->s_ntarguvhub += hubs; + + if (hubs >= 16) + stat->s_ntarguvhub16++; + else if (hubs >= 8) + stat->s_ntarguvhub8++; + else if (hubs >= 4) + stat->s_ntarguvhub4++; + else if (hubs >= 2) + stat->s_ntarguvhub2++; + else + stat->s_ntarguvhub1++; +} + +/* + * Translate a cpu mask to the uvhub distribution mask in the BAU + * activation descriptor. + */ +static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, + struct bau_desc *bau_desc, int *localsp, int *remotesp) +{ + int cpu; + int pnode; + int cnt = 0; + struct hub_and_pnode *hpp; + + for_each_cpu(cpu, flush_mask) { + /* + * The distribution vector is a bit map of pnodes, relative + * to the partition base pnode (and the partition base nasid + * in the header). + * Translate cpu to pnode and hub using a local memory array. + */ + hpp = &bcp->socket_master->thp[cpu]; + pnode = hpp->pnode - bcp->partition_base_pnode; + bau_uvhub_set(pnode, &bau_desc->distribution); + cnt++; + if (hpp->uvhub == bcp->uvhub) + (*localsp)++; + else + (*remotesp)++; } + if (!cnt) + return 1; return 0; } -/** - * uv_flush_tlb_others - globally purge translation cache of a virtual - * address or all TLB's +/* + * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) @@ -695,11 +925,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, * done. The returned pointer is valid till preemption is re-enabled. */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long va, unsigned int cpu) + struct mm_struct *mm, unsigned long va, + unsigned int cpu) { - int tcpu; - int uvhub; int locals = 0; int remotes = 0; int hubs = 0; @@ -707,7 +935,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; - struct bau_control *tbcp; /* kernel was booted 'nobau' */ if (nobau) @@ -718,20 +945,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, /* bau was disabled due to slow response */ if (bcp->baudisabled) { - /* the cpu that disabled it must re-enable it */ - if (bcp->set_bau_off) { - if (get_cycles() >= bcp->set_bau_on_time) { - stat->s_bau_reenabled++; - baudisabled = 0; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 0; - tbcp->period_requests = 0; - tbcp->period_time = 0; - } - } - } - return cpumask; + if (check_enable(bcp, stat)) + return cpumask; } /* @@ -742,52 +957,20 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); /* don't actually do a shootdown of the local cpu */ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + if (cpu_isset(cpu, *cpumask)) stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; + bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu; bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - - /* cpu statistics */ - for_each_cpu(tcpu, flush_mask) { - uvhub = uv_cpu_to_blade_id(tcpu); - bau_uvhub_set(uvhub, &bau_desc->distribution); - if (uvhub == bcp->uvhub) - locals++; - else - remotes++; - } - if ((locals + remotes) == 0) + if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; - stat->s_requestor++; - stat->s_ntargcpu += remotes + locals; - stat->s_ntargremotes += remotes; - stat->s_ntarglocals += locals; - remotes = bau_uvhub_weight(&bau_desc->distribution); - /* uvhub statistics */ - hubs = bau_uvhub_weight(&bau_desc->distribution); - if (locals) { - stat->s_ntarglocaluvhub++; - stat->s_ntargremoteuvhub += (hubs - 1); - } else - stat->s_ntargremoteuvhub += hubs; - stat->s_ntarguvhub += hubs; - if (hubs >= 16) - stat->s_ntarguvhub16++; - else if (hubs >= 8) - stat->s_ntarguvhub8++; - else if (hubs >= 4) - stat->s_ntarguvhub4++; - else if (hubs >= 2) - stat->s_ntarguvhub2++; - else - stat->s_ntarguvhub1++; + record_send_statistics(stat, locals, hubs, remotes, bau_desc); bau_desc->payload.address = va; bau_desc->payload.sending_cpu = cpu; - /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. @@ -816,26 +999,31 @@ void uv_bau_message_interrupt(struct pt_regs *regs) { int count = 0; cycles_t time_start; - struct bau_payload_queue_entry *msg; + struct bau_pq_entry *msg; struct bau_control *bcp; struct ptc_stats *stat; struct msg_desc msgdesc; time_start = get_cycles(); + bcp = &per_cpu(bau_control, smp_processor_id()); stat = bcp->statp; - msgdesc.va_queue_first = bcp->va_queue_first; - msgdesc.va_queue_last = bcp->va_queue_last; + + msgdesc.queue_first = bcp->queue_first; + msgdesc.queue_last = bcp->queue_last; + msg = bcp->bau_msg_head; - while (msg->sw_ack_vector) { + while (msg->swack_vec) { count++; - msgdesc.msg_slot = msg - msgdesc.va_queue_first; - msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; + + msgdesc.msg_slot = msg - msgdesc.queue_first; + msgdesc.swack_slot = ffs(msg->swack_vec) - 1; msgdesc.msg = msg; - uv_bau_process_message(&msgdesc, bcp); + bau_process_message(&msgdesc, bcp); + msg++; - if (msg > msgdesc.va_queue_last) - msg = msgdesc.va_queue_first; + if (msg > msgdesc.queue_last) + msg = msgdesc.queue_first; bcp->bau_msg_head = msg; } stat->d_time += (get_cycles() - time_start); @@ -843,18 +1031,17 @@ void uv_bau_message_interrupt(struct pt_regs *regs) stat->d_nomsg++; else if (count > 1) stat->d_multmsg++; + ack_APIC_irq(); } /* - * uv_enable_timeouts - * - * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have + * Each target uvhub (i.e. a uvhub that has cpu's) needs to have * shootdown message timeouts enabled. The timeout does not cause * an interrupt, but causes an error message to be returned to * the sender. */ -static void uv_enable_timeouts(void) +static void __init enable_timeouts(void) { int uvhub; int nuvhubs; @@ -868,47 +1055,44 @@ static void uv_enable_timeouts(void) continue; pnode = uv_blade_to_pnode(uvhub); - mmr_image = - uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); + mmr_image = read_mmr_misc_control(pnode); /* * Set the timeout period and then lock it in, in three * steps; captures and locks in the period. * * To program the period, the SOFT_ACK_MODE must be off. */ - mmr_image &= ~((unsigned long)1 << - UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); + mmr_image &= ~(1L << SOFTACK_MSHIFT); + write_mmr_misc_control(pnode, mmr_image); /* * Set the 4-bit period. */ - mmr_image &= ~((unsigned long)0xf << - UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); - mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << - UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); + mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT); + mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT); + write_mmr_misc_control(pnode, mmr_image); /* + * UV1: * Subsequent reversals of the timebase bit (3) cause an * immediate timeout of one or all INTD resources as * indicated in bits 2:0 (7 causes all of them to timeout). */ - mmr_image |= ((unsigned long)1 << - UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); + mmr_image |= (1L << SOFTACK_MSHIFT); + if (is_uv2_hub()) { + mmr_image |= (1L << UV2_LEG_SHFT); + mmr_image |= (1L << UV2_EXT_SHFT); + } + write_mmr_misc_control(pnode, mmr_image); } } -static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) +static void *ptc_seq_start(struct seq_file *file, loff_t *offset) { if (*offset < num_possible_cpus()) return offset; return NULL; } -static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) +static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) { (*offset)++; if (*offset < num_possible_cpus()) @@ -916,12 +1100,11 @@ static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) return NULL; } -static void uv_ptc_seq_stop(struct seq_file *file, void *data) +static void ptc_seq_stop(struct seq_file *file, void *data) { } -static inline unsigned long long -microsec_2_cycles(unsigned long microsec) +static inline unsigned long long usec_2_cycles(unsigned long microsec) { unsigned long ns; unsigned long long cyc; @@ -932,29 +1115,27 @@ microsec_2_cycles(unsigned long microsec) } /* - * Display the statistics thru /proc. + * Display the statistics thru /proc/sgi_uv/ptc_statistics * 'data' points to the cpu number + * Note: see the descriptions in stat_description[]. */ -static int uv_ptc_seq_show(struct seq_file *file, void *data) +static int ptc_seq_show(struct seq_file *file, void *data) { struct ptc_stats *stat; int cpu; cpu = *(loff_t *)data; - if (!cpu) { seq_printf(file, "# cpu sent stime self locals remotes ncpus localhub "); seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto "); - seq_printf(file, - "retries rok resetp resett giveup sto bz throt "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok "); seq_printf(file, - "sw_ack recv rtime all "); + "resetp resett giveup sto bz throt swack recv rtime "); seq_printf(file, - "one mult none retry canc nocan reset rcan "); + "all one mult none retry canc nocan reset rcan "); seq_printf(file, "disable enable\n"); } @@ -981,8 +1162,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) /* destination side statistics */ seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - uv_read_global_mmr64(uv_cpu_to_pnode(cpu), - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), + read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), stat->d_requestee, cycles_2_us(stat->d_time), stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, stat->d_nomsg, stat->d_retries, stat->d_canceled, @@ -991,7 +1171,6 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "%ld %ld\n", stat->s_bau_disabled, stat->s_bau_reenabled); } - return 0; } @@ -999,18 +1178,18 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) * Display the tunables thru debugfs */ static ssize_t tunables_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { char *buf; int ret; buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", - "max_bau_concurrent plugged_delay plugsb4reset", + "max_concur plugged_delay plugsb4reset", "timeoutsb4reset ipi_reset_limit complete_threshold", "congested_response_us congested_reps congested_period", - max_bau_concurrent, plugged_delay, plugsb4reset, + max_concurr, plugged_delay, plugsb4reset, timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_response_us, congested_reps, congested_period); + congested_respns_us, congested_reps, congested_period); if (!buf) return -ENOMEM; @@ -1021,13 +1200,16 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf, } /* - * -1: resetf the statistics + * handle a write to /proc/sgi_uv/ptc_statistics + * -1: reset the statistics * 0: display meaning of the statistics */ -static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, - size_t count, loff_t *data) +static ssize_t ptc_proc_write(struct file *file, const char __user *user, + size_t count, loff_t *data) { int cpu; + int i; + int elements; long input_arg; char optstr[64]; struct ptc_stats *stat; @@ -1037,79 +1219,18 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, if (copy_from_user(optstr, user, count)) return -EFAULT; optstr[count - 1] = '\0'; + if (strict_strtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; } if (input_arg == 0) { + elements = sizeof(stat_description)/sizeof(*stat_description); printk(KERN_DEBUG "# cpu: cpu number\n"); printk(KERN_DEBUG "Sender statistics:\n"); - printk(KERN_DEBUG - "sent: number of shootdown messages sent\n"); - printk(KERN_DEBUG - "stime: time spent sending messages\n"); - printk(KERN_DEBUG - "numuvhubs: number of hubs targeted with shootdown\n"); - printk(KERN_DEBUG - "numuvhubs16: number times 16 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs8: number times 8 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs4: number times 4 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs2: number times 2 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs1: number times 1 hub targeted\n"); - printk(KERN_DEBUG - "numcpus: number of cpus targeted with shootdown\n"); - printk(KERN_DEBUG - "dto: number of destination timeouts\n"); - printk(KERN_DEBUG - "retries: destination timeout retries sent\n"); - printk(KERN_DEBUG - "rok: : destination timeouts successfully retried\n"); - printk(KERN_DEBUG - "resetp: ipi-style resource resets for plugs\n"); - printk(KERN_DEBUG - "resett: ipi-style resource resets for timeouts\n"); - printk(KERN_DEBUG - "giveup: fall-backs to ipi-style shootdowns\n"); - printk(KERN_DEBUG - "sto: number of source timeouts\n"); - printk(KERN_DEBUG - "bz: number of stay-busy's\n"); - printk(KERN_DEBUG - "throt: number times spun in throttle\n"); - printk(KERN_DEBUG "Destination side statistics:\n"); - printk(KERN_DEBUG - "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); - printk(KERN_DEBUG - "recv: shootdown messages received\n"); - printk(KERN_DEBUG - "rtime: time spent processing messages\n"); - printk(KERN_DEBUG - "all: shootdown all-tlb messages\n"); - printk(KERN_DEBUG - "one: shootdown one-tlb messages\n"); - printk(KERN_DEBUG - "mult: interrupts that found multiple messages\n"); - printk(KERN_DEBUG - "none: interrupts that found no messages\n"); - printk(KERN_DEBUG - "retry: number of retry messages processed\n"); - printk(KERN_DEBUG - "canc: number messages canceled by retries\n"); - printk(KERN_DEBUG - "nocan: number retries that found nothing to cancel\n"); - printk(KERN_DEBUG - "reset: number of ipi-style reset requests processed\n"); - printk(KERN_DEBUG - "rcan: number messages canceled by reset requests\n"); - printk(KERN_DEBUG - "disable: number times use of the BAU was disabled\n"); - printk(KERN_DEBUG - "enable: number times use of the BAU was re-enabled\n"); + for (i = 0; i < elements; i++) + printk(KERN_DEBUG "%s\n", stat_description[i]); } else if (input_arg == -1) { for_each_present_cpu(cpu) { stat = &per_cpu(ptcstats, cpu); @@ -1136,27 +1257,18 @@ static int local_atoi(const char *name) } /* - * set the tunables - * 0 values reset them to defaults + * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables. + * Zero values reset them to defaults. */ -static ssize_t tunables_write(struct file *file, const char __user *user, - size_t count, loff_t *data) +static int parse_tunables_write(struct bau_control *bcp, char *instr, + int count) { - int cpu; - int cnt = 0; - int val; char *p; char *q; - char instr[64]; - struct bau_control *bcp; - - if (count == 0 || count > sizeof(instr)-1) - return -EINVAL; - if (copy_from_user(instr, user, count)) - return -EFAULT; + int cnt = 0; + int val; + int e = sizeof(tunables) / sizeof(*tunables); - instr[count] = '\0'; - /* count the fields */ p = instr + strspn(instr, WHITESPACE); q = p; for (; *p; p = q + strspn(q, WHITESPACE)) { @@ -1165,8 +1277,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user, if (q == p) break; } - if (cnt != 9) { - printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); + if (cnt != e) { + printk(KERN_INFO "bau tunable error: should be %d values\n", e); return -EINVAL; } @@ -1178,97 +1290,80 @@ static ssize_t tunables_write(struct file *file, const char __user *user, switch (cnt) { case 0: if (val == 0) { - max_bau_concurrent = MAX_BAU_CONCURRENT; - max_bau_concurrent_constant = - MAX_BAU_CONCURRENT; + max_concurr = MAX_BAU_CONCURRENT; + max_concurr_const = MAX_BAU_CONCURRENT; continue; } - bcp = &per_cpu(bau_control, smp_processor_id()); if (val < 1 || val > bcp->cpus_in_uvhub) { printk(KERN_DEBUG "Error: BAU max concurrent %d is invalid\n", val); return -EINVAL; } - max_bau_concurrent = val; - max_bau_concurrent_constant = val; - continue; - case 1: - if (val == 0) - plugged_delay = PLUGGED_DELAY; - else - plugged_delay = val; + max_concurr = val; + max_concurr_const = val; continue; - case 2: - if (val == 0) - plugsb4reset = PLUGSB4RESET; - else - plugsb4reset = val; - continue; - case 3: - if (val == 0) - timeoutsb4reset = TIMEOUTSB4RESET; - else - timeoutsb4reset = val; - continue; - case 4: - if (val == 0) - ipi_reset_limit = IPI_RESET_LIMIT; - else - ipi_reset_limit = val; - continue; - case 5: - if (val == 0) - complete_threshold = COMPLETE_THRESHOLD; - else - complete_threshold = val; - continue; - case 6: - if (val == 0) - congested_response_us = CONGESTED_RESPONSE_US; - else - congested_response_us = val; - continue; - case 7: - if (val == 0) - congested_reps = CONGESTED_REPS; - else - congested_reps = val; - continue; - case 8: + default: if (val == 0) - congested_period = CONGESTED_PERIOD; + *tunables[cnt].tunp = tunables[cnt].deflt; else - congested_period = val; + *tunables[cnt].tunp = val; continue; } if (q == p) break; } + return 0; +} + +/* + * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables) + */ +static ssize_t tunables_write(struct file *file, const char __user *user, + size_t count, loff_t *data) +{ + int cpu; + int ret; + char instr[100]; + struct bau_control *bcp; + + if (count == 0 || count > sizeof(instr)-1) + return -EINVAL; + if (copy_from_user(instr, user, count)) + return -EFAULT; + + instr[count] = '\0'; + + bcp = &per_cpu(bau_control, smp_processor_id()); + + ret = parse_tunables_write(bcp, instr, count); + if (ret) + return ret; + for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->max_bau_concurrent = max_bau_concurrent; - bcp->max_bau_concurrent_constant = max_bau_concurrent; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->congested_response_us = congested_response_us; - bcp->congested_reps = congested_reps; - bcp->congested_period = congested_period; + bcp->max_concurr = max_concurr; + bcp->max_concurr_const = max_concurr; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->cong_response_us = congested_respns_us; + bcp->cong_reps = congested_reps; + bcp->cong_period = congested_period; } return count; } static const struct seq_operations uv_ptc_seq_ops = { - .start = uv_ptc_seq_start, - .next = uv_ptc_seq_next, - .stop = uv_ptc_seq_stop, - .show = uv_ptc_seq_show + .start = ptc_seq_start, + .next = ptc_seq_next, + .stop = ptc_seq_stop, + .show = ptc_seq_show }; -static int uv_ptc_proc_open(struct inode *inode, struct file *file) +static int ptc_proc_open(struct inode *inode, struct file *file) { return seq_open(file, &uv_ptc_seq_ops); } @@ -1279,9 +1374,9 @@ static int tunables_open(struct inode *inode, struct file *file) } static const struct file_operations proc_uv_ptc_operations = { - .open = uv_ptc_proc_open, + .open = ptc_proc_open, .read = seq_read, - .write = uv_ptc_proc_write, + .write = ptc_proc_write, .llseek = seq_lseek, .release = seq_release, }; @@ -1315,7 +1410,7 @@ static int __init uv_ptc_init(void) return -EINVAL; } tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, - tunables_dir, NULL, &tunables_fops); + tunables_dir, NULL, &tunables_fops); if (!tunables_file) { printk(KERN_ERR "unable to create debugfs file %s\n", UV_BAU_TUNABLES_FILE); @@ -1325,53 +1420,52 @@ static int __init uv_ptc_init(void) } /* - * initialize the sending side's sending buffers + * Initialize the sending side's sending buffers. */ -static void -uv_activation_descriptor_init(int node, int pnode) +static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; unsigned long pa; unsigned long m; unsigned long n; + size_t dsize; struct bau_desc *bau_desc; struct bau_desc *bd2; struct bau_control *bcp; /* - * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) - * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE) + * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC) + * per cpu; and one per cpu on the uvhub (ADP_SZ) */ - bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE - * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); + dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC; + bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); BUG_ON(!bau_desc); pa = uv_gpa(bau_desc); /* need the real nasid*/ n = pa >> uv_nshift; m = pa & uv_mmask; - uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - + /* the 14-bit pnode */ + write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); /* - * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * Initializing all 8 (ITEMS_PER_DESC) descriptors for each * cpu even though we only use the first one; one descriptor can * describe a broadcast to 256 uv hubs. */ - for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); - i++, bd2++) { + for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { memset(bd2, 0, sizeof(struct bau_desc)); - bd2->header.sw_ack_flag = 1; + bd2->header.swack_flag = 1; /* - * base_dest_nodeid is the nasid of the first uvhub - * in the partition. The bit map will indicate uvhub numbers, - * which are 0-N in a partition. Pnodes are unique system-wide. + * The base_dest_nasid set in the message header is the nasid + * of the first uvhub in the partition. The bit map will + * indicate destination pnode numbers relative to that base. + * They may not be consecutive if nasid striding is being used. */ - bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); - bd2->header.dest_subnodeid = 0x10; /* the LB */ - bd2->header.command = UV_NET_ENDPOINT_INTD; - bd2->header.int_both = 1; + bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); + bd2->header.dest_subnodeid = UV_LB_SUBNODEID; + bd2->header.command = UV_NET_ENDPOINT_INTD; + bd2->header.int_both = 1; /* * all others need to be set to zero: * fairness chaining multilevel count replied_to @@ -1391,57 +1485,55 @@ uv_activation_descriptor_init(int node, int pnode) * - node is first node (kernel memory notion) on the uvhub * - pnode is the uvhub's physical identifier */ -static void -uv_payload_queue_init(int node, int pnode) +static void pq_init(int node, int pnode) { - int pn; int cpu; + size_t plsize; char *cp; - unsigned long pa; - struct bau_payload_queue_entry *pqp; - struct bau_payload_queue_entry *pqp_malloc; + void *vp; + unsigned long pn; + unsigned long first; + unsigned long pn_first; + unsigned long last; + struct bau_pq_entry *pqp; struct bau_control *bcp; - pqp = kmalloc_node((DEST_Q_SIZE + 1) - * sizeof(struct bau_payload_queue_entry), - GFP_KERNEL, node); + plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry); + vp = kmalloc_node(plsize, GFP_KERNEL, node); + pqp = (struct bau_pq_entry *)vp; BUG_ON(!pqp); - pqp_malloc = pqp; cp = (char *)pqp + 31; - pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); + pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5); for_each_present_cpu(cpu) { if (pnode != uv_cpu_to_pnode(cpu)) continue; /* for every cpu on this pnode: */ bcp = &per_cpu(bau_control, cpu); - bcp->va_queue_first = pqp; - bcp->bau_msg_head = pqp; - bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); + bcp->queue_first = pqp; + bcp->bau_msg_head = pqp; + bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } /* * need the pnode of where the memory was really allocated */ - pa = uv_gpa(pqp); - pn = pa >> uv_nshift; - uv_write_global_mmr64(pnode, - UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, - ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, - (unsigned long) - uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1))); + pn = uv_gpa(pqp) >> uv_nshift; + first = uv_physnodeaddr(pqp); + pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; + last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); + write_mmr_payload_first(pnode, pn_first); + write_mmr_payload_tail(pnode, first); + write_mmr_payload_last(pnode, last); + /* in effect, all msg_type's are set to MSG_NOOP */ - memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); + memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); } /* * Initialization of each UV hub's structures */ -static void __init uv_init_uvhub(int uvhub, int vector) +static void __init init_uvhub(int uvhub, int vector, int base_pnode) { int node; int pnode; @@ -1449,24 +1541,24 @@ static void __init uv_init_uvhub(int uvhub, int vector) node = uvhub_to_first_node(uvhub); pnode = uv_blade_to_pnode(uvhub); - uv_activation_descriptor_init(node, pnode); - uv_payload_queue_init(node, pnode); + + activation_descriptor_init(node, pnode, base_pnode); + + pq_init(node, pnode); /* - * the below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS + * The below initialization can't be in firmware because the + * messaging IRQ will be determined by the OS. */ apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, - ((apicid << 32) | vector)); + write_mmr_data_config(pnode, ((apicid << 32) | vector)); } /* * We will set BAU_MISC_CONTROL with a timeout period. * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. - * So the destination timeout period has be be calculated from them. + * So the destination timeout period has to be calculated from them. */ -static int -calculate_destination_timeout(void) +static int calculate_destination_timeout(void) { unsigned long mmr_image; int mult1; @@ -1476,135 +1568,223 @@ calculate_destination_timeout(void) int ret; unsigned long ts_ns; - mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); - index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; - mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); - mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - base = timeout_base_ns[index]; - ts_ns = base * mult1 * mult2; - ret = ts_ns / 1000; + if (is_uv1_hub()) { + mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; + mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; + mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); + mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; + base = timeout_base_ns[index]; + ts_ns = base * mult1 * mult2; + ret = ts_ns / 1000; + } else { + /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */ + mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; + if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) + mult1 = 80; + else + mult1 = 10; + base = mmr_image & UV2_ACK_MASK; + ret = mult1 * base; + } return ret; } +static void __init init_per_cpu_tunables(void) +{ + int cpu; + struct bau_control *bcp; + + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->baudisabled = 0; + bcp->statp = &per_cpu(ptcstats, cpu); + /* time interval to catch a hardware stay-busy bug */ + bcp->timeout_interval = usec_2_cycles(2*timeout_us); + bcp->max_concurr = max_concurr; + bcp->max_concurr_const = max_concurr; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->cong_response_us = congested_respns_us; + bcp->cong_reps = congested_reps; + bcp->cong_period = congested_period; + } +} + /* - * initialize the bau_control structure for each cpu + * Scan all cpus to collect blade and socket summaries. */ -static int __init uv_init_per_cpu(int nuvhubs) +static int __init get_cpu_topology(int base_pnode, + struct uvhub_desc *uvhub_descs, + unsigned char *uvhub_mask) { - int i; int cpu; int pnode; int uvhub; - int have_hmaster; - short socket = 0; - unsigned short socket_mask; - unsigned char *uvhub_mask; + int socket; struct bau_control *bcp; struct uvhub_desc *bdp; struct socket_desc *sdp; - struct bau_control *hmaster = NULL; - struct bau_control *smaster = NULL; - struct socket_desc { - short num_cpus; - short cpu_number[MAX_CPUS_PER_SOCKET]; - }; - struct uvhub_desc { - unsigned short socket_mask; - short num_cpus; - short uvhub; - short pnode; - struct socket_desc socket[2]; - }; - struct uvhub_desc *uvhub_descs; - - timeout_us = calculate_destination_timeout(); - uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); - memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); - uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); + memset(bcp, 0, sizeof(struct bau_control)); + pnode = uv_cpu_hub_info(cpu)->pnode; + if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) { + printk(KERN_EMERG + "cpu %d pnode %d-%d beyond %d; BAU disabled\n", + cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE); + return 1; + } + + bcp->osnode = cpu_to_node(cpu); + bcp->partition_base_pnode = base_pnode; + uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); bdp = &uvhub_descs[uvhub]; + bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; + /* kludge: 'assuming' one node per socket, and assuming that disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1); + socket = bcp->osnode & 1; bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { - printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus); + printk(KERN_EMERG "%d cpus per socket invalid\n", + sdp->num_cpus); return 1; } } + return 0; +} + +/* + * Each socket is to get a local array of pnodes/hubs. + */ +static void make_per_cpu_thp(struct bau_control *smaster) +{ + int cpu; + size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus(); + + smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode); + memset(smaster->thp, 0, hpsz); + for_each_present_cpu(cpu) { + smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode; + smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; + } +} + +/* + * Initialize all the per_cpu information for the cpu's on a given socket, + * given what has been gathered into the socket_desc struct. + * And reports the chosen hub and socket masters back to the caller. + */ +static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, + struct bau_control **smasterp, + struct bau_control **hmasterp) +{ + int i; + int cpu; + struct bau_control *bcp; + + for (i = 0; i < sdp->num_cpus; i++) { + cpu = sdp->cpu_number[i]; + bcp = &per_cpu(bau_control, cpu); + bcp->cpu = cpu; + if (i == 0) { + *smasterp = bcp; + if (!(*hmasterp)) + *hmasterp = bcp; + } + bcp->cpus_in_uvhub = bdp->num_cpus; + bcp->cpus_in_socket = sdp->num_cpus; + bcp->socket_master = *smasterp; + bcp->uvhub = bdp->uvhub; + bcp->uvhub_master = *hmasterp; + bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { + printk(KERN_EMERG "%d cpus per uvhub invalid\n", + bcp->uvhub_cpu); + return 1; + } + } + return 0; +} + +/* + * Summarize the blade and socket topology into the per_cpu structures. + */ +static int __init summarize_uvhub_sockets(int nuvhubs, + struct uvhub_desc *uvhub_descs, + unsigned char *uvhub_mask) +{ + int socket; + int uvhub; + unsigned short socket_mask; + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + struct uvhub_desc *bdp; + struct bau_control *smaster = NULL; + struct bau_control *hmaster = NULL; + if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) continue; - have_hmaster = 0; + bdp = &uvhub_descs[uvhub]; socket_mask = bdp->socket_mask; socket = 0; while (socket_mask) { - if (!(socket_mask & 1)) - goto nextsocket; - sdp = &bdp->socket[socket]; - for (i = 0; i < sdp->num_cpus; i++) { - cpu = sdp->cpu_number[i]; - bcp = &per_cpu(bau_control, cpu); - bcp->cpu = cpu; - if (i == 0) { - smaster = bcp; - if (!have_hmaster) { - have_hmaster++; - hmaster = bcp; - } - } - bcp->cpus_in_uvhub = bdp->num_cpus; - bcp->cpus_in_socket = sdp->num_cpus; - bcp->socket_master = smaster; - bcp->uvhub = bdp->uvhub; - bcp->uvhub_master = hmaster; - bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> - blade_processor_id; - if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { - printk(KERN_EMERG - "%d cpus per uvhub invalid\n", - bcp->uvhub_cpu); + struct socket_desc *sdp; + if ((socket_mask & 1)) { + sdp = &bdp->socket[socket]; + if (scan_sock(sdp, bdp, &smaster, &hmaster)) return 1; - } } -nextsocket: socket++; socket_mask = (socket_mask >> 1); + make_per_cpu_thp(smaster); } } + return 0; +} + +/* + * initialize the bau_control structure for each cpu + */ +static int __init init_per_cpu(int nuvhubs, int base_part_pnode) +{ + unsigned char *uvhub_mask; + void *vp; + struct uvhub_desc *uvhub_descs; + + timeout_us = calculate_destination_timeout(); + + vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); + uvhub_descs = (struct uvhub_desc *)vp; + memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); + uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); + + if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) + return 1; + + if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask)) + return 1; + kfree(uvhub_descs); kfree(uvhub_mask); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->baudisabled = 0; - bcp->statp = &per_cpu(ptcstats, cpu); - /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = microsec_2_cycles(2*timeout_us); - bcp->max_bau_concurrent = max_bau_concurrent; - bcp->max_bau_concurrent_constant = max_bau_concurrent; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->congested_response_us = congested_response_us; - bcp->congested_reps = congested_reps; - bcp->congested_period = congested_period; - } + init_per_cpu_tunables(); return 0; } @@ -1617,8 +1797,9 @@ static int __init uv_bau_init(void) int pnode; int nuvhubs; int cur_cpu; + int cpus; int vector; - unsigned long mmr; + cpumask_var_t *mask; if (!is_uv_system()) return 0; @@ -1626,45 +1807,47 @@ static int __init uv_bau_init(void) if (nobau) return 0; - for_each_possible_cpu(cur_cpu) - zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), - GFP_KERNEL, cpu_to_node(cur_cpu)); + for_each_possible_cpu(cur_cpu) { + mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); + zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); + } uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); spin_lock_init(&disable_lock); - congested_cycles = microsec_2_cycles(congested_response_us); + congested_cycles = usec_2_cycles(congested_respns_us); + + uv_base_pnode = 0x7fffffff; + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + cpus = uv_blade_nr_possible_cpus(uvhub); + if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode)) + uv_base_pnode = uv_blade_to_pnode(uvhub); + } - if (uv_init_per_cpu(nuvhubs)) { + if (init_per_cpu(nuvhubs, uv_base_pnode)) { nobau = 1; return 0; } - uv_partition_base_pnode = 0x7fffffff; - for (uvhub = 0; uvhub < nuvhubs; uvhub++) - if (uv_blade_nr_possible_cpus(uvhub) && - (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) - uv_partition_base_pnode = uv_blade_to_pnode(uvhub); - vector = UV_BAU_MESSAGE; for_each_possible_blade(uvhub) if (uv_blade_nr_possible_cpus(uvhub)) - uv_init_uvhub(uvhub, vector); + init_uvhub(uvhub, vector, uv_base_pnode); - uv_enable_timeouts(); + enable_timeouts(); alloc_intr_gate(vector, uv_bau_message_intr1); for_each_possible_blade(uvhub) { if (uv_blade_nr_possible_cpus(uvhub)) { + unsigned long val; + unsigned long mmr; pnode = uv_blade_to_pnode(uvhub); /* INIT the bau */ - uv_write_global_mmr64(pnode, - UVH_LB_BAU_SB_ACTIVATION_CONTROL, - ((unsigned long)1 << 63)); + val = 1L << 63; + write_gmmr_activation(pnode, val); mmr = 1; /* should be 1 to broadcast to both sockets */ - uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, - mmr); + write_mmr_data_broadcast(pnode, mmr); } } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 7b24460917d5..374a05d8ad22 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; int mmr_pnode, err; @@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, else irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, irq_name); mmr_value = 0; diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 9daf5d1af9f1..9f29a01ee1b3 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = { .rating = 400, .read = uv_read_rtc, .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -100,8 +99,12 @@ static void uv_rtc_send_IPI(int cpu) /* Check for an RTC interrupt pending */ static int uv_intr_pending(int pnode) { - return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & - UVH_EVENT_OCCURRED0_RTC1_MASK; + if (is_uv1_hub()) + return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & + UV1H_EVENT_OCCURRED0_RTC1_MASK; + else + return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) & + UV2H_EVENT_OCCURRED2_RTC_1_MASK; } /* Setup interrupt and return non-zero if early expiration occurred. */ @@ -115,8 +118,12 @@ static int uv_setup_intr(int cpu, u64 expires) UVH_RTC1_INT_CONFIG_M_MASK); uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); - uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, - UVH_EVENT_OCCURRED0_RTC1_MASK); + if (is_uv1_hub()) + uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, + UV1H_EVENT_OCCURRED0_RTC1_MASK); + else + uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS, + UV2H_EVENT_OCCURRED2_RTC_1_MASK); val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); @@ -372,14 +379,11 @@ static __init int uv_rtc_setup_clock(void) if (!is_uv_system()) return -ENODEV; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_uv.shift); - /* If single blade, prefer tsc */ if (uv_num_possible_blades() == 1) clocksource_uv.rating = 250; - rc = clocksource_register(&clocksource_uv); + rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); else diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 632037671746..c7abf13a213f 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -471,15 +471,7 @@ static unsigned int startup_piix4_master_irq(struct irq_data *data) { legacy_pic->init(0); enable_cobalt_irq(data); -} - -static void end_piix4_master_irq(struct irq_data *data) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - enable_cobalt_irq(data); - spin_unlock_irqrestore(&cobalt_lock, flags); + return 0; } static struct irq_chip piix4_master_irq_type = { @@ -492,7 +484,7 @@ static void pii4_mask(struct irq_data *data) { } static struct irq_chip piix4_virtual_irq_type = { .name = "PIIX4-virtual", - .mask = pii4_mask, + .irq_mask = pii4_mask, }; /* @@ -569,18 +561,20 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NO_THREAD, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static inline void set_piix4_virtual_irq_type(void) { - piix4_virtual_irq_type.enable = i8259A_chip.unmask; - piix4_virtual_irq_type.disable = i8259A_chip.mask; - piix4_virtual_irq_type.unmask = i8259A_chip.unmask; + piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask; + piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask; + piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask; } static void __init visws_pre_intr_init(void) @@ -597,7 +591,7 @@ static void __init visws_pre_intr_init(void) else if (i == CO_IRQ_IDE0) chip = &cobalt_irq_type; else if (i == CO_IRQ_IDE1) - >chip = &cobalt_irq_type; + chip = &cobalt_irq_type; else if (i == CO_IRQ_8259) chip = &piix4_master_irq_type; else if (i < CO_IRQ_APIC0) @@ -606,7 +600,7 @@ static void __init visws_pre_intr_init(void) chip = &cobalt_irq_type; if (chip) - set_irq_chip(i, chip); + irq_set_chip(i, chip); } setup_irq(CO_IRQ_8259, &master_action); diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index b6552b189bcd..bef0bc962400 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images) # files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o # files to link into kernel obj-$(VDSO64-y) += vma.o vdso.o @@ -37,11 +37,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ - $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls $(vobjs): KBUILD_CFLAGS += $(CFL) +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vvar.o = -pg + targets += vdso-syms.lds obj-$(VDSO64-y) += vdso-syms.lds diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index ee55754cc3c5..a724905fdae7 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -2,7 +2,7 @@ * Copyright 2006 Andi Kleen, SUSE Labs. * Subject to the GNU Public License, v.2 * - * Fast user context implementation of clock_gettime and gettimeofday. + * Fast user context implementation of clock_gettime, gettimeofday, and time. * * The code should have no internal unresolved relocations. * Check with readelf after changing. @@ -22,9 +22,8 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> -#include "vextern.h" -#define gtod vdso_vsyscall_gtod_data +#define gtod (&VVAR(vsyscall_gtod_data)) notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { @@ -56,22 +55,6 @@ notrace static noinline int do_realtime(struct timespec *ts) return 0; } -/* Copy of the version in kernel/time.c which we cannot directly access */ -notrace static void -vset_normalized_timespec(struct timespec *ts, long sec, long nsec) -{ - while (nsec >= NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} - notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; @@ -82,7 +65,17 @@ notrace static noinline int do_monotonic(struct timespec *ts) secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; } while (unlikely(read_seqretry(>od->lock, seq))); - vset_normalized_timespec(ts, secs, ns); + + /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec + * are all guaranteed to be nonnegative. + */ + while (ns >= NSEC_PER_SEC) { + ns -= NSEC_PER_SEC; + ++secs; + } + ts->tv_sec = secs; + ts->tv_nsec = ns; + return 0; } @@ -107,7 +100,17 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; } while (unlikely(read_seqretry(>od->lock, seq))); - vset_normalized_timespec(ts, secs, ns); + + /* wall_time_nsec and wall_to_monotonic.tv_nsec are + * guaranteed to be between 0 and NSEC_PER_SEC. + */ + if (ns >= NSEC_PER_SEC) { + ns -= NSEC_PER_SEC; + ++secs; + } + ts->tv_sec = secs; + ts->tv_nsec = ns; + return 0; } @@ -157,3 +160,32 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) } int gettimeofday(struct timeval *, struct timezone *) __attribute__((weak, alias("__vdso_gettimeofday"))); + +/* This will break when the xtime seconds get inaccurate, but that is + * unlikely */ + +static __always_inline long time_syscall(long *t) +{ + long secs; + asm volatile("syscall" + : "=a" (secs) + : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); + return secs; +} + +notrace time_t __vdso_time(time_t *t) +{ + time_t result; + + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) + return time_syscall(t); + + /* This is atomic on x86_64 so we don't need any locks. */ + result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); + + if (t) + *t = result; + return result; +} +int time(time_t *t) + __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index 4e5dd3b4de7f..b96b2677cad8 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -23,15 +23,10 @@ VERSION { __vdso_gettimeofday; getcpu; __vdso_getcpu; + time; + __vdso_time; local: *; }; } VDSO64_PRELINK = VDSO_PRELINK; - -/* - * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL. - */ -#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x; -#include "vextern.h" -#undef VEXTERN diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 36df991985b2..468d591dde31 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -417,24 +417,25 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - struct mm_struct *mm = tsk->mm; - - /* Check to see if this task was created in compat vdso mode */ + /* + * Check to see if the corresponding task was created in compat vdso + * mode. + */ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) return &gate_vma; return NULL; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(task); + const struct vm_area_struct *vma = get_gate_vma(mm); return vma && addr >= vma->vm_start && addr < vma->vm_end; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h deleted file mode 100644 index 1683ba2ae3e8..000000000000 --- a/arch/x86/vdso/vextern.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef VEXTERN -#include <asm/vsyscall.h> -#define VEXTERN(x) \ - extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden"))); -#endif - -#define VMAGIC 0xfeedbabeabcdefabUL - -/* Any kernel variables used in the vDSO must be exported in the main - kernel's vmlinux.lds.S/vsyscall.h/proper __section and - put into vextern.h and be referenced as a pointer with vdso prefix. - The main kernel later fills in the values. */ - -VEXTERN(jiffies) -VEXTERN(vgetcpu_mode) -VEXTERN(vsyscall_gtod_data) diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 9fbc6b20026b..5463ad558573 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -11,14 +11,13 @@ #include <linux/time.h> #include <asm/vsyscall.h> #include <asm/vgtod.h> -#include "vextern.h" notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; - if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { + if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ native_read_tscp(&p); } else { diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 4b5d26f108bb..7abd2be0f9b9 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -15,9 +15,6 @@ #include <asm/proto.h> #include <asm/vdso.h> -#include "vextern.h" /* Just for VMAGIC. */ -#undef VEXTERN - unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; @@ -26,20 +23,10 @@ extern unsigned short vdso_sync_cpuid; static struct page **vdso_pages; static unsigned vdso_size; -static inline void *var_ref(void *p, char *name) -{ - if (*(void **)p != (void *)VMAGIC) { - printk("VDSO: variable %s broken\n", name); - vdso_enabled = 0; - } - return p; -} - static int __init init_vdso_vars(void) { int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; int i; - char *vbase; vdso_size = npages << PAGE_SHIFT; vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); @@ -54,20 +41,6 @@ static int __init init_vdso_vars(void) copy_page(page_address(p), vdso_start + i*PAGE_SIZE); } - vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL); - if (!vbase) - goto oom; - - if (memcmp(vbase, "\177ELF", 4)) { - printk("VDSO: I'm broken; not ELF\n"); - vdso_enabled = 0; - } - -#define VEXTERN(x) \ - *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; -#include "vextern.h" -#undef VEXTERN - vunmap(vbase); return 0; oom: diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c deleted file mode 100644 index 1b7e703684f9..000000000000 --- a/arch/x86/vdso/vvar.c +++ /dev/null @@ -1,12 +0,0 @@ -/* Define pointer to external vDSO variables. - These are part of the vDSO. The kernel fills in the real addresses - at boot time. This is done because when the vdso is linked the - kernel isn't yet and we don't know the final addresses. */ -#include <linux/kernel.h> -#include <linux/time.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/vgtod.h> - -#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC; -#include "vextern.h" diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5b54892e4bc3..5cc821cb2e09 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -38,7 +38,8 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool - depends on XEN && PM + depends on XEN + select HIBERNATE_CALLBACKS default y config XEN_DEBUG_FS @@ -48,3 +49,11 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. + +config XEN_DEBUG + bool "Enable Xen debug checks" + depends on XEN + default n + help + Enable various WARN_ON checks in the Xen MMU code. + Enabling this option WILL incur a significant performance overhead. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50542efe45fb..dd7b88f2ec7a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -235,9 +235,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *dx &= maskedx; } -static __init void xen_init_cpuid_mask(void) +static void __init xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; + unsigned int xsave_mask; cpuid_leaf1_edx_mask = ~((1 << X86_FEATURE_MCE) | /* disable MCE */ @@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void) cpuid_leaf1_edx_mask &= ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ - ax = 1; - cx = 0; xen_cpuid(&ax, &bx, &cx, &dx); - /* cpuid claims we support xsave; try enabling it to see what happens */ - if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { - unsigned long cr4; - - set_in_cr4(X86_CR4_OSXSAVE); - - cr4 = read_cr4(); - - if ((cr4 & X86_CR4_OSXSAVE) == 0) - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); + xsave_mask = + (1 << (X86_FEATURE_XSAVE % 32)) | + (1 << (X86_FEATURE_OSXSAVE % 32)); - clear_in_cr4(X86_CR4_OSXSAVE); - } + /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ + if ((cx & xsave_mask) != xsave_mask) + cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ } static void xen_set_debugreg(int reg, unsigned long val) @@ -407,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr) /* * load_gdt for early boot, when the gdt is only mapped once */ -static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) +static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) { unsigned long va = dtr->address; unsigned int size = dtr->size + 1; @@ -669,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, * Version of write_gdt_entry for use at early boot-time needed to * update an entry as simply as possible. */ -static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, +static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, const void *desc, int type) { switch (type) { @@ -940,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static const struct pv_info xen_info __initdata = { +static const struct pv_info xen_info __initconst = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initdata = { +static const struct pv_init_ops xen_init_ops __initconst = { .patch = xen_patch, }; -static const struct pv_cpu_ops xen_cpu_ops __initdata = { +static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, .set_debugreg = xen_set_debugreg, @@ -1011,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .end_context_switch = xen_end_context_switch, }; -static const struct pv_apic_ops xen_apic_ops __initdata = { +static const struct pv_apic_ops xen_apic_ops __initconst = { #ifdef CONFIG_X86_LOCAL_APIC .startup_ipi_hook = paravirt_nop, #endif @@ -1062,7 +1055,7 @@ int xen_panic_handler_init(void) return 0; } -static const struct machine_ops __initdata xen_machine_ops = { +static const struct machine_ops xen_machine_ops __initconst = { .restart = xen_restart, .halt = xen_machine_halt, .power_off = xen_machine_halt, @@ -1284,15 +1277,14 @@ static int init_hvm_pv_info(int *major, int *minor) xen_setup_features(); - pv_info = xen_info; - pv_info.kernel_rpl = 0; + pv_info.name = "Xen HVM"; xen_domain_type = XEN_HVM_DOMAIN; return 0; } -void xen_hvm_init_shared_info(void) +void __ref xen_hvm_init_shared_info(void) { int cpu; struct xen_add_to_physmap xatp; @@ -1331,6 +1323,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + if (xen_have_vector_callback) + xen_init_lock_cpu(cpu); break; default: break; @@ -1338,7 +1332,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { +static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { .notifier_call = xen_hvm_cpu_notify, }; @@ -1355,6 +1349,7 @@ static void __init xen_hvm_guest_init(void) if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; + xen_hvm_smp_init(); register_cpu_notifier(&xen_hvm_cpu_notifier); xen_unplug_emulated_devices(); have_vcpu_info_placement = 0; @@ -1386,7 +1381,7 @@ bool xen_hvm_need_lapic(void) } EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); -const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { +const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { .name = "Xen HVM", .detect = xen_hvm_platform, .init_platform = xen_hvm_guest_init, diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 6a6fe8939645..8bbb465b6f0a 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -113,7 +113,7 @@ static void xen_halt(void) xen_safe_halt(); } -static const struct pv_irq_ops xen_irq_ops __initdata = { +static const struct pv_irq_ops xen_irq_ops __initconst = { .save_fl = PV_CALLEE_SAVE(xen_save_fl), .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f6089421147a..dc708dcc62f1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -46,6 +46,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <linux/memblock.h> +#include <linux/seq_file.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -74,68 +75,12 @@ #include "mmu.h" #include "debugfs.h" -#define MMU_UPDATE_HISTO 30 - /* * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. + * Also protects non-atomic updates of current_pages and balloon lists. */ DEFINE_SPINLOCK(xen_reservation_lock); -#ifdef CONFIG_XEN_DEBUG_FS - -static struct { - u32 pgd_update; - u32 pgd_update_pinned; - u32 pgd_update_batched; - - u32 pud_update; - u32 pud_update_pinned; - u32 pud_update_batched; - - u32 pmd_update; - u32 pmd_update_pinned; - u32 pmd_update_batched; - - u32 pte_update; - u32 pte_update_pinned; - u32 pte_update_batched; - - u32 mmu_update; - u32 mmu_update_extended; - u32 mmu_update_histo[MMU_UPDATE_HISTO]; - - u32 prot_commit; - u32 prot_commit_batched; - - u32 set_pte_at; - u32 set_pte_at_batched; - u32 set_pte_at_pinned; - u32 set_pte_at_current; - u32 set_pte_at_kernel; -} mmu_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - if (unlikely(zero_stats)) { - memset(&mmu_stats, 0, sizeof(mmu_stats)); - zero_stats = 0; - } -} - -#define ADD_STATS(elem, val) \ - do { check_zero(); mmu_stats.elem += (val); } while(0) - -#else /* !CONFIG_XEN_DEBUG_FS */ - -#define ADD_STATS(elem, val) do { (void)(val); } while(0) - -#endif /* CONFIG_XEN_DEBUG_FS */ - - /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -243,11 +188,6 @@ static bool xen_page_pinned(void *ptr) return PagePinned(page); } -static bool xen_iomap_pte(pte_t pte) -{ - return pte_flags(pte) & _PAGE_IOMAP; -} - void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) { struct multicall_space mcs; @@ -257,7 +197,7 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) u = mcs.args; /* ptep might be kmapped when using 32-bit HIGHPTE */ - u->ptr = arbitrary_virt_to_machine(ptep).maddr; + u->ptr = virt_to_machine(ptep).maddr; u->val = pte_val_ma(pteval); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); @@ -266,11 +206,6 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) } EXPORT_SYMBOL_GPL(xen_set_domain_pte); -static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) -{ - xen_set_domain_pte(ptep, pteval, DOMID_IO); -} - static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; @@ -279,27 +214,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update) mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); if (mcs.mc != NULL) { - ADD_STATS(mmu_update_extended, 1); - ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); - mcs.mc->args[1]++; - - if (mcs.mc->args[1] < MMU_UPDATE_HISTO) - ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); - else - ADD_STATS(mmu_update_histo[0], 1); } else { - ADD_STATS(mmu_update, 1); mcs = __xen_mc_entry(sizeof(*u)); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); - ADD_STATS(mmu_update_histo[1], 1); } u = mcs.args; *u = *update; } -void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) { struct mmu_update u; @@ -312,17 +237,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); - ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } -void xen_set_pmd(pmd_t *ptr, pmd_t val) +static void xen_set_pmd(pmd_t *ptr, pmd_t val) { - ADD_STATS(pmd_update, 1); - /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -330,8 +251,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) return; } - ADD_STATS(pmd_update_pinned, 1); - xen_set_pmd_hyper(ptr, val); } @@ -344,35 +263,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); } -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) { - if (xen_iomap_pte(pteval)) { - xen_set_iomap_pte(ptep, pteval); - goto out; - } + struct mmu_update u; - ADD_STATS(set_pte_at, 1); -// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); - ADD_STATS(set_pte_at_current, mm == current->mm); - ADD_STATS(set_pte_at_kernel, mm == &init_mm); + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + return false; - if (mm == current->mm || mm == &init_mm) { - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - struct multicall_space mcs; - mcs = xen_mc_entry(0); + xen_mc_batch(); - MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); - ADD_STATS(set_pte_at_batched, 1); - xen_mc_issue(PARAVIRT_LAZY_MMU); - goto out; - } else - if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) - goto out; - } - xen_set_pte(ptep, pteval); + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); -out: return; + return true; +} + +static void xen_set_pte(pte_t *ptep, pte_t pteval) +{ + if (!xen_batched_set_pte(ptep, pteval)) + native_set_pte(ptep, pteval); +} + +static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + xen_set_pte(ptep, pteval); } pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, @@ -389,13 +307,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, xen_mc_batch(); - u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); - ADD_STATS(prot_commit, 1); - ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -416,8 +331,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - unsigned long mfn = pfn_to_mfn(pfn); + unsigned long mfn; + if (!xen_feature(XENFEAT_auto_translated_physmap)) + mfn = get_phys_to_machine(pfn); + else + mfn = pfn; /* * If there's no mfn for the pfn, then just create an * empty non-present pte. Unfortunately this loses @@ -427,8 +346,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (unlikely(mfn == INVALID_P2M_ENTRY)) { mfn = 0; flags = 0; + } else { + /* + * Paramount to do this test _after_ the + * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & + * IDENTITY_FRAME_BIT resolves to true. + */ + mfn &= ~FOREIGN_FRAME_BIT; + if (mfn & IDENTITY_FRAME_BIT) { + mfn &= ~IDENTITY_FRAME_BIT; + flags |= _PAGE_IOMAP; + } } - val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } @@ -449,7 +378,7 @@ static pteval_t iomap_pte(pteval_t val) return val; } -pteval_t xen_pte_val(pte_t pte) +static pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; @@ -466,7 +395,7 @@ pteval_t xen_pte_val(pte_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); -pgdval_t xen_pgd_val(pgd_t pgd) +static pgdval_t xen_pgd_val(pgd_t pgd) { return pte_mfn_to_pfn(pgd.pgd); } @@ -497,7 +426,7 @@ void xen_set_pat(u64 pat) WARN_ON(pat != 0x0007010600070106ull); } -pte_t xen_make_pte(pteval_t pte) +static pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); @@ -532,20 +461,55 @@ pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); -pgd_t xen_make_pgd(pgdval_t pgd) +#ifdef CONFIG_XEN_DEBUG +pte_t xen_make_pte_debug(pteval_t pte) +{ + phys_addr_t addr = (pte & PTE_PFN_MASK); + phys_addr_t other_addr; + bool io_page = false; + pte_t _pte; + + if (pte & _PAGE_IOMAP) + io_page = true; + + _pte = xen_make_pte(pte); + + if (!addr) + return _pte; + + if (io_page && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { + other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; + WARN_ONCE(addr != other_addr, + "0x%lx is using VM_IO, but it is 0x%lx!\n", + (unsigned long)addr, (unsigned long)other_addr); + } else { + pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; + other_addr = (_pte.pte & PTE_PFN_MASK); + WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set), + "0x%lx is missing VM_IO (and wasn't fixed)!\n", + (unsigned long)addr); + } + + return _pte; +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); +#endif + +static pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); return native_make_pgd(pgd); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); -pmdval_t xen_pmd_val(pmd_t pmd) +static pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); -void xen_set_pud_hyper(pud_t *ptr, pud_t val) +static void xen_set_pud_hyper(pud_t *ptr, pud_t val) { struct mmu_update u; @@ -558,17 +522,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); - ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } -void xen_set_pud(pud_t *ptr, pud_t val) +static void xen_set_pud(pud_t *ptr, pud_t val) { - ADD_STATS(pud_update, 1); - /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -576,56 +536,28 @@ void xen_set_pud(pud_t *ptr, pud_t val) return; } - ADD_STATS(pud_update_pinned, 1); - xen_set_pud_hyper(ptr, val); } -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - if (xen_iomap_pte(pte)) { - xen_set_iomap_pte(ptep, pte); - return; - } - - ADD_STATS(pte_update, 1); -// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); - ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - -#ifdef CONFIG_X86_PAE - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -#else - *ptep = pte; -#endif -} - #ifdef CONFIG_X86_PAE -void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - if (xen_iomap_pte(pte)) { - xen_set_iomap_pte(ptep, pte); - return; - } - set_64bit((u64 *)ptep, native_pte_val(pte)); } -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep->pte_low = 0; - smp_wmb(); /* make sure low gets written first */ - ptep->pte_high = 0; + if (!xen_batched_set_pte(ptep, native_make_pte(0))) + native_pte_clear(mm, addr, ptep); } -void xen_pmd_clear(pmd_t *pmdp) +static void xen_pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, __pmd(0)); } #endif /* CONFIG_X86_PAE */ -pmd_t xen_make_pmd(pmdval_t pmd) +static pmd_t xen_make_pmd(pmdval_t pmd) { pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); @@ -633,13 +565,13 @@ pmd_t xen_make_pmd(pmdval_t pmd) PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); #if PAGETABLE_LEVELS == 4 -pudval_t xen_pud_val(pud_t pud) +static pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); } PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); -pud_t xen_make_pud(pudval_t pud) +static pud_t xen_make_pud(pudval_t pud) { pud = pte_pfn_to_mfn(pud); @@ -647,7 +579,7 @@ pud_t xen_make_pud(pudval_t pud) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); -pgd_t *xen_get_user_pgd(pgd_t *pgd) +static pgd_t *xen_get_user_pgd(pgd_t *pgd) { pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); unsigned offset = pgd - pgd_page; @@ -679,7 +611,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) * 2. It is always pinned * 3. It has no user pagetable attached to it */ -void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) { preempt_disable(); @@ -692,12 +624,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) preempt_enable(); } -void xen_set_pgd(pgd_t *ptr, pgd_t val) +static void xen_set_pgd(pgd_t *ptr, pgd_t val) { pgd_t *user_ptr = xen_get_user_pgd(ptr); - ADD_STATS(pgd_update, 1); - /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -709,9 +639,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) return; } - ADD_STATS(pgd_update_pinned, 1); - ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - /* If it's pinned, then we can at least batch the kernel and user updates together. */ xen_mc_batch(); @@ -1005,7 +932,7 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, +static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, enum pt_level level) { SetPagePinned(page); @@ -1113,14 +1040,14 @@ void xen_mm_unpin_all(void) spin_unlock(&pgd_lock); } -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); xen_pgd_pin(mm); @@ -1138,7 +1065,7 @@ static void drop_other_mm_ref(void *info) active_mm = percpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm) + if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure @@ -1207,7 +1134,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) * pagetable because of lazy tlb flushing. This means we need need to * switch all CPUs off this pagetable before we can unpin it. */ -void xen_exit_mmap(struct mm_struct *mm) +static void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ xen_drop_mm_ref(mm); @@ -1222,13 +1149,27 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static __init void xen_pagetable_setup_start(pgd_t *base) +static void __init xen_pagetable_setup_start(pgd_t *base) +{ +} + +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) { + /* reserve the range used */ + native_pagetable_reserve(start, end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, + PFN_PHYS(pgt_buf_top)); + while (end < PFN_PHYS(pgt_buf_top)) { + make_lowmem_page_readwrite(__va(end)); + end += PAGE_SIZE; + } } static void xen_post_allocator_init(void); -static __init void xen_pagetable_setup_done(pgd_t *base) +static void __init xen_pagetable_setup_done(pgd_t *base) { xen_setup_shared_info(); xen_post_allocator_init(); @@ -1424,32 +1365,39 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - #ifdef CONFIG_X86_32 +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) +{ /* If there's an existing pte, then don't allow _PAGE_RW to be set */ if (pte_val_ma(*ptep) & _PAGE_PRESENT) pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & pte_val_ma(pte)); -#endif + + return pte; +} +#else /* CONFIG_X86_64 */ +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot, make sure it is RO. + * early_ioremap fixmap slot as a freshly allocated page, make sure + * it is RO. */ - if (!is_early_ioremap_ptep(ptep) && - pfn >= e820_table_start && pfn < e820_table_end) + if (((!is_early_ioremap_ptep(ptep) && + pfn >= pgt_buf_start && pfn < pgt_buf_top)) || + (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); return pte; } +#endif /* CONFIG_X86_64 */ /* Init-time set_pte while constructing initial pagetables, which doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { pte = mask_rw_pte(ptep, pte); @@ -1467,7 +1415,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -1477,7 +1425,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) } /* Used for pmd and pud */ -static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -1487,13 +1435,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static __init void xen_release_pte_init(unsigned long pfn) +static void __init xen_release_pte_init(unsigned long pfn) { pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -static __init void xen_release_pmd_init(unsigned long pfn) +static void __init xen_release_pmd_init(unsigned long pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -1619,7 +1567,7 @@ static void set_page_prot(void *addr, pgprot_t prot) BUG(); } -static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; unsigned ident_pte; @@ -1651,9 +1599,6 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; - if (pfn > max_pfn_mapped) - max_pfn_mapped = pfn; - if (!pte_none(pte_page[pteidx])) continue; @@ -1695,7 +1640,7 @@ static void convert_pfn_mfn(void *v) } /* - * Set up the inital kernel pagetable. + * Set up the initial kernel pagetable. * * We can construct this by grafting the Xen provided pagetable into * head_64.S's preconstructed pagetables. We copy the Xen L2's into @@ -1705,12 +1650,18 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, +pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); @@ -1770,7 +1721,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); -static __init void xen_write_cr3_init(unsigned long cr3) +static void __init xen_write_cr3_init(unsigned long cr3) { unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); @@ -1807,7 +1758,7 @@ static __init void xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, +pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; @@ -1815,9 +1766,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + - xen_start_info->nr_pt_frames * PAGE_SIZE + - 512*1024); + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); @@ -1915,7 +1864,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -__init void xen_ident_map_ISA(void) +void __init xen_ident_map_ISA(void) { unsigned long pa; @@ -1938,8 +1887,11 @@ __init void xen_ident_map_ISA(void) xen_flush_tlb(); } -static __init void xen_post_allocator_init(void) +static void __init xen_post_allocator_init(void) { +#ifdef CONFIG_XEN_DEBUG + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); +#endif pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; @@ -1972,7 +1924,7 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -static const struct pv_mmu_ops xen_mmu_ops __initdata = { +static const struct pv_mmu_ops xen_mmu_ops __initconst = { .read_cr2 = xen_read_cr2, .write_cr2 = xen_write_cr2, @@ -2045,6 +1997,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { void __init xen_init_mmu_ops(void) { + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; @@ -2072,7 +2025,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, in_frames[i] = virt_to_mfn(vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); if (out_frames) out_frames[i] = virt_to_pfn(vaddr); @@ -2296,7 +2249,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, struct remap_data *rmd = data; pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); - rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; + rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; rmd->mmu_update->val = pte_val_ma(pte); rmd->mmu_update++; @@ -2350,64 +2303,15 @@ out: EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); #ifdef CONFIG_XEN_DEBUG_FS - -static struct dentry *d_mmu_debug; - -static int __init xen_mmu_debugfs(void) +static int p2m_dump_open(struct inode *inode, struct file *filp) { - struct dentry *d_xen = xen_init_debugfs(); - - if (d_xen == NULL) - return -ENOMEM; - - d_mmu_debug = debugfs_create_dir("mmu", d_xen); - - debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); - - debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); - debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, - &mmu_stats.pgd_update_pinned); - debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, - &mmu_stats.pgd_update_pinned); - - debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); - debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, - &mmu_stats.pud_update_pinned); - debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, - &mmu_stats.pud_update_pinned); - - debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); - debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, - &mmu_stats.pmd_update_pinned); - debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, - &mmu_stats.pmd_update_pinned); - - debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); -// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, -// &mmu_stats.pte_update_pinned); - debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, - &mmu_stats.pte_update_pinned); - - debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); - debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, - &mmu_stats.mmu_update_extended); - xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, - mmu_stats.mmu_update_histo, 20); - - debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); - debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, - &mmu_stats.set_pte_at_batched); - debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, - &mmu_stats.set_pte_at_current); - debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, - &mmu_stats.set_pte_at_kernel); - - debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); - debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, - &mmu_stats.prot_commit_batched); - - return 0; + return single_open(filp, p2m_dump_show, NULL); } -fs_initcall(xen_mmu_debugfs); -#endif /* CONFIG_XEN_DEBUG_FS */ +static const struct file_operations p2m_dump_fops = { + .open = p2m_dump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 537bb9aab777..73809bb951b4 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -15,43 +15,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); -void xen_exit_mmap(struct mm_struct *mm); - -pteval_t xen_pte_val(pte_t); -pmdval_t xen_pmd_val(pmd_t); -pgdval_t xen_pgd_val(pgd_t); - -pte_t xen_make_pte(pteval_t); -pmd_t xen_make_pmd(pmdval_t); -pgd_t xen_make_pgd(pgdval_t); - -void xen_set_pte(pte_t *ptep, pte_t pteval); -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - -#ifdef CONFIG_X86_PAE -void xen_set_pte_atomic(pte_t *ptep, pte_t pte); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); -#endif /* CONFIG_X86_PAE */ - -void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); -void xen_set_pud(pud_t *ptr, pud_t val); -void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); -void xen_set_pud_hyper(pud_t *ptr, pud_t val); - -#if PAGETABLE_LEVELS == 4 -pudval_t xen_pud_val(pud_t pud); -pud_t xen_make_pud(pudval_t pudval); -void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); -void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); -#endif - -pgd_t *xen_get_user_pgd(pgd_t *pgd); - pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index fd12d7ce7ff9..58efeb9d5440 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -23,6 +23,129 @@ * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to * 512 and 1024 entries respectively. + * + * In short, these structures contain the Machine Frame Number (MFN) of the PFN. + * + * However not all entries are filled with MFNs. Specifically for all other + * leaf entries, or for the top root, or middle one, for which there is a void + * entry, we assume it is "missing". So (for example) + * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * + * We also have the possibility of setting 1-1 mappings on certain regions, so + * that: + * pfn_to_mfn(0xc0000)=0xc0000 + * + * The benefit of this is, that we can assume for non-RAM regions (think + * PCI BARs, or ACPI spaces), we can create mappings easily b/c we + * get the PFN value to match the MFN. + * + * For this to work efficiently we have one new page p2m_identity and + * allocate (via reserved_brk) any other pages we need to cover the sides + * (1GB or 4MB boundary violations). All entries in p2m_identity are set to + * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, + * no other fancy value). + * + * On lookup we spot that the entry points to p2m_identity and return the + * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. + * If the entry points to an allocated page, we just proceed as before and + * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * appropriate functions (pfn_to_mfn). + * + * The reason for having the IDENTITY_FRAME_BIT instead of just returning the + * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a + * non-identity pfn. To protect ourselves against we elect to set (and get) the + * IDENTITY_FRAME_BIT on all identity mapped PFNs. + * + * This simplistic diagram is used to explain the more subtle piece of code. + * There is also a digram of the P2M at the end that can help. + * Imagine your E820 looking as so: + * + * 1GB 2GB + * /-------------------+---------\/----\ /----------\ /---+-----\ + * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | + * \-------------------+---------/\----/ \----------/ \---+-----/ + * ^- 1029MB ^- 2001MB + * + * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), + * 2048MB = 524288 (0x80000)] + * + * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB + * is actually not present (would have to kick the balloon driver to put it in). + * + * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: + * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start + * of the PFN and the end PFN (263424 and 512256 respectively). The first step + * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page + * covers 512^2 of page estate (1GB) and in case the start or end PFN is not + * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn + * to end pfn. We reserve_brk top leaf pages if they are missing (means they + * point to p2m_mid_missing). + * + * With the E820 example above, 263424 is not 1GB aligned so we allocate a + * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. + * Each entry in the allocate page is "missing" (points to p2m_missing). + * + * Next stage is to determine if we need to do a more granular boundary check + * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. + * We check if the start pfn and end pfn violate that boundary check, and if + * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer + * granularity of setting which PFNs are missing and which ones are identity. + * In our example 263424 and 512256 both fail the check so we reserve_brk two + * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" + * values) and assign them to p2m[1][2] and p2m[1][488] respectively. + * + * At this point we would at minimum reserve_brk one page, but could be up to + * three. Each call to set_phys_range_identity has at maximum a three page + * cost. If we were to query the P2M at this stage, all those entries from + * start PFN through end PFN (so 1029MB -> 2001MB) would return + * INVALID_P2M_ENTRY ("missing"). + * + * The next step is to walk from the start pfn to the end pfn setting + * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. + * If we find that the middle leaf is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this + * point we do not need to worry about boundary aligment (so no need to + * reserve_brk a middle page, figure out which PFNs are "missing" and which + * ones are identity), as that has been done earlier. If we find that the + * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference + * that page (which covers 512 PFNs) and set the appropriate PFN with + * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we + * set from p2m[1][2][256->511] and p2m[1][488][0->256] with + * IDENTITY_FRAME_BIT set. + * + * All other regions that are void (or not filled) either point to p2m_missing + * (considered missing) or have the default value of INVALID_P2M_ENTRY (also + * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] + * contain the INVALID_P2M_ENTRY value and are considered "missing." + * + * This is what the p2m ends up looking (for the E820 above) with this + * fabulous drawing: + * + * p2m /--------------\ + * /-----\ | &mfn_list[0],| /-----------------\ + * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | + * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | + * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | + * |-----| \ | [p2m_identity]+\\ | .... | + * | 2 |--\ \-------------------->| ... | \\ \----------------/ + * |-----| \ \---------------/ \\ + * | 3 |\ \ \\ p2m_identity + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ / | [p2m_identity]+-->| ..., ~0 | + * / /---------------\ | .... | \-----------------/ + * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | + * / | IDENTITY[@256]|<----/ \---------------/ + * / | ~0, ~0, .... | + * | \---------------/ + * | + * p2m_missing p2m_missing + * /------------------\ /------------\ + * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | + * | [p2m_mid_missing]+---->| ..., ~0 | + * \------------------/ \------------/ + * + * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ #include <linux/init.h> @@ -30,6 +153,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/sched.h> +#include <linux/seq_file.h> #include <asm/cache.h> #include <asm/setup.h> @@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); + RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +/* We might hit two boundary violations at the start and end, at max each + * boundary violation will require three middle nodes. */ +RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m) * - After resume we're called from within stop_machine, but the mfn * tree should alreay be completely allocated. */ -void xen_build_mfn_list_list(void) +void __ref xen_build_mfn_list_list(void) { unsigned long pfn; @@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); + /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just @@ -266,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* + * The INVALID_P2M_ENTRY is filled in both p2m_*identity + * and in p2m_*missing, so returning the INVALID_P2M_ENTRY + * would be wrong. + */ + if (p2m_top[topidx][mididx] == p2m_identity) + return IDENTITY_FRAME(pfn); + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); @@ -335,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn) p2m_top_mfn_p[topidx] = mid_mfn; } - if (p2m_top[topidx][mididx] == p2m_missing) { + if (p2m_top[topidx][mididx] == p2m_identity || + p2m_top[topidx][mididx] == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; + unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) @@ -345,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn) p2m_init(p2m); - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) free_p2m_page(p2m); else mid_mfn[mididx] = virt_to_mfn(p2m); @@ -354,11 +497,117 @@ static bool alloc_p2m(unsigned long pfn) return true; } +static bool __init __early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* Pfff.. No boundary cross-over, lets get out. */ + if (!idx) + return false; + + WARN(p2m_top[topidx][mididx] == p2m_identity, + "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", + topidx, mididx); + + /* + * Could be done by xen_build_dynamic_phys_to_machine.. + */ + if (p2m_top[topidx][mididx] != p2m_missing) + return false; + + /* Boundary cross-over for the edges: */ + if (idx) { + unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + unsigned long *mid_mfn_p; + + p2m_init(p2m); + + p2m_top[topidx][mididx] = p2m; + + /* For save/restore we need to MFN of the P2M saved */ + + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + } + return idx != 0; +} +unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e) +{ + unsigned long pfn; + + if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) + return 0; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn_e - pfn_s; + + if (pfn_s > pfn_e) + return 0; + + for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); + pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); + pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) + { + unsigned topidx = p2m_top_index(pfn); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don't set mid_mfn_p[midix] here, + * look in __early_alloc_p2m */ + } + } + + __early_alloc_p2m(pfn_s); + __early_alloc_p2m(pfn_e); + + for (pfn = pfn_s; pfn < pfn_e; pfn++) + if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) + break; + + if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), + "Identity mapping failed. We are %ld short of 1-1 mappings!\n", + (pfn_e - pfn_s) - (pfn - pfn_s))) + printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + + return pfn - pfn_s; +} + /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, mididx, idx; + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; @@ -368,6 +617,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* For sparse holes were the p2m leaf has real PFN along with + * PCI holes, stick in the PFN as the MFN value. + */ + if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx][mididx] == p2m_identity) + return true; + + /* Swap over from MISSING to IDENTITY if needed. */ + if (p2m_top[topidx][mididx] == p2m_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, + p2m_identity) != p2m_missing); + return true; + } + } + if (p2m_top[topidx][mididx] == p2m_missing) return mfn == INVALID_P2M_ENTRY; @@ -378,11 +642,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!alloc_p2m(pfn)) return false; @@ -417,11 +676,11 @@ static unsigned long mfn_hash(unsigned long mfn) } /* Add an MFN override for a particular page */ -int m2p_add_override(unsigned long mfn, struct page *page) +int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) { unsigned long flags; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; @@ -429,7 +688,6 @@ int m2p_add_override(unsigned long mfn, struct page *page) if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); ptep = lookup_address(address, &level); - if (WARN(ptep == NULL || level != PG_LEVEL_4K, "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; @@ -438,24 +696,25 @@ int m2p_add_override(unsigned long mfn, struct page *page) page->private = mfn; page->index = pfn_to_mfn(pfn); - __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); - if (!PageHighMem(page)) + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) + return -ENOMEM; + + if (clear_pte && !PageHighMem(page)) /* Just zap old mapping for now */ pte_clear(&init_mm, address, ptep); - spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); return 0; } - -int m2p_remove_override(struct page *page) +EXPORT_SYMBOL_GPL(m2p_add_override); +int m2p_remove_override(struct page *page, bool clear_pte) { unsigned long flags; unsigned long mfn; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; @@ -476,9 +735,9 @@ int m2p_remove_override(struct page *page) spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); - __set_phys_to_machine(pfn, page->index); + set_phys_to_machine(pfn, page->index); - if (!PageHighMem(page)) + if (clear_pte && !PageHighMem(page)) set_pte_at(&init_mm, address, ptep, pfn_pte(pfn, PAGE_KERNEL)); /* No tlb flush necessary because the caller already @@ -486,6 +745,7 @@ int m2p_remove_override(struct page *page) return 0; } +EXPORT_SYMBOL_GPL(m2p_remove_override); struct page *m2p_find_override(unsigned long mfn) { @@ -520,3 +780,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) return ret; } EXPORT_SYMBOL_GPL(m2p_find_override_pfn); + +#ifdef CONFIG_XEN_DEBUG_FS + +int p2m_dump_show(struct seq_file *m, void *v) +{ + static const char * const level_name[] = { "top", "middle", + "entry", "abnormal" }; + static const char * const type_name[] = { "identity", "missing", + "pfn", "abnormal"}; +#define TYPE_IDENTITY 0 +#define TYPE_MISSING 1 +#define TYPE_PFN 2 +#define TYPE_UNKNOWN 3 + unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; + unsigned int uninitialized_var(prev_level); + unsigned int uninitialized_var(prev_type); + + if (!p2m_top) + return 0; + + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned idx = p2m_index(pfn); + unsigned lvl, type; + + lvl = 4; + type = TYPE_UNKNOWN; + if (p2m_top[topidx] == p2m_mid_missing) { + lvl = 0; type = TYPE_MISSING; + } else if (p2m_top[topidx] == NULL) { + lvl = 0; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == NULL) { + lvl = 1; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == p2m_identity) { + lvl = 1; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx] == p2m_missing) { + lvl = 1; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == 0) { + lvl = 2; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { + lvl = 2; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { + lvl = 2; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == pfn) { + lvl = 2; type = TYPE_PFN; + } else if (p2m_top[topidx][mididx][idx] != pfn) { + lvl = 2; type = TYPE_PFN; + } + if (pfn == 0) { + prev_level = lvl; + prev_type = type; + } + if (pfn == MAX_DOMAIN_PAGES-1) { + lvl = 3; + type = TYPE_UNKNOWN; + } + if (prev_type != type) { + seq_printf(m, " [0x%lx->0x%lx] %s\n", + prev_pfn_type, pfn, type_name[prev_type]); + prev_pfn_type = pfn; + prev_type = type; + } + if (prev_level != lvl) { + seq_printf(m, " [0x%lx->0x%lx] level %s\n", + prev_pfn_level, pfn, level_name[prev_level]); + prev_pfn_level = pfn; + prev_level = lvl; + } + } + return 0; +#undef TYPE_IDENTITY +#undef TYPE_MISSING +#undef TYPE_PFN +#undef TYPE_UNKNOWN +} +#endif diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index bfd0632fe65e..b480d4207a4c 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -36,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void) /* If running as PV guest, either iommu=soft, or swiotlb=force will * activate this IOMMU. If running as PV privileged, activate it - * irregardlesss. + * irregardless. */ if ((xen_initial_domain() || swiotlb || swiotlb_force) && (xen_pv_domain())) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a8a66a50d446..be1a464f6d66 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -50,8 +50,10 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; */ #define EXTRA_MEM_RATIO (10) -static __init void xen_add_extra_mem(unsigned long pages) +static void __init xen_add_extra_mem(unsigned long pages) { + unsigned long pfn; + u64 size = (u64)pages * PAGE_SIZE; u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; @@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages) xen_extra_mem_size += size; xen_max_p2m_pfn = PFN_DOWN(extra_start + size); + + for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } static unsigned long __init xen_release_chunk(phys_addr_t start_addr, @@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", start, end, ret); if (ret == 1) { - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } @@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, return released; } +static unsigned long __init xen_set_identity(const struct e820entry *list, + ssize_t map_size) +{ + phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; + phys_addr_t start_pci = last; + const struct e820entry *entry; + unsigned long identity = 0; + int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + phys_addr_t start = entry->addr; + phys_addr_t end = start + entry->size; + + if (start < last) + start = last; + + if (end <= start) + continue; + + /* Skip over the 1MB region. */ + if (last > end) + continue; + + if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { + if (start > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(start)); + + /* Without saving 'last' we would gooble RAM too + * at the end of the loop. */ + last = end; + start_pci = end; + continue; + } + start_pci = min(start, start_pci); + last = end; + } + if (last > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(last)); + return identity; +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; + static struct e820entry map_raw[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; @@ -151,6 +199,7 @@ char * __init xen_memory_setup(void) struct xen_memory_map memmap; unsigned long extra_pages = 0; unsigned long extra_limit; + unsigned long identity_pages = 0; int i; int op; @@ -176,8 +225,13 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; +#ifdef CONFIG_X86_32 xen_extra_mem_start = mem_end; +#else + xen_extra_mem_start = max((1ULL << 32), mem_end); +#endif for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; @@ -194,6 +248,15 @@ char * __init xen_memory_setup(void) end -= delta; extra_pages += PFN_DOWN(delta); + /* + * Set RAM below 4GB that is not for us to be unusable. + * This prevents "System RAM" address space from being + * used as potential resource for I/O address (happens + * when 'allocate_resource' is called). + */ + if (delta && + (xen_initial_domain() && end < 0x100000000ULL)) + e820_add_region(end, delta, E820_UNUSABLE); } if (map[i].size > 0 && end > xen_extra_mem_start) @@ -251,6 +314,13 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(extra_pages); + /* + * Set P2M for all non-RAM pages and E820 gaps to be identity + * type PFNs. We supply it with the non-sanitized version + * of the E820. + */ + identity_pages = xen_set_identity(map_raw, memmap.nr_entries); + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); return "Xen"; } @@ -270,7 +340,7 @@ static void __init fiddle_vdso(void) #endif } -static __cpuinit int register_callback(unsigned type, const void *func) +static int __cpuinit register_callback(unsigned type, const void *func) { struct callback_register callback = { .type = type, diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 72a4c7959045..41038c01de40 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -46,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { inc_irq_stat(irq_resched_count); + scheduler_ipi(); return IRQ_HANDLED; } -static __cpuinit void cpu_bringup(void) +static void __cpuinit cpu_bringup(void) { int cpu = smp_processor_id(); @@ -85,7 +84,7 @@ static __cpuinit void cpu_bringup(void) wmb(); /* make sure everything is out */ } -static __cpuinit void cpu_bringup_and_idle(void) +static void __cpuinit cpu_bringup_and_idle(void) { cpu_bringup(); cpu_idle(); @@ -242,7 +241,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } } -static __cpuinit int +static int __cpuinit cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; @@ -486,7 +485,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static const struct smp_ops xen_smp_ops __initdata = { +static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, .smp_cpus_done = xen_smp_cpus_done, @@ -509,3 +508,41 @@ void __init xen_smp_init(void) xen_fill_possible_map(); xen_init_spinlocks(); } + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); + + if (!xen_have_vector_callback) + return; + xen_init_lock_cpu(0); + xen_init_spinlocks(); +} + +static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) +{ + int rc; + rc = native_cpu_up(cpu); + WARN_ON (xen_smp_intr_init(cpu)); + return rc; +} + +static void xen_hvm_cpu_die(unsigned int cpu) +{ + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + native_cpu_die(cpu); +} + +void __init xen_hvm_smp_init(void) +{ + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_up = xen_hvm_cpu_up; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; +} diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 9bbd63a129b5..45329c8c226e 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,7 +12,7 @@ #include "xen-ops.h" #include "mmu.h" -void xen_pre_suspend(void) +void xen_arch_pre_suspend(void) { xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = @@ -26,8 +26,9 @@ void xen_pre_suspend(void) BUG(); } -void xen_hvm_post_suspend(int suspend_cancelled) +void xen_arch_hvm_post_suspend(int suspend_cancelled) { +#ifdef CONFIG_XEN_PVHVM int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); @@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_setup_runstate_info(cpu); } } +#endif } -void xen_post_suspend(int suspend_cancelled) +void xen_arch_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 067759e3d6a5..5158c505bef9 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -26,8 +26,6 @@ #include "xen-ops.h" -#define XEN_SHIFT 22 - /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) @@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = { .rating = 400, .read = xen_clocksource_get_cycles, .mask = ~0, - .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ - .shift = XEN_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -397,7 +393,9 @@ void xen_setup_timer(int cpu) name = "<timer kasprintf failed>"; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, + IRQF_DISABLED|IRQF_PERCPU| + IRQF_NOBALANCING|IRQF_TIMER| + IRQF_FORCE_RESUME, name, NULL); evt = &per_cpu(xen_clock_events, cpu); @@ -437,16 +435,16 @@ void xen_timer_resume(void) } } -static const struct pv_time_ops xen_time_ops __initdata = { +static const struct pv_time_ops xen_time_ops __initconst = { .sched_clock = xen_clocksource_read, }; -static __init void xen_time_init(void) +static void __init xen_time_init(void) { int cpu = smp_processor_id(); struct timespec tp; - clocksource_register(&xen_clocksource); + clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { /* Successfully turned off 100Hz tick, so we have the @@ -466,7 +464,7 @@ static __init void xen_time_init(void) xen_setup_cpu_clockevents(); } -__init void xen_init_time_ops(void) +void __init xen_init_time_ops(void) { pv_time_ops = xen_time_ops; @@ -488,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void) xen_setup_cpu_clockevents(); } -__init void xen_hvm_init_time_ops(void) +void __init xen_hvm_init_time_ops(void) { /* vector callback is needed otherwise we cannot receive interrupts * on cpu > 0 and at this point we don't know how many cpus are diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 1a5ff24e29c0..aaa7291c9259 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -28,9 +28,9 @@ ENTRY(startup_xen) __FINIT .pushsection .text - .align PAGE_SIZE_asm + .align PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE_asm + .skip PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9d41bf985757..97dfdc8757b3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -64,15 +64,17 @@ void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_hvm_smp_init(void); extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} +static inline void xen_hvm_smp_init(void) {} #endif #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init xen_init_spinlocks(void); -__cpuinit void xen_init_lock_cpu(int cpu); +void __cpuinit xen_init_lock_cpu(int cpu); void xen_uninit_lock_cpu(int cpu); #else static inline void xen_init_spinlocks(void) |