diff options
Diffstat (limited to 'arch/x86')
212 files changed, 8204 insertions, 3044 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ded8a6774ac9..d69f1cd87fd9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 @@ -144,7 +145,7 @@ config INSTRUCTION_DECODER config PERF_EVENTS_INTEL_UNCORE def_bool y - depends on PERF_EVENTS && SUP_SUP_INTEL && PCI + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI config OUTPUT_FORMAT string @@ -248,6 +249,10 @@ config HAVE_INTEL_TXT def_bool y depends on INTEL_IOMMU && ACPI +config X86_INTEL_MPX + def_bool y + depends on CPU_SUP_INTEL + config X86_32_SMP def_bool y depends on X86_32 && SMP @@ -988,6 +993,24 @@ config X86_ESPFIX64 def_bool y depends on X86_16BIT && X86_64 +config X86_VSYSCALL_EMULATION + bool "Enable vsyscall emulation" if EXPERT + default y + depends on X86_64 + ---help--- + This enables emulation of the legacy vsyscall page. Disabling + it is roughly equivalent to booting with vsyscall=none, except + that it will also disable the helpful warning if a program + tries to use a vsyscall. With this option set to N, offending + programs will just segfault, citing addresses of the form + 0xffffffffff600?00. + + This option is required by many programs built before 2013, and + care should be used even with newer programs if set to N. + + Disabling this option saves about 7K of kernel size and + possibly 4K of additional runtime pagetable memory. + config TOSHIBA tristate "Toshiba Laptop support" depends on X86_32 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 704f58aa79cd..d999398928bc 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -3,6 +3,18 @@ # # create a compressed vmlinux image from the original vmlinux # +# vmlinuz is: +# decompression code (*.o) +# asm globals (piggy.S), including: +# vmlinux.bin.(gz|bz2|lzma|...) +# +# vmlinux.bin is: +# vmlinux stripped of debugging and comments +# vmlinux.bin.all is: +# vmlinux.bin + vmlinux.relocs +# vmlinux.bin.(gz|bz2|lzma|...) is: +# (see scripts/Makefile.lib size_append) +# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 @@ -35,7 +47,8 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone -vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o +vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ + $(objtree)/drivers/firmware/efi/libstub/lib.a $(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) @@ -76,8 +89,10 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 +RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ + perl $(srctree)/arch/x86/tools/calc_run_size.pl) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 1acf605a646d..92b9a5f2aed6 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -21,8 +21,10 @@ static efi_system_table_t *sys_table; static struct efi_config *efi_early; -#define efi_call_early(f, ...) \ - efi_early->call(efi_early->f, __VA_ARGS__); +__pure const struct efi_config *__efi_early(void) +{ + return efi_early; +} #define BOOT_SERVICES(bits) \ static void setup_boot_services##bits(struct efi_config *c) \ @@ -285,8 +287,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } } -#include "../../../../drivers/firmware/efi/libstub/efi-stub-helper.c" - static void find_bits(unsigned long mask, u8 *pos, u8 *size) { u8 first, len; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index c88c31ecad12..d487e727f1ec 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -103,20 +103,4 @@ struct efi_uga_draw_protocol { void *blt; }; -struct efi_config { - u64 image_handle; - u64 table; - u64 allocate_pool; - u64 allocate_pages; - u64 get_memory_map; - u64 free_pool; - u64 free_pages; - u64 locate_handle; - u64 handle_protocol; - u64 exit_boot_services; - u64 text_output; - efi_status_t (*call)(unsigned long, ...); - bool is64; -} __packed; - #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index cbed1407a5cd..1d7fbbcc196d 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -207,7 +207,8 @@ relocated: * Do the decompression, and jump to the new kernel.. */ /* push arguments for decompress_kernel: */ - pushl $z_output_len /* decompressed length */ + pushl $z_run_size /* size of kernel with .bss and .brk */ + pushl $z_output_len /* decompressed length, end of relocs */ leal z_extract_offset_negative(%ebx), %ebp pushl %ebp /* output address */ pushl $z_input_len /* input_len */ @@ -217,7 +218,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call decompress_kernel /* returns kernel location in %eax */ - addl $24, %esp + addl $28, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2884e0c3e8a5..6b1766c6c082 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -402,13 +402,16 @@ relocated: * Do the decompression, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ + movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ + pushq %r9 movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ - movq $z_output_len, %r9 /* decompressed length */ + movq $z_output_len, %r9 /* decompressed length, end of relocs */ call decompress_kernel /* returns kernel location in %rax */ + popq %r9 popq %rsi /* diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 57ab74df7eea..dcc1c536cc21 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -260,7 +260,7 @@ static void handle_relocations(void *output, unsigned long output_len) /* * Process relocations: 32 bit relocations first then 64 bit after. - * Two sets of binary relocations are added to the end of the kernel + * Three sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel * address of the location which needs to be updated stored as a * 32-bit value which is sign extended to 64 bits. @@ -270,6 +270,8 @@ static void handle_relocations(void *output, unsigned long output_len) * kernel bits... * 0 - zero terminator for 64 bit relocations * 64 bit relocation repeated + * 0 - zero terminator for inverse 32 bit relocations + * 32 bit inverse relocation repeated * 0 - zero terminator for 32 bit relocations * 32 bit relocation repeated * @@ -286,6 +288,16 @@ static void handle_relocations(void *output, unsigned long output_len) *(uint32_t *)ptr += delta; } #ifdef CONFIG_X86_64 + while (*--reloc) { + long extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("inverse 32-bit relocation outside of kernel!\n"); + + *(int32_t *)ptr -= delta; + } for (reloc--; *reloc; reloc--) { long extended = *reloc; extended += map; @@ -358,7 +370,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len) + unsigned long output_len, + unsigned long run_size) { real_mode = rmode; @@ -381,8 +394,14 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; - output = choose_kernel_location(input_data, input_len, - output, output_len); + /* + * The memory hole needed for the kernel is the larger of either + * the entire decompressed kernel plus relocation table, or the + * entire decompressed kernel plus .bss and .brk sections. + */ + output = choose_kernel_location(input_data, input_len, output, + output_len > run_size ? output_len + : run_size); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index b669ab65bf6c..d8222f213182 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -36,11 +36,13 @@ int main(int argc, char *argv[]) uint32_t olen; long ilen; unsigned long offs; + unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 2) { - fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); + if (argc < 3) { + fprintf(stderr, "Usage: %s compressed_file run_size\n", + argv[0]); goto bail; } @@ -74,6 +76,7 @@ int main(int argc, char *argv[]) offs += olen >> 12; /* Add 8 bytes for each 32K block */ offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + run_size = atoi(argv[2]); printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); @@ -85,6 +88,8 @@ int main(int argc, char *argv[]) /* z_extract_offset_negative allows simplification of head_32.S */ printf(".globl z_extract_offset_negative\n"); printf("z_extract_offset_negative = -0x%lx\n", offs); + printf(".globl z_run_size\n"); + printf("z_run_size = %lu\n", run_size); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 32d2e7056c87..419819d6dab3 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -8,6 +8,7 @@ CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_FHANDLE=y CONFIG_AUDIT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index a481dd4755d5..4c311ddd973b 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -7,6 +7,7 @@ CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_FHANDLE=y CONFIG_AUDIT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index aafe8ce0d65d..e26984f7ab8d 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -66,5 +66,5 @@ module_exit(aes_fini); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); -MODULE_ALIAS("aes-asm"); +MODULE_ALIAS_CRYPTO("aes"); +MODULE_ALIAS_CRYPTO("aes-asm"); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 888950f29fd9..ae855f4f64b7 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -43,10 +43,6 @@ #include <asm/crypto/glue_helper.h> #endif -#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) -#define HAS_PCBC -#endif - /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. @@ -547,7 +543,7 @@ static int ablk_ctr_init(struct crypto_tfm *tfm) #endif -#ifdef HAS_PCBC +#if IS_ENABLED(CONFIG_CRYPTO_PCBC) static int ablk_pcbc_init(struct crypto_tfm *tfm) { return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); @@ -1377,7 +1373,7 @@ static struct crypto_alg aesni_algs[] = { { }, }, #endif -#ifdef HAS_PCBC +#if IS_ENABLED(CONFIG_CRYPTO_PCBC) }, { .cra_name = "pcbc(aes)", .cra_driver_name = "pcbc-aes-aesni", @@ -1550,4 +1546,4 @@ module_exit(aesni_exit); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); +MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 8af519ed73d1..17c05531dfd1 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -478,5 +478,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized"); -MODULE_ALIAS("blowfish"); -MODULE_ALIAS("blowfish-asm"); +MODULE_ALIAS_CRYPTO("blowfish"); +MODULE_ALIAS_CRYPTO("blowfish-asm"); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 4209a76fcdaa..9a07fafe3831 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -582,5 +582,5 @@ module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 87a041a10f4a..ed38d959add6 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -574,5 +574,5 @@ module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index c171dcbf192d..5c8b6266a394 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1725,5 +1725,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e57e20ab5e0b..60ada677a928 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -491,4 +491,4 @@ module_exit(cast5_exit); MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("cast5"); +MODULE_ALIAS_CRYPTO("cast5"); diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 09f3677393e4..0160f68a57ff 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -611,4 +611,4 @@ module_exit(cast6_exit); MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("cast6"); +MODULE_ALIAS_CRYPTO("cast6"); diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 9d014a74ef96..1937fc1d8763 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -197,5 +197,5 @@ module_exit(crc32_pclmul_mod_fini); MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crc32"); -MODULE_ALIAS("crc32-pclmul"); +MODULE_ALIAS_CRYPTO("crc32"); +MODULE_ALIAS_CRYPTO("crc32-pclmul"); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 6812ad98355c..28640c3d6af7 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -280,5 +280,5 @@ MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.c MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crc32c"); -MODULE_ALIAS("crc32c-intel"); +MODULE_ALIAS_CRYPTO("crc32c"); +MODULE_ALIAS_CRYPTO("crc32c-intel"); diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 7845d7fd54c0..b6c67bf30fdf 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -147,5 +147,5 @@ MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crct10dif"); -MODULE_ALIAS("crct10dif-pclmul"); +MODULE_ALIAS_CRYPTO("crct10dif"); +MODULE_ALIAS_CRYPTO("crct10dif-pclmul"); diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 0e9c0668fe4e..38a14f818ef1 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -502,8 +502,8 @@ module_exit(des3_ede_x86_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); -MODULE_ALIAS("des3_ede"); -MODULE_ALIAS("des3_ede-asm"); -MODULE_ALIAS("des"); -MODULE_ALIAS("des-asm"); +MODULE_ALIAS_CRYPTO("des3_ede"); +MODULE_ALIAS_CRYPTO("des3_ede-asm"); +MODULE_ALIAS_CRYPTO("des"); +MODULE_ALIAS_CRYPTO("des-asm"); MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>"); diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index 98d7a188f46b..f368ba261739 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/crypto.h> #include <asm/i387.h> struct crypto_fpu_ctx { @@ -159,3 +160,5 @@ void __exit crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } + +MODULE_ALIAS_CRYPTO("fpu"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 88bb7ba8b175..8253d85aa165 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -341,4 +341,4 @@ module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " "acclerated by PCLMULQDQ-NI"); -MODULE_ALIAS("ghash"); +MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 5e8e67739bb5..399a29d067d6 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -119,5 +119,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)"); -MODULE_ALIAS("salsa20"); -MODULE_ALIAS("salsa20-asm"); +MODULE_ALIAS_CRYPTO("salsa20"); +MODULE_ALIAS_CRYPTO("salsa20-asm"); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 2fae489b1524..437e47a4d302 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -558,5 +558,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("serpent"); -MODULE_ALIAS("serpent-asm"); +MODULE_ALIAS_CRYPTO("serpent"); +MODULE_ALIAS_CRYPTO("serpent-asm"); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index ff4870870972..7e217398b4eb 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -617,4 +617,4 @@ module_exit(serpent_exit); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("serpent"); +MODULE_ALIAS_CRYPTO("serpent"); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 8c95f8637306..bf025adaea01 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -618,4 +618,4 @@ module_exit(serpent_sse2_exit); MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("serpent"); +MODULE_ALIAS_CRYPTO("serpent"); diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 99eefd812958..a225a5ca1037 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -204,8 +204,7 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str continue; } - if (ctx) - ctx->status = HASH_CTX_STS_IDLE; + ctx->status = HASH_CTX_STS_IDLE; return ctx; } diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 74d16ef707c7..6c20fe04a738 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -278,4 +278,4 @@ module_exit(sha1_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha1"); +MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index f248546da1ca..8fad72f4dfd2 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -211,7 +211,7 @@ static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) sha256_ssse3_final(desc, D); memcpy(hash, D, SHA224_DIGEST_SIZE); - memset(D, 0, SHA256_DIGEST_SIZE); + memzero_explicit(D, SHA256_DIGEST_SIZE); return 0; } @@ -318,5 +318,5 @@ module_exit(sha256_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha256"); -MODULE_ALIAS("sha224"); +MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha224"); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 8626b03e83b7..0b6af26832bf 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -219,7 +219,7 @@ static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) sha512_ssse3_final(desc, D); memcpy(hash, D, SHA384_DIGEST_SIZE); - memset(D, 0, SHA512_DIGEST_SIZE); + memzero_explicit(D, SHA512_DIGEST_SIZE); return 0; } @@ -326,5 +326,5 @@ module_exit(sha512_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha512"); -MODULE_ALIAS("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); +MODULE_ALIAS_CRYPTO("sha384"); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 4e3c665be129..1ac531ea9bcc 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -579,4 +579,4 @@ module_exit(twofish_exit); MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("twofish"); +MODULE_ALIAS_CRYPTO("twofish"); diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 0a5202303501..77e06c2da83d 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -96,5 +96,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); +MODULE_ALIAS_CRYPTO("twofish"); +MODULE_ALIAS_CRYPTO("twofish-asm"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 13e63b3e1dfb..56d8a08ee479 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -495,5 +495,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); +MODULE_ALIAS_CRYPTO("twofish"); +MODULE_ALIAS_CRYPTO("twofish-asm"); diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 5d7b381da692..2eccc8932ae6 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall) case __NR_socketcall: return 4; case __NR_execve: + case __NR_execveat: return 5; default: return 1; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index df91466f973d..ae6aad1d24f7 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -342,8 +342,8 @@ static int load_aout_binary(struct linux_binprm *bprm) time_after(jiffies, error_time + 5*HZ)) { printk(KERN_WARNING "fd_offset is not page aligned. Please convert " - "program: %s\n", - bprm->file->f_path.dentry->d_name.name); + "program: %pD\n", + bprm->file); error_time = jiffies; } #endif @@ -429,8 +429,8 @@ static int load_aout_library(struct file *file) if (time_after(jiffies, error_time + 5*HZ)) { printk(KERN_WARNING "N_TXTOFF is not page aligned. Please convert " - "library: %s\n", - file->f_path.dentry->d_name.name); + "library: %pD\n", + file); error_time = jiffies; } #endif diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffe71228fc10..82e8a1d44658 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -480,6 +480,7 @@ GLOBAL(\label) PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn PTREGSCALL stub32_execve, compat_sys_execve + PTREGSCALL stub32_execveat, compat_sys_execveat PTREGSCALL stub32_fork, sys_fork PTREGSCALL stub32_vfork, sys_vfork diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0f4460b5636d..2ab1eb33106e 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -24,78 +24,28 @@ #define wmb() asm volatile("sfence" ::: "memory") #endif -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * <programlisting> - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * </programlisting> - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * <programlisting> - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * </programlisting> - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() #ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() +#define dma_rmb() rmb() #else -# define smp_rmb() barrier() +#define dma_rmb() barrier() #endif +#define dma_wmb() barrier() + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) #endif /* SMP */ +#define read_barrier_depends() do { } while (0) +#define smp_read_barrier_depends() do { } while (0) + #if defined(CONFIG_X86_PPRO_FENCE) /* diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9863ee3747da..47c8e32f621a 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,65 +5,6 @@ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> -#ifdef CONFIG_X86_PAT -/* - * X86 PAT uses page flags WC and Uncached together to keep track of - * memory type of pages that have backing page struct. X86 PAT supports 3 - * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and - * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not - * been changed from its default (value of -1 used to denote this). - * Note we do not support _PAGE_CACHE_UC here. - */ - -#define _PGMT_DEFAULT 0 -#define _PGMT_WC (1UL << PG_arch_1) -#define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_CLEAR_MASK (~_PGMT_MASK) - -static inline unsigned long get_page_memtype(struct page *pg) -{ - unsigned long pg_flags = pg->flags & _PGMT_MASK; - - if (pg_flags == _PGMT_DEFAULT) - return -1; - else if (pg_flags == _PGMT_WC) - return _PAGE_CACHE_WC; - else if (pg_flags == _PGMT_UC_MINUS) - return _PAGE_CACHE_UC_MINUS; - else - return _PAGE_CACHE_WB; -} - -static inline void set_page_memtype(struct page *pg, unsigned long memtype) -{ - unsigned long memtype_flags = _PGMT_DEFAULT; - unsigned long old_flags; - unsigned long new_flags; - - switch (memtype) { - case _PAGE_CACHE_WC: - memtype_flags = _PGMT_WC; - break; - case _PAGE_CACHE_UC_MINUS: - memtype_flags = _PGMT_UC_MINUS; - break; - case _PAGE_CACHE_WB: - memtype_flags = _PGMT_WB; - break; - } - - do { - old_flags = pg->flags; - new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; - } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); -} -#else -static inline unsigned long get_page_memtype(struct page *pg) { return -1; } -static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } -#endif - /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0bb1335313b2..aede2c347bde 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -189,6 +189,11 @@ #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */ +#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ +#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ +#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ +#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 97534a7d38e3..f226df064660 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -10,6 +10,12 @@ * cpu_feature_enabled(). */ +#ifdef CONFIG_X86_INTEL_MPX +# define DISABLE_MPX 0 +#else +# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) +#endif + #ifdef CONFIG_X86_64 # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) @@ -34,6 +40,6 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 #define DISABLED_MASK8 0 -#define DISABLED_MASK9 0 +#define DISABLED_MASK9 (DISABLE_MPX) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 0bdb0c54d9a1..fe884e18fa6e 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -70,7 +70,7 @@ #define MAX_DMA_CHANNELS 8 /* 16MB ISA DMA zone */ -#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 9b11757975d0..25bce45c6fc4 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -158,6 +158,30 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( } #endif /* CONFIG_EFI_MIXED */ + +/* arch specific definitions used by the stub code */ + +struct efi_config { + u64 image_handle; + u64 table; + u64 allocate_pool; + u64 allocate_pages; + u64 get_memory_map; + u64 free_pool; + u64 free_pages; + u64 locate_handle; + u64 handle_protocol; + u64 exit_boot_services; + u64 text_output; + efi_status_t (*call)(unsigned long, ...); + bool is64; +} __packed; + +__pure const struct efi_config *__efi_early(void); + +#define efi_call_early(f, ...) \ + __efi_early()->call(__efi_early()->f, __VA_ARGS__); + extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 2519d0679d99..c3dd5e71f439 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -8,8 +8,12 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off) { + unsigned long prot; + + prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; if (boot_cpu_data.x86 > 3) - pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; + pgprot_val(vma->vm_page_prot) = + prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); } extern int fb_is_primary_device(struct fb_info *info); diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index ffb1733ac91f..f80d70009ff8 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -69,7 +69,9 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_HOLE, #else +#ifdef CONFIG_X86_VSYSCALL_EMULATION VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, +#endif #ifdef CONFIG_PARAVIRT_CLOCK PVCLOCK_FIXMAP_BEGIN, PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, @@ -136,9 +138,7 @@ enum fixed_addresses { extern void reserve_top_address(unsigned long reserve); #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) -#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) extern int fixmaps_set; diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index e1f7fecaa7d6..f45acad3c4b6 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,39 +1,6 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H -#ifdef __ASSEMBLY__ - - /* skip is set if the stack was already partially adjusted */ - .macro MCOUNT_SAVE_FRAME skip=0 - /* - * We add enough stack to save all regs. - */ - subq $(SS+8-\skip), %rsp - movq %rax, RAX(%rsp) - movq %rcx, RCX(%rsp) - movq %rdx, RDX(%rsp) - movq %rsi, RSI(%rsp) - movq %rdi, RDI(%rsp) - movq %r8, R8(%rsp) - movq %r9, R9(%rsp) - /* Move RIP to its proper location */ - movq SS+8(%rsp), %rdx - movq %rdx, RIP(%rsp) - .endm - - .macro MCOUNT_RESTORE_FRAME skip=0 - movq R9(%rsp), %r9 - movq R8(%rsp), %r8 - movq RDI(%rsp), %rdi - movq RSI(%rsp), %rsi - movq RDX(%rsp), %rdx - movq RCX(%rsp), %rcx - movq RAX(%rsp), %rax - addq $(SS+8-\skip), %rsp - .endm - -#endif - #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY # define MCOUNT_ADDR ((long)(__fentry__)) diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h deleted file mode 100644 index e8c58f88b1d4..000000000000 --- a/arch/x86/include/asm/hash.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_X86_HASH_H -#define _ASM_X86_HASH_H - -struct fast_hash_ops; -extern void setup_arch_fast_hash(struct fast_hash_ops *ops); - -#endif /* _ASM_X86_HASH_H */ diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 302a323b3f67..04e9d023168f 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -38,17 +38,20 @@ extern unsigned long highstart_pfn, highend_pfn; /* * Ordering is: * - * FIXADDR_TOP - * fixed_addresses - * FIXADDR_START - * temp fixed addresses - * FIXADDR_BOOT_START - * Persistent kmap area - * PKMAP_BASE - * VMALLOC_END - * Vmalloc area - * VMALLOC_START - * high_memory + * high memory on: high_memory off: + * FIXADDR_TOP FIXADDR_TOP + * fixed addresses fixed addresses + * FIXADDR_START FIXADDR_START + * temp fixed addresses/persistent kmap area VMALLOC_END + * PKMAP_BASE temp fixed addresses/vmalloc area + * VMALLOC_END VMALLOC_START + * vmalloc area high_memory + * VMALLOC_START + * high_memory + * + * The temp fixed area is only used during boot for early_ioremap(), and + * it is unused when the ioremap() is functional. vmalloc/pkmap area become + * available after early boot so the temp fixed area is available for re-use. */ #define LAST_PKMAP_MASK (LAST_PKMAP-1) #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 48eb30a86062..47f29b1d1846 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -65,6 +65,7 @@ struct insn { unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; @@ -96,7 +97,7 @@ struct insn { #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ -extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); +extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); extern void insn_get_modrm(struct insn *insn); @@ -115,12 +116,13 @@ static inline void insn_get_attribute(struct insn *insn) extern int insn_rip_relative(struct insn *insn); /* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, const void *kaddr) +static inline void kernel_insn_init(struct insn *insn, + const void *kaddr, int buf_len) { #ifdef CONFIG_X86_64 - insn_init(insn, kaddr, 1); + insn_init(insn, kaddr, buf_len, 1); #else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, 0); + insn_init(insn, kaddr, buf_len, 0); #endif } diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index b8237d8a1e0c..34a5b93704d3 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -74,6 +74,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb_relaxed(v, a) __writeb(v, a) +#define writew_relaxed(v, a) __writew(v, a) +#define writel_relaxed(v, a) __writel(v, a) #define __raw_writeb __writeb #define __raw_writew __writew #define __raw_writel __writel @@ -86,6 +89,7 @@ build_mmio_read(readq, "q", unsigned long, "=r", :"memory") build_mmio_write(writeq, "q", unsigned long, "r", :"memory") #define readq_relaxed(a) readq(a) +#define writeq_relaxed(v, a) writeq(v, a) #define __raw_readq(a) readq(a) #define __raw_writeq(val, addr) writeq(val, addr) @@ -310,11 +314,11 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) -extern void *xlate_dev_mem_ptr(unsigned long phys); -extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); +extern void *xlate_dev_mem_ptr(phys_addr_t phys); +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, - unsigned long prot_val); + enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ed0c30d6a0c..d89c6b828c96 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -33,7 +33,7 @@ #define KVM_MAX_VCPUS 255 #define KVM_SOFT_MAX_VCPUS 160 -#define KVM_USER_MEM_SLOTS 125 +#define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) @@ -51,6 +51,7 @@ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL +#define CR3_PCID_INVD (1UL << 63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -361,6 +362,7 @@ struct kvm_vcpu_arch { int mp_state; u64 ia32_misc_enable_msr; bool tpr_access_reporting; + u64 ia32_xss; /* * Paging state of the vcpu @@ -542,7 +544,7 @@ struct kvm_apic_map { struct rcu_head rcu; u8 ldr_bits; /* fields bellow are used to decode ldr values in different modes */ - u32 cid_shift, cid_mask, lid_mask; + u32 cid_shift, cid_mask, lid_mask, broadcast; struct kvm_lapic *phys_map[256]; /* first index is cluster id second is cpu id in a cluster */ struct kvm_lapic *logical_map[16][16]; @@ -602,6 +604,9 @@ struct kvm_arch { struct kvm_xen_hvm_config xen_hvm_config; + /* reads protected by irq_srcu, writes by irq_lock */ + struct hlist_head mask_notifier_list; + /* fields used by HYPER-V emulation */ u64 hv_guest_os_id; u64 hv_hypercall; @@ -659,6 +664,16 @@ struct msr_data { u64 data; }; +struct kvm_lapic_irq { + u32 vector; + u32 delivery_mode; + u32 dest_mode; + u32 level; + u32 trig_mode; + u32 shorthand; + u32 dest_id; +}; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -767,6 +782,7 @@ struct kvm_x86_ops { enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); + bool (*xsaves_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); @@ -818,6 +834,19 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; +}; + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask); + extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); @@ -863,7 +892,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector); +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); @@ -895,6 +924,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); +bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); static inline int __kvm_irq_line_state(unsigned long *irq_state, int irq_source_id, int level) @@ -1066,6 +1096,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 958b90f761e5..51b26e895933 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -34,6 +34,10 @@ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ +/* AMD-specific bits */ +#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ +#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ + /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is * bits 15:0. But bit 12 is the 'F' bit, defined for corrected @@ -78,7 +82,6 @@ /* Software defined banks */ #define MCE_EXTENDED_BANK 128 #define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0) -#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) #define MCE_LOG_LEN 32 #define MCE_LOG_SIGNATURE "MACHINECHECK" diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 64dc362506b7..201b520521ed 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -78,6 +78,7 @@ static inline void __exit exit_amd_microcode(void) {} extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); +void reload_early_microcode(void); #else static inline void __init load_ucode_bsp(void) {} static inline void load_ucode_ap(void) {} @@ -85,6 +86,7 @@ static inline int __init save_microcode_in_initrd(void) { return 0; } +static inline void reload_early_microcode(void) {} #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index b7b10b82d3e5..af935397e053 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -59,7 +59,7 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); +extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); #define PATCH_MAX_SIZE PAGE_SIZE extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; @@ -68,10 +68,12 @@ extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; extern void __init load_ucode_amd_bsp(void); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); +void reload_ucode_amd(void); #else static inline void __init load_ucode_amd_bsp(void) {} static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } +void reload_ucode_amd(void) {} #endif #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index bbe296e0bce1..dd4c20043ce7 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -68,11 +68,13 @@ extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); extern void show_ucode_info_early(void); extern int __init save_microcode_in_initrd_intel(void); +void reload_ucode_intel(void); #else static inline __init void load_ucode_intel_bsp(void) {} static inline void load_ucode_intel_ap(void) {} static inline void show_ucode_info_early(void) {} static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; } +static inline void reload_ucode_intel(void) {} #endif #if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 166af2a8e865..40269a2bf6f9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -10,9 +10,8 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> +#include <asm/mpx.h> #ifndef CONFIG_PARAVIRT -#include <asm-generic/mm_hooks.h> - static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { @@ -53,7 +52,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* Load the LDT, if the LDT is different: */ + /* + * Load the LDT, if the LDT is different. + * + * It's possible leave_mm(prev) has been called. If so, + * then prev->context.ldt could be out of sync with the + * LDT descriptor or the LDT register. This can only happen + * if prev->context.ldt is non-null, since we never free + * an LDT. But LDTs can't be shared across mms, so + * prev->context.ldt won't be equal to next->context.ldt. + */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } @@ -102,4 +110,27 @@ do { \ } while (0) #endif +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + paravirt_arch_dup_mmap(oldmm, mm); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ + paravirt_arch_exit_mmap(mm); +} + +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + mpx_mm_init(mm); +} + +static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + mpx_notify_unmap(mm, vma, start, end); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h new file mode 100644 index 000000000000..a952a13d59a7 --- /dev/null +++ b/arch/x86/include/asm/mpx.h @@ -0,0 +1,103 @@ +#ifndef _ASM_X86_MPX_H +#define _ASM_X86_MPX_H + +#include <linux/types.h> +#include <asm/ptrace.h> +#include <asm/insn.h> + +/* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ +#define MPX_INVALID_BOUNDS_DIR ((void __user *)-1) +#define MPX_BNDCFG_ENABLE_FLAG 0x1 +#define MPX_BD_ENTRY_VALID_FLAG 0x1 + +#ifdef CONFIG_X86_64 + +/* upper 28 bits [47:20] of the virtual address in 64-bit used to + * index into bounds directory (BD). + */ +#define MPX_BD_ENTRY_OFFSET 28 +#define MPX_BD_ENTRY_SHIFT 3 +/* bits [19:3] of the virtual address in 64-bit used to index into + * bounds table (BT). + */ +#define MPX_BT_ENTRY_OFFSET 17 +#define MPX_BT_ENTRY_SHIFT 5 +#define MPX_IGN_BITS 3 +#define MPX_BD_ENTRY_TAIL 3 + +#else + +#define MPX_BD_ENTRY_OFFSET 20 +#define MPX_BD_ENTRY_SHIFT 2 +#define MPX_BT_ENTRY_OFFSET 10 +#define MPX_BT_ENTRY_SHIFT 4 +#define MPX_IGN_BITS 2 +#define MPX_BD_ENTRY_TAIL 2 + +#endif + +#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) +#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) + +#define MPX_BNDSTA_TAIL 2 +#define MPX_BNDCFG_TAIL 12 +#define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1)) +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BT_ADDR_MASK (~((1UL<<MPX_BD_ENTRY_TAIL)-1)) + +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BNDSTA_ERROR_CODE 0x3 + +#define MPX_BD_ENTRY_MASK ((1<<MPX_BD_ENTRY_OFFSET)-1) +#define MPX_BT_ENTRY_MASK ((1<<MPX_BT_ENTRY_OFFSET)-1) +#define MPX_GET_BD_ENTRY_OFFSET(addr) ((((addr)>>(MPX_BT_ENTRY_OFFSET+ \ + MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT) +#define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \ + MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) + +#ifdef CONFIG_X86_INTEL_MPX +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf); +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ + /* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; +} +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#else +static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + return NULL; +} +static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + return -EINVAL; +} +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return 0; +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ +} +static inline void mpx_notify_unmap(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} +#endif /* CONFIG_X86_INTEL_MPX */ + +#endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f48b17df4224..3a52ee0e726d 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -20,7 +20,6 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index f408caf73430..b3bebf9e5746 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,6 +39,8 @@ void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ -#define __HAVE_ARCH_GATE_AREA 1 +#ifdef CONFIG_X86_VSYSCALL_EMULATION +# define __HAVE_ARCH_GATE_AREA 1 +#endif #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 678205195ae1..75450b2c7be4 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,11 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e1610e29e..32444ae939ca 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -330,13 +330,13 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); } -static inline void arch_exit_mmap(struct mm_struct *mm) +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); } @@ -986,5 +986,15 @@ extern void default_banner(void); #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop +#ifndef __ASSEMBLY__ +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +} + +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) +{ +} +#endif /* __ASSEMBLY__ */ #endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index e2c1668dde7a..91bc4ba95f91 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -11,16 +11,17 @@ static const int pat_enabled; #endif extern void pat_init(void); +void pat_init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, - unsigned long req_type, unsigned long *ret_type); + enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, - unsigned long flag); + enum page_cache_mode pcm); int io_reserve_memtype(resource_size_t start, resource_size_t end, - unsigned long *type); + enum page_cache_mode *pcm); void io_free_memtype(resource_size_t start, resource_size_t end); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index fd472181a1d0..e0ba66ca68c6 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -64,7 +64,7 @@ #define __percpu_prefix "" #endif -#define __percpu_arg(x) __percpu_prefix "%P" #x +#define __percpu_arg(x) __percpu_prefix "%" #x /* * Initialized pointers to per-cpu variables needed for the boot @@ -179,29 +179,58 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var, constraint) \ +#define percpu_from_op(op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ : "=q" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pfo_ret__; \ +}) + +#define percpu_stable_op(op, var) \ +({ \ + typeof(var) pfo_ret__; \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_arg(P1)",%0" \ + : "=q" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 2: \ + asm(op "w "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 4: \ + asm(op "l "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 8: \ + asm(op "q "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ break; \ default: __bad_percpu_size(); \ } \ @@ -359,11 +388,11 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) +#define this_cpu_read_stable(var) percpu_stable_op("mov", var) -#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp) +#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp) +#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp) #define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) @@ -381,9 +410,9 @@ do { \ #define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) #define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) -#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) #define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) @@ -435,7 +464,7 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp) #define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) #define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) @@ -444,7 +473,7 @@ do { \ #define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) @@ -522,7 +551,7 @@ static inline int x86_this_cpu_variable_test_bit(int nr, #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ -DECLARE_PER_CPU(unsigned long, this_cpu_off); +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8dfc9fd094a3..dc0f6ed35b08 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -177,6 +177,9 @@ struct x86_pmu_capability { #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) #define IBS_CAPS_RIPINVALIDCHK (1U<<7) +#define IBS_CAPS_OPBRNFUSE (1U<<8) +#define IBS_CAPS_FETCHCTLEXTD (1U<<9) +#define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index aa97a070f09f..e8a5454acc99 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -9,9 +9,10 @@ /* * Macro to mark a page protection value as UC- */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | \ + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) #ifndef __ASSEMBLY__ @@ -99,6 +100,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_DIRTY; +} + static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; @@ -404,8 +410,8 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, - unsigned long flags, - unsigned long new_flags) + enum page_cache_mode pcm, + enum page_cache_mode new_pcm) { /* * PAT type is always WB for untracked ranges, so no need to check. @@ -419,10 +425,10 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back */ - if ((flags == _PAGE_CACHE_UC_MINUS && - new_flags == _PAGE_CACHE_WB) || - (flags == _PAGE_CACHE_WC && - new_flags == _PAGE_CACHE_WB)) { + if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WC && + new_pcm == _PAGE_CACHE_MODE_WB)) { return 0; } diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index ed5903be26fe..9fb2f2bc8245 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -37,7 +37,7 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ +#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ & PMD_MASK) #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 7166e25ecb57..602b6028c5b6 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -63,6 +63,8 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) +#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 07789647bf33..25bcd4a89517 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -128,11 +128,28 @@ _PAGE_SOFT_DIRTY | _PAGE_NUMA) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) -#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) -#define _PAGE_CACHE_WB (0) -#define _PAGE_CACHE_WC (_PAGE_PWT) -#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) -#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) +/* + * The cache modes defined here are used to translate between pure SW usage + * and the HW defined cache mode bits and/or PAT entries. + * + * The resulting bits for PWT, PCD and PAT should be chosen in a way + * to have the WB mode at index 0 (all bits clear). This is the default + * right now and likely would break too much if changed. + */ +#ifndef __ASSEMBLY__ +enum page_cache_mode { + _PAGE_CACHE_MODE_WB = 0, + _PAGE_CACHE_MODE_WC = 1, + _PAGE_CACHE_MODE_UC_MINUS = 2, + _PAGE_CACHE_MODE_UC = 3, + _PAGE_CACHE_MODE_WT = 4, + _PAGE_CACHE_MODE_WP = 5, + _PAGE_CACHE_MODE_NUM = 8 +}; +#endif + +#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) +#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -156,41 +173,27 @@ #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) -#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) -#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS) -#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC) #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) -#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) -#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) -#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) /* xwr */ #define __P000 PAGE_NONE @@ -341,6 +344,59 @@ static inline pmdval_t pmdnuma_flags(pmd_t pmd) #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) +extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; +extern uint8_t __pte2cachemode_tbl[8]; + +#define __pte2cm_idx(cb) \ + ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ + (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ + (((cb) >> _PAGE_BIT_PWT) & 1)) +#define __cm_idx2pte(i) \ + ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ + (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ + (((i) & 1) << _PAGE_BIT_PWT)) + +static inline unsigned long cachemode2protval(enum page_cache_mode pcm) +{ + if (likely(pcm == 0)) + return 0; + return __cachemode2pte_tbl[pcm]; +} +static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) +{ + return __pgprot(cachemode2protval(pcm)); +} +static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +{ + unsigned long masked; + + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; + if (likely(masked == 0)) + return 0; + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; +} +static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) +{ + pgprot_t new; + unsigned long val; + + val = pgprot_val(pgprot); + pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); + return new; +} +static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) +{ + pgprot_t new; + unsigned long val; + + val = pgprot_val(pgprot); + pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT_LARGE) >> + (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); + return new; +} + typedef struct page *pgtable_t; @@ -396,6 +452,7 @@ static inline void update_page_count(int level, unsigned long pages) { } extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); +extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h index 0a4e140315b6..7249e6d0902d 100644 --- a/arch/x86/include/asm/platform_sst_audio.h +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -16,6 +16,9 @@ #include <linux/sfi.h> +#define MAX_NUM_STREAMS_MRFLD 25 +#define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD + enum sst_audio_task_id_mrfld { SST_TASK_ID_NONE = 0, SST_TASK_ID_SBA = 1, @@ -73,6 +76,65 @@ struct sst_platform_data { unsigned int strm_map_size; }; +struct sst_info { + u32 iram_start; + u32 iram_end; + bool iram_use; + u32 dram_start; + u32 dram_end; + bool dram_use; + u32 imr_start; + u32 imr_end; + bool imr_use; + u32 mailbox_start; + bool use_elf; + bool lpe_viewpt_rqd; + unsigned int max_streams; + u32 dma_max_len; + u8 num_probes; +}; + +struct sst_lib_dnld_info { + unsigned int mod_base; + unsigned int mod_end; + unsigned int mod_table_offset; + unsigned int mod_table_size; + bool mod_ddr_dnld; +}; + +struct sst_res_info { + unsigned int shim_offset; + unsigned int shim_size; + unsigned int shim_phy_addr; + unsigned int ssp0_offset; + unsigned int ssp0_size; + unsigned int dma0_offset; + unsigned int dma0_size; + unsigned int dma1_offset; + unsigned int dma1_size; + unsigned int iram_offset; + unsigned int iram_size; + unsigned int dram_offset; + unsigned int dram_size; + unsigned int mbox_offset; + unsigned int mbox_size; + unsigned int acpi_lpe_res_index; + unsigned int acpi_ddr_index; + unsigned int acpi_ipc_irq_index; +}; + +struct sst_ipc_info { + int ipc_offset; + unsigned int mbox_recv_off; +}; + +struct sst_platform_info { + const struct sst_info *probe_data; + const struct sst_ipc_info *ipc_info; + const struct sst_res_info *res_info; + const struct sst_lib_dnld_info *lib_info; + const char *platform; +}; int add_sst_platform_device(void); #endif diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 400873450e33..8f3271842533 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,9 +30,6 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define task_preempt_count(p) \ - (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) - #define init_task_preempt_count(p) do { \ task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ } while (0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eb71ec794732..a092a0cce0b7 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -127,7 +127,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); +}; #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 @@ -151,7 +151,7 @@ extern __u32 cpu_caps_cleared[NCAPINTS]; extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP -DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) #else #define cpu_info boot_cpu_data @@ -374,13 +374,14 @@ struct lwp_struct { u8 reserved[128]; }; -struct bndregs_struct { - u64 bndregs[8]; +struct bndreg { + u64 lower_bound; + u64 upper_bound; } __packed; -struct bndcsr_struct { - u64 cfg_reg_u; - u64 status_reg; +struct bndcsr { + u64 bndcfgu; + u64 bndstatus; } __packed; struct xsave_hdr_struct { @@ -394,8 +395,8 @@ struct xsave_struct { struct xsave_hdr_struct xsave_hdr; struct ymmh_struct ymmh; struct lwp_struct lwp; - struct bndregs_struct bndregs; - struct bndcsr_struct bndcsr; + struct bndreg bndreg[4]; + struct bndcsr bndcsr; /* new processor state extensions will go here */ } __attribute__ ((packed, aligned (64))); @@ -893,7 +894,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #else /* - * User space process size. 47bits minus one guard page. + * User space process size. 47bits minus one guard page. The guard + * page is necessary on Intel CPUs: if a SYSCALL instruction is at + * the highest possible canonical userspace address, then that + * syscall will enter the kernel with a non-canonical return + * address, and SYSRET will explode dangerously. We avoid this + * particular problem by preventing anything from being mapped + * at the maximum canonical address. */ #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) @@ -953,6 +960,24 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +/* Register/unregister a process' MPX related resource */ +#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) +#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) + +#ifdef CONFIG_X86_INTEL_MPX +extern int mpx_enable_management(struct task_struct *tsk); +extern int mpx_disable_management(struct task_struct *tsk); +#else +static inline int mpx_enable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +static inline int mpx_disable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +#endif /* CONFIG_X86_INTEL_MPX */ + extern u16 amd_get_nb_id(int cpu); static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 6f1c3a8a33ab..db257a58571f 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -23,6 +23,15 @@ #define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) +#define SEGMENT_RPL_MASK 0x3 /* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ +#define USER_RPL 0x3 /* User mode is privilege level 3 */ +#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ +#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ + #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: @@ -125,16 +134,6 @@ #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 - -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 /* * Matching rules for certain types of segments. @@ -192,17 +191,6 @@ #define get_kernel_rpl() 0 #endif -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 - -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 - #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd27e08e23c..8cd1cc3bc835 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -150,6 +150,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) } void cpu_disable_common(void); +void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 9295016485c9..a4efe477ceab 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -183,8 +183,20 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - while (arch_spin_is_locked(lock)) + __ticket_t head = ACCESS_ONCE(lock->tickets.head); + + for (;;) { + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* + * We need to check "unlocked" in a loop, tmp.head == head + * can be false positive because of overflow. + */ + if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) || + tmp.head != head) + break; + cpu_relax(); + } } /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index d7f3b3b78ac3..751bf4b7bf11 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" + "r12", "r13", "r14", "r15", "flags" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,7 +100,11 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* Save restore flags to clear handle leaking NT */ +/* + * There is no need to save or restore flags, because flags are always + * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL + * has no effect. + */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 854053889d4d..547e344a6dc6 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -141,7 +141,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY) + _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bc8352e7010a..707adc6549d8 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -39,6 +39,7 @@ asmlinkage void simd_coprocessor_error(void); #ifdef CONFIG_TRACING asmlinkage void trace_page_fault(void); +#define trace_stack_segment stack_segment #define trace_divide_error divide_error #define trace_bounds bounds #define trace_invalid_op invalid_op diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 2d60a7813dfe..fc808b83fccb 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -33,8 +33,8 @@ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). */ -#define MAX_CPUS_PER_UVHUB 64 -#define MAX_CPUS_PER_SOCKET 32 +#define MAX_CPUS_PER_UVHUB 128 +#define MAX_CPUS_PER_SOCKET 64 #define ADP_SZ 64 /* hardware-provided max. */ #define UV_CPUS_PER_AS 32 /* hardware-provided max. */ #define ITEMS_PER_DESC 8 diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3c3366c2e37f..e7e9682a33e9 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -70,4 +70,23 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s) ++s->seq; } +#ifdef CONFIG_X86_64 + +#define VGETCPU_CPU_MASK 0xfff + +static inline unsigned int __getcpu(void) +{ + unsigned int p; + + /* + * Load per CPU data from GDT. LSL is faster than RDTSCP and + * works on all CPUs. + */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + + return p; +} + +#endif /* CONFIG_X86_64 */ + #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index bcbfade26d8d..45afaee9555c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_XSAVES 0x00100000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -159,6 +160,8 @@ enum vmcs_field { EOI_EXIT_BITMAP3_HIGH = 0x00002023, VMREAD_BITMAP = 0x00002026, VMWRITE_BITMAP = 0x00002028, + XSS_EXIT_BITMAP = 0x0000202C, + XSS_EXIT_BITMAP_HIGH = 0x0000202D, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 2a46ca720afc..6ba66ee79710 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,15 +4,7 @@ #include <linux/seqlock.h> #include <uapi/asm/vsyscall.h> -#define VGETCPU_RDTSCP 1 -#define VGETCPU_LSL 2 - -/* kernel space (writeable) */ -extern int vgetcpu_mode; -extern struct timezone sys_tz; - -#include <asm/vvar.h> - +#ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); /* @@ -20,25 +12,12 @@ extern void map_vsyscall(void); * Returns true if handled. */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); - -#ifdef CONFIG_X86_64 - -#define VGETCPU_CPU_MASK 0xfff - -static inline unsigned int __getcpu(void) +#else +static inline void map_vsyscall(void) {} +static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { - unsigned int p; - - if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - - return p; + return false; } -#endif /* CONFIG_X86_64 */ +#endif #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 5d2b9ad2c6d2..3f32dfc2ab73 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -44,8 +44,6 @@ extern char __vvar_page; /* DECLARE_VVAR(offset, type, name) */ -DECLARE_VVAR(0, volatile unsigned long, jiffies) -DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e45e4da96bf1..f58a9c7a3c86 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -172,7 +172,6 @@ struct x86_platform_ops { struct pci_dev; struct msi_msg; -struct msi_desc; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); @@ -183,8 +182,6 @@ struct x86_msi_ops { void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); int (*setup_hpet_msi)(unsigned int irq, unsigned int id); - u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag); - u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag); }; struct IO_APIC_route_entry; diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h new file mode 100644 index 000000000000..0d809e9fc975 --- /dev/null +++ b/arch/x86/include/asm/xen/cpuid.h @@ -0,0 +1,91 @@ +/****************************************************************************** + * arch-x86/cpuid.h + * + * CPUID interface to Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007 Citrix Systems, Inc. + * + * Authors: + * Keir Fraser <keir@xen.org> + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__ +#define __XEN_PUBLIC_ARCH_X86_CPUID_H__ + +/* + * For compatibility with other hypervisor interfaces, the Xen cpuid leaves + * can be found at the first otherwise unused 0x100 aligned boundary starting + * from 0x40000000. + * + * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid + * leaves will start at 0x40000100 + */ + +#define XEN_CPUID_FIRST_LEAF 0x40000000 +#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i)) + +/* + * Leaf 1 (0x40000x00) + * EAX: Largest Xen-information leaf. All leaves up to an including @EAX + * are supported by the Xen host. + * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification + * of a Xen host. + */ +#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */ +#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */ +#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */ + +/* + * Leaf 2 (0x40000x01) + * EAX[31:16]: Xen major version. + * EAX[15: 0]: Xen minor version. + * EBX-EDX: Reserved (currently all zeroes). + */ + +/* + * Leaf 3 (0x40000x02) + * EAX: Number of hypercall transfer pages. This register is always guaranteed + * to specify one hypercall page. + * EBX: Base address of Xen-specific MSRs. + * ECX: Features 1. Unused bits are set to zero. + * EDX: Features 2. Unused bits are set to zero. + */ + +/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */ +#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0 +#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) + +/* + * Leaf 5 (0x40000x04) + * HVM-specific features + */ + +/* EAX Features */ +/* Virtualized APIC registers */ +#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) +/* Virtualized x2APIC accesses */ +#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) +/* Memory mapped from other domains has valid IOMMU entries */ +#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) + +#define XEN_CPUID_MAX_NUM_LEAVES 4 + +#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h index 7f02fe4e2c7b..acd844c017d3 100644 --- a/arch/x86/include/asm/xen/page-coherent.h +++ b/arch/x86/include/asm/xen/page-coherent.h @@ -22,8 +22,8 @@ static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, } static inline void xen_dma_map_page(struct device *hwdev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) { } + dma_addr_t dev_addr, unsigned long offset, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) { } static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, size_t size, enum dma_data_direction dir, diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c949923a5668..5eea09915a15 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -41,10 +41,12 @@ typedef struct xpaddr { extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; +extern unsigned long *xen_p2m_addr; +extern unsigned long xen_p2m_size; +extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); @@ -52,17 +54,52 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -extern int m2p_add_override(unsigned long mfn, struct page *page, - struct gnttab_map_grant_ref *kmap_op); extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -extern int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn); -extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); +/* + * Helper functions to write or read unsigned long values to/from + * memory, when the access may fault. + */ +static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val) +{ + return __put_user(val, (unsigned long __user *)addr); +} + +static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val) +{ + return __get_user(*val, (unsigned long __user *)addr); +} + +/* + * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine(): + * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator + * bits (identity or foreign) are set. + * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set + * identity or foreign indicator will be still set. __pfn_to_mfn() is + * encapsulating get_phys_to_machine() which is called in special cases only. + * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special + * cases needing an extended handling. + */ +static inline unsigned long __pfn_to_mfn(unsigned long pfn) +{ + unsigned long mfn; + + if (pfn < xen_p2m_size) + mfn = xen_p2m_addr[pfn]; + else if (unlikely(pfn < xen_max_p2m_pfn)) + return get_phys_to_machine(pfn); + else + return IDENTITY_FRAME(pfn); + + if (unlikely(mfn == INVALID_P2M_ENTRY)) + return get_phys_to_machine(pfn); + + return mfn; +} + static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -70,7 +107,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - mfn = get_phys_to_machine(pfn); + mfn = __pfn_to_mfn(pfn); if (mfn != INVALID_P2M_ENTRY) mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); @@ -83,7 +120,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return 1; - return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; + return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY; } static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) @@ -102,7 +139,7 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn); if (ret < 0) return ~0; @@ -117,7 +154,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); - if (get_phys_to_machine(pfn) != mfn) { + if (__pfn_to_mfn(pfn) != mfn) { /* * If this appears to be a foreign mfn (because the pfn * doesn't map back to the mfn), then check the local override @@ -133,8 +170,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) * entry doesn't map back to the mfn and m2p_override doesn't have a * valid entry for it. */ - if (pfn == ~0 && - get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) + if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn)) pfn = mfn; return pfn; @@ -180,7 +216,7 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn(mfn); - if (get_phys_to_machine(pfn) != mfn) + if (__pfn_to_mfn(pfn) != mfn) return -1; /* force !pfn_valid() */ return pfn; } @@ -236,4 +272,11 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_remap(cookie, size) ioremap((cookie), (size)); #define xen_unmap(cookie) iounmap((cookie)) +static inline bool xen_arch_need_swiotlb(struct device *dev, + unsigned long pfn, + unsigned long mfn) +{ + return false; +} + #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 7e7a79ada658..5fa9770035dc 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,7 @@ #define XSTATE_Hi16_ZMM 0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index e21331ce368f..c8aa65d56027 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -152,6 +152,45 @@ #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 #define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 +/* Hardware P state interface */ +#define MSR_PPERF 0x0000064e +#define MSR_PERF_LIMIT_REASONS 0x0000064f +#define MSR_PM_ENABLE 0x00000770 +#define MSR_HWP_CAPABILITIES 0x00000771 +#define MSR_HWP_REQUEST_PKG 0x00000772 +#define MSR_HWP_INTERRUPT 0x00000773 +#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_STATUS 0x00000777 + +/* CPUID.6.EAX */ +#define HWP_BASE_BIT (1<<7) +#define HWP_NOTIFICATIONS_BIT (1<<8) +#define HWP_ACTIVITY_WINDOW_BIT (1<<9) +#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10) +#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11) + +/* IA32_HWP_CAPABILITIES */ +#define HWP_HIGHEST_PERF(x) (x & 0xff) +#define HWP_GUARANTEED_PERF(x) ((x & (0xff << 8)) >>8) +#define HWP_MOSTEFFICIENT_PERF(x) ((x & (0xff << 16)) >>16) +#define HWP_LOWEST_PERF(x) ((x & (0xff << 24)) >>24) + +/* IA32_HWP_REQUEST */ +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) +#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24) +#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42) + +/* IA32_HWP_STATUS */ +#define HWP_GUARANTEED_CHANGE(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4) + +/* IA32_HWP_INTERRUPT */ +#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2) + #define MSR_AMD64_MC0_MASK 0xc0010044 #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) @@ -206,6 +245,7 @@ #define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1) #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ /* Fam 16h MSRs */ @@ -345,6 +385,8 @@ #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 +#define MSR_MISC_PWR_MGMT 0x000001aa + #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 #define ENERGY_PERF_BIAS_PERFORMANCE 0 #define ENERGY_PERF_BIAS_NORMAL 6 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 990a2fe1588d..b813bf9da1e2 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -72,6 +72,8 @@ #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -116,6 +118,8 @@ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVVPID, "INVVPID" }, \ - { EXIT_REASON_INVPCID, "INVPCID" } + { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_XSAVES, "XSAVES" }, \ + { EXIT_REASON_XRSTORS, "XRSTORS" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f1e77440b2b..5d4502c8b983 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,8 +28,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_X86_64) += vsyscall_64.o -obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index f04dbb3069b8..5caed1dd7ccf 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,6 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, {} @@ -30,6 +31,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, {} diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 4128b5fcb559..c2fd21fed002 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,7 +40,7 @@ static unsigned int get_apic_id(unsigned long x) unsigned int id; rdmsrl(MSR_FAM10H_NODE_ID, value); - id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U); return id; } @@ -145,7 +145,7 @@ static void numachip_send_IPI_all(int vector) static void numachip_send_IPI_self(int vector) { - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); + apic_write(APIC_SELF_IPI, vector); } static int __init numachip_probe(void) @@ -153,20 +153,8 @@ static int __init numachip_probe(void) return apic == &apic_numachip; } -static void __init map_csrs(void) -{ - printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", - NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); - - printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", - NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); -} - static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { - if (c->phys_proc_id != node) { c->phys_proc_id = node; per_cpu(cpu_llc_id, smp_processor_id()) = node; @@ -175,19 +163,15 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) static int __init numachip_system_init(void) { - unsigned int val; - if (!numachip_system) return 0; + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; x86_init.pci.arch_init = pci_numachip_init; - map_csrs(); - - val = read_lcsr(CSR_G0_NODE_IDS); - printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); - return 0; } early_initcall(numachip_system_init); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6a1e71bde323..6873ab925d00 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,6 +18,7 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/delay.h> +#include <linux/seq_buf.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) @@ -29,14 +30,35 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; +static cpumask_t printtrace_mask; + +#define NMI_BUF_SIZE 4096 + +struct nmi_seq_buf { + unsigned char buffer[NMI_BUF_SIZE]; + struct seq_buf seq; +}; + +/* Safe printing in NMI context */ +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; +static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + void arch_trigger_all_cpu_backtrace(bool include_self) { + struct nmi_seq_buf *s; + int len; + int cpu; int i; - int cpu = get_cpu(); + int this_cpu = get_cpu(); if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -49,7 +71,17 @@ void arch_trigger_all_cpu_backtrace(bool include_self) cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); if (!include_self) - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + + cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); + /* + * Set up per_cpu seq_buf buffers that the NMIs running on the other + * CPUs will write to. + */ + for_each_cpu(cpu, to_cpumask(backtrace_mask)) { + s = &per_cpu(nmi_print_seq, cpu); + seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); + } if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("sending NMI to %s CPUs:\n", @@ -65,11 +97,58 @@ void arch_trigger_all_cpu_backtrace(bool include_self) touch_softlockup_watchdog(); } + /* + * Now that all the NMIs have triggered, we can dump out their + * back traces safely to the console. + */ + for_each_cpu(cpu, &printtrace_mask) { + int last_i = 0; + + s = &per_cpu(nmi_print_seq, cpu); + len = seq_buf_used(&s->seq); + if (!len) + continue; + + /* Print line by line. */ + for (i = 0; i < len; i++) { + if (s->buffer[i] == '\n') { + print_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < len) { + print_seq_line(s, last_i, len - 1); + pr_cont("\n"); + } + } + clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); put_cpu(); } +/* + * It is not safe to call printk() directly from NMI handlers. + * It may be fine if the NMI detected a lock up and we have no choice + * but to do so, but doing a NMI on all other CPUs to get a back trace + * can be done with a sysrq-l. We don't want that to lock up, which + * can happen if the NMI interrupts a printk in progress. + * + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes + * the content into a per cpu seq_buf buffer. Then when the NMIs are + * all done, we can safely dump the contents of the seq_buf to a printk() + * from a non NMI context. + */ +static int nmi_vprintk(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + unsigned int len = seq_buf_used(&s->seq); + + seq_buf_vprintf(&s->seq, fmt, args); + return seq_buf_used(&s->seq) - len; +} + static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { @@ -78,12 +157,14 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + printk_func_t printk_func_save = this_cpu_read(printk_func); - arch_spin_lock(&lock); + /* Replace printk to write into the NMI seq */ + this_cpu_write(printk_func, nmi_vprintk); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - arch_spin_unlock(&lock); + this_cpu_write(printk_func, printk_func_save); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NMI_HANDLED; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1183d545da1e..a6745e756729 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3158,7 +3158,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - __write_msi_msg(data->msi_desc, &msg); + __pci_write_msi_msg(data->msi_desc, &msg); return IRQ_SET_MASK_OK_NOCOPY; } @@ -3169,8 +3169,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) */ static struct irq_chip msi_chip = { .name = "PCI-MSI", - .irq_unmask = unmask_msi_irq, - .irq_mask = mask_msi_irq, + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, .irq_ack = ack_apic_edge, .irq_set_affinity = msi_set_affinity, .irq_retrigger = ioapic_retrigger_irq, @@ -3196,7 +3196,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. */ if (!irq_offset) - write_msi_msg(irq, &msg); + pci_write_msi_msg(irq, &msg); setup_remapped_irq(irq, irq_cfg(irq), chip); @@ -3968,7 +3968,7 @@ bool mp_should_keep_irq(struct device *dev) { if (dev->power.is_prepared) return true; -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM if (dev->power.runtime_status == RPM_SUSPENDING) return true; #endif diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 584874451414..927ec9235947 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -378,7 +378,6 @@ static struct cpuidle_driver apm_idle_driver = { { /* entry 1 is for APM idle */ .name = "APM", .desc = "APM idle", - .flags = CPUIDLE_FLAG_TIME_VALID, .exit_latency = 250, /* WAG */ .target_residency = 500, /* WAG */ .enter = &apm_cpu_idle diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index d67c4be3e8b1..3b3b9d33ac1d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ucontext.h> #include <linux/lguest.h> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e7c798b354fa..fdcbb4d27c9f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, compat) [nr] = 1, @@ -48,7 +52,6 @@ int main(void) #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); - ENTRY(bx); ENTRY(cx); ENTRY(dx); ENTRY(sp); diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9d..f3672508b249 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) case __NR_openat: return 3; case __NR_execve: + case __NR_execveat: return 5; default: return 0; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 813d29d00a17..15c5df92f74e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -566,6 +566,17 @@ static void init_amd_k8(struct cpuinfo_x86 *c) if (!c->x86_model_id[0]) strcpy(c->x86_model_id, "Hammer"); + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + msr_set_bit(MSR_K7_HWCR, 6); +#endif } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -636,18 +647,6 @@ static void init_amd(struct cpuinfo_x86 *c) { u32 dummy; -#ifdef CONFIG_SMP - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) - msr_set_bit(MSR_K7_HWCR, 6); -#endif - early_init_amd(c); /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b4f78c9ba19..c6049650c093 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -146,6 +146,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { + if (strlen(s)) + return 0; setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); setup_clear_cpu_cap(X86_FEATURE_XSAVES); @@ -956,14 +958,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_64 -static void vgetcpu_set_mode(void) -{ - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; -} - #ifdef CONFIG_IA32_EMULATION /* May not be __init: called during resume */ static void syscall32_cpu_init(void) @@ -1006,8 +1000,6 @@ void __init identify_boot_cpu(void) #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); -#else - vgetcpu_set_mode(); #endif cpu_detect_tlb(&boot_cpu_data); } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 09edd0b65fef..10b46906767f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -3,6 +3,8 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, MCE_AO_SEVERITY, @@ -21,7 +23,7 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg); +int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c370e1c4468b..8bb433043a7f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -31,6 +31,7 @@ enum context { IN_KERNEL = 1, IN_USER = 2 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; static struct severity { u64 mask; @@ -40,6 +41,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char excp; unsigned char covered; char *msg; } severities[] = { @@ -48,6 +50,8 @@ static struct severity { #define USER .context = IN_USER #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP #define BITCLR(x) .mask = x, .result = 0 #define BITSET(x) .mask = x, .result = x #define MCGMASK(x, y) .mcgmask = x, .mcgres = y @@ -62,7 +66,7 @@ static struct severity { ), MCESEV( NO, "Not enabled", - BITCLR(MCI_STATUS_EN) + EXCP, BITCLR(MCI_STATUS_EN) ), MCESEV( PANIC, "Processor context corrupt", @@ -71,16 +75,20 @@ static struct severity { /* When MCIP is not set something is very confused */ MCESEV( PANIC, "MCIP not set in MCA handler", - MCGMASK(MCG_STATUS_MCIP, 0) + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) ), /* Neither return not error IP -- no chance to recover -> PANIC */ MCESEV( PANIC, "Neither restart nor error IP", - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) ), MCESEV( PANIC, "In kernel and no restart IP", - KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + DEFERRED, "Deferred error", + NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), MCESEV( KEEP, "Corrected error", @@ -89,7 +97,7 @@ static struct severity { /* ignore OVER for UCNA */ MCESEV( - KEEP, "Uncorrected no action required", + UCNA, "Uncorrected no action required", SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) ), MCESEV( @@ -178,8 +186,9 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *m, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) { + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); struct severity *s; @@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg) continue; if (s->context && ctx != s->context) continue; + if (s->excp && excp != s->excp) + continue; if (msg) *msg = s->msg; s->covered = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 61a9668cebfd..d2c611699cd9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -292,10 +292,10 @@ static void print_mce(struct mce *m) #define PANIC_TIMEOUT 5 /* 5 seconds */ -static atomic_t mce_paniced; +static atomic_t mce_panicked; static int fake_panic; -static atomic_t mce_fake_paniced; +static atomic_t mce_fake_panicked; /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) @@ -319,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_inc_return(&mce_paniced) > 1) + if (atomic_inc_return(&mce_panicked) > 1) wait_for_panic(); barrier(); @@ -327,7 +327,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) console_verbose(); } else { /* Don't log too much for fake panic */ - if (atomic_inc_return(&mce_fake_paniced) > 1) + if (atomic_inc_return(&mce_fake_panicked) > 1) return; } /* First print corrected ones that are still unlogged */ @@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i) } } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* + * coming soon + */ + return false; + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; + int severity; int i; this_cpu_inc(mce_poll_count); @@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; + + severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + + /* + * In the cases where we don't have a valid address after all, + * do not add it into the ring buffer. + */ + if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { + if (m.status & MCI_STATUS_ADDRV) { + mce_ring_add(m.addr >> PAGE_SHIFT); + mce_schedule_work(); + } + } + /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + if (mce_severity(m, mca_cfg.tolerant, msg, true) >= + MCE_PANIC_SEVERITY) ret = 1; } return ret; @@ -697,7 +744,7 @@ static int mce_timed_out(u64 *t) * might have been modified by someone else. */ rmb(); - if (atomic_read(&mce_paniced)) + if (atomic_read(&mce_panicked)) wait_for_panic(); if (!mca_cfg.monarch_timeout) goto out; @@ -754,7 +801,7 @@ static void mce_reign(void) for_each_possible_cpu(cpu) { int severity = mce_severity(&per_cpu(mces_seen, cpu), mca_cfg.tolerant, - &nmsg); + &nmsg, true); if (severity > global_worst) { msg = nmsg; global_worst = severity; @@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, cfg->tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL, true); /* - * When machine check was for corrected handler don't touch, - * unless we're panicing. + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicing. */ - if (severity == MCE_KEEP_SEVERITY && !no_way_out) + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) continue; __set_bit(i, toclear); if (severity == MCE_NO_SEVERITY) { @@ -2520,7 +2568,7 @@ struct dentry *mce_get_debugfs_dir(void) static void mce_reset(void) { cpu_missing = 0; - atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_fake_panicked, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5d4999f95aec..f1c3769bbd64 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -212,12 +212,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - int offset = -1; + int offset = -1, new; for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) - address = MSR_IA32_MC0_MISC + bank * 4; + address = MSR_IA32_MCx_MISC(bank); else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) @@ -247,13 +247,18 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.address = address; b.interrupt_capable = lvt_interrupt_supported(bank, high); - if (b.interrupt_capable) { - int new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); - } + if (!b.interrupt_capable) + goto init; + + new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + + if ((offset == new) && + (mce_threshold_vector != amd_threshold_interrupt)) + mce_threshold_vector = amd_threshold_interrupt; +init: mce_threshold_block_init(&b, offset); - mce_threshold_vector = amd_threshold_interrupt; } } } @@ -270,18 +275,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; + int cpu = smp_processor_id(); unsigned int bank, block; struct mce m; - mce_setup(&m); - /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) { - address = MSR_IA32_MC0_MISC + bank * 4; + address = MSR_IA32_MCx_MISC(bank); } else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) @@ -309,21 +313,20 @@ static void amd_threshold_interrupt(void) * Log the machine check that caused the threshold * event. */ - machine_check_poll(MCP_TIMESTAMP, - this_cpu_ptr(&mce_poll_banks)); - - if (high & MASK_OVERFLOW_HI) { - rdmsrl(address, m.misc); - rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, - m.status); - m.bank = K8_MCE_THRESHOLD_BASE - + bank * NR_BLOCKS - + block; - mce_log(&m); - return; - } + if (high & MASK_OVERFLOW_HI) + goto log; } } + return; + +log: + mce_setup(&m); + rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); + m.misc = ((u64)high << 32) | low; + m.bank = bank; + mce_log(&m); + + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } /* @@ -617,8 +620,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); + err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); if (!err) goto out; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8fffd845e22b..bfbbe6195e2d 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -376,7 +376,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) { enum ucode_state ret; @@ -390,8 +390,8 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) #if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) /* save BSP's matching patch for early load */ - if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(smp_processor_id()); + if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(cpu); if (p) { memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), @@ -444,7 +444,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(c->x86, fw->data, fw->size); + ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 7aa1acc79789..737737edbd1e 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -108,12 +108,13 @@ static size_t compute_container_size(u8 *data, u32 total_size) * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. */ -static void apply_ucode_in_initrd(void *ucode, size_t size) +static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) { struct equiv_cpu_entry *eq; size_t *cont_sz; u32 *header; u8 *data, **cont; + u8 (*patch)[PATCH_MAX_SIZE]; u16 eq_id = 0; int offset, left; u32 rev, eax, ebx, ecx, edx; @@ -123,10 +124,12 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); cont_sz = (size_t *)__pa_nodebug(&container_size); cont = (u8 **)__pa_nodebug(&container); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; cont_sz = &container_size; cont = &container; + patch = &amd_ucode_patch; #endif data = ucode; @@ -213,9 +216,9 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) rev = mc->hdr.patch_id; *new_rev = rev; - /* save ucode patch */ - memcpy(amd_ucode_patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); + if (save_patch) + memcpy(patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); } } @@ -246,7 +249,7 @@ void __init load_ucode_amd_bsp(void) *data = cp.data; *size = cp.size; - apply_ucode_in_initrd(cp.data, cp.size); + apply_ucode_in_initrd(cp.data, cp.size, true); } #ifdef CONFIG_X86_32 @@ -263,7 +266,7 @@ void load_ucode_amd_ap(void) size_t *usize; void **ucode; - mc = (struct microcode_amd *)__pa(amd_ucode_patch); + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { __apply_microcode_amd(mc); return; @@ -275,7 +278,7 @@ void load_ucode_amd_ap(void) if (!*ucode || !*usize) return; - apply_ucode_in_initrd(*ucode, *usize); + apply_ucode_in_initrd(*ucode, *usize, false); } static void __init collect_cpu_sig_on_bsp(void *arg) @@ -339,7 +342,7 @@ void load_ucode_amd_ap(void) * AP has a different equivalence ID than BSP, looks like * mixed-steppings silicon so go through the ucode blob anew. */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); } } #endif @@ -347,7 +350,9 @@ void load_ucode_amd_ap(void) int __init save_microcode_in_initrd_amd(void) { unsigned long cont; + int retval = 0; enum ucode_state ret; + u8 *cont_va; u32 eax; if (!container) @@ -355,13 +360,15 @@ int __init save_microcode_in_initrd_amd(void) #ifdef CONFIG_X86_32 get_bsp_sig(); - cont = (unsigned long)container; + cont = (unsigned long)container; + cont_va = __va(container); #else /* * We need the physical address of the container for both bitness since * boot_params.hdr.ramdisk_image is a physical address. */ - cont = __pa(container); + cont = __pa(container); + cont_va = container; #endif /* @@ -372,6 +379,8 @@ int __init save_microcode_in_initrd_amd(void) if (relocated_ramdisk) container = (u8 *)(__va(relocated_ramdisk) + (cont - boot_params.hdr.ramdisk_image)); + else + container = cont_va; if (ucode_new_rev) pr_info("microcode: updated early to new patch_level=0x%08x\n", @@ -380,9 +389,9 @@ int __init save_microcode_in_initrd_amd(void) eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - ret = load_microcode_amd(eax, container, container_size); + ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); if (ret != UCODE_OK) - return -EINVAL; + retval = -EINVAL; /* * This will be freed any msec now, stash patches for the current @@ -391,5 +400,23 @@ int __init save_microcode_in_initrd_amd(void) container = NULL; container_size = 0; - return 0; + return retval; +} + +void reload_ucode_amd(void) +{ + struct microcode_amd *mc; + u32 rev, eax; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("microcode: reload patch_level=0x%08x\n", + ucode_new_rev); + } + } } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index dd9d6190b08d..15c29096136b 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -465,6 +465,8 @@ static void mc_bp_resume(void) if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); + else if (!uci->mc) + reload_early_microcode(); } static struct syscore_ops mc_syscore_ops = { @@ -549,7 +551,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; - if (dis_ucode_ldr) + if (paravirt_enabled() || dis_ucode_ldr) return 0; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index 5f28a64e71ea..d45df4bd16ab 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -124,7 +124,7 @@ void __init load_ucode_bsp(void) static bool check_loader_disabled_ap(void) { #ifdef CONFIG_X86_32 - return __pa_nodebug(dis_ucode_ldr); + return *((bool *)__pa_nodebug(&dis_ucode_ldr)); #else return dis_ucode_ldr; #endif @@ -176,3 +176,24 @@ int __init save_microcode_in_initrd(void) return 0; } + +void reload_early_microcode(void) +{ + int vendor, x86; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + reload_ucode_intel(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + reload_ucode_amd(); + break; + default: + break; + } +} diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index b88343f7a3b3..ec9df6f9cd47 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -650,8 +650,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci) } #endif -static int apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) +static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc_intel; unsigned int val[2]; @@ -680,7 +679,10 @@ static int apply_microcode_early(struct mc_saved_data *mc_saved_data, #endif uci->cpu_sig.rev = val[1]; - print_ucode(uci); + if (early) + print_ucode(uci); + else + print_ucode_info(uci, mc_intel->hdr.date); return 0; } @@ -715,12 +717,17 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, unsigned long initrd_end_early, struct ucode_cpu_info *uci) { + enum ucode_state ret; + collect_cpu_info_early(uci); scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, mc_saved_in_initrd, uci); - load_microcode(mc_saved_data, mc_saved_in_initrd, - initrd_start_early, uci); - apply_microcode_early(mc_saved_data, uci); + + ret = load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + + if (ret == UCODE_OK) + apply_microcode_early(uci, true); } void __init @@ -749,7 +756,8 @@ load_ucode_intel_bsp(void) initrd_end_early = initrd_start_early + ramdisk_size; _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, - initrd_start_early, initrd_end_early, &uci); + initrd_start_early, initrd_end_early, + &uci); #endif } @@ -783,5 +791,23 @@ void load_ucode_intel_ap(void) collect_cpu_info_early(&uci); load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, initrd_start_addr, &uci); - apply_microcode_early(mc_saved_data_p, &uci); + apply_microcode_early(&uci, true); +} + +void reload_ucode_intel(void) +{ + struct ucode_cpu_info uci; + enum ucode_state ret; + + if (!mc_saved_data.mc_saved_count) + return; + + collect_cpu_info_early(&uci); + + ret = generic_load_microcode_early(mc_saved_data.mc_saved, + mc_saved_data.mc_saved_count, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, false); } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fc5eb390b368..4e6cdb0ddc70 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -253,6 +253,10 @@ struct cpu_hw_events { #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +/* Like UEVENT_CONSTRAINT, but match flags too */ +#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + #define INTEL_PLD_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cbb1be3ed9e4..a61f5c6911da 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -565,6 +565,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + /* + * Read IbsBrTarget and IbsOpData4 separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } + } ibs_data.size = sizeof(u64) * size; regs = *iregs; diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c index 639d1289b1ba..97242a9242bd 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -130,10 +130,7 @@ static ssize_t _iommu_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask); } static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 30790d798e6b..cc6cedb8f25d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -219,7 +219,6 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n; cpumask_t *active_mask; struct pmu *pmu = dev_get_drvdata(dev); @@ -230,10 +229,7 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, else return 0; - n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, active_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 46211bcc813e..3c895d480cd7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -552,18 +552,18 @@ int intel_pmu_drain_bts_buffer(void) * PEBS */ struct event_constraint intel_core2_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_atom_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; @@ -577,36 +577,36 @@ struct event_constraint intel_slm_pebs_event_constraints[] = { struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_snb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -617,7 +617,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { }; struct event_constraint intel_ivb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -628,7 +628,7 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { }; struct event_constraint intel_hsw_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), @@ -724,6 +724,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long ip = regs->ip; int is_64bit = 0; void *kaddr; + int size; /* * We don't need to fixup if the PEBS assist is fault like @@ -758,11 +759,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 1; } + size = ip - to; if (!kernel_ip(ip)) { - int size, bytes; + int bytes; u8 *buf = this_cpu_read(insn_buffer); - size = ip - to; /* Must fit our buffer, see above */ + /* 'size' must fit our buffer, see above */ bytes = copy_from_user_nmi(buf, (void __user *)to, size); if (bytes != 0) return 0; @@ -780,11 +782,20 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) #ifdef CONFIG_X86_64 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, kaddr, is_64bit); + insn_init(&insn, kaddr, size, is_64bit); insn_get_length(&insn); + /* + * Make sure there was not a problem decoding the + * instruction and getting the length. This is + * doubly important because we have an infinite + * loop if insn.length=0. + */ + if (!insn.length) + break; to += insn.length; kaddr += insn.length; + size -= insn.length; } while (to < ip); if (to == ip) { @@ -886,6 +897,29 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.bp = pebs->bp; regs.sp = pebs->sp; + if (sample_type & PERF_SAMPLE_REGS_INTR) { + regs.ax = pebs->ax; + regs.bx = pebs->bx; + regs.cx = pebs->cx; + regs.dx = pebs->dx; + regs.si = pebs->si; + regs.di = pebs->di; + regs.bp = pebs->bp; + regs.sp = pebs->sp; + + regs.flags = pebs->flags; +#ifndef CONFIG_X86_32 + regs.r8 = pebs->r8; + regs.r9 = pebs->r9; + regs.r10 = pebs->r10; + regs.r11 = pebs->r11; + regs.r12 = pebs->r12; + regs.r13 = pebs->r13; + regs.r14 = pebs->r14; + regs.r15 = pebs->r15; +#endif + } + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { regs.ip = pebs->real_ip; regs.flags |= PERF_EFLAGS_EXACT; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 45fa730a5283..58f1a94beaf0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -465,7 +465,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; - int bytes, size = MAX_INSN_SIZE; + int bytes_read, bytes_left; int ret = X86_BR_NONE; int ext, to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; @@ -493,8 +493,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return X86_BR_NONE; /* may fail if text not present */ - bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != 0) + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) return X86_BR_NONE; addr = buf; @@ -505,10 +507,19 @@ static int branch_type(unsigned long from, unsigned long to, int abort) * Ensure we don't blindy read any address by validating it is * a known text address. */ - if (kernel_text_address(from)) + if (kernel_text_address(from)) { addr = (void *)from; - else + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { return X86_BR_NONE; + } } /* @@ -518,8 +529,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) #ifdef CONFIG_X86_64 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, addr, is64); + insn_init(&insn, addr, bytes_read, is64); insn_get_opcode(&insn); + if (!insn.opcode.got) + return X86_BR_ABORT; switch (insn.opcode.bytes[0]) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index d64f275fe274..673f930c700f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -365,11 +365,7 @@ static void rapl_pmu_event_read(struct perf_event *event) static ssize_t rapl_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 9762dbd9f3f7..08f3fed2b0f2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -647,11 +647,7 @@ static int uncore_pmu_event_init(struct perf_event *event) static ssize_t uncore_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index adf138eac85c..745b158e9a65 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -449,7 +449,11 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { static struct uncore_event_desc snbep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -486,14 +490,17 @@ static struct attribute_group snbep_uncore_qpi_format_group = { .attrs = snbep_uncore_qpi_formats_attr, }; -#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ - .init_box = snbep_uncore_msr_init_box, \ +#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ .disable_box = snbep_uncore_msr_disable_box, \ .enable_box = snbep_uncore_msr_enable_box, \ .disable_event = snbep_uncore_msr_disable_event, \ .enable_event = snbep_uncore_msr_enable_event, \ .read_counter = uncore_msr_read_counter +#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ + __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \ + .init_box = snbep_uncore_msr_init_box \ + static struct intel_uncore_ops snbep_uncore_msr_ops = { SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), }; @@ -1919,6 +1926,30 @@ static struct intel_uncore_type hswep_uncore_cbox = { .format_group = &hswep_uncore_cbox_format_group, }; +/* + * Write SBOX Initialization register bit by bit to avoid spurious #GPs + */ +static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) { + u64 init = SNBEP_PMON_BOX_CTL_INT; + u64 flags = 0; + int i; + + for_each_set_bit(i, (unsigned long *)&init, 64) { + flags |= (1ULL << i); + wrmsrl(msr, flags); + } + } +} + +static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = { + __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .init_box = hswep_uncore_sbox_msr_init_box +}; + static struct attribute *hswep_uncore_sbox_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -1944,7 +1975,7 @@ static struct intel_uncore_type hswep_uncore_sbox = { .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, .msr_offset = HSWEP_SBOX_MSR_OFFSET, - .ops = &snbep_uncore_msr_ops, + .ops = &hswep_uncore_sbox_msr_ops, .format_group = &hswep_uncore_sbox_format_group, }; @@ -2009,7 +2040,11 @@ static struct intel_uncore_type hswep_uncore_ha = { static struct uncore_event_desc hswep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -2025,13 +2060,27 @@ static struct intel_uncore_type hswep_uncore_imc = { SNBEP_UNCORE_PCI_COMMON_INIT(), }; +static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8}; + +static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + static struct intel_uncore_ops hswep_uncore_irp_ops = { .init_box = snbep_uncore_pci_init_box, .disable_box = snbep_uncore_pci_disable_box, .enable_box = snbep_uncore_pci_enable_box, .disable_event = ivbep_uncore_irp_disable_event, .enable_event = ivbep_uncore_irp_enable_event, - .read_counter = ivbep_uncore_irp_read_counter, + .read_counter = hswep_uncore_irp_read_counter, }; static struct intel_uncore_type hswep_uncore_irp = { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 5433658e598d..e7d8c7608471 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -72,7 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else - seq_printf(m, "stepping\t: unknown\n"); + seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); @@ -92,12 +92,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); - seq_printf(m, "flags\t\t:"); + seq_puts(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); - seq_printf(m, "\nbugs\t\t:"); + seq_puts(m, "\nbugs\t\t:"); for (i = 0; i < 32*NBUGINTS; i++) { unsigned int bug_bit = 32*NCAPINTS + i; @@ -118,7 +118,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); - seq_printf(m, "power management:"); + seq_puts(m, "power management:"); for (i = 0; i < 32; i++) { if (c->x86_power & (1 << i)) { if (i < ARRAY_SIZE(x86_power_flags) && @@ -131,7 +131,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) } } - seq_printf(m, "\n\n"); + seq_puts(m, "\n\n"); return 0; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4a8013d55947..60639093d536 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, + { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 }, + { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 }, + { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, + { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, + { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 3225ae6c5180..83741a71558f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -143,7 +143,7 @@ static int cpuid_device_create(int cpu) dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + return PTR_ERR_OR_ZERO(dev); } static void cpuid_device_destroy(int cpu) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 1abcb50b48ae..ff86f19b5758 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = { [ DEBUG_STACK-1 ] = "#DB", [ NMI_STACK-1 ] = "NMI", [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ STACKFAULT_STACK-1 ] = "#SS", [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [ N_EXCEPTION_STACKS ... diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 49f886481615..dd2f07ae9d0c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1114,8 +1114,8 @@ void __init memblock_find_dma_reserve(void) * at first, and assume boot_mem will not take below MAX_DMA_PFN */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); - end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); nr_pages += end_pfn - start_pfn; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 2e1a6853e00c..fe9f0b79a18b 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -455,6 +455,23 @@ struct intel_stolen_funcs { u32 (*base)(int num, int slot, int func, size_t size); }; +static size_t __init gen9_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; + gmch_ctrl &= BDW_GMCH_GMS_MASK; + + if (gmch_ctrl < 0xf0) + return gmch_ctrl << 25; /* 32 MB units */ + else + /* 4MB increments starting at 0xf0 for 4MB */ + return (gmch_ctrl - 0xf0 + 1) << 22; +} + +typedef size_t (*stolen_size_fn)(int num, int slot, int func); + static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { .base = i830_stolen_base, .size = i830_stolen_size, @@ -490,6 +507,11 @@ static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { .size = gen8_stolen_size, }; +static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = gen9_stolen_size, +}; + static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { .base = intel_stolen_base, .size = chv_stolen_size, @@ -523,6 +545,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = { INTEL_BDW_M_IDS(&gen8_stolen_funcs), INTEL_BDW_D_IDS(&gen8_stolen_funcs), INTEL_CHV_IDS(&chv_stolen_funcs), + INTEL_SKL_IDS(&gen9_stolen_funcs), }; static void __init intel_graphics_stolen(int num, int slot, int func) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 344b63f18d14..1cf7c97ff175 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1191,10 +1191,10 @@ ENTRY(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %edx - lea 0x4(%ebp), %eax + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %edx + subl $MCOUNT_INSN_SIZE, %eax call prepare_ftrace_return popl %edx popl %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index df088bb03fb3..90878aa38dbd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -652,6 +652,20 @@ ENTRY(stub_execve) CFI_ENDPROC END(stub_execve) +ENTRY(stub_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execveat) + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. @@ -697,6 +711,20 @@ ENTRY(stub_x32_execve) CFI_ENDPROC END(stub_x32_execve) +ENTRY(stub_x32_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call compat_sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execveat) + #endif /* @@ -828,9 +856,15 @@ ENTRY(native_iret) jnz native_irq_return_ldt #endif +.global native_irq_return_iret native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ iretq - _ASM_EXTABLE(native_irq_return_iret, bad_iret) #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: @@ -858,25 +892,6 @@ native_irq_return_ldt: jmp native_irq_return_iret #endif - .section .fixup,"ax" -bad_iret: - /* - * The iret traps when the %cs or %ss being restored is bogus. - * We've lost the original trap vector and error code. - * #GPF is the most likely one to get for an invalid selector. - * So pretend we completed the iret and took the #GPF in user mode. - * - * We are now running with the kernel GS after exception recovery. - * But error_entry expects us to have user GS to match the user %cs, - * so swap back. - */ - pushq $0 - - SWAPGS - jmp general_protection - - .previous - /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE @@ -922,37 +937,6 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) - /* - * If IRET takes a fault on the espfix stack, then we - * end up promoting it to a doublefault. In that case, - * modify the stack to make it look like we just entered - * the #GP handler from user space, similar to bad_iret. - */ -#ifdef CONFIG_X86_ESPFIX64 - ALIGN -__do_double_fault: - XCPT_FRAME 1 RDI+8 - movq RSP(%rdi),%rax /* Trap on the espfix stack? */ - sarq $PGDIR_SHIFT,%rax - cmpl $ESPFIX_PGD_ENTRY,%eax - jne do_double_fault /* No, just deliver the fault */ - cmpl $__KERNEL_CS,CS(%rdi) - jne do_double_fault - movq RIP(%rdi),%rax - cmpq $native_irq_return_iret,%rax - jne do_double_fault /* This shouldn't happen... */ - movq PER_CPU_VAR(kernel_stack),%rax - subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ - movq %rax,RSP(%rdi) - movq $0,(%rax) /* Missing (lost) #GP error code */ - movq $general_protection,RIP(%rdi) - retq - CFI_ENDPROC -END(__do_double_fault) -#else -# define __do_double_fault do_double_fault -#endif - /* * APIC interrupts. */ @@ -1124,7 +1108,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 @@ -1289,7 +1273,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 +idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN idtentry xen_debug do_debug has_error_code=0 idtentry xen_int3 do_int3 has_error_code=0 @@ -1399,17 +1383,16 @@ error_sti: /* * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. The exception handlers after iret run with - * kernel gs again, so don't set the user space flag. B stepping K8s - * sometimes report an truncated RIP for IRET exceptions returning to - * compat mode. Check for these here too. + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) - je error_swapgs + je error_bad_iret movl %ecx,%eax /* zero extend */ cmpq %rax,RIP+8(%rsp) je bstep_iret @@ -1420,7 +1403,15 @@ error_kernelspace: bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) - jmp error_swapgs + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti CFI_ENDPROC END(error_entry) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 94d857fb1033..f5d0730e7b08 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -122,9 +122,6 @@ static void init_espfix_random(void) void __init init_espfix_bsp(void) { pgd_t *pgd_p; - pteval_t ptemask; - - ptemask = __supported_pte_mask; /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3386dc9aa333..2142376dc8c6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -17,6 +17,7 @@ #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/list.h> #include <linux/module.h> @@ -47,7 +48,7 @@ int ftrace_arch_code_modify_post_process(void) union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; struct { - char e8; + unsigned char e8; int offset; } __attribute__((packed)); }; @@ -582,7 +583,7 @@ void ftrace_replace_code(int enable) remove_breakpoints: pr_warn("Failed on %s (%d):\n", report, count); - ftrace_bug(ret, rec ? rec->ip : 0); + ftrace_bug(ret, rec); for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); /* @@ -644,13 +645,8 @@ int __init ftrace_dyn_arch_init(void) { return 0; } -#endif - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -#ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_graph_call(void); +#if defined(CONFIG_X86_64) || defined(CONFIG_FUNCTION_GRAPH_TRACER) static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -664,6 +660,280 @@ static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) */ return calc.code; } +#endif + +/* Currently only x86_64 supports dynamic trampolines */ +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_MODULES +#include <linux/moduleloader.h> +/* Module allocation simplifies allocating memory for code */ +static inline void *alloc_tramp(unsigned long size) +{ + return module_alloc(size); +} +static inline void tramp_free(void *tramp) +{ + module_free(NULL, tramp); +} +#else +/* Trampolines can only be created if modules are supported */ +static inline void *alloc_tramp(unsigned long size) +{ + return NULL; +} +static inline void tramp_free(void *tramp) { } +#endif + +/* Defined as markers to the end of the ftrace default trampolines */ +extern void ftrace_caller_end(void); +extern void ftrace_regs_caller_end(void); +extern void ftrace_return(void); +extern void ftrace_caller_op_ptr(void); +extern void ftrace_regs_caller_op_ptr(void); + +/* movq function_trace_op(%rip), %rdx */ +/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */ +#define OP_REF_SIZE 7 + +/* + * The ftrace_ops is passed to the function callback. Since the + * trampoline only services a single ftrace_ops, we can pass in + * that ops directly. + * + * The ftrace_op_code_union is used to create a pointer to the + * ftrace_ops that will be passed to the callback function. + */ +union ftrace_op_code_union { + char code[OP_REF_SIZE]; + struct { + char op[3]; + int offset; + } __attribute__((packed)); +}; + +static unsigned long +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) +{ + unsigned const char *jmp; + unsigned long start_offset; + unsigned long end_offset; + unsigned long op_offset; + unsigned long offset; + unsigned long size; + unsigned long ip; + unsigned long *ptr; + void *trampoline; + /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ + unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + union ftrace_op_code_union op_ptr; + int ret; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + op_offset = (unsigned long)ftrace_caller_op_ptr; + } + + size = end_offset - start_offset; + + /* + * Allocate enough size to store the ftrace_caller code, + * the jmp to ftrace_return, as well as the address of + * the ftrace_ops this trampoline is used for. + */ + trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); + if (!trampoline) + return 0; + + *tramp_size = size + MCOUNT_INSN_SIZE + sizeof(void *); + + /* Copy ftrace_caller onto the trampoline memory */ + ret = probe_kernel_read(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) { + tramp_free(trampoline); + return 0; + } + + ip = (unsigned long)trampoline + size; + + /* The trampoline ends with a jmp to ftrace_return */ + jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); + memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); + + /* + * The address of the ftrace_ops that is used for this trampoline + * is stored at the end of the trampoline. This will be used to + * load the third parameter for the callback. Basically, that + * location at the end of the trampoline takes the place of + * the global function_trace_op variable. + */ + + ptr = (unsigned long *)(trampoline + size + MCOUNT_INSN_SIZE); + *ptr = (unsigned long)ops; + + op_offset -= start_offset; + memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); + + /* Are we pointing to the reference? */ + if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) { + tramp_free(trampoline); + return 0; + } + + /* Load the contents of ptr into the callback parameter */ + offset = (unsigned long)ptr; + offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE; + + op_ptr.offset = offset; + + /* put in the new offset to the ftrace_ops */ + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + + /* ALLOC_TRAMP flags lets us know we created it */ + ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + + return (unsigned long)trampoline; +} + +static unsigned long calc_trampoline_call_offset(bool save_regs) +{ + unsigned long start_offset; + unsigned long call_offset; + + if (save_regs) { + start_offset = (unsigned long)ftrace_regs_caller; + call_offset = (unsigned long)ftrace_regs_call; + } else { + start_offset = (unsigned long)ftrace_caller; + call_offset = (unsigned long)ftrace_call; + } + + return call_offset - start_offset; +} + +void arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ + ftrace_func_t func; + unsigned char *new; + unsigned long offset; + unsigned long ip; + unsigned int size; + int ret; + + if (ops->trampoline) { + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + } else { + ops->trampoline = create_trampoline(ops, &size); + if (!ops->trampoline) + return; + ops->trampoline_size = size; + } + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + ip = ops->trampoline + offset; + + func = ftrace_ops_get_func(ops); + + /* Do a safe modify in case the trampoline is executing */ + new = ftrace_call_replace(ip, (unsigned long)func); + ret = update_ftrace_func(ip, new); + + /* The update should never fail */ + WARN_ON(ret); +} + +/* Return the address of the function the trampoline calls */ +static void *addr_from_call(void *ptr) +{ + union ftrace_code_union calc; + int ret; + + ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE); + if (WARN_ON_ONCE(ret < 0)) + return NULL; + + /* Make sure this is a call */ + if (WARN_ON_ONCE(calc.e8 != 0xe8)) { + pr_warn("Expected e8, got %x\n", calc.e8); + return NULL; + } + + return ptr + MCOUNT_INSN_SIZE + calc.offset; +} + +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, + unsigned long frame_pointer); + +/* + * If the ops->trampoline was not allocated, then it probably + * has a static trampoline func, or is the ftrace caller itself. + */ +static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + bool save_regs = rec->flags & FTRACE_FL_REGS_EN; + void *ptr; + + if (ops && ops->trampoline) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* + * We only know about function graph tracer setting as static + * trampoline. + */ + if (ops->trampoline == FTRACE_GRAPH_ADDR) + return (void *)prepare_ftrace_return; +#endif + return NULL; + } + + offset = calc_trampoline_call_offset(save_regs); + + if (save_regs) + ptr = (void *)FTRACE_REGS_ADDR + offset; + else + ptr = (void *)FTRACE_ADDR + offset; + + return addr_from_call(ptr); +} + +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + + /* If we didn't allocate this trampoline, consider it static */ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return static_tramp_func(ops, rec); + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + return addr_from_call((void *)ops->trampoline + offset); +} + +void arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + tramp_free((void *)ops->trampoline); + ops->trampoline = 0; +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); static int ftrace_mod_jmp(unsigned long ip, void *func) { @@ -694,7 +964,7 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) { unsigned long old; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 922d28581024..6307a0f0cf17 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -59,78 +59,78 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); + seq_puts(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); + seq_puts(p, " Local timer interrupts\n"); seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); + seq_puts(p, " Spurious interrupts\n"); seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance monitoring interrupts\n"); + seq_puts(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); - seq_printf(p, " IRQ work interrupts\n"); + seq_puts(p, " IRQ work interrupts\n"); seq_printf(p, "%*s: ", prec, "RTR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); - seq_printf(p, " APIC ICR read retries\n"); + seq_puts(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); - seq_printf(p, " Platform interrupts\n"); + seq_puts(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); + seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - irq_stats(j)->irq_tlb_count); - seq_printf(p, " Function call interrupts\n"); + seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); + seq_puts(p, " TLB shootdowns\n"); #endif #ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); + seq_puts(p, " Thermal event interrupts\n"); #endif #ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); + seq_puts(p, " Threshold APIC interrupts\n"); #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); - seq_printf(p, " Machine check exceptions\n"); + seq_puts(p, " Machine check exceptions\n"); seq_printf(p, "%*s: ", prec, "MCP"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); - seq_printf(p, " Machine check polls\n"); + seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); - seq_printf(p, " Hypervisor callback interrupts\n"); + seq_puts(p, " Hypervisor callback interrupts\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 67e6d19ef1be..f7e3cd50ece0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -285,7 +285,7 @@ static int can_probe(unsigned long paddr) * normally used, we just go through if there is no kprobe. */ __addr = recover_probed_instruction(buf, addr); - kernel_insn_init(&insn, (void *)__addr); + kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); insn_get_length(&insn); /* @@ -330,8 +330,10 @@ int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; + unsigned long recovered_insn = + recover_probed_instruction(buf, (unsigned long)src); - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) @@ -342,7 +344,7 @@ int __copy_instruction(u8 *dest, u8 *src) if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest); + kernel_insn_init(&insn, dest, insn.length); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 717b02a22e67..5f8f0b3cc674 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -27,7 +27,7 @@ static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, unsigned long orig_ip) { /* * Emulate singlestep (and also recover regs->ip) @@ -39,6 +39,8 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, p->post_handler(p, regs, 0); } __this_cpu_write(current_kprobe, NULL); + if (orig_ip) + regs->ip = orig_ip; return 1; } @@ -46,7 +48,7 @@ int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { if (kprobe_ftrace(p)) - return __skip_singlestep(p, regs, kcb); + return __skip_singlestep(p, regs, kcb, 0); else return 0; } @@ -71,13 +73,14 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { + unsigned long orig_ip = regs->ip; /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) - __skip_singlestep(p, regs, kcb); + __skip_singlestep(p, regs, kcb, orig_ip); /* * If pre_handler returns !0, it sets regs->ip and * resets current kprobe. diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index f1314d0bcf0a..7c523bbf3dc8 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -251,13 +251,15 @@ static int can_optimize(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr - offset + size) { /* Decode until function end */ + unsigned long recovered_insn; if (search_exception_tables(addr)) /* * Since some fixup code will jumps into this function, * we can't optimize kprobe in this function. */ return 0; - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + recovered_insn = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f6945bef2cd1..94f643484300 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault); static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; + + /* + * KVM isn't paravirt in the sense of paravirt_enabled. A KVM + * guest kernel works like a bare metal kernel with additional + * features, and paravirt_enabled is about features that are + * missing. + */ + pv_info.paravirt_enabled = 0; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d9156ceecdff..42caaef897c8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -59,13 +59,12 @@ static void kvm_get_wallclock(struct timespec *now) native_write_msr(msr_kvm_wall_clock, low, high); - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; pvclock_read_wallclock(&wall_clock, vcpu_time, now); - preempt_enable(); + put_cpu(); } static int kvm_set_wallclock(const struct timespec *now) @@ -107,11 +106,10 @@ static unsigned long kvm_get_tsc_khz(void) int cpu; unsigned long tsc_khz; - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); - preempt_enable(); + put_cpu(); return tsc_khz; } @@ -263,7 +261,6 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); - pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) @@ -284,23 +281,22 @@ int __init kvm_setup_vsyscall_timeinfo(void) size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; flags = pvclock_read_flags(vcpu_time); if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - preempt_enable(); + put_cpu(); return 1; } if ((ret = pvclock_init_vsyscall(hv_clock, size))) { - preempt_enable(); + put_cpu(); return ret; } - preempt_enable(); + put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index c73aecf10d34..94ea120fa21f 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -21,40 +21,159 @@ # define function_hook mcount #endif +/* All cases save the original rbp (8 bytes) */ +#ifdef CONFIG_FRAME_POINTER +# ifdef CC_USING_FENTRY +/* Save parent and function stack frames (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16*2) +# else +/* Save just function stack frame (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16) +# endif +#else +/* No need to save a stack frame */ +# define MCOUNT_FRAME_SIZE 8 +#endif /* CONFIG_FRAME_POINTER */ + +/* Size of stack used to save mcount regs in save_mcount_regs */ +#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) + +/* + * gcc -pg option adds a call to 'mcount' in most functions. + * When -mfentry is used, the call is to 'fentry' and not 'mcount' + * and is done before the function's stack frame is set up. + * They both require a set of regs to be saved before calling + * any C code and restored before returning back to the function. + * + * On boot up, all these calls are converted into nops. When tracing + * is enabled, the call can jump to either ftrace_caller or + * ftrace_regs_caller. Callbacks (tracing functions) that require + * ftrace_regs_caller (like kprobes) need to have pt_regs passed to + * it. For this reason, the size of the pt_regs structure will be + * allocated on the stack and the required mcount registers will + * be saved in the locations that pt_regs has them in. + */ + +/* + * @added: the amount of stack added before calling this + * + * After this is called, the following registers contain: + * + * %rdi - holds the address that called the trampoline + * %rsi - holds the parent function (traced function's return address) + * %rdx - holds the original %rbp + */ +.macro save_mcount_regs added=0 + + /* Always save the original rbp */ + pushq %rbp + +#ifdef CONFIG_FRAME_POINTER + /* + * Stack traces will stop at the ftrace trampoline if the frame pointer + * is not set up properly. If fentry is used, we need to save a frame + * pointer for the parent as well as the function traced, because the + * fentry is called before the stack frame is set up, where as mcount + * is called afterward. + */ +#ifdef CC_USING_FENTRY + /* Save the parent pointer (skip orig rbp and our return address) */ + pushq \added+8*2(%rsp) + pushq %rbp + movq %rsp, %rbp + /* Save the return address (now skip orig rbp, rbp and parent) */ + pushq \added+8*3(%rsp) +#else + /* Can't assume that rip is before this (unless added was zero) */ + pushq \added+8(%rsp) +#endif + pushq %rbp + movq %rsp, %rbp +#endif /* CONFIG_FRAME_POINTER */ + + /* + * We add enough stack to save all regs. + */ + subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + /* + * Save the original RBP. Even though the mcount ABI does not + * require this, it helps out callers. + */ + movq MCOUNT_REG_SIZE-8(%rsp), %rdx + movq %rdx, RBP(%rsp) + + /* Copy the parent address into %rsi (second parameter) */ +#ifdef CC_USING_FENTRY + movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi +#else + /* %rdx contains original %rbp */ + movq 8(%rdx), %rsi +#endif + + /* Move RIP to its proper location */ + movq MCOUNT_REG_SIZE+\added(%rsp), %rdi + movq %rdi, RIP(%rsp) + + /* + * Now %rdi (the first parameter) has the return address of + * where ftrace_call returns. But the callbacks expect the + * address of the call itself. + */ + subq $MCOUNT_INSN_SIZE, %rdi + .endm + +.macro restore_mcount_regs + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + + /* ftrace_regs_caller can modify %rbp */ + movq RBP(%rsp), %rbp + + addq $MCOUNT_REG_SIZE, %rsp + + .endm + #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(function_hook) retq END(function_hook) -/* skip is set if stack has been adjusted */ -.macro ftrace_caller_setup skip=0 - MCOUNT_SAVE_FRAME \skip +ENTRY(ftrace_caller) + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs +GLOBAL(ftrace_caller_op_ptr) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* Load ip into the first parameter */ - movq RIP(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - /* Load the parent_ip into the second parameter */ -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif -.endm - -ENTRY(ftrace_caller) - ftrace_caller_setup /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx GLOBAL(ftrace_call) call ftrace_stub - MCOUNT_RESTORE_FRAME -ftrace_return: + restore_mcount_regs + + /* + * The copied trampoline must call ftrace_return as it + * still may need to call the function graph tracer. + */ +GLOBAL(ftrace_caller_end) + +GLOBAL(ftrace_return) #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -66,11 +185,16 @@ GLOBAL(ftrace_stub) END(ftrace_caller) ENTRY(ftrace_regs_caller) - /* Save the current flags before compare (in SS location)*/ + /* Save the current flags before any operations that can change them */ pushfq - /* skip=8 to skip flags saved in SS */ - ftrace_caller_setup 8 + /* added 8 bytes to save flags */ + save_mcount_regs 8 + /* save_mcount_regs fills in first two parameters */ + +GLOBAL(ftrace_regs_caller_op_ptr) + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx /* Save the rest of pt_regs */ movq %r15, R15(%rsp) @@ -79,18 +203,17 @@ ENTRY(ftrace_regs_caller) movq %r12, R12(%rsp) movq %r11, R11(%rsp) movq %r10, R10(%rsp) - movq %rbp, RBP(%rsp) movq %rbx, RBX(%rsp) /* Copy saved flags */ - movq SS(%rsp), %rcx + movq MCOUNT_REG_SIZE(%rsp), %rcx movq %rcx, EFLAGS(%rsp) /* Kernel segments */ movq $__KERNEL_DS, %rcx movq %rcx, SS(%rsp) movq $__KERNEL_CS, %rcx movq %rcx, CS(%rsp) - /* Stack - skipping return address */ - leaq SS+16(%rsp), %rcx + /* Stack - skipping return address and flags */ + leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx movq %rcx, RSP(%rsp) /* regs go into 4th parameter */ @@ -101,11 +224,11 @@ GLOBAL(ftrace_regs_call) /* Copy flags back to SS, to restore them */ movq EFLAGS(%rsp), %rax - movq %rax, SS(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) /* Handlers can change the RIP */ movq RIP(%rsp), %rax - movq %rax, SS+8(%rsp) + movq %rax, MCOUNT_REG_SIZE+8(%rsp) /* restore the rest of pt_regs */ movq R15(%rsp), %r15 @@ -113,19 +236,22 @@ GLOBAL(ftrace_regs_call) movq R13(%rsp), %r13 movq R12(%rsp), %r12 movq R10(%rsp), %r10 - movq RBP(%rsp), %rbp movq RBX(%rsp), %rbx - /* skip=8 to skip flags saved in SS */ - MCOUNT_RESTORE_FRAME 8 + restore_mcount_regs /* Restore flags */ popfq - jmp ftrace_return + /* + * As this jmp to ftrace_return can be a short jump + * it must not be copied into the trampoline. + * The trampoline will add the code to jump + * to the return. + */ +GLOBAL(ftrace_regs_caller_end) - popfq - jmp ftrace_stub + jmp ftrace_return END(ftrace_regs_caller) @@ -136,6 +262,7 @@ ENTRY(function_hook) cmpq $ftrace_stub, ftrace_trace_function jnz trace +fgraph_trace: #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpq $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller @@ -148,42 +275,35 @@ GLOBAL(ftrace_stub) retq trace: - MCOUNT_SAVE_FRAME - - movq RIP(%rsp), %rdi -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif - subq $MCOUNT_INSN_SIZE, %rdi + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs call *ftrace_trace_function - MCOUNT_RESTORE_FRAME + restore_mcount_regs - jmp ftrace_stub + jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - MCOUNT_SAVE_FRAME + /* Saves rbp into %rdx and fills first parameter */ + save_mcount_regs #ifdef CC_USING_FENTRY - leaq SS+16(%rsp), %rdi + leaq MCOUNT_REG_SIZE+8(%rsp), %rsi movq $0, %rdx /* No framepointers needed */ #else - leaq 8(%rbp), %rdi - movq (%rbp), %rdx + /* Save address of the return address of traced function */ + leaq 8(%rdx), %rsi + /* ftrace does sanity checks against frame pointers */ + movq (%rdx), %rdx #endif - movq RIP(%rsp), %rsi - subq $MCOUNT_INSN_SIZE, %rsi - call prepare_ftrace_return - MCOUNT_RESTORE_FRAME + restore_mcount_regs retq END(ftrace_graph_caller) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c9603ac80de5..113e70784854 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -22,6 +22,8 @@ * an SMP box will direct the access to CPU %d. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/types.h> @@ -50,11 +52,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig) mutex_lock(&inode->i_mutex); switch (orig) { - case 0: + case SEEK_SET: file->f_pos = offset; ret = file->f_pos; break; - case 1: + case SEEK_CUR: file->f_pos += offset; ret = file->f_pos; break; @@ -206,7 +208,7 @@ static int msr_device_create(int cpu) dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, "msr%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + return PTR_ERR_OR_ZERO(dev); } static void msr_device_destroy(int cpu) @@ -248,8 +250,7 @@ static int __init msr_init(void) i = 0; if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { - printk(KERN_ERR "msr: unable to get major %d for msr\n", - MSR_MAJOR); + pr_err("unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; goto out; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3ed4a68d4013..5a2c02913af3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; } - /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 749b0e423419..e510618b2e91 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1484,7 +1484,7 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) */ if (work & _TIF_NOHZ) { user_exit(); - work &= ~TIF_NOHZ; + work &= ~_TIF_NOHZ; } #ifdef CONFIG_SECCOMP diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ab08aa2276fb..ab4734e5411d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -960,6 +960,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; + mpx_mm_init(&init_mm); + code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; data_resource.start = __pa_symbol(_etext); @@ -1190,9 +1192,7 @@ void __init setup_arch(char **cmdline_p) tboot_probe(); -#ifdef CONFIG_X86_64 map_vsyscall(); -#endif generic_apic_probe(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5cdff0357746..e4fcb87ba7a6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -30,7 +30,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); #define BOOT_PERCPU_OFFSET 0 #endif -DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4d2128ac70bd..7a8f5845e8eb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -99,7 +99,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Per CPU bogomips and other parameters */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; @@ -1303,10 +1303,14 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } +static DEFINE_PER_CPU(struct completion, die_complete); + void cpu_disable_common(void) { int cpu = smp_processor_id(); + init_completion(&per_cpu(die_complete, smp_processor_id())); + remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1316,8 +1320,6 @@ void cpu_disable_common(void) fixup_irqs(); } -static DEFINE_PER_CPU(struct completion, die_complete); - int native_cpu_disable(void) { int ret; @@ -1327,16 +1329,21 @@ int native_cpu_disable(void) return ret; clear_local_APIC(); - init_completion(&per_cpu(die_complete, smp_processor_id())); cpu_disable_common(); return 0; } +void cpu_die_common(unsigned int cpu) +{ + wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); +} + void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ - wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); + + cpu_die_common(cpu); /* They ack this in play_dead() by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c index 193ec2ce46c7..160386e9fc17 100644 --- a/arch/x86/kernel/sysfb.c +++ b/arch/x86/kernel/sysfb.c @@ -67,7 +67,7 @@ static __init int sysfb_init(void) pd = platform_device_register_resndata(NULL, name, 0, NULL, 0, si, sizeof(*si)); - return IS_ERR(pd) ? PTR_ERR(pd) : 0; + return PTR_ERR_OR_ZERO(pd); } /* must execute after PCI subsystem for EFI quirks */ diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 86179d409893..764a29f84de7 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -88,8 +88,5 @@ __init int create_simplefb(const struct screen_info *si, pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, &res, 1, mode, sizeof(*mode)); - if (IS_ERR(pd)) - return PTR_ERR(pd); - - return 0; + return PTR_ERR_OR_ZERO(pd); } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 0fa29609b2c4..25adc0e16eaa 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; +__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09e3e3a..3e551eee87b9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,43 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + if (LDT_empty(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + +#ifdef CONFIG_X86_64 + /* The L bit makes no sense for data. */ + if (info->lm) + return false; +#endif + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -66,6 +103,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +232,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +246,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0d0e922fafc1..a9ae20579895 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> +#include <asm/mpx.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -228,37 +229,44 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) -#endif DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - enum ctx_state prev_state; - - prev_state = exception_enter(); - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { - preempt_conditional_sti(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); - } - exception_exit(prev_state); -} - dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_ESPFIX64 + extern unsigned char native_irq_return_iret[]; + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we + * end up promoting it to a doublefault. In that case, modify + * the stack to make it look like we just entered the #GP + * handler from user space, similar to bad_iret. + */ + if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { + struct pt_regs *normal_regs = task_pt_regs(current); + + /* Fake a #GP(0) from userspace. */ + memmove(&normal_regs->ip, (void *)regs->sp, 5*8); + normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + regs->ip = (unsigned long)general_protection; + regs->sp = (unsigned long)&normal_regs->orig_ax; + return; + } +#endif + exception_enter(); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -278,6 +286,89 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif +dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + struct xsave_struct *xsave_buf; + enum ctx_state prev_state; + struct bndcsr *bndcsr; + siginfo_t *info; + + prev_state = exception_enter(); + if (notify_die(DIE_TRAP, "bounds", regs, error_code, + X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) + goto exit; + conditional_sti(regs); + + if (!user_mode(regs)) + die("bounds", regs, error_code); + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) { + /* The exception is not from Intel MPX */ + goto exit_trap; + } + + /* + * We need to look at BNDSTATUS to resolve this exception. + * It is not directly accessible, though, so we need to + * do an xsave and then pull it out of the xsave buffer. + */ + fpu_save_init(&tsk->thread.fpu); + xsave_buf = &(tsk->thread.fpu.state->xsave); + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + goto exit_trap; + + /* + * The error code field of the BNDSTATUS register communicates status + * information of a bound range exception #BR or operation involving + * bound directory. + */ + switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { + case 2: /* Bound directory has invalid entry. */ + if (mpx_handle_bd_fault(xsave_buf)) + goto exit_trap; + break; /* Success, it was handled */ + case 1: /* Bound violation. */ + info = mpx_generate_siginfo(regs, xsave_buf); + if (PTR_ERR(info)) { + /* + * We failed to decode the MPX instruction. Act as if + * the exception was not caused by MPX. + */ + goto exit_trap; + } + /* + * Success, we decoded the instruction and retrieved + * an 'info' containing the address being accessed + * which caused the exception. This information + * allows and application to possibly handle the + * #BR exception itself. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info); + kfree(info); + break; + case 0: /* No exception caused by Intel MPX operations. */ + goto exit_trap; + default: + die("bounds", regs, error_code); + } + +exit: + exception_exit(prev_state); + return; +exit_trap: + /* + * This path out is for all the cases where we could not + * handle the exception in some way (like allocating a + * table or telling userspace about it. We will also end + * up here if the kernel has MPX turned off at compile + * time.. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); + exception_exit(prev_state); +} + dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { @@ -379,7 +470,7 @@ NOKPROBE_SYMBOL(do_int3); * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -399,6 +490,36 @@ asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } NOKPROBE_SYMBOL(sync_regs); + +struct bad_iret_stack { + void *error_entry_ret; + struct pt_regs regs; +}; + +asmlinkage __visible notrace +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +{ + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault + * correctly, we want move our stack frame to task_pt_regs + * and we want to pretend that the exception came from the + * iret target. + */ + struct bad_iret_stack *new_stack = + container_of(task_pt_regs(current), + struct bad_iret_stack, regs); + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + + /* Copy the remainder of the stack from the current stack. */ + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + + BUG_ON(!user_mode_vm(&new_stack->regs)); + return new_stack; +} +NOKPROBE_SYMBOL(fixup_bad_iret); #endif /* @@ -778,7 +899,7 @@ void __init trap_init(void) set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); set_intr_gate(X86_TRAP_TS, invalid_TSS); set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_SS, stack_segment); set_intr_gate(X86_TRAP_GP, general_protection); set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); set_intr_gate(X86_TRAP_MF, coprocessor_error); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5d1cbfe4ae58..8b96a947021f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -219,7 +219,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool { u32 volatile *good_insns; - insn_init(insn, auprobe->insn, x86_64); + insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); /* has the side-effect of processing the entire instruction */ insn_get_length(insn); if (WARN_ON_ONCE(!insn_complete(insn))) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 49edf2dd3613..00bf300fd846 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -186,6 +186,8 @@ SECTIONS * start another segment - init. */ PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) + ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, + "per-CPU data too large - increase CONFIG_PHYSICAL_START") #endif INIT_TEXT_SECTION(PAGE_SIZE) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 957779f4eb40..2dcc6ff6fdcc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -1,59 +1,43 @@ /* + * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> + * + * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * - * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. * - * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located - * at virtual address -10Mbyte+1024bytes etc... There are at max 4 - * vsyscalls. One vsyscall can reserve more than 1 slot to avoid - * jumping out of line if necessary. We cannot add more with this - * mechanism because older kernels won't return -ENOSYS. + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. * - * Note: the concept clashes with user mode linux. UML users should - * use the vDSO. + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/time.h> -#include <linux/init.h> #include <linux/kernel.h> #include <linux/timer.h> -#include <linux/seqlock.h> -#include <linux/jiffies.h> -#include <linux/sysctl.h> -#include <linux/topology.h> -#include <linux/timekeeper_internal.h> -#include <linux/getcpu.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> #include <asm/vsyscall.h> -#include <asm/pgtable.h> -#include <asm/compat.h> -#include <asm/page.h> #include <asm/unistd.h> #include <asm/fixmap.h> -#include <asm/errno.h> -#include <asm/io.h> -#include <asm/segment.h> -#include <asm/desc.h> -#include <asm/topology.h> #include <asm/traps.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -DEFINE_VVAR(int, vgetcpu_mode); - static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; static int __init vsyscall_setup(char *str) @@ -222,6 +206,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) "seccomp tried to change syscall nr or ip"); do_exit(SIGSYS); } + regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ @@ -284,46 +269,54 @@ sigsegv: } /* - * Assume __initcall executes before all user space. Hopefully kmod - * doesn't violate that. We'll find out if it does. + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: */ -static void vsyscall_set_cpu(int cpu) +static const char *gate_vma_name(struct vm_area_struct *vma) { - unsigned long d; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded quickly - * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) - */ - d = 0x0f40000000000ULL; - d |= cpu; - d |= (node & 0xf) << 12; - d |= (node >> 4) << 48; - - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); + return "[vsyscall]"; } - -static void cpu_vsyscall_init(void *arg) +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - /* preemption should be already off */ - vsyscall_set_cpu(raw_smp_processor_id()); +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; } -static int -cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - long cpu = (long)arg; + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} - return NOTIFY_DONE; +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } void __init map_vsyscall(void) @@ -331,24 +324,12 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } - -static int __init vsyscall_init(void) -{ - cpu_notifier_register_begin(); - - on_each_cpu(cpu_vsyscall_init, NULL, 1); - /* notifier priority > KVM */ - __hotcpu_notifier(cpu_vsyscall_notifier, 30); - - cpu_notifier_register_done(); - - return 0; -} -__initcall(vsyscall_init); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e48b674639cc..234b0722de53 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -116,8 +116,6 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, .setup_hpet_msi = default_setup_hpet_msi, - .msi_mask_irq = default_msi_mask_irq, - .msix_mask_irq = default_msix_mask_irq, }; /* MSI arch specific hooks */ @@ -140,14 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) { x86_msi.restore_msi_irqs(dev); } -u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) -{ - return x86_msi.msi_mask_irq(desc, mask, flag); -} -u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag) -{ - return x86_msi.msix_mask_irq(desc, flag); -} #endif struct x86_io_apic_ops x86_io_apic_ops = { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4c540c4719d8..0de1fae2bdf0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -738,3 +738,4 @@ void *get_xsave_addr(struct xsave_struct *xsave, int xstate) return (void *)xsave + xstate_comp_offsets[feature]; } +EXPORT_SYMBOL_GPL(get_xsave_addr); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 25d22b2d6509..08f790dfadc9 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,14 +7,13 @@ CFLAGS_vmx.o := -I. KVM := ../../../virt/kvm -kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ - $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ +kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o cpuid.o pmu.o + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c new file mode 100644 index 000000000000..6eb5c20ee373 --- /dev/null +++ b/arch/x86/kvm/assigned-dev.c @@ -0,0 +1,1052 @@ +/* + * Kernel-based Virtual Machine - device assignment support + * + * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/fs.h> +#include "irq.h" +#include "assigned-dev.h" + +struct kvm_assigned_dev_kernel { + struct kvm_irq_ack_notifier ack_notifier; + struct list_head list; + int assigned_dev_id; + int host_segnr; + int host_busnr; + int host_devfn; + unsigned int entries_nr; + int host_irq; + bool host_irq_disabled; + bool pci_2_3; + struct msix_entry *host_msix_entries; + int guest_irq; + struct msix_entry *guest_msix_entries; + unsigned long irq_requested_type; + int irq_source_id; + int flags; + struct pci_dev *dev; + struct kvm *kvm; + spinlock_t intx_lock; + spinlock_t intx_mask_lock; + char irq_name[32]; + struct pci_saved_state *pci_saved_state; +}; + +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static int find_index_from_host_irq(struct kvm_assigned_dev_kernel + *assigned_dev, int irq) +{ + int i, index; + struct msix_entry *host_msix_entries; + + host_msix_entries = assigned_dev->host_msix_entries; + + index = -1; + for (i = 0; i < assigned_dev->entries_nr; i++) + if (irq == host_msix_entries[i].vector) { + index = i; + break; + } + if (index < 0) + printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); + + return index; +} + +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret; + + spin_lock(&assigned_dev->intx_lock); + if (pci_check_and_mask_intx(assigned_dev->dev)) { + assigned_dev->host_irq_disabled = true; + ret = IRQ_WAKE_THREAD; + } else + ret = IRQ_NONE; + spin_unlock(&assigned_dev->intx_lock); + + return ret; +} + +static void +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, + int vector) +{ + if (unlikely(assigned_dev->irq_requested_type & + KVM_DEV_IRQ_GUEST_INTX)) { + spin_lock(&assigned_dev->intx_mask_lock); + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, vector, 1, + false); + spin_unlock(&assigned_dev->intx_mask_lock); + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + vector, 1, false); +} + +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + spin_unlock_irq(&assigned_dev->intx_lock); + } + + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); + + return IRQ_HANDLED; +} + +#ifdef __KVM_HAVE_MSI +static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret = kvm_set_irq_inatomic(assigned_dev->kvm, + assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} + +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); + + return IRQ_HANDLED; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int index = find_index_from_host_irq(assigned_dev, irq); + u32 vector; + int ret = 0; + + if (index >= 0) { + vector = assigned_dev->guest_msix_entries[index].vector; + ret = kvm_set_irq_inatomic(assigned_dev->kvm, + assigned_dev->irq_source_id, + vector, 1); + } + + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} + +static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int index = find_index_from_host_irq(assigned_dev, irq); + u32 vector; + + if (index >= 0) { + vector = assigned_dev->guest_msix_entries[index].vector; + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); + } + + return IRQ_HANDLED; +} +#endif + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev = + container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); + + spin_lock(&dev->intx_mask_lock); + + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { + bool reassert = false; + + spin_lock_irq(&dev->intx_lock); + /* + * The guest IRQ may be shared so this ack can come from an + * IRQ for another guest device. + */ + if (dev->host_irq_disabled) { + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) + enable_irq(dev->host_irq); + else if (!pci_check_and_unmask_intx(dev->dev)) + reassert = true; + dev->host_irq_disabled = reassert; + } + spin_unlock_irq(&dev->intx_lock); + + if (reassert) + kvm_set_irq(dev->kvm, dev->irq_source_id, + dev->guest_irq, 1, false); + } + + spin_unlock(&dev->intx_mask_lock); +} + +static void deassign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + if (assigned_dev->ack_notifier.gsi != -1) + kvm_unregister_irq_ack_notifier(kvm, + &assigned_dev->ack_notifier); + + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 0, false); + + if (assigned_dev->irq_source_id != -1) + kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); + assigned_dev->irq_source_id = -1; + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); +} + +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ +static void deassign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + /* + * We disable irq here to prevent further events. + * + * Notice this maybe result in nested disable if the interrupt type is + * INTx, but it's OK for we are going to free it. + * + * If this function is a part of VM destroy, please ensure that till + * now, the kvm state is still legal for probably we also have to wait + * on a currently running IRQ handler. + */ + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int i; + for (i = 0; i < assigned_dev->entries_nr; i++) + disable_irq(assigned_dev->host_msix_entries[i].vector); + + for (i = 0; i < assigned_dev->entries_nr; i++) + free_irq(assigned_dev->host_msix_entries[i].vector, + assigned_dev); + + assigned_dev->entries_nr = 0; + kfree(assigned_dev->host_msix_entries); + kfree(assigned_dev->guest_msix_entries); + pci_disable_msix(assigned_dev->dev); + } else { + /* Deal with MSI and INTx */ + if ((assigned_dev->irq_requested_type & + KVM_DEV_IRQ_HOST_INTX) && + (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); + pci_intx(assigned_dev->dev, false); + spin_unlock_irq(&assigned_dev->intx_lock); + synchronize_irq(assigned_dev->host_irq); + } else + disable_irq(assigned_dev->host_irq); + + free_irq(assigned_dev->host_irq, assigned_dev); + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) + pci_disable_msi(assigned_dev->dev); + } + + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); +} + +static int kvm_deassign_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev, + unsigned long irq_requested_type) +{ + unsigned long guest_irq_type, host_irq_type; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + /* no irq assignment to deassign */ + if (!assigned_dev->irq_requested_type) + return -ENXIO; + + host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; + guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; + + if (host_irq_type) + deassign_host_irq(kvm, assigned_dev); + if (guest_irq_type) + deassign_guest_irq(kvm, assigned_dev); + + return 0; +} + +static void kvm_free_assigned_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + kvm_free_assigned_irq(kvm, assigned_dev); + + pci_reset_function(assigned_dev->dev); + if (pci_load_and_free_saved_state(assigned_dev->dev, + &assigned_dev->pci_saved_state)) + printk(KERN_INFO "%s: Couldn't reload %s saved state\n", + __func__, dev_name(&assigned_dev->dev->dev)); + else + pci_restore_state(assigned_dev->dev); + + pci_clear_dev_assigned(assigned_dev->dev); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int assigned_device_enable_host_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + irq_handler_t irq_handler; + unsigned long flags; + + dev->host_irq = dev->dev->irq; + + /* + * We can only share the IRQ line with other host devices if we are + * able to disable the IRQ source at device-level - independently of + * the guest driver. Otherwise host devices may suffer from unbounded + * IRQ latencies when the guest keeps the line asserted. + */ + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + irq_handler = kvm_assigned_dev_intx; + flags = IRQF_SHARED; + } else { + irq_handler = NULL; + flags = IRQF_ONESHOT; + } + if (request_threaded_irq(dev->host_irq, irq_handler, + kvm_assigned_dev_thread_intx, flags, + dev->irq_name, dev)) + return -EIO; + + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + spin_lock_irq(&dev->intx_lock); + pci_intx(dev->dev, true); + spin_unlock_irq(&dev->intx_lock); + } + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_host_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int r; + + if (!dev->dev->msi_enabled) { + r = pci_enable_msi(dev->dev); + if (r) + return r; + } + + dev->host_irq = dev->dev->irq; + if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, + kvm_assigned_dev_thread_msi, 0, + dev->irq_name, dev)) { + pci_disable_msi(dev->dev); + return -EIO; + } + + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_host_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int i, r = -EINVAL; + + /* host_msix_entries and guest_msix_entries should have been + * initialized */ + if (dev->entries_nr == 0) + return r; + + r = pci_enable_msix_exact(dev->dev, + dev->host_msix_entries, dev->entries_nr); + if (r) + return r; + + for (i = 0; i < dev->entries_nr; i++) { + r = request_threaded_irq(dev->host_msix_entries[i].vector, + kvm_assigned_dev_msix, + kvm_assigned_dev_thread_msix, + 0, dev->irq_name, dev); + if (r) + goto err; + } + + return 0; +err: + for (i -= 1; i >= 0; i--) + free_irq(dev->host_msix_entries[i].vector, dev); + pci_disable_msix(dev->dev); + return r; +} + +#endif + +static int assigned_device_enable_guest_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = irq->guest_irq; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_guest_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_guest_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + return 0; +} +#endif + +static int assign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + __u32 host_irq_type) +{ + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) + return r; + + snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", + pci_name(dev->dev)); + + switch (host_irq_type) { + case KVM_DEV_IRQ_HOST_INTX: + r = assigned_device_enable_host_intx(kvm, dev); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_HOST_MSI: + r = assigned_device_enable_host_msi(kvm, dev); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_HOST_MSIX: + r = assigned_device_enable_host_msix(kvm, dev); + break; +#endif + default: + r = -EINVAL; + } + dev->host_irq_disabled = false; + + if (!r) + dev->irq_requested_type |= host_irq_type; + + return r; +} + +static int assign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq, + unsigned long guest_irq_type) +{ + int id; + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) + return r; + + id = kvm_request_irq_source_id(kvm); + if (id < 0) + return id; + + dev->irq_source_id = id; + + switch (guest_irq_type) { + case KVM_DEV_IRQ_GUEST_INTX: + r = assigned_device_enable_guest_intx(kvm, dev, irq); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_GUEST_MSI: + r = assigned_device_enable_guest_msi(kvm, dev, irq); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_GUEST_MSIX: + r = assigned_device_enable_guest_msix(kvm, dev, irq); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) { + dev->irq_requested_type |= guest_irq_type; + if (dev->ack_notifier.gsi != -1) + kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); + } else { + kvm_free_irq_source_id(kvm, dev->irq_source_id); + dev->irq_source_id = -1; + } + + return r; +} + +/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq *assigned_irq) +{ + int r = -EINVAL; + struct kvm_assigned_dev_kernel *match; + unsigned long host_irq_type, guest_irq_type; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(&kvm->lock); + r = -ENODEV; + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); + guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); + + r = -EINVAL; + /* can only assign one type at a time */ + if (hweight_long(host_irq_type) > 1) + goto out; + if (hweight_long(guest_irq_type) > 1) + goto out; + if (host_irq_type == 0 && guest_irq_type == 0) + goto out; + + r = 0; + if (host_irq_type) + r = assign_host_irq(kvm, match, host_irq_type); + if (r) + goto out; + + if (guest_irq_type) + r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = -ENODEV; + struct kvm_assigned_dev_kernel *match; + unsigned long irq_type; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | + KVM_DEV_IRQ_GUEST_MASK); + r = kvm_deassign_irq(kvm, match, irq_type); +out: + mutex_unlock(&kvm->lock); + return r; +} + +/* + * We want to test whether the caller has been granted permissions to + * use this device. To be able to configure and control the device, + * the user needs access to PCI configuration space and BAR resources. + * These are accessed through PCI sysfs. PCI config space is often + * passed to the process calling this ioctl via file descriptor, so we + * can't rely on access to that file. We can check for permissions + * on each of the BAR resource files, which is a pretty clear + * indicator that the user has been granted access to the device. + */ +static int probe_sysfs_permissions(struct pci_dev *dev) +{ +#ifdef CONFIG_SYSFS + int i; + bool bar_found = false; + + for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { + char *kpath, *syspath; + struct path path; + struct inode *inode; + int r; + + if (!pci_resource_len(dev, i)) + continue; + + kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); + if (!kpath) + return -ENOMEM; + + /* Per sysfs-rules, sysfs is always at /sys */ + syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); + kfree(kpath); + if (!syspath) + return -ENOMEM; + + r = kern_path(syspath, LOOKUP_FOLLOW, &path); + kfree(syspath); + if (r) + return r; + + inode = path.dentry->d_inode; + + r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); + path_put(&path); + if (r) + return r; + + bar_found = true; + } + + /* If no resources, probably something special */ + if (!bar_found) + return -EPERM; + + return 0; +#else + return -EINVAL; /* No way to control the device without sysfs */ +#endif +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0, idx; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) + return -EINVAL; + + mutex_lock(&kvm->lock); + idx = srcu_read_lock(&kvm->srcu); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EEXIST; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, + assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + + /* Don't allow bridges to be assigned */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { + r = -EPERM; + goto out_put; + } + + r = probe_sysfs_permissions(dev); + if (r) + goto out_put; + + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + + pci_reset_function(dev); + pci_save_state(dev); + match->pci_saved_state = pci_store_saved_state(dev); + if (!match->pci_saved_state) + printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", + __func__, dev_name(&dev->dev)); + + if (!pci_intx_mask_supported(dev)) + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; + + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_segnr = assigned_dev->segnr; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->flags = assigned_dev->flags; + match->dev = dev; + spin_lock_init(&match->intx_lock); + spin_lock_init(&match->intx_mask_lock); + match->irq_source_id = -1; + match->kvm = kvm; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); + if (r) + goto out_list_del; + } + r = kvm_assign_device(kvm, match->dev); + if (r) + goto out_list_del; + +out: + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +out_list_del: + if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) + printk(KERN_INFO "%s: Couldn't reload %s saved state\n", + __func__, dev_name(&dev->dev)); + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + printk(KERN_INFO "%s: device hasn't been assigned before, " + "so cannot be deassigned\n", __func__); + r = -EINVAL; + goto out; + } + + kvm_deassign_device(kvm, match->dev); + + kvm_free_assigned_device(kvm, match); + +out: + mutex_unlock(&kvm->lock); + return r; +} + + +#ifdef __KVM_HAVE_MSIX +static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, + struct kvm_assigned_msix_nr *entry_nr) +{ + int r = 0; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry_nr->assigned_dev_id); + if (!adev) { + r = -EINVAL; + goto msix_nr_out; + } + + if (adev->entries_nr == 0) { + adev->entries_nr = entry_nr->entry_nr; + if (adev->entries_nr == 0 || + adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { + r = -EINVAL; + goto msix_nr_out; + } + + adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * + entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->host_msix_entries) { + r = -ENOMEM; + goto msix_nr_out; + } + adev->guest_msix_entries = + kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->guest_msix_entries) { + kfree(adev->host_msix_entries); + r = -ENOMEM; + goto msix_nr_out; + } + } else /* Not allowed set MSI-X number twice */ + r = -EINVAL; +msix_nr_out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, + struct kvm_assigned_msix_entry *entry) +{ + int r = 0, i; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry->assigned_dev_id); + + if (!adev) { + r = -EINVAL; + goto msix_entry_out; + } + + for (i = 0; i < adev->entries_nr; i++) + if (adev->guest_msix_entries[i].vector == 0 || + adev->guest_msix_entries[i].entry == entry->entry) { + adev->guest_msix_entries[i].entry = entry->entry; + adev->guest_msix_entries[i].vector = entry->gsi; + adev->host_msix_entries[i].entry = entry->entry; + break; + } + if (i == adev->entries_nr) { + r = -ENOSPC; + goto msix_entry_out; + } + +msix_entry_out: + mutex_unlock(&kvm->lock); + + return r; +} +#endif + +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + r = -ENODEV; + goto out; + } + + spin_lock(&match->intx_mask_lock); + + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; + + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { + kvm_set_irq(match->kvm, match->irq_source_id, + match->guest_irq, 0, false); + /* + * Masking at hardware-level is performed on demand, + * i.e. when an IRQ actually arrives at the host. + */ + } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + /* + * Unmask the IRQ line if required. Unmasking at + * device level will be performed by user space. + */ + spin_lock_irq(&match->intx_lock); + if (match->host_irq_disabled) { + enable_irq(match->host_irq); + match->host_irq_disabled = false; + } + spin_unlock_irq(&match->intx_lock); + } + } + + spin_unlock(&match->intx_mask_lock); + +out: + mutex_unlock(&kvm->lock); + return r; +} + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + int r; + + switch (ioctl) { + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + r = -EOPNOTSUPP; + break; + } + case KVM_ASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } +#ifdef __KVM_HAVE_MSIX + case KVM_ASSIGN_SET_MSIX_NR: { + struct kvm_assigned_msix_nr entry_nr; + r = -EFAULT; + if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) + goto out; + r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); + if (r) + goto out; + break; + } + case KVM_ASSIGN_SET_MSIX_ENTRY: { + struct kvm_assigned_msix_entry entry; + r = -EFAULT; + if (copy_from_user(&entry, argp, sizeof entry)) + goto out; + r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); + if (r) + goto out; + break; + } +#endif + case KVM_ASSIGN_SET_INTX_MASK: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); + break; + } + default: + r = -ENOTTY; + break; + } +out: + return r; +} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h new file mode 100644 index 000000000000..a428c1a211b2 --- /dev/null +++ b/arch/x86/kvm/assigned-dev.h @@ -0,0 +1,32 @@ +#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H +#define ARCH_X86_KVM_ASSIGNED_DEV_H + +#include <linux/kvm_host.h> + +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT +int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); +int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); + +int kvm_iommu_map_guest(struct kvm *kvm); +int kvm_iommu_unmap_guest(struct kvm *kvm); + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg); + +void kvm_free_all_assigned_devices(struct kvm *kvm); +#else +static inline int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + return 0; +} + +static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + +static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} +#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ + +#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 976e3a57f9ea..8a80737ee6e6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -23,7 +23,7 @@ #include "mmu.h" #include "trace.h" -static u32 xstate_required_size(u64 xstate_bv) +static u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -31,9 +31,10 @@ static u32 xstate_required_size(u64 xstate_bv) xstate_bv &= XSTATE_EXTEND_MASK; while (xstate_bv) { if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx; + u32 eax, ebx, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); - ret = max(ret, eax + ebx); + offset = compacted ? ret : ebx; + ret = max(ret, offset + eax); } xstate_bv >>= 1; @@ -53,6 +54,8 @@ u64 kvm_supported_xcr0(void) return xcr0; } +#define F(x) bit(X86_FEATURE_##x) + int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -64,13 +67,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) /* Update OSXSAVE bit */ if (cpu_has_xsave && best->function == 0x1) { - best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + best->ecx &= ~F(OSXSAVE); if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= bit(X86_FEATURE_OSXSAVE); + best->ecx |= F(OSXSAVE); } if (apic) { - if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + if (best->ecx & F(TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; else apic->lapic_timer.timer_mode_mask = 1 << 17; @@ -85,9 +88,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) (best->eax | ((u64)best->edx << 32)) & kvm_supported_xcr0(); vcpu->arch.guest_xstate_size = best->ebx = - xstate_required_size(vcpu->arch.xcr0); + xstate_required_size(vcpu->arch.xcr0, false); } + best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) + best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. @@ -122,8 +129,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) break; } } - if (entry && (entry->edx & bit(X86_FEATURE_NX)) && !is_efer_nx()) { - entry->edx &= ~bit(X86_FEATURE_NX); + if (entry && (entry->edx & F(NX)) && !is_efer_nx()) { + entry->edx &= ~F(NX); printk(KERN_INFO "kvm: guest NX capability removed\n"); } } @@ -227,8 +234,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -#define F(x) bit(X86_FEATURE_##x) - static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, u32 func, u32 index, int *nent, int maxnent) { @@ -267,6 +272,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; + unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -317,7 +323,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP); + F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | + F(AVX512CD); + + /* cpuid 0xD.1.eax */ + const u32 kvm_supported_word10_x86_features = + F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -453,16 +464,34 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u64 supported = kvm_supported_xcr0(); entry->eax &= supported; + entry->ebx = xstate_required_size(supported, false); + entry->ecx = entry->ebx; entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + if (!supported) + break; + for (idx = 1, i = 1; idx < 64; ++idx) { u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !(supported & mask)) - continue; + if (idx == 1) { + entry[i].eax &= kvm_supported_word10_x86_features; + entry[i].ebx = 0; + if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) + entry[i].ebx = + xstate_required_size(supported, + true); + } else { + if (entry[i].eax == 0 || !(supported & mask)) + continue; + if (WARN_ON_ONCE(entry[i].ecx & 1)) + continue; + } + entry[i].ecx = 0; + entry[i].edx = 0; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5edf088ca51e..169b09d76ddd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -123,6 +123,7 @@ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Escape (5<<15) /* Escape to coprocessor instruction */ +#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -166,6 +167,8 @@ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ #define NoBigReal ((u64)1 << 50) /* No big real mode */ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ +#define NearBranch ((u64)1 << 52) /* Near branches */ +#define No16 ((u64)1 << 53) /* No 16 bit operand */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -209,6 +212,7 @@ struct opcode { const struct group_dual *gdual; const struct gprefix *gprefix; const struct escape *esc; + const struct instr_dual *idual; void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); @@ -231,6 +235,11 @@ struct escape { struct opcode high[64]; }; +struct instr_dual { + struct opcode mod012; + struct opcode mod3; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -379,6 +388,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); ON64(FOP2E(op##q, rax, cl)) \ FOP_END +/* 2 operand, src and dest are reversed */ +#define FASTOP2R(op, name) \ + FOP_START(name) \ + FOP2E(op##b, dl, al) \ + FOP2E(op##w, dx, ax) \ + FOP2E(op##l, edx, eax) \ + ON64(FOP2E(op##q, rdx, rax)) \ + FOP_END + #define FOP3E(op, dst, src, src2) \ FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET @@ -477,9 +495,9 @@ address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) } static inline unsigned long -register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) +register_address(struct x86_emulate_ctxt *ctxt, int reg) { - return address_mask(ctxt, reg); + return address_mask(ctxt, reg_read(ctxt, reg)); } static void masked_increment(ulong *reg, ulong mask, int inc) @@ -488,7 +506,7 @@ static void masked_increment(ulong *reg, ulong mask, int inc) } static inline void -register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) +register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) { ulong mask; @@ -496,7 +514,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in mask = ~0UL; else mask = ad_mask(ctxt); - masked_increment(reg, mask, inc); + masked_increment(reg_rmw(ctxt, reg), mask, inc); } static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) @@ -564,40 +582,6 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } -static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, - int cs_l) -{ - switch (ctxt->op_bytes) { - case 2: - ctxt->_eip = (u16)dst; - break; - case 4: - ctxt->_eip = (u32)dst; - break; -#ifdef CONFIG_X86_64 - case 8: - if ((cs_l && is_noncanonical_address(dst)) || - (!cs_l && (dst >> 32) != 0)) - return emulate_gp(ctxt, 0); - ctxt->_eip = dst; - break; -#endif - default: - WARN(1, "unsupported eip assignment size\n"); - } - return X86EMUL_CONTINUE; -} - -static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) -{ - return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); -} - -static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) -{ - return assign_eip_near(ctxt, ctxt->_eip + rel); -} - static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; @@ -641,25 +625,24 @@ static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) return true; } -static int __linearize(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - unsigned *max_size, unsigned size, - bool write, bool fetch, - ulong *linear) +static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned *max_size, unsigned size, + bool write, bool fetch, + enum x86emul_mode mode, ulong *linear) { struct desc_struct desc; bool usable; ulong la; u32 lim; u16 sel; - unsigned cpl; la = seg_base(ctxt, addr.seg) + addr.ea; *max_size = 0; - switch (ctxt->mode) { + switch (mode) { case X86EMUL_MODE_PROT64: - if (((signed long)la << 16) >> 16 != la) - return emulate_gp(ctxt, 0); + if (is_noncanonical_address(la)) + goto bad; *max_size = min_t(u64, ~0u, (1ull << 48) - la); if (size > *max_size) @@ -678,46 +661,20 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (!fetch && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); - if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && - (ctxt->d & NoBigReal)) { - /* la is between zero and 0xffff */ - if (la > 0xffff) - goto bad; - *max_size = 0x10000 - la; - } else if ((desc.type & 8) || !(desc.type & 4)) { - /* expand-up segment */ - if (addr.ea > lim) - goto bad; - *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); - } else { + if (!(desc.type & 8) && (desc.type & 4)) { /* expand-down segment */ if (addr.ea <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; - if (addr.ea > lim) - goto bad; - *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); } + if (addr.ea > lim) + goto bad; + *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); if (size > *max_size) goto bad; - cpl = ctxt->ops->cpl(ctxt); - if (!(desc.type & 8)) { - /* data segment */ - if (cpl > desc.dpl) - goto bad; - } else if ((desc.type & 8) && !(desc.type & 4)) { - /* nonconforming code segment */ - if (cpl != desc.dpl) - goto bad; - } else if ((desc.type & 8) && (desc.type & 4)) { - /* conforming code segment */ - if (cpl < desc.dpl) - goto bad; - } + la &= (u32)-1; break; } - if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) - la &= (u32)-1; if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) return emulate_gp(ctxt, 0); *linear = la; @@ -735,9 +692,55 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ulong *linear) { unsigned max_size; - return __linearize(ctxt, addr, &max_size, size, write, false, linear); + return __linearize(ctxt, addr, &max_size, size, write, false, + ctxt->mode, linear); +} + +static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, + enum x86emul_mode mode) +{ + ulong linear; + int rc; + unsigned max_size; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = dst }; + + if (ctxt->op_bytes != sizeof(unsigned long)) + addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); + rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); + if (rc == X86EMUL_CONTINUE) + ctxt->_eip = addr.ea; + return rc; +} + +static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + return assign_eip(ctxt, dst, ctxt->mode); +} + +static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, + const struct desc_struct *cs_desc) +{ + enum x86emul_mode mode = ctxt->mode; + +#ifdef CONFIG_X86_64 + if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + mode = X86EMUL_MODE_PROT64; + } +#endif + if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) + mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + return assign_eip(ctxt, dst, mode); } +static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +{ + return assign_eip_near(ctxt, ctxt->_eip + rel); +} static int segmented_read_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, @@ -776,7 +779,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) * boundary check itself. Instead, we use max_size to check * against op_size. */ - rc = __linearize(ctxt, addr, &max_size, 0, false, true, &linear); + rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode, + &linear); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; @@ -911,6 +915,8 @@ FASTOP2W(btc); FASTOP2(xadd); +FASTOP2R(cmp, cmp_r); + static u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; @@ -1221,6 +1227,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { + modrm_ea += insn_fetch(s32, ctxt); if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; } else { @@ -1229,10 +1236,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, adjust_modrm_seg(ctxt, base_reg); } switch (ctxt->modrm_mod) { - case 0: - if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, ctxt); - break; case 1: modrm_ea += insn_fetch(s8, ctxt); break; @@ -1284,7 +1287,8 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) else sv = (s64)ctxt->src.val & (s64)mask; - ctxt->dst.addr.mem.ea += (sv >> 3); + ctxt->dst.addr.mem.ea = address_mask(ctxt, + ctxt->dst.addr.mem.ea + (sv >> 3)); } /* only subword offset */ @@ -1610,6 +1614,9 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, sizeof(base3), &ctxt->exception); if (ret != X86EMUL_CONTINUE) return ret; + if (is_noncanonical_address(get_desc_base(&seg_desc) | + ((u64)base3 << 32))) + return emulate_gp(ctxt, 0); } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); @@ -1807,6 +1814,10 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt) int seg = ctxt->src2.val; ctxt->src.val = get_segment_selector(ctxt, seg); + if (ctxt->op_bytes == 4) { + rsp_increment(ctxt, -2); + ctxt->op_bytes = 2; + } return em_push(ctxt); } @@ -1850,7 +1861,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) static int em_pushf(struct x86_emulate_ctxt *ctxt) { - ctxt->src.val = (unsigned long)ctxt->eflags; + ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM; return em_push(ctxt); } @@ -2035,7 +2046,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) { WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); /* assigning eip failed; restore the old cs */ @@ -2045,31 +2056,22 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return rc; } -static int em_grp45(struct x86_emulate_ctxt *ctxt) +static int em_jmp_abs(struct x86_emulate_ctxt *ctxt) { - int rc = X86EMUL_CONTINUE; + return assign_eip_near(ctxt, ctxt->src.val); +} - switch (ctxt->modrm_reg) { - case 2: /* call near abs */ { - long int old_eip; - old_eip = ctxt->_eip; - rc = assign_eip_near(ctxt, ctxt->src.val); - if (rc != X86EMUL_CONTINUE) - break; - ctxt->src.val = old_eip; - rc = em_push(ctxt); - break; - } - case 4: /* jmp abs */ - rc = assign_eip_near(ctxt, ctxt->src.val); - break; - case 5: /* jmp far */ - rc = em_jmp_far(ctxt); - break; - case 6: /* push */ - rc = em_push(ctxt); - break; - } +static int em_call_near_abs(struct x86_emulate_ctxt *ctxt) +{ + int rc; + long int old_eip; + + old_eip = ctxt->_eip; + rc = assign_eip_near(ctxt, ctxt->src.val); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->src.val = old_eip; + rc = em_push(ctxt); return rc; } @@ -2128,11 +2130,11 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false, + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, eip, new_desc.l); + rc = assign_eip_far(ctxt, eip, &new_desc); if (rc != X86EMUL_CONTINUE) { WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); @@ -2316,6 +2318,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~msr_data; + ctxt->eflags |= EFLG_RESERVED_ONE_MASK; #endif } else { /* legacy mode */ @@ -2349,11 +2352,9 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) && !vendor_intel(ctxt)) return emulate_ud(ctxt); - /* XXX sysenter/sysexit have not been tested in 64bit mode. - * Therefore, we inject an #UD. - */ + /* sysenter/sysexit have not been tested in 64bit mode. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return emulate_ud(ctxt); + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -2425,6 +2426,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); ss_sel = (u16)(msr_data + 24); + rcx = (u32)rcx; + rdx = (u32)rdx; break; case X86EMUL_MODE_PROT64: cs_sel = (u16)(msr_data + 32); @@ -2599,7 +2602,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; save_state_to_tss16(ctxt, &tss_seg); @@ -2607,13 +2609,11 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; if (old_tss_sel != 0xffff) { @@ -2624,7 +2624,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, sizeof tss_seg.prev_task_link, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; } @@ -2813,7 +2812,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, * * 1. jmp/call/int to task gate: Check against DPL of the task gate * 2. Exception/IRQ/iret: No check is performed - * 3. jmp/call to TSS: Check against DPL of the TSS + * 3. jmp/call to TSS/task-gate: No check is performed since the + * hardware checks it before exiting. */ if (reason == TASK_SWITCH_GATE) { if (idt_index != -1) { @@ -2830,13 +2830,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) return emulate_gp(ctxt, (idt_index << 3) | 0x2); } - } else if (reason != TASK_SWITCH_IRET) { - int dpl = next_tss_desc.dpl; - if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) - return emulate_gp(ctxt, tss_selector); } - desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2913,8 +2908,8 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, { int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; - register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes); - op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg)); + register_address_increment(ctxt, reg, df * op->bytes); + op->addr.mem.ea = register_address(ctxt, reg); } static int em_das(struct x86_emulate_ctxt *ctxt) @@ -3025,7 +3020,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) goto fail; @@ -3215,6 +3210,8 @@ static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) return emulate_ud(ctxt); ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); + if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM) + ctxt->dst.bytes = 2; return X86EMUL_CONTINUE; } @@ -3317,7 +3314,7 @@ static int em_sidt(struct x86_emulate_ctxt *ctxt) return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); } -static int em_lgdt(struct x86_emulate_ctxt *ctxt) +static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) { struct desc_ptr desc_ptr; int rc; @@ -3329,12 +3326,23 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt) ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - ctxt->ops->set_gdt(ctxt, &desc_ptr); + if (ctxt->mode == X86EMUL_MODE_PROT64 && + is_noncanonical_address(desc_ptr.address)) + return emulate_gp(ctxt, 0); + if (lgdt) + ctxt->ops->set_gdt(ctxt, &desc_ptr); + else + ctxt->ops->set_idt(ctxt, &desc_ptr); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } +static int em_lgdt(struct x86_emulate_ctxt *ctxt) +{ + return em_lgdt_lidt(ctxt, true); +} + static int em_vmmcall(struct x86_emulate_ctxt *ctxt) { int rc; @@ -3348,20 +3356,7 @@ static int em_vmmcall(struct x86_emulate_ctxt *ctxt) static int em_lidt(struct x86_emulate_ctxt *ctxt) { - struct desc_ptr desc_ptr; - int rc; - - if (ctxt->mode == X86EMUL_MODE_PROT64) - ctxt->op_bytes = 8; - rc = read_descriptor(ctxt, ctxt->src.addr.mem, - &desc_ptr.size, &desc_ptr.address, - ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - ctxt->ops->set_idt(ctxt, &desc_ptr); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; + return em_lgdt_lidt(ctxt, false); } static int em_smsw(struct x86_emulate_ctxt *ctxt) @@ -3384,7 +3379,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; - register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); + register_address_increment(ctxt, VCPU_REGS_RCX, -1); if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) rc = jmp_rel(ctxt, ctxt->src.val); @@ -3554,7 +3549,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) - rsvd = CR3_L_MODE_RESERVED_BITS; + rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD; if (new_val & rsvd) return emulate_gp(ctxt, 0); @@ -3596,8 +3591,15 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) return emulate_ud(ctxt); - if (check_dr7_gd(ctxt)) + if (check_dr7_gd(ctxt)) { + ulong dr6; + + ctxt->ops->get_dr(ctxt, 6, &dr6); + dr6 &= ~15; + dr6 |= DR6_BD | DR6_RTM; + ctxt->ops->set_dr(ctxt, 6, dr6); return emulate_db(ctxt); + } return X86EMUL_CONTINUE; } @@ -3684,6 +3686,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } +#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } @@ -3780,11 +3783,11 @@ static const struct opcode group4[] = { static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), - I(SrcMem | Stack, em_grp45), + I(SrcMem | NearBranch, em_call_near_abs), I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), - I(SrcMem | Stack, em_grp45), - I(SrcMemFAddr | ImplicitOps, em_grp45), - I(SrcMem | Stack, em_grp45), D(Undefined), + I(SrcMem | NearBranch, em_jmp_abs), + I(SrcMemFAddr | ImplicitOps, em_jmp_far), + I(SrcMem | Stack, em_push), D(Undefined), }; static const struct opcode group6[] = { @@ -3845,8 +3848,12 @@ static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; +static const struct instr_dual instr_dual_0f_2b = { + I(0, em_mov), N +}; + static const struct gprefix pfx_0f_2b = { - I(0, em_mov), I(0, em_mov), N, N, + ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N, }; static const struct gprefix pfx_0f_28_0f_29 = { @@ -3920,6 +3927,10 @@ static const struct escape escape_dd = { { N, N, N, N, N, N, N, N, } }; +static const struct instr_dual instr_dual_0f_c3 = { + I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N +}; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ F6ALU(Lock, em_add), @@ -3964,7 +3975,7 @@ static const struct opcode opcode_table[256] = { I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ - X16(D(SrcImmByte)), + X16(D(SrcImmByte | NearBranch)), /* 0x80 - 0x87 */ G(ByteOp | DstMem | SrcImm, group1), G(DstMem | SrcImm, group1), @@ -3991,20 +4002,20 @@ static const struct opcode opcode_table[256] = { I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - F2bv(SrcSI | DstDI | String | NoWrite, em_cmp), + F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), /* 0xA8 - 0xAF */ F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp), + F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), - I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), - I(ImplicitOps | Stack, em_ret), + I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm), + I(ImplicitOps | NearBranch, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), @@ -4024,13 +4035,14 @@ static const struct opcode opcode_table[256] = { /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ - X3(I(SrcImmByte, em_loop)), - I(SrcImmByte, em_jcxz), + X3(I(SrcImmByte | NearBranch, em_loop)), + I(SrcImmByte | NearBranch, em_jcxz), I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps), - I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), + I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch), + I(SrcImmFAddr | No64, em_jmp_far), + D(SrcImmByte | ImplicitOps | NearBranch), I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), /* 0xF0 - 0xF7 */ @@ -4090,7 +4102,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ - X16(D(SrcImm)), + X16(D(SrcImm | NearBranch)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ @@ -4121,7 +4133,7 @@ static const struct opcode twobyte_table[256] = { D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), - N, D(DstMem | SrcReg | ModRM | Mov), + N, ID(0, &instr_dual_0f_c3), N, N, N, GD(0, &group9), /* 0xC8 - 0xCF */ X8(I(DstReg, em_bswap)), @@ -4134,12 +4146,20 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; +static const struct instr_dual instr_dual_0f_38_f0 = { + I(DstReg | SrcMem | Mov, em_movbe), N +}; + +static const struct instr_dual instr_dual_0f_38_f1 = { + I(DstMem | SrcReg | Mov, em_movbe), N +}; + static const struct gprefix three_byte_0f_38_f0 = { - I(DstReg | SrcMem | Mov, em_movbe), N, N, N + ID(0, &instr_dual_0f_38_f0), N, N, N }; static const struct gprefix three_byte_0f_38_f1 = { - I(DstMem | SrcReg | Mov, em_movbe), N, N, N + ID(0, &instr_dual_0f_38_f1), N, N, N }; /* @@ -4152,8 +4172,8 @@ static const struct opcode opcode_map_0f_38[256] = { /* 0x80 - 0xef */ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), /* 0xf0 - 0xf1 */ - GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0), - GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1), + GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0), + GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1), /* 0xf2 - 0xff */ N, N, X4(N), X8(N) }; @@ -4275,7 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI)); + register_address(ctxt, VCPU_REGS_RDI); op->addr.mem.seg = VCPU_SREG_ES; op->val = 0; op->count = 1; @@ -4287,6 +4307,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_register_operand(op); break; case OpCL: + op->type = OP_IMM; op->bytes = 1; op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff; break; @@ -4294,6 +4315,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, rc = decode_imm(ctxt, op, 1, true); break; case OpOne: + op->type = OP_IMM; op->bytes = 1; op->val = 1; break; @@ -4327,7 +4349,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); + register_address(ctxt, VCPU_REGS_RSI); op->addr.mem.seg = ctxt->seg_override; op->val = 0; op->count = 1; @@ -4336,7 +4358,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, + address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RBX) + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); op->addr.mem.seg = ctxt->seg_override; @@ -4352,21 +4374,27 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, ctxt->memop.bytes = ctxt->op_bytes + 2; goto mem_common; case OpES: + op->type = OP_IMM; op->val = VCPU_SREG_ES; break; case OpCS: + op->type = OP_IMM; op->val = VCPU_SREG_CS; break; case OpSS: + op->type = OP_IMM; op->val = VCPU_SREG_SS; break; case OpDS: + op->type = OP_IMM; op->val = VCPU_SREG_DS; break; case OpFS: + op->type = OP_IMM; op->val = VCPU_SREG_FS; break; case OpGS: + op->type = OP_IMM; op->val = VCPU_SREG_GS; break; case OpImplicit: @@ -4502,8 +4530,7 @@ done_prefixes: /* vex-prefix instructions are not implemented */ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && - (mode == X86EMUL_MODE_PROT64 || - (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) { + (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { ctxt->d = NotImpl; } @@ -4541,6 +4568,12 @@ done_prefixes: else opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; break; + case InstrDual: + if ((ctxt->modrm >> 6) == 3) + opcode = opcode.u.idual->mod3; + else + opcode = opcode.u.idual->mod012; + break; default: return EMULATION_FAILED; } @@ -4559,7 +4592,8 @@ done_prefixes: return EMULATION_FAILED; if (unlikely(ctxt->d & - (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) { + (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch| + No16))) { /* * These are copied unconditionally here, and checked unconditionally * in x86_emulate_insn. @@ -4570,8 +4604,12 @@ done_prefixes: if (ctxt->d & NotImpl) return EMULATION_FAILED; - if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) - ctxt->op_bytes = 8; + if (mode == X86EMUL_MODE_PROT64) { + if (ctxt->op_bytes == 4 && (ctxt->d & Stack)) + ctxt->op_bytes = 8; + else if (ctxt->d & NearBranch) + ctxt->op_bytes = 8; + } if (ctxt->d & Op3264) { if (mode == X86EMUL_MODE_PROT64) @@ -4580,6 +4618,9 @@ done_prefixes: ctxt->op_bytes = 4; } + if ((ctxt->d & No16) && ctxt->op_bytes == 2) + ctxt->op_bytes = 4; + if (ctxt->d & Sse) ctxt->op_bytes = 16; else if (ctxt->d & Mmx) @@ -4623,7 +4664,8 @@ done_prefixes: rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); if (ctxt->rip_relative) - ctxt->memopp->addr.mem.ea += ctxt->_eip; + ctxt->memopp->addr.mem.ea = address_mask(ctxt, + ctxt->memopp->addr.mem.ea + ctxt->_eip); done: return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; @@ -4767,6 +4809,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + /* Instruction can only be executed in protected mode */ + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { + rc = emulate_ud(ctxt); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((ctxt->d & Priv) && ops->cpl(ctxt)) { if (ctxt->d & PrivUD) @@ -4776,12 +4824,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { - rc = emulate_ud(ctxt); - goto done; - } - /* Do instruction specific permission checks */ if (ctxt->d & CheckPerm) { rc = ctxt->check_perm(ctxt); @@ -4966,8 +5008,7 @@ writeback: count = ctxt->src.count; else count = ctxt->dst.count; - register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), - -count); + register_address_increment(ctxt, VCPU_REGS_RCX, -count); if (!string_insn_completed(ctxt)) { /* @@ -5045,11 +5086,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; - case 0xc3: /* movnti */ - ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val : - (u32) ctxt->src.val; - break; default: goto cannot_emulate; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c new file mode 100644 index 000000000000..b1947e0f3e10 --- /dev/null +++ b/arch/x86/kvm/ioapic.c @@ -0,0 +1,675 @@ +/* + * Copyright (C) 2001 MandrakeSoft S.A. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * MandrakeSoft S.A. + * 43, rue d'Aboukir + * 75002 Paris - France + * http://www.linux-mandrake.com/ + * http://www.mandrakesoft.com/ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Yunhong Jiang <yunhong.jiang@intel.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * Based on Xen 3.1 code. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/current.h> +#include <trace/events/kvm.h> + +#include "ioapic.h" +#include "lapic.h" +#include "irq.h" + +#if 0 +#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#else +#define ioapic_debug(fmt, arg...) +#endif +static int ioapic_service(struct kvm_ioapic *vioapic, int irq, + bool line_status); + +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, + unsigned long addr, + unsigned long length) +{ + unsigned long result = 0; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) + | (IOAPIC_VERSION_ID & 0xff)); + break; + + case IOAPIC_REG_APIC_ID: + case IOAPIC_REG_ARB_ID: + result = ((ioapic->id & 0xf) << 24); + break; + + default: + { + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; + u64 redir_content; + + if (redir_index < IOAPIC_NUM_PINS) + redir_content = + ioapic->redirtbl[redir_index].bits; + else + redir_content = ~0ULL; + + result = (ioapic->ioregsel & 0x1) ? + (redir_content >> 32) & 0xffffffff : + redir_content & 0xffffffff; + break; + } + } + + return result; +} + +static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) +{ + ioapic->rtc_status.pending_eoi = 0; + bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); +} + +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); + +static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic) +{ + if (WARN_ON(ioapic->rtc_status.pending_eoi < 0)) + kvm_rtc_eoi_tracking_restore_all(ioapic); +} + +static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) +{ + bool new_val, old_val; + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + union kvm_ioapic_redirect_entry *e; + + e = &ioapic->redirtbl[RTC_GSI]; + if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, + e->fields.dest_mode)) + return; + + new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); + old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + + if (new_val == old_val) + return; + + if (new_val) { + __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi++; + } else { + __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi--; + rtc_status_pending_eoi_check_valid(ioapic); + } +} + +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + + spin_lock(&ioapic->lock); + __rtc_irq_eoi_tracking_restore_one(vcpu); + spin_unlock(&ioapic->lock); +} + +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) +{ + struct kvm_vcpu *vcpu; + int i; + + if (RTC_GSI >= IOAPIC_NUM_PINS) + return; + + rtc_irq_eoi_tracking_reset(ioapic); + kvm_for_each_vcpu(i, vcpu, ioapic->kvm) + __rtc_irq_eoi_tracking_restore_one(vcpu); +} + +static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) +{ + if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) { + --ioapic->rtc_status.pending_eoi; + rtc_status_pending_eoi_check_valid(ioapic); + } +} + +static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) +{ + if (ioapic->rtc_status.pending_eoi > 0) + return true; /* coalesced */ + + return false; +} + +static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, + int irq_level, bool line_status) +{ + union kvm_ioapic_redirect_entry entry; + u32 mask = 1 << irq; + u32 old_irr; + int edge, ret; + + entry = ioapic->redirtbl[irq]; + edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + + if (!irq_level) { + ioapic->irr &= ~mask; + ret = 1; + goto out; + } + + /* + * Return 0 for coalesced interrupts; for edge-triggered interrupts, + * this only happens if a previous edge has not been delivered due + * do masking. For level interrupts, the remote_irr field tells + * us if the interrupt is waiting for an EOI. + * + * RTC is special: it is edge-triggered, but userspace likes to know + * if it has been already ack-ed via EOI because coalesced RTC + * interrupts lead to time drift in Windows guests. So we track + * EOI manually for the RTC interrupt. + */ + if (irq == RTC_GSI && line_status && + rtc_irq_check_coalesced(ioapic)) { + ret = 0; + goto out; + } + + old_irr = ioapic->irr; + ioapic->irr |= mask; + if ((edge && old_irr == ioapic->irr) || + (!edge && entry.fields.remote_irr)) { + ret = 0; + goto out; + } + + ret = ioapic_service(ioapic, irq, line_status); + +out: + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); + return ret; +} + +static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) +{ + u32 idx; + + rtc_irq_eoi_tracking_reset(ioapic); + for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS) + ioapic_set_irq(ioapic, idx, 1, true); + + kvm_rtc_eoi_tracking_restore_all(ioapic); +} + + +static void update_handled_vectors(struct kvm_ioapic *ioapic) +{ + DECLARE_BITMAP(handled_vectors, 256); + int i; + + memset(handled_vectors, 0, sizeof(handled_vectors)); + for (i = 0; i < IOAPIC_NUM_PINS; ++i) + __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); + memcpy(ioapic->handled_vectors, handled_vectors, + sizeof(handled_vectors)); + smp_wmb(); +} + +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, + u32 *tmr) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + union kvm_ioapic_redirect_entry *e; + int index; + + spin_lock(&ioapic->lock); + for (index = 0; index < IOAPIC_NUM_PINS; index++) { + e = &ioapic->redirtbl[index]; + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || + kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || + index == RTC_GSI) { + if (kvm_apic_match_dest(vcpu, NULL, 0, + e->fields.dest_id, e->fields.dest_mode)) { + __set_bit(e->fields.vector, + (unsigned long *)eoi_exit_bitmap); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) + __set_bit(e->fields.vector, + (unsigned long *)tmr); + } + } + } + spin_unlock(&ioapic->lock); +} + +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (!ioapic) + return; + kvm_make_scan_ioapic_request(kvm); +} + +static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) +{ + unsigned index; + bool mask_before, mask_after; + union kvm_ioapic_redirect_entry *e; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + /* Writes are ignored. */ + break; + + case IOAPIC_REG_APIC_ID: + ioapic->id = (val >> 24) & 0xf; + break; + + case IOAPIC_REG_ARB_ID: + break; + + default: + index = (ioapic->ioregsel - 0x10) >> 1; + + ioapic_debug("change redir index %x val %x\n", index, val); + if (index >= IOAPIC_NUM_PINS) + return; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; + if (ioapic->ioregsel & 1) { + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; + } else { + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + e->fields.remote_irr = 0; + } + update_handled_vectors(ioapic); + mask_after = e->fields.mask; + if (mask_before != mask_after) + kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG + && ioapic->irr & (1 << index)) + ioapic_service(ioapic, index, false); + kvm_vcpu_request_scan_ioapic(ioapic->kvm); + break; + } +} + +static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) +{ + union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; + struct kvm_lapic_irq irqe; + int ret; + + if (entry->fields.mask) + return -1; + + ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " + "vector=%x trig_mode=%x\n", + entry->fields.dest_id, entry->fields.dest_mode, + entry->fields.delivery_mode, entry->fields.vector, + entry->fields.trig_mode); + + irqe.dest_id = entry->fields.dest_id; + irqe.vector = entry->fields.vector; + irqe.dest_mode = entry->fields.dest_mode; + irqe.trig_mode = entry->fields.trig_mode; + irqe.delivery_mode = entry->fields.delivery_mode << 8; + irqe.level = 1; + irqe.shorthand = 0; + + if (irqe.trig_mode == IOAPIC_EDGE_TRIG) + ioapic->irr &= ~(1 << irq); + + if (irq == RTC_GSI && line_status) { + /* + * pending_eoi cannot ever become negative (see + * rtc_status_pending_eoi_check_valid) and the caller + * ensures that it is only called if it is >= zero, namely + * if rtc_irq_check_coalesced returns false). + */ + BUG_ON(ioapic->rtc_status.pending_eoi != 0); + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, + ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); + } else + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); + + if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG) + entry->fields.remote_irr = 1; + + return ret; +} + +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level, bool line_status) +{ + int ret, irq_level; + + BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); + + spin_lock(&ioapic->lock); + irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], + irq_source_id, level); + ret = ioapic_set_irq(ioapic, irq, irq_level, line_status); + + spin_unlock(&ioapic->lock); + + return ret; +} + +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) +{ + int i; + + spin_lock(&ioapic->lock); + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &ioapic->irq_states[i]); + spin_unlock(&ioapic->lock); +} + +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) +{ + int i; + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, + eoi_inject.work); + spin_lock(&ioapic->lock); + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) + continue; + + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) + ioapic_service(ioapic, i, false); + } + spin_unlock(&ioapic->lock); +} + +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 + +static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, + struct kvm_ioapic *ioapic, int vector, int trigger_mode) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.vector != vector) + continue; + + if (i == RTC_GSI) + rtc_irq_eoi(ioapic, vcpu); + /* + * We are dropping lock while calling ack notifiers because ack + * notifier callbacks for assigned devices call into IOAPIC + * recursively. Since remote_irr is cleared only after call + * to notifiers if the same vector will be delivered while lock + * is dropped it will be put into irr and will be delivered + * after ack notifier returns. + */ + spin_unlock(&ioapic->lock); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); + spin_lock(&ioapic->lock); + + if (trigger_mode != IOAPIC_LEVEL_TRIG) + continue; + + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { + ++ioapic->irq_eoi[i]; + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { + /* + * Real hardware does not deliver the interrupt + * immediately during eoi broadcast, and this + * lets a buggy guest make slow progress + * even if it does not correctly handle a + * level-triggered interrupt. Emulate this + * behavior if we detect an interrupt storm. + */ + schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); + ioapic->irq_eoi[i] = 0; + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); + } else { + ioapic_service(ioapic, i, false); + } + } else { + ioapic->irq_eoi[i] = 0; + } + } +} + +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + smp_rmb(); + return test_bit(vector, ioapic->handled_vectors); +} + +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + + spin_lock(&ioapic->lock); + __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode); + spin_unlock(&ioapic->lock); +} + +static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_ioapic, dev); +} + +static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) +{ + return ((addr >= ioapic->base_address && + (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); +} + +static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 result; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("addr %lx\n", (unsigned long)addr); + ASSERT(!(addr & 0xf)); /* check alignment */ + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + result = ioapic->ioregsel; + break; + + case IOAPIC_REG_WINDOW: + result = ioapic_read_indirect(ioapic, addr, len); + break; + + default: + result = 0; + break; + } + spin_unlock(&ioapic->lock); + + switch (len) { + case 8: + *(u64 *) val = result; + break; + case 1: + case 2: + case 4: + memcpy(val, (char *)&result, len); + break; + default: + printk(KERN_WARNING "ioapic: wrong length %d\n", len); + } + return 0; +} + +static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 data; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", + (void*)addr, len, val); + ASSERT(!(addr & 0xf)); /* check alignment */ + + switch (len) { + case 8: + case 4: + data = *(u32 *) val; + break; + case 2: + data = *(u16 *) val; + break; + case 1: + data = *(u8 *) val; + break; + default: + printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); + return 0; + } + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + ioapic->ioregsel = data & 0xFF; /* 8-bit register */ + break; + + case IOAPIC_REG_WINDOW: + ioapic_write_indirect(ioapic, data); + break; + + default: + break; + } + spin_unlock(&ioapic->lock); + return 0; +} + +static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +{ + int i; + + cancel_delayed_work_sync(&ioapic->eoi_inject); + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + ioapic->ioregsel = 0; + ioapic->irr = 0; + ioapic->id = 0; + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); + rtc_irq_eoi_tracking_reset(ioapic); + update_handled_vectors(ioapic); +} + +static const struct kvm_io_device_ops ioapic_mmio_ops = { + .read = ioapic_mmio_read, + .write = ioapic_mmio_write, +}; + +int kvm_ioapic_init(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic; + int ret; + + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + if (!ioapic) + return -ENOMEM; + spin_lock_init(&ioapic->lock); + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); + kvm->arch.vioapic = ioapic; + kvm_ioapic_reset(ioapic); + kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); + ioapic->kvm = kvm; + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, + IOAPIC_MEM_LENGTH, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kvm->arch.vioapic = NULL; + kfree(ioapic); + } + + return ret; +} + +void kvm_ioapic_destroy(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + cancel_delayed_work_sync(&ioapic->eoi_inject); + if (ioapic) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); + } +} + +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); + spin_unlock(&ioapic->lock); + return 0; +} + +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + ioapic->irr = 0; + update_handled_vectors(ioapic); + kvm_vcpu_request_scan_ioapic(kvm); + kvm_ioapic_inject_all(ioapic, state->irr); + spin_unlock(&ioapic->lock); + return 0; +} diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h new file mode 100644 index 000000000000..3c9195535ffc --- /dev/null +++ b/arch/x86/kvm/ioapic.h @@ -0,0 +1,119 @@ +#ifndef __KVM_IO_APIC_H +#define __KVM_IO_APIC_H + +#include <linux/kvm_host.h> + +#include "iodev.h" + +struct kvm; +struct kvm_vcpu; + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +/*ioapic delivery mode*/ +#define IOAPIC_FIXED 0x0 +#define IOAPIC_LOWEST_PRIORITY 0x1 +#define IOAPIC_PMI 0x2 +#define IOAPIC_NMI 0x4 +#define IOAPIC_INIT 0x5 +#define IOAPIC_EXTINT 0x7 + +#ifdef CONFIG_X86 +#define RTC_GSI 8 +#else +#define RTC_GSI -1U +#endif + +struct rtc_status { + int pending_eoi; + DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS); +}; + +union kvm_ioapic_redirect_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; +}; + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; + unsigned long irq_states[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; + void (*ack_notifier)(void *opaque, int irq); + spinlock_t lock; + DECLARE_BITMAP(handled_vectors, 256); + struct rtc_status rtc_status; + struct delayed_work eoi_inject; + u32 irq_eoi[IOAPIC_NUM_PINS]; +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vioapic; +} + +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, unsigned int dest, int dest_mode); +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, + int trigger_mode); +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_destroy(struct kvm *kvm); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level, bool line_status); +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, unsigned long *dest_map); +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, + u32 *tmr); + +#endif diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c new file mode 100644 index 000000000000..17b73eeac8a4 --- /dev/null +++ b/arch/x86/kvm/iommu.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Author: Allen M. Kay <allen.m.kay@intel.com> + * Author: Weidong Han <weidong.han@intel.com> + * Author: Ben-Ami Yassour <benami@il.ibm.com> + */ + +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/stat.h> +#include <linux/dmar.h> +#include <linux/iommu.h> +#include <linux/intel-iommu.h> +#include "assigned-dev.h" + +static bool allow_unsafe_assigned_interrupts; +module_param_named(allow_unsafe_assigned_interrupts, + allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, + "Enable device assignment on platforms without interrupt remapping support."); + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, + unsigned long npages) +{ + gfn_t end_gfn; + pfn_t pfn; + + pfn = gfn_to_pfn_memslot(slot, gfn); + end_gfn = gfn + npages; + gfn += 1; + + if (is_error_noslot_pfn(pfn)) + return pfn; + + while (gfn < end_gfn) + gfn_to_pfn_memslot(slot, gfn++); + + return pfn; +} + +static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; ++i) + kvm_release_pfn_clean(pfn + i); +} + +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + gfn_t gfn, end_gfn; + pfn_t pfn; + int r = 0; + struct iommu_domain *domain = kvm->arch.iommu_domain; + int flags; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + gfn = slot->base_gfn; + end_gfn = gfn + slot->npages; + + flags = IOMMU_READ; + if (!(slot->flags & KVM_MEM_READONLY)) + flags |= IOMMU_WRITE; + if (!kvm->arch.iommu_noncoherent) + flags |= IOMMU_CACHE; + + + while (gfn < end_gfn) { + unsigned long page_size; + + /* Check if already mapped */ + if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { + gfn += 1; + continue; + } + + /* Get the page size we could use to map */ + page_size = kvm_host_page_size(kvm, gfn); + + /* Make sure the page_size does not exceed the memslot */ + while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) + page_size >>= 1; + + /* Make sure gfn is aligned to the page size we want to map */ + while ((gfn << PAGE_SHIFT) & (page_size - 1)) + page_size >>= 1; + + /* Make sure hva is aligned to the page size we want to map */ + while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) + page_size >>= 1; + + /* + * Pin all pages we are about to map in memory. This is + * important because we unmap and unpin in 4kb steps later. + */ + pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); + if (is_error_noslot_pfn(pfn)) { + gfn += 1; + continue; + } + + /* Map into IO address space */ + r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), + page_size, flags); + if (r) { + printk(KERN_ERR "kvm_iommu_map_address:" + "iommu failed to map pfn=%llx\n", pfn); + kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); + goto unmap_pages; + } + + gfn += page_size >> PAGE_SHIFT; + + + } + + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int idx, r = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + if (kvm->arch.iommu_noncoherent) + kvm_arch_register_noncoherent_dma(kvm); + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + kvm_for_each_memslot(memslot, slots) { + r = kvm_iommu_map_pages(kvm, memslot); + if (r) + break; + } + srcu_read_unlock(&kvm->srcu, idx); + + return r; +} + +int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + int r; + bool noncoherent; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + if (pdev == NULL) + return -ENODEV; + + r = iommu_attach_device(domain, &pdev->dev); + if (r) { + dev_err(&pdev->dev, "kvm assign device failed ret %d", r); + return r; + } + + noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); + + /* Check if need to update IOMMU page table for guest memory */ + if (noncoherent != kvm->arch.iommu_noncoherent) { + kvm_iommu_unmap_memslots(kvm); + kvm->arch.iommu_noncoherent = noncoherent; + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + } + + pci_set_dev_assigned(pdev); + + dev_info(&pdev->dev, "kvm assign device\n"); + + return 0; +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + if (pdev == NULL) + return -ENODEV; + + iommu_detach_device(domain, &pdev->dev); + + pci_clear_dev_assigned(pdev); + + dev_info(&pdev->dev, "kvm deassign device\n"); + + return 0; +} + +int kvm_iommu_map_guest(struct kvm *kvm) +{ + int r; + + if (!iommu_present(&pci_bus_type)) { + printk(KERN_ERR "%s: iommu not found\n", __func__); + return -ENODEV; + } + + mutex_lock(&kvm->slots_lock); + + kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); + if (!kvm->arch.iommu_domain) { + r = -ENOMEM; + goto out_unlock; + } + + if (!allow_unsafe_assigned_interrupts && + !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { + printk(KERN_WARNING "%s: No interrupt remapping support," + " disallowing device assignment." + " Re-enble with \"allow_unsafe_assigned_interrupts=1\"" + " module option.\n", __func__); + iommu_domain_free(kvm->arch.iommu_domain); + kvm->arch.iommu_domain = NULL; + r = -EPERM; + goto out_unlock; + } + + r = kvm_iommu_map_memslots(kvm); + if (r) + kvm_iommu_unmap_memslots(kvm); + +out_unlock: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + struct iommu_domain *domain; + gfn_t end_gfn, gfn; + pfn_t pfn; + u64 phys; + + domain = kvm->arch.iommu_domain; + end_gfn = base_gfn + npages; + gfn = base_gfn; + + /* check if iommu exists and in use */ + if (!domain) + return; + + while (gfn < end_gfn) { + unsigned long unmap_pages; + size_t size; + + /* Get physical address */ + phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); + + if (!phys) { + gfn++; + continue; + } + + pfn = phys >> PAGE_SHIFT; + + /* Unmap address from IO address space */ + size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); + unmap_pages = 1ULL << get_order(size); + + /* Unpin all pages we just unmapped to not leak any memory */ + kvm_unpin_pages(kvm, pfn, unmap_pages); + + gfn += unmap_pages; + } +} + +void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int idx; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + kvm_for_each_memslot(memslot, slots) + kvm_iommu_unmap_pages(kvm, memslot); + + srcu_read_unlock(&kvm->srcu, idx); + + if (kvm->arch.iommu_noncoherent) + kvm_arch_unregister_noncoherent_dma(kvm); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + mutex_lock(&kvm->slots_lock); + kvm_iommu_unmap_memslots(kvm); + kvm->arch.iommu_domain = NULL; + kvm->arch.iommu_noncoherent = false; + mutex_unlock(&kvm->slots_lock); + + iommu_domain_free(domain); + return 0; +} diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c new file mode 100644 index 000000000000..72298b3ac025 --- /dev/null +++ b/arch/x86/kvm/irq_comm.c @@ -0,0 +1,332 @@ +/* + * irq_comm.c: Common API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <trace/events/kvm.h> + +#include <asm/msidef.h> + +#include "irq.h" + +#include "ioapic.h" + +static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + struct kvm_pic *pic = pic_irqchip(kvm); + return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); +} + +static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, + line_status); +} + +inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) +{ + return irq->delivery_mode == APIC_DM_LOWEST; +} + +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, unsigned long *dest_map) +{ + int i, r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + + if (irq->dest_mode == 0 && irq->dest_id == 0xff && + kvm_is_dm_lowest_prio(irq)) { + printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + irq->delivery_mode = APIC_DM_FIXED; + } + + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_is_dm_lowest_prio(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq, dest_map); + } else if (kvm_lapic_enabled(vcpu)) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); + + return r; +} + +static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) +{ + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + + irq->dest_id = (e->msi.address_lo & + MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + irq->vector = (e->msi.data & + MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; + irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; + irq->delivery_mode = e->msi.data & 0x700; + irq->level = 1; + irq->shorthand = 0; + /* TODO Deal with RH bit of MSI message address */ +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct kvm_lapic_irq irq; + + if (!level) + return -1; + + kvm_set_msi_irq(e, &irq); + + return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); +} + + +static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm) +{ + struct kvm_lapic_irq irq; + int r; + + kvm_set_msi_irq(e, &irq); + + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) + return r; + else + return -EWOULDBLOCK; +} + +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + * Other values - No need to retry. + */ +int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) +{ + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + struct kvm_kernel_irq_routing_entry *e; + int ret = -EINVAL; + int idx; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* + * Injection into either PIC or IOAPIC might need to scan all CPUs, + * which would need to be retried from thread context; when same GSI + * is connected to both PIC and IOAPIC, we'd have to report a + * partial failure here. + * Since there's no easy way to do this, we only support injecting MSI + * which is limited to 1:1 GSI mapping. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { + e = &entries[0]; + if (likely(e->type == KVM_IRQ_ROUTING_MSI)) + ret = kvm_set_msi_inatomic(e, kvm); + else + ret = -EWOULDBLOCK; + } + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + +int kvm_request_irq_source_id(struct kvm *kvm) +{ + unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); + + if (irq_source_id >= BITS_PER_LONG) { + printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + irq_source_id = -EFAULT; + goto unlock; + } + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + set_bit(irq_source_id, bitmap); +unlock: + mutex_unlock(&kvm->irq_lock); + + return irq_source_id; +} + +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) +{ + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + + mutex_lock(&kvm->irq_lock); + if (irq_source_id < 0 || + irq_source_id >= BITS_PER_LONG) { + printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + goto unlock; + } + clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + if (!irqchip_in_kernel(kvm)) + goto unlock; + + kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); + kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); +unlock: + mutex_unlock(&kvm->irq_lock); +} + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + kimn->irq = irq; + hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_rcu(&kimn->link); + mutex_unlock(&kvm->irq_lock); + synchronize_srcu(&kvm->irq_srcu); +} + +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask) +{ + struct kvm_irq_mask_notifier *kimn; + int idx, gsi; + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); + if (gsi != -1) + hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link) + if (kimn->irq == gsi) + kimn->func(kimn, mask); + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + int delta; + unsigned max_pin; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + delta = 0; + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_MASTER: + e->set = kvm_set_pic_irq; + max_pin = PIC_NUM_PINS; + break; + case KVM_IRQCHIP_PIC_SLAVE: + e->set = kvm_set_pic_irq; + max_pin = PIC_NUM_PINS; + delta = 8; + break; + case KVM_IRQCHIP_IOAPIC: + max_pin = KVM_IOAPIC_NUM_PINS; + e->set = kvm_set_ioapic_irq; + break; + default: + goto out; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin + delta; + if (e->irqchip.pin >= max_pin) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + r = 0; +out: + return r; +} + +#define IOAPIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } +#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) + +#define PIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } +#define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) + +static const struct kvm_irq_routing_entry default_routing[] = { + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), +}; + +int kvm_setup_default_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, default_routing, + ARRAY_SIZE(default_routing), 0); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b8345dd41b25..4f0c0b954686 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -68,6 +68,9 @@ #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 +#define APIC_BROADCAST 0xFF +#define X2APIC_BROADCAST 0xFFFFFFFFul + #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) @@ -129,8 +132,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic) return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } -#define KVM_X2APIC_CID_BITS 0 - static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -149,42 +150,56 @@ static void recalculate_apic_map(struct kvm *kvm) new->cid_shift = 8; new->cid_mask = 0; new->lid_mask = 0xff; + new->broadcast = APIC_BROADCAST; kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; - u16 cid, lid; - u32 ldr; if (!kvm_apic_present(vcpu)) continue; + if (apic_x2apic_mode(apic)) { + new->ldr_bits = 32; + new->cid_shift = 16; + new->cid_mask = new->lid_mask = 0xffff; + new->broadcast = X2APIC_BROADCAST; + } else if (kvm_apic_get_reg(apic, APIC_LDR)) { + if (kvm_apic_get_reg(apic, APIC_DFR) == + APIC_DFR_CLUSTER) { + new->cid_shift = 4; + new->cid_mask = 0xf; + new->lid_mask = 0xf; + } else { + new->cid_shift = 8; + new->cid_mask = 0; + new->lid_mask = 0xff; + } + } + /* * All APICs have to be configured in the same mode by an OS. * We take advatage of this while building logical id loockup - * table. After reset APICs are in xapic/flat mode, so if we - * find apic with different setting we assume this is the mode + * table. After reset APICs are in software disabled mode, so if + * we find apic with different setting we assume this is the mode * OS wants all apics to be in; build lookup table accordingly. */ - if (apic_x2apic_mode(apic)) { - new->ldr_bits = 32; - new->cid_shift = 16; - new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1; - new->lid_mask = 0xffff; - } else if (kvm_apic_sw_enabled(apic) && - !new->cid_mask /* flat mode */ && - kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) { - new->cid_shift = 4; - new->cid_mask = 0xf; - new->lid_mask = 0xf; - } + if (kvm_apic_sw_enabled(apic)) + break; + } - new->phys_map[kvm_apic_id(apic)] = apic; + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvm_lapic *apic = vcpu->arch.apic; + u16 cid, lid; + u32 ldr, aid; + aid = kvm_apic_id(apic); ldr = kvm_apic_get_reg(apic, APIC_LDR); cid = apic_cluster_id(new, ldr); lid = apic_logical_id(new, ldr); - if (lid) + if (aid < ARRAY_SIZE(new->phys_map)) + new->phys_map[aid] = apic; + if (lid && cid < ARRAY_SIZE(new->logical_map)) new->logical_map[cid][ffs(lid) - 1] = apic; } out: @@ -201,11 +216,13 @@ out: static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) { - u32 prev = kvm_apic_get_reg(apic, APIC_SPIV); + bool enabled = val & APIC_SPIV_APIC_ENABLED; apic_set_reg(apic, APIC_SPIV, val); - if ((prev ^ val) & APIC_SPIV_APIC_ENABLED) { - if (val & APIC_SPIV_APIC_ENABLED) { + + if (enabled != apic->sw_enabled) { + apic->sw_enabled = enabled; + if (enabled) { static_key_slow_dec_deferred(&apic_sw_disabled); recalculate_apic_map(apic->vcpu->kvm); } else @@ -237,21 +254,17 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) { - return ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT; } static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC; } static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) { - return ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == - APIC_LVT_TIMER_TSCDEADLINE); + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE; } static inline int apic_lvt_nmi_mode(u32 lvt_val) @@ -326,8 +339,12 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { - apic->irr_pending = true; apic_set_vector(vec, apic->regs + APIC_IRR); + /* + * irr_pending must be true if any interrupt is pending; set it after + * APIC_IRR to avoid race with apic_clear_irr + */ + apic->irr_pending = true; } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -359,13 +376,15 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - apic_clear_vector(vec, apic->regs + APIC_IRR); - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { /* try to update RVI */ + apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); - else { - vec = apic_search_irr(apic); - apic->irr_pending = (vec != -1); + } else { + apic->irr_pending = false; + apic_clear_vector(vec, apic->regs + APIC_IRR); + if (apic_search_irr(apic) != -1) + apic->irr_pending = true; } } @@ -558,16 +577,25 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) apic_update_ppr(apic); } -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) +static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) +{ + return dest == (apic_x2apic_mode(apic) ? + X2APIC_BROADCAST : APIC_BROADCAST); +} + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) { - return dest == 0xff || kvm_apic_id(apic) == dest; + return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); } -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) { int result = 0; u32 logical_id; + if (kvm_apic_broadcast(apic, mda)) + return 1; + if (apic_x2apic_mode(apic)) { logical_id = kvm_apic_get_reg(apic, APIC_LDR); return logical_id & mda; @@ -595,7 +623,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) } int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, int dest, int dest_mode) + int short_hand, unsigned int dest, int dest_mode) { int result = 0; struct kvm_lapic *target = vcpu->arch.apic; @@ -657,15 +685,24 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, if (!map) goto out; + if (irq->dest_id == map->broadcast) + goto out; + + ret = true; + if (irq->dest_mode == 0) { /* physical mode */ - if (irq->delivery_mode == APIC_DM_LOWEST || - irq->dest_id == 0xff) + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) goto out; - dst = &map->phys_map[irq->dest_id & 0xff]; + + dst = &map->phys_map[irq->dest_id]; } else { u32 mda = irq->dest_id << (32 - map->ldr_bits); + u16 cid = apic_cluster_id(map, mda); + + if (cid >= ARRAY_SIZE(map->logical_map)) + goto out; - dst = map->logical_map[apic_cluster_id(map, mda)]; + dst = map->logical_map[cid]; bitmap = apic_logical_id(map, mda); @@ -691,8 +728,6 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = 0; *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } - - ret = true; out: rcu_read_unlock(); return ret; @@ -1034,6 +1069,26 @@ static void update_divide_count(struct kvm_lapic *apic) apic->divide_count); } +static void apic_timer_expired(struct kvm_lapic *apic) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + wait_queue_head_t *q = &vcpu->wq; + + /* + * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * vcpu_enter_guest. + */ + if (atomic_read(&apic->lapic_timer.pending)) + return; + + atomic_inc(&apic->lapic_timer.pending); + /* FIXME: this code should not know anything about vcpus */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + + if (waitqueue_active(q)) + wake_up_interruptible(q); +} + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1096,9 +1151,10 @@ static void start_apic_timer(struct kvm_lapic *apic) if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); - } - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + } else + apic_timer_expired(apic); local_irq_restore(flags); } @@ -1203,17 +1259,20 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; - case APIC_LVTT: - if ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) != - (val & apic->lapic_timer.timer_mode_mask)) + case APIC_LVTT: { + u32 timer_mode = val & apic->lapic_timer.timer_mode_mask; + + if (apic->lapic_timer.timer_mode != timer_mode) { + apic->lapic_timer.timer_mode = timer_mode; hrtimer_cancel(&apic->lapic_timer.timer); + } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); apic_set_reg(apic, APIC_LVTT, val); break; + } case APIC_TMICT: if (apic_lvtt_tscdeadline(apic)) @@ -1320,7 +1379,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) static_key_slow_dec_deferred(&apic_hw_disabled); - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED)) + if (!apic->sw_enabled) static_key_slow_dec_deferred(&apic_sw_disabled); if (apic->regs) @@ -1355,9 +1414,6 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) return; hrtimer_cancel(&apic->lapic_timer.timer); - /* Inject here so clearing tscdeadline won't override new value */ - if (apic_has_pending_timer(vcpu)) - kvm_inject_apic_timer_irqs(vcpu); apic->lapic_timer.tscdeadline = data; start_apic_timer(apic); } @@ -1422,6 +1478,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; + if ((value & MSR_IA32_APICBASE_ENABLE) && + apic->base_address != APIC_DEFAULT_PHYS_BASE) + pr_warn_once("APIC base relocation is unsupported by KVM"); + /* with FSB delivery interrupt, we can restart APIC functionality */ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); @@ -1447,6 +1507,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + apic->lapic_timer.timer_mode = 0; apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); @@ -1538,23 +1599,8 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) { struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); - struct kvm_vcpu *vcpu = apic->vcpu; - wait_queue_head_t *q = &vcpu->wq; - - /* - * There is a race window between reading and incrementing, but we do - * not care about potentially losing timer events in the !reinject - * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked - * in vcpu_enter_guest. - */ - if (!atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - } - if (waitqueue_active(q)) - wake_up_interruptible(q); + apic_timer_expired(apic); if (lapic_is_periodic(apic)) { hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); @@ -1693,6 +1739,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + apic_find_highest_irr(apic)); kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_rtc_eoi_tracking_restore_one(vcpu); @@ -1837,8 +1886,11 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) return 1; + if (reg == APIC_ICR2) + return 1; + /* if this is ICR write vector before command */ - if (msr == 0x830) + if (reg == APIC_ICR) apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); return apic_reg_write(apic, reg, (u32)data); } @@ -1851,9 +1903,15 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) return 1; + if (reg == APIC_DFR || reg == APIC_ICR2) { + apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n", + reg); + return 1; + } + if (apic_reg_read(apic, reg, 4, &low)) return 1; - if (msr == 0x830) + if (reg == APIC_ICR) apic_reg_read(apic, APIC_ICR2, 4, &high); *data = (((u64)high) << 32) | low; @@ -1908,7 +1966,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) void kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - unsigned int sipi_vector; + u8 sipi_vector; unsigned long pe; if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6a11845fd8b9..c674fce53cf9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -11,6 +11,7 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ @@ -22,6 +23,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool sw_enabled; bool irr_pending; /* Number of bits set in ISR. */ s16 isr_count; @@ -55,8 +57,8 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest); +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -119,11 +121,11 @@ static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) extern struct static_key_deferred apic_sw_disabled; -static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic) +static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) { if (static_key_false(&apic_sw_disabled.key)) - return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; - return APIC_SPIV_APIC_ENABLED; + return apic->sw_enabled; + return true; } static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) @@ -152,8 +154,6 @@ static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) ldr >>= 32 - map->ldr_bits; cid = (ldr >> map->cid_shift) & map->cid_mask; - BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); - return cid; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac1c4de3a484..10fbed126b11 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -214,13 +214,12 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); #define MMIO_GEN_LOW_SHIFT 10 #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) -#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) static u64 generation_mmio_spte_mask(unsigned int gen) { u64 mask; - WARN_ON(gen > MMIO_MAX_GEN); + WARN_ON(gen & ~MMIO_GEN_MASK); mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; @@ -263,13 +262,13 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) & ~PAGE_MASK; } @@ -630,7 +629,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) * kvm mmu, before reclaiming the page, we should * unmap it from mmu first. */ - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); @@ -2461,7 +2460,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + kvm_is_reserved_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; @@ -2737,7 +2736,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 5aaf35641768..ce463a9cc8fb 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -22,7 +22,7 @@ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ - const u32 saved_len = p->len; \ + const char *saved_ptr = trace_seq_buffer_ptr(p); \ static const char *access_str[] = { \ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ }; \ @@ -41,7 +41,7 @@ role.nxe ? "" : "!", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ - p->buffer + saved_len; \ + saved_ptr; \ }) #define kvm_mmu_trace_pferr_flags \ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7527cefc5a43..41dd0387cccb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1056,9 +1056,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho { struct vcpu_svm *svm = to_svm(vcpu); - WARN_ON(adjustment < 0); - if (host) - adjustment = svm_scale_tsc(vcpu, adjustment); + if (host) { + if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + WARN_ON(adjustment < 0); + adjustment = svm_scale_tsc(vcpu, (u64)adjustment); + } svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) @@ -2999,7 +3001,6 @@ static int dr_interception(struct vcpu_svm *svm) { int reg, dr; unsigned long val; - int err; if (svm->vcpu.guest_debug == 0) { /* @@ -3019,12 +3020,15 @@ static int dr_interception(struct vcpu_svm *svm) dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ + if (!kvm_require_dr(&svm->vcpu, dr - 16)) + return 1; val = kvm_register_read(&svm->vcpu, reg); kvm_set_dr(&svm->vcpu, dr - 16, val); } else { - err = kvm_get_dr(&svm->vcpu, dr, &val); - if (!err) - kvm_register_write(&svm->vcpu, reg, val); + if (!kvm_require_dr(&svm->vcpu, dr)) + return 1; + kvm_get_dr(&svm->vcpu, dr, &val); + kvm_register_write(&svm->vcpu, reg, val); } skip_emulated_instruction(&svm->vcpu); @@ -4123,6 +4127,11 @@ static bool svm_mpx_supported(void) return false; } +static bool svm_xsaves_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4410,6 +4419,7 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .invpcid_supported = svm_invpcid_supported, .mpx_supported = svm_mpx_supported, + .xsaves_supported = svm_xsaves_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6b06ab8748dd..c2a34bb5ad93 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -5,6 +5,7 @@ #include <asm/vmx.h> #include <asm/svm.h> #include <asm/clocksource.h> +#include <asm/pvclock-abi.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -877,6 +878,42 @@ TRACE_EVENT(kvm_ple_window, #define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ trace_kvm_ple_window(false, vcpu_id, new, old) +TRACE_EVENT(kvm_pvclock_update, + TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), + TP_ARGS(vcpu_id, pvclock), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( __u32, version ) + __field( __u64, tsc_timestamp ) + __field( __u64, system_time ) + __field( __u32, tsc_to_system_mul ) + __field( __s8, tsc_shift ) + __field( __u8, flags ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->version = pvclock->version; + __entry->tsc_timestamp = pvclock->tsc_timestamp; + __entry->system_time = pvclock->system_time; + __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul; + __entry->tsc_shift = pvclock->tsc_shift; + __entry->flags = pvclock->flags; + ), + + TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, " + "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, " + "flags 0x%x }", + __entry->vcpu_id, + __entry->version, + __entry->tsc_timestamp, + __entry->system_time, + __entry->tsc_to_system_mul, + __entry->tsc_shift, + __entry->flags) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3e556c68351b..feb852b04598 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -99,13 +99,15 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); +static u64 __read_mostly host_xss; + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT) + | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -214,6 +216,7 @@ struct __packed vmcs12 { u64 virtual_apic_page_addr; u64 apic_access_addr; u64 ept_pointer; + u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; u64 guest_ia32_debugctl; @@ -616,6 +619,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), FIELD64(EPT_POINTER, ept_pointer), + FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), @@ -720,12 +724,15 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), }; -static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); static inline short vmcs_field_to_offset(unsigned long field) { - if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) - return -1; + BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + + if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || + vmcs_field_to_offset_table[field] == 0) + return -ENOENT; + return vmcs_field_to_offset_table[field]; } @@ -758,6 +765,7 @@ static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); +static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1098,6 +1106,12 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); } +static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && + vmx_xsaves_supported(); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1659,12 +1673,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) vmx->guest_msrs[efer_offset].mask = ~ignore_bits; clear_atomic_switch_msr(vmx, MSR_EFER); - /* On ept, can't emulate nx, and must switch nx atomically */ - if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { + + /* + * On EPT, we can't emulate NX, so we must switch EFER atomically. + * On CPUs that support "load IA32_EFER", always switch EFER + * atomically, since it's faster than switching it manually. + */ + if (cpu_has_load_ia32_efer || + (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { guest_efer = vmx->vcpu.arch.efer; if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer); return false; } @@ -2377,12 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST; nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; @@ -2558,6 +2581,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) if (!nested_vmx_allowed(vcpu)) return 1; return vmx_get_vmx_msr(vcpu, msr_index, pdata); + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + data = vcpu->arch.ia32_xss; + break; case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; @@ -2649,6 +2677,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return 1; /* they are read-only */ + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + /* + * The only supported bit as of Skylake is bit 8, but + * it is not supported on KVM. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + if (vcpu->arch.ia32_xss != host_xss) + add_atomic_switch_msr(vmx, MSR_IA32_XSS, + vcpu->arch.ia32_xss, host_xss); + else + clear_atomic_switch_msr(vmx, MSR_IA32_XSS); + break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2884,7 +2928,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS; + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_XSAVES; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3007,6 +3052,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } + if (cpu_has_xsaves) + rdmsrl(MSR_IA32_XSS, host_xss); + return 0; } @@ -3110,76 +3158,6 @@ static __init int alloc_kvm_area(void) return 0; } -static __init int hardware_setup(void) -{ - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (!cpu_has_vmx_vpid()) - enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels()) { - enable_ept = 0; - enable_unrestricted_guest = 0; - enable_ept_ad_bits = 0; - } - - if (!cpu_has_vmx_ept_ad_bits()) - enable_ept_ad_bits = 0; - - if (!cpu_has_vmx_unrestricted_guest()) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) { - flexpriority_enabled = 0; - - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if the - * processor does not have the APIC_ACCESS_ADDR VMCS field. - */ - kvm_x86_ops->set_apic_access_page_addr = NULL; - } - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - - if (!cpu_has_vmx_ple()) - ple_gap = 0; - - if (!cpu_has_vmx_apicv()) - enable_apicv = 0; - - if (enable_apicv) - kvm_x86_ops->update_cr8_intercept = NULL; - else { - kvm_x86_ops->hwapic_irr_update = NULL; - kvm_x86_ops->deliver_posted_interrupt = NULL; - kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; - } - - if (nested) - nested_vmx_setup_ctls_msrs(); - - return alloc_kvm_area(); -} - -static __exit void hardware_unsetup(void) -{ - free_kvm_area(); -} - static bool emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !guest_state_valid(vcpu); @@ -4396,6 +4374,7 @@ static void ept_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); } +#define VMX_XSS_EXIT_BITMAP 0 /* * Sets up the vmcs for emulated real mode. */ @@ -4505,6 +4484,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); + if (vmx_xsaves_supported()) + vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + return 0; } @@ -5163,13 +5145,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int dr, reg; + int dr, dr7, reg; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + + /* First, if DR does not exist, trigger UD */ + if (!kvm_require_dr(vcpu, dr)) + return 1; /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; - dr = vmcs_readl(GUEST_DR7); - if (dr & DR7_GD) { + dr7 = vmcs_readl(GUEST_DR7); + if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the @@ -5177,17 +5166,14 @@ static int handle_dr(struct kvm_vcpu *vcpu) */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; - vcpu->run->debug.arch.dr7 = dr; - vcpu->run->debug.arch.pc = - vmcs_readl(GUEST_CS_BASE) + - vmcs_readl(GUEST_RIP); + vcpu->run->debug.arch.dr7 = dr7; + vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr7 &= ~DR7_GD; + vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); kvm_queue_exception(vcpu, DB_VECTOR); return 1; } @@ -5209,8 +5195,6 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; } - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; @@ -5391,6 +5375,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) return 1; } +static int handle_xsaves(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + +static int handle_xrstors(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5492,7 +5490,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); /* * TODO: What about debug traps on tss switch? @@ -5539,11 +5537,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) trace_kvm_page_fault(gpa, exit_qualification); /* It is a write fault? */ - error_code = exit_qualification & (1U << 1); + error_code = exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ - error_code |= (exit_qualification & (1U << 2)) << 2; + error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & 0x1; + error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; vcpu->arch.exit_qualification = exit_qualification; @@ -5785,6 +5783,204 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +static __init int hardware_setup(void) +{ + int r = -ENOMEM, i, msr; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_a) + return r; + + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_b) + goto out; + + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) + goto out1; + + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; + + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) + goto out3; + + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out5; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) + goto out6; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } + + if (enable_ept) { + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + update_ple_window_actual_max(); + + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; + goto out7; + } + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { + enable_ept = 0; + enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; + } + + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) { + flexpriority_enabled = 0; + + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if the + * processor does not have the APIC_ACCESS_ADDR VMCS field. + */ + kvm_x86_ops->set_apic_access_page_addr = NULL; + } + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + + if (!cpu_has_vmx_ple()) + ple_gap = 0; + + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; + + if (enable_apicv) + kvm_x86_ops->update_cr8_intercept = NULL; + else { + kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->deliver_posted_interrupt = NULL; + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + } + + if (nested) + nested_vmx_setup_ctls_msrs(); + + return alloc_kvm_area(); + +out7: + free_page((unsigned long)vmx_vmwrite_bitmap); +out6: + free_page((unsigned long)vmx_vmread_bitmap); +out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +out4: + free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +out2: + free_page((unsigned long)vmx_msr_bitmap_legacy); +out1: + free_page((unsigned long)vmx_io_bitmap_b); +out: + free_page((unsigned long)vmx_io_bitmap_a); + + return r; +} + +static __exit void hardware_unsetup(void) +{ + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); + + free_kvm_area(); +} + /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -6361,58 +6557,60 @@ static inline int vmcs_field_readonly(unsigned long field) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) +static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); char *p; if (offset < 0) - return 0; + return offset; p = ((char *)(get_vmcs12(vcpu))) + offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_NATURAL_WIDTH: *ret = *((natural_width *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U16: *ret = *((u16 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U32: *ret = *((u32 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U64: *ret = *((u64 *)p); - return 1; + return 0; default: - return 0; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } -static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 field_value){ +static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); char *p = ((char *) get_vmcs12(vcpu)) + offset; if (offset < 0) - return false; + return offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: *(u16 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U32: *(u32 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U64: *(u64 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_NATURAL_WIDTH: *(natural_width *)p = field_value; - return true; + return 0; default: - return false; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } @@ -6445,6 +6643,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: field_value = vmcs_readl(field); break; + default: + WARN_ON(1); + continue; } vmcs12_write_any(&vmx->vcpu, field, field_value); } @@ -6490,6 +6691,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: vmcs_writel(field, (long)field_value); break; + default: + WARN_ON(1); + break; } } } @@ -6528,7 +6732,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (!vmcs12_read_any(vcpu, field, &field_value)) { + if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6598,7 +6802,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; } - if (!vmcs12_write_any(vcpu, field, field_value)) { + if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6802,6 +7006,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, [EXIT_REASON_INVVPID] = handle_invvpid, + [EXIT_REASON_XSAVES] = handle_xsaves, + [EXIT_REASON_XRSTORS] = handle_xrstors, }; static const int kvm_vmx_max_exit_handlers = @@ -7089,6 +7295,14 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return 1; + case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + /* + * This should never happen, since it is not possible to + * set XSS to a non-zero value---neither in L1 nor in L2. + * If if it were, XSS would have to be checked against + * the XSS exit bitmap in vmcs12. + */ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); default: return 1; } @@ -7277,6 +7491,9 @@ static void vmx_set_rvi(int vector) u16 status; u8 old; + if (vector == -1) + vector = 0; + status = vmcs_read16(GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { @@ -7288,22 +7505,23 @@ static void vmx_set_rvi(int vector) static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + if (max_irr == -1) return; /* - * If a vmexit is needed, vmx_check_nested_events handles it. + * In guest mode. If a vmexit is needed, vmx_check_nested_events + * handles it. */ - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + if (nested_exit_on_intr(vcpu)) return; - if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); - return; - } - /* - * Fall back to pre-APICv interrupt injection since L2 + * Else, fall back to pre-APICv interrupt injection since L2 * is run without virtual interrupt delivery. */ if (!kvm_event_needs_reinjection(vcpu) && @@ -7400,6 +7618,12 @@ static bool vmx_mpx_supported(void) (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); } +static bool vmx_xsaves_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_XSAVES; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8135,6 +8359,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; @@ -8775,6 +9001,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (vmx_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); /* update exit information fields: */ @@ -9176,6 +9404,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, + .xsaves_supported = vmx_xsaves_supported, .check_nested_events = vmx_check_nested_events, @@ -9184,150 +9413,21 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r, i, msr; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return -ENOMEM; - - r = -ENOMEM; - - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out3; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out5; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out6; - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out7; + return r; #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } - - if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - - update_ple_window_actual_max(); - return 0; - -out7: - free_page((unsigned long)vmx_vmwrite_bitmap); -out6: - free_page((unsigned long)vmx_vmread_bitmap); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); -out: - free_page((unsigned long)vmx_io_bitmap_a); - return r; } static void __exit vmx_exit(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); - #ifdef CONFIG_KEXEC RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0033df32a745..c259814200bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,6 +27,7 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" +#include "assigned-dev.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -353,6 +354,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending) { queue: + if (has_error && !is_protmode(vcpu)) + has_error = false; vcpu->arch.exception.pending = true; vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; @@ -455,6 +458,16 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) } EXPORT_SYMBOL_GPL(kvm_require_cpl); +bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) +{ + if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return true; + + kvm_queue_exception(vcpu, UD_VECTOR); + return false; +} +EXPORT_SYMBOL_GPL(kvm_require_dr); + /* * This function will be used to read from the physical memory of the currently * running guest. The difference to kvm_read_guest_page is that this function @@ -656,6 +669,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) return 1; + if (xcr0 & XSTATE_AVX512) { + if (!(xcr0 & XSTATE_YMM)) + return 1; + if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512) + return 1; + } kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; @@ -732,6 +751,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { +#ifdef CONFIG_X86_64 + cr3 &= ~CR3_PCID_INVD; +#endif + if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -811,8 +834,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ /* fall through */ case 6: if (val & 0xffffffff00000000ULL) @@ -821,8 +842,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) kvm_update_dr6(vcpu); break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ /* fall through */ default: /* 7 */ if (val & 0xffffffff00000000ULL) @@ -837,27 +856,21 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { - int res; - - res = __kvm_set_dr(vcpu, dr, val); - if (res > 0) - kvm_queue_exception(vcpu, UD_VECTOR); - else if (res < 0) + if (__kvm_set_dr(vcpu, dr, val)) { kvm_inject_gp(vcpu, 0); - - return res; + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(kvm_set_dr); -static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { switch (dr) { case 0 ... 3: *val = vcpu->arch.db[dr]; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) @@ -866,23 +879,11 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) *val = kvm_x86_ops->get_dr6(vcpu); break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; break; } - - return 0; -} - -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - if (_kvm_get_dr(vcpu, dr, val)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } return 0; } EXPORT_SYMBOL_GPL(kvm_get_dr); @@ -1237,21 +1238,22 @@ void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 bool vcpus_matched; - bool do_request = false; struct kvm_arch *ka = &vcpu->kvm->arch; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&vcpu->kvm->online_vcpus)); - if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) - if (!ka->use_master_clock) - do_request = 1; - - if (!vcpus_matched && ka->use_master_clock) - do_request = 1; - - if (do_request) + /* + * Once the masterclock is enabled, always perform request in + * order to update it. + * + * In order to enable masterclock, the host clocksource must be TSC + * and the vcpus need to have matched TSCs. When that happens, + * perform request to enable masterclock. + */ + if (ka->use_master_clock || + (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -1637,16 +1639,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return 0; + /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate * state, we just increase by 2 at the end. */ - vcpu->hv_clock.version += 2; - - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return 0; + vcpu->hv_clock.version = guest_hv_clock.version + 2; /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); @@ -1662,6 +1664,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); @@ -2140,7 +2144,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: if (guest_cpuid_has_tsc_adjust(vcpu)) { if (!msr_info->host_initiated) { - u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); } vcpu->arch.ia32_tsc_adjust_msr = data; @@ -3106,7 +3110,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, unsigned long val; memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - _kvm_get_dr(vcpu, 6, &val); + kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; @@ -3128,15 +3132,89 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } +#define XSTATE_COMPACTION_ENABLED (1ULL << 63) + +static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) +{ + struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + u64 xstate_bv = xsave->xsave_hdr.xstate_bv; + u64 valid; + + /* + * Copy legacy XSAVE area, to avoid complications with CPUID + * leaves 0 and 1 in the loop below. + */ + memcpy(dest, xsave, XSAVE_HDR_OFFSET); + + /* Set XSTATE_BV */ + *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; + + /* + * Copy each region from the possibly compacted offset to the + * non-compacted offset. + */ + valid = xstate_bv & ~XSTATE_FPSSE; + while (valid) { + u64 feature = valid & -valid; + int index = fls64(feature) - 1; + void *src = get_xsave_addr(xsave, feature); + + if (src) { + u32 size, offset, ecx, edx; + cpuid_count(XSTATE_CPUID, index, + &size, &offset, &ecx, &edx); + memcpy(dest + offset, src, size); + } + + valid -= feature; + } +} + +static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) +{ + struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); + u64 valid; + + /* + * Copy legacy XSAVE area, to avoid complications with CPUID + * leaves 0 and 1 in the loop below. + */ + memcpy(xsave, src, XSAVE_HDR_OFFSET); + + /* Set XSTATE_BV and possibly XCOMP_BV. */ + xsave->xsave_hdr.xstate_bv = xstate_bv; + if (cpu_has_xsaves) + xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; + + /* + * Copy each region from the non-compacted offset to the + * possibly compacted offset. + */ + valid = xstate_bv & ~XSTATE_FPSSE; + while (valid) { + u64 feature = valid & -valid; + int index = fls64(feature) - 1; + void *dest = get_xsave_addr(xsave, feature); + + if (dest) { + u32 size, offset, ecx, edx; + cpuid_count(XSTATE_CPUID, index, + &size, &offset, &ecx, &edx); + memcpy(dest, src + offset, size); + } else + WARN_ON_ONCE(1); + + valid -= feature; + } +} + static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { if (cpu_has_xsave) { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->xsave, - vcpu->arch.guest_xstate_size); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &= - vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE; + memset(guest_xsave, 0, sizeof(struct kvm_xsave)); + fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->fxsave, @@ -3160,8 +3238,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, */ if (xstate_bv & ~kvm_supported_xcr0()) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, vcpu->arch.guest_xstate_size); + load_xsave(vcpu, (u8 *)guest_xsave->region); } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; @@ -4004,7 +4081,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } default: - ; + r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } out: return r; @@ -4667,7 +4744,7 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) @@ -5211,21 +5288,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { - struct kvm_run *kvm_run = vcpu->run; - unsigned long eip = vcpu->arch.emulate_ctxt.eip; - u32 dr6 = 0; - if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { - dr6 = kvm_vcpu_check_hw_bp(eip, 0, + struct kvm_run *kvm_run = vcpu->run; + unsigned long eip = kvm_get_linear_rip(vcpu); + u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.guest_debug_dr7, vcpu->arch.eff_db); if (dr6 != 0) { kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; - kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); - + kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; @@ -5235,7 +5308,8 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { - dr6 = kvm_vcpu_check_hw_bp(eip, 0, + unsigned long eip = kvm_get_linear_rip(vcpu); + u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); @@ -5365,7 +5439,9 @@ restart: kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); - __kvm_set_rflags(vcpu, ctxt->eflags); + if (!ctxt->have_exception || + exception_type(ctxt->exception.vector) == EXCPT_TRAP) + __kvm_set_rflags(vcpu, ctxt->eflags); /* * For STI, interrupts are shadowed; so KVM_REQ_EVENT will @@ -5965,6 +6041,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); + if (vcpu->arch.exception.nr == DB_VECTOR && + (vcpu->arch.dr7 & DR7_GD)) { + vcpu->arch.dr7 &= ~DR7_GD; + kvm_update_dr7(vcpu); + } + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, @@ -6873,6 +6955,9 @@ int fx_init(struct kvm_vcpu *vcpu) return err; fpu_finit(&vcpu->arch.guest_fpu); + if (cpu_has_xsaves) + vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = + host_xcr0 | XSTATE_COMPACTION_ENABLED; /* * Ensure guest xcr0 is valid for loading @@ -7024,7 +7109,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_reset(vcpu); } -void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { struct kvm_segment cs; @@ -7256,6 +7341,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (type) return -EINVAL; + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -7536,12 +7622,18 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) { - unsigned long current_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + if (is_64_bit_mode(vcpu)) + return kvm_rip_read(vcpu); + return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + + kvm_rip_read(vcpu)); +} +EXPORT_SYMBOL_GPL(kvm_get_linear_rip); - return current_rip == linear_rip; +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +{ + return kvm_get_linear_rip(vcpu) == linear_rip; } EXPORT_SYMBOL_GPL(kvm_is_linear_rip); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7cb9c45a5fe0..cc1d61af6140 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_BNDREGS | XSTATE_BNDCSR) + | XSTATE_BNDREGS | XSTATE_BNDCSR \ + | XSTATE_AVX512) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index db92793b7e23..1530afb07c85 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o -obj-y += msr.o msr-reg.o msr-reg-export.o hash.o +obj-y += msr.o msr-reg.o msr-reg-export.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 7609e0e421ec..1318f75d56e4 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -41,9 +41,8 @@ csum_partial_copy_from_user(const void __user *src, void *dst, while (((unsigned long)src & 6) && len >= 2) { __u16 val16; - *errp = __get_user(val16, (const __u16 __user *)src); - if (*errp) - return isum; + if (__get_user(val16, (const __u16 __user *)src)) + goto out_err; *(__u16 *)dst = val16; isum = (__force __wsum)add32_with_carry( diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c deleted file mode 100644 index ff4fa51a5b1f..000000000000 --- a/arch/x86/lib/hash.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Some portions derived from code covered by the following notice: - * - * Copyright (c) 2010-2013 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/hash.h> -#include <linux/init.h> - -#include <asm/processor.h> -#include <asm/cpufeature.h> -#include <asm/hash.h> - -static inline u32 crc32_u32(u32 crc, u32 val) -{ -#ifdef CONFIG_AS_CRC32 - asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val)); -#else - asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val)); -#endif - return crc; -} - -static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed) -{ - const u32 *p32 = (const u32 *) data; - u32 i, tmp = 0; - - for (i = 0; i < len / 4; i++) - seed = crc32_u32(seed, *p32++); - - switch (len & 3) { - case 3: - tmp |= *((const u8 *) p32 + 2) << 16; - /* fallthrough */ - case 2: - tmp |= *((const u8 *) p32 + 1) << 8; - /* fallthrough */ - case 1: - tmp |= *((const u8 *) p32); - seed = crc32_u32(seed, tmp); - break; - } - - return seed; -} - -static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed) -{ - const u32 *p32 = (const u32 *) data; - u32 i; - - for (i = 0; i < len; i++) - seed = crc32_u32(seed, *p32++); - - return seed; -} - -void __init setup_arch_fast_hash(struct fast_hash_ops *ops) -{ - if (cpu_has_xmm4_2) { - ops->hash = intel_crc4_2_hash; - ops->hash2 = intel_crc4_2_hash2; - } -} diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 54fcffed28ed..2480978b31cc 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -28,7 +28,7 @@ /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ - ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE) + ((insn)->next_byte + sizeof(t) + n < (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) @@ -50,10 +50,11 @@ * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @x86_64: !0 for 64-bit kernel or 64-bit app */ -void insn_init(struct insn *insn, const void *kaddr, int x86_64) +void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; + insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64 ? 1 : 0; insn->opnd_bytes = 4; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 6a19ad9f370d..ecfdc46a024a 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -30,3 +30,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_MEMTEST) += memtest.o + +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 95a427e57887..f0cedf3395af 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -76,6 +76,9 @@ static struct addr_marker address_markers[] = { # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif +# ifdef CONFIG_EFI + { EFI_VA_END, "EFI Runtime Services" }, +# endif { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -126,7 +129,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) if (!pgprot_val(prot)) { /* Not present */ - pt_dump_cont_printf(m, dmsg, " "); + pt_dump_cont_printf(m, dmsg, " "); } else { if (pr & _PAGE_USER) pt_dump_cont_printf(m, dmsg, "USR "); @@ -145,18 +148,16 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) else pt_dump_cont_printf(m, dmsg, " "); - /* Bit 9 has a different meaning on level 3 vs 4 */ - if (level <= 3) { - if (pr & _PAGE_PSE) - pt_dump_cont_printf(m, dmsg, "PSE "); - else - pt_dump_cont_printf(m, dmsg, " "); - } else { - if (pr & _PAGE_PAT) - pt_dump_cont_printf(m, dmsg, "pat "); - else - pt_dump_cont_printf(m, dmsg, " "); - } + /* Bit 7 has a different meaning on level 3 vs 4 */ + if (level <= 3 && pr & _PAGE_PSE) + pt_dump_cont_printf(m, dmsg, "PSE "); + else + pt_dump_cont_printf(m, dmsg, " "); + if ((level == 4 && pr & _PAGE_PAT) || + ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + pt_dump_cont_printf(m, dmsg, "pat "); + else + pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_GLOBAL) pt_dump_cont_printf(m, dmsg, "GLB "); else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d973e61e450d..38dcec403b46 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -844,11 +844,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; int code = BUS_ADRERR; - up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); @@ -879,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { - up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, 0, 0); return; } @@ -887,14 +883,11 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { - up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; } - up_read(¤t->mm->mmap_sem); - /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got @@ -1062,7 +1055,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - int fault; + int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; @@ -1237,47 +1230,50 @@ good_area: * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); + major |= fault & VM_FAULT_MAJOR; /* - * If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_sem because it - * would already be released in __lock_page_or_retry in mm/filemap.c. + * If we need to retry the mmap_sem has already been released, + * and if there is a fatal signal pending there is no guarantee + * that we made any progress. Handle this case first. */ - if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + if (unlikely(fault & VM_FAULT_RETRY)) { + /* Retry at most once */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + if (!fatal_signal_pending(tsk)) + goto retry; + } + + /* User mode? Just return to handle the fatal exception */ + if (flags & FAULT_FLAG_USER) + return; + + /* Not returning to user mode? Handle exceptions or die: */ + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; + } + up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. + * Major/minor page fault accounting. If any of the events + * returned VM_FAULT_MAJOR, we account it as a major fault. */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - goto retry; - } + if (major) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } check_v8086_mode(regs, address, tsk); - - up_read(&mm->mmap_sem); } NOKPROBE_SYMBOL(__do_page_fault); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 66dba36f2343..a97ee0801475 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -27,6 +27,35 @@ #include "mm_internal.h" +/* + * Tables translating between page_cache_type_t and pte encoding. + * Minimal supported modes are defined statically, modified if more supported + * cache modes are available. + * Index into __cachemode2pte_tbl is the cachemode. + * Index into __pte2cachemode_tbl are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + */ +uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { + [_PAGE_CACHE_MODE_WB] = 0, + [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, + [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, + [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, + [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, + [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, +}; +EXPORT_SYMBOL_GPL(__cachemode2pte_tbl); +uint8_t __pte2cachemode_tbl[8] = { + [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, +}; +EXPORT_SYMBOL_GPL(__pte2cachemode_tbl); + static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; static unsigned long __initdata pgt_buf_top; @@ -674,10 +703,10 @@ void __init zone_sizes_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM @@ -687,3 +716,11 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) +{ + /* entry 0 MUST be WB (hardwired to speed up translations) */ + BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); + + __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); + __pte2cachemode_tbl[entry] = cache; +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763868fc..30eb05ae7061 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,7 +52,6 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> -#include <asm/uv/uv.h> #include <asm/setup.h> #include "mm_internal.h" @@ -338,12 +337,15 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) * Create large page table mappings for a range of physical addresses. */ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, - pgprot_t prot) + enum page_cache_mode cache) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + pgprot_t prot; + pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | + pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); @@ -366,12 +368,12 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); } void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); } /* @@ -1123,7 +1125,7 @@ void mark_rodata_ro(void) unsigned long end = (unsigned long) &__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(&__stop___ex_table); unsigned long rodata_end = PFN_ALIGN(&__end_rodata); - unsigned long all_end = PFN_ALIGN(&_end); + unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1134,7 +1136,16 @@ void mark_rodata_ro(void) /* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -1193,66 +1204,15 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - static unsigned long probe_memory_block_size(void) { /* start from 2g */ unsigned long bz = 1UL<<31; -#ifdef CONFIG_X86_UV - if (is_uv_system()) { - printk(KERN_INFO "UV: memory block size 2GB\n"); + if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { + pr_info("Using 2GB memory block size for large-memory system\n"); return 2UL * 1024 * 1024 * 1024; } -#endif /* less than 64g installed */ if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 7b179b499fa3..9ca35fc60cfe 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -33,17 +33,17 @@ static int is_io_mapping_possible(resource_size_t base, unsigned long size) int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) { - unsigned long flag = _PAGE_CACHE_WC; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC; int ret; if (!is_io_mapping_possible(base, size)) return -EINVAL; - ret = io_reserve_memtype(base, base + size, &flag); + ret = io_reserve_memtype(base, base + size, &pcm); if (ret) return ret; - *prot = __pgprot(__PAGE_KERNEL | flag); + *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm)); return 0; } EXPORT_SYMBOL_GPL(iomap_create_wc); @@ -82,8 +82,10 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) * MTRR is UC or WC. UC_MINUS gets the real intention, of the * user, which is "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) - prot = PAGE_KERNEL_UC_MINUS; + if (!pat_enabled && pgprot_val(prot) == + (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) + prot = __pgprot(__PAGE_KERNEL | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index af78e50ca6ce..fdf617c00e2f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -29,20 +29,20 @@ * conflicts. */ int ioremap_change_attr(unsigned long vaddr, unsigned long size, - unsigned long prot_val) + enum page_cache_mode pcm) { unsigned long nrpages = size >> PAGE_SHIFT; int err; - switch (prot_val) { - case _PAGE_CACHE_UC: + switch (pcm) { + case _PAGE_CACHE_MODE_UC: default: err = _set_memory_uc(vaddr, nrpages); break; - case _PAGE_CACHE_WC: + case _PAGE_CACHE_MODE_WC: err = _set_memory_wc(vaddr, nrpages); break; - case _PAGE_CACHE_WB: + case _PAGE_CACHE_MODE_WB: err = _set_memory_wb(vaddr, nrpages); break; } @@ -75,14 +75,14 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, * caller shouldn't need to know that small detail. */ static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, unsigned long prot_val, void *caller) + unsigned long size, enum page_cache_mode pcm, void *caller) { unsigned long offset, vaddr; resource_size_t pfn, last_pfn, last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; struct vm_struct *area; - unsigned long new_prot_val; + enum page_cache_mode new_pcm; pgprot_t prot; int retval; void __iomem *ret_addr; @@ -134,38 +134,40 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, size = PAGE_ALIGN(last_addr+1) - phys_addr; retval = reserve_memtype(phys_addr, (u64)phys_addr + size, - prot_val, &new_prot_val); + pcm, &new_pcm); if (retval) { printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } - if (prot_val != new_prot_val) { - if (!is_new_memtype_allowed(phys_addr, size, - prot_val, new_prot_val)) { + if (pcm != new_pcm) { + if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) { printk(KERN_ERR - "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), - prot_val, new_prot_val); + pcm, new_pcm); goto err_free_memtype; } - prot_val = new_prot_val; + pcm = new_pcm; } - switch (prot_val) { - case _PAGE_CACHE_UC: + prot = PAGE_KERNEL_IO; + switch (pcm) { + case _PAGE_CACHE_MODE_UC: default: - prot = PAGE_KERNEL_IO_NOCACHE; + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC)); break; - case _PAGE_CACHE_UC_MINUS: - prot = PAGE_KERNEL_IO_UC_MINUS; + case _PAGE_CACHE_MODE_UC_MINUS: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); break; - case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_IO_WC; + case _PAGE_CACHE_MODE_WC: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); break; - case _PAGE_CACHE_WB: - prot = PAGE_KERNEL_IO; + case _PAGE_CACHE_MODE_WB: break; } @@ -178,7 +180,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (kernel_map_sync_memtype(phys_addr, size, prot_val)) + if (kernel_map_sync_memtype(phys_addr, size, pcm)) goto err_free_area; if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) @@ -227,14 +229,14 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use * UC MINUS. */ - unsigned long val = _PAGE_CACHE_UC_MINUS; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; - return __ioremap_caller(phys_addr, size, val, + return __ioremap_caller(phys_addr, size, pcm, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_nocache); @@ -252,7 +254,7 @@ EXPORT_SYMBOL(ioremap_nocache); void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); else return ioremap_nocache(phys_addr, size); @@ -261,7 +263,7 @@ EXPORT_SYMBOL(ioremap_wc); void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); @@ -269,7 +271,8 @@ EXPORT_SYMBOL(ioremap_cache); void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, unsigned long prot_val) { - return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + return __ioremap_caller(phys_addr, size, + pgprot2cachemode(__pgprot(prot_val)), __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_prot); @@ -327,7 +330,7 @@ EXPORT_SYMBOL(iounmap); * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access */ -void *xlate_dev_mem_ptr(unsigned long phys) +void *xlate_dev_mem_ptr(phys_addr_t phys) { void *addr; unsigned long start = phys & PAGE_MASK; @@ -343,7 +346,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) return addr; } -void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { if (page_is_ram(phys >> PAGE_SHIFT)) return; diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 6b563a118891..62474ba66c8e 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -16,4 +16,6 @@ void zone_sizes_init(void); extern int after_bootmem; +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c new file mode 100644 index 000000000000..67ebf5751222 --- /dev/null +++ b/arch/x86/mm/mpx.c @@ -0,0 +1,928 @@ +/* + * mpx.c - Memory Protection eXtensions + * + * Copyright (c) 2014, Intel Corporation. + * Qiaowei Ren <qiaowei.ren@intel.com> + * Dave Hansen <dave.hansen@intel.com> + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/sched/sysctl.h> + +#include <asm/i387.h> +#include <asm/insn.h> +#include <asm/mman.h> +#include <asm/mmu_context.h> +#include <asm/mpx.h> +#include <asm/processor.h> +#include <asm/fpu-internal.h> + +static const char *mpx_mapping_name(struct vm_area_struct *vma) +{ + return "[mpx]"; +} + +static struct vm_operations_struct mpx_vma_ops = { + .name = mpx_mapping_name, +}; + +static int is_mpx_vma(struct vm_area_struct *vma) +{ + return (vma->vm_ops == &mpx_vma_ops); +} + +/* + * This is really a simplified "vm_mmap". it only handles MPX + * bounds tables (the bounds directory is user-allocated). + * + * Later on, we use the vma->vm_ops to uniquely identify these + * VMAs. + */ +static unsigned long mpx_mmap(unsigned long len) +{ + unsigned long ret; + unsigned long addr, pgoff; + struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; + struct vm_area_struct *vma; + + /* Only bounds table and bounds directory can be allocated here */ + if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES) + return -EINVAL; + + down_write(&mm->mmap_sem); + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) { + ret = -ENOMEM; + goto out; + } + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); + if (addr & ~PAGE_MASK) { + ret = addr; + goto out; + } + + vm_flags = VM_READ | VM_WRITE | VM_MPX | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* Set pgoff according to addr for anon_vma */ + pgoff = addr >> PAGE_SHIFT; + + ret = mmap_region(NULL, addr, len, vm_flags, pgoff); + if (IS_ERR_VALUE(ret)) + goto out; + + vma = find_vma(mm, ret); + if (!vma) { + ret = -ENOMEM; + goto out; + } + vma->vm_ops = &mpx_vma_ops; + + if (vm_flags & VM_LOCKED) { + up_write(&mm->mmap_sem); + mm_populate(ret, len); + return ret; + } + +out: + up_write(&mm->mmap_sem); + return ret; +} + +enum reg_type { + REG_TYPE_RM = 0, + REG_TYPE_INDEX, + REG_TYPE_BASE, +}; + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = 0; + + static const int regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif + }; + int nr_registers = ARRAY_SIZE(regoff); + /* + * Don't possibly decode a 32-bit instructions as + * reading a 64-bit-only register. + */ + if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) + nr_registers -= 8; + + switch (type) { + case REG_TYPE_RM: + regno = X86_MODRM_RM(insn->modrm.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_INDEX: + regno = X86_SIB_INDEX(insn->sib.value); + if (X86_REX_X(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_BASE: + regno = X86_SIB_BASE(insn->sib.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + default: + pr_err("invalid register type"); + BUG(); + break; + } + + if (regno > nr_registers) { + WARN_ONCE(1, "decoded an instruction with an invalid register"); + return -EINVAL; + } + return regoff[regno]; +} + +/* + * return the address being referenced be instruction + * for rm=3 returning the content of the rm reg + * for rm!=3 calculates the address using SIB and Disp + */ +static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + unsigned long addr, base, indx; + int addr_offset, base_offset, indx_offset; + insn_byte_t sib; + + insn_get_modrm(insn); + insn_get_sib(insn); + sib = insn->sib.value; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } else { + if (insn->sib.nbytes) { + base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + if (base_offset < 0) + goto out_err; + + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + if (indx_offset < 0) + goto out_err; + + base = regs_get_register(regs, base_offset); + indx = regs_get_register(regs, indx_offset); + addr = base + indx * (1 << X86_SIB_SCALE(sib)); + } else { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } + addr += insn->displacement.value; + } + return (void __user *)addr; +out_err: + return (void __user *)-1; +} + +static int mpx_insn_decode(struct insn *insn, + struct pt_regs *regs) +{ + unsigned char buf[MAX_INSN_SIZE]; + int x86_64 = !test_thread_flag(TIF_IA32); + int not_copied; + int nr_copied; + + not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + /* + * The decoder _should_ fail nicely if we pass it a short buffer. + * But, let's not depend on that implementation detail. If we + * did not get anything, just error out now. + */ + if (!nr_copied) + return -EFAULT; + insn_init(insn, buf, nr_copied, x86_64); + insn_get_length(insn); + /* + * copy_from_user() tries to get as many bytes as we could see in + * the largest possible instruction. If the instruction we are + * after is shorter than that _and_ we attempt to copy from + * something unreadable, we might get a short read. This is OK + * as long as the read did not stop in the middle of the + * instruction. Check to see if we got a partial instruction. + */ + if (nr_copied < insn->length) + return -EFAULT; + + insn_get_opcode(insn); + /* + * We only _really_ need to decode bndcl/bndcn/bndcu + * Error out on anything else. + */ + if (insn->opcode.bytes[0] != 0x0f) + goto bad_opcode; + if ((insn->opcode.bytes[1] != 0x1a) && + (insn->opcode.bytes[1] != 0x1b)) + goto bad_opcode; + + return 0; +bad_opcode: + return -EINVAL; +} + +/* + * If a bounds overflow occurs then a #BR is generated. This + * function decodes MPX instructions to get violation address + * and set this address into extended struct siginfo. + * + * Note that this is not a super precise way of doing this. + * Userspace could have, by the time we get here, written + * anything it wants in to the instructions. We can not + * trust anything about it. They might not be valid + * instructions or might encode invalid registers, etc... + * + * The caller is expected to kfree() the returned siginfo_t. + */ +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + struct bndreg *bndregs, *bndreg; + siginfo_t *info = NULL; + struct insn insn; + uint8_t bndregno; + int err; + + err = mpx_insn_decode(&insn, regs); + if (err) + goto err_out; + + /* + * We know at this point that we are only dealing with + * MPX instructions. + */ + insn_get_modrm(&insn); + bndregno = X86_MODRM_REG(insn.modrm.value); + if (bndregno > 3) { + err = -EINVAL; + goto err_out; + } + /* get the bndregs _area_ of the xsave structure */ + bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS); + if (!bndregs) { + err = -EINVAL; + goto err_out; + } + /* now go select the individual register in the set of 4 */ + bndreg = &bndregs[bndregno]; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto err_out; + } + /* + * The registers are always 64-bit, but the upper 32 + * bits are ignored in 32-bit mode. Also, note that the + * upper bounds are architecturally represented in 1's + * complement form. + * + * The 'unsigned long' cast is because the compiler + * complains when casting from integers to different-size + * pointers. + */ + info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; + info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; + info->si_addr_lsb = 0; + info->si_signo = SIGSEGV; + info->si_errno = 0; + info->si_code = SEGV_BNDERR; + info->si_addr = mpx_get_addr_ref(&insn, regs); + /* + * We were not able to extract an address from the instruction, + * probably because there was something invalid in it. + */ + if (info->si_addr == (void *)-1) { + err = -EINVAL; + goto err_out; + } + return info; +err_out: + /* info might be NULL, but kfree() handles that */ + kfree(info); + return ERR_PTR(err); +} + +static __user void *task_get_bounds_dir(struct task_struct *tsk) +{ + struct bndcsr *bndcsr; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * The bounds directory pointer is stored in a register + * only accessible if we first do an xsave. + */ + fpu_save_init(&tsk->thread.fpu); + bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); + if (!bndcsr) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Make sure the register looks valid by checking the + * enable bit. + */ + if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Lastly, mask off the low bits used for configuration + * flags, and return the address of the bounds table. + */ + return (void __user *)(unsigned long) + (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); +} + +int mpx_enable_management(struct task_struct *tsk) +{ + void __user *bd_base = MPX_INVALID_BOUNDS_DIR; + struct mm_struct *mm = tsk->mm; + int ret = 0; + + /* + * runtime in the userspace will be responsible for allocation of + * the bounds directory. Then, it will save the base of the bounds + * directory into XSAVE/XRSTOR Save Area and enable MPX through + * XRSTOR instruction. + * + * fpu_xsave() is expected to be very expensive. Storing the bounds + * directory here means that we do not have to do xsave in the unmap + * path; we can just use mm->bd_addr instead. + */ + bd_base = task_get_bounds_dir(tsk); + down_write(&mm->mmap_sem); + mm->bd_addr = bd_base; + if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + ret = -ENXIO; + + up_write(&mm->mmap_sem); + return ret; +} + +int mpx_disable_management(struct task_struct *tsk) +{ + struct mm_struct *mm = current->mm; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return -ENXIO; + + down_write(&mm->mmap_sem); + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + up_write(&mm->mmap_sem); + return 0; +} + +/* + * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each + * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, + * and the size of each bounds table is 4MB. + */ +static int allocate_bt(long __user *bd_entry) +{ + unsigned long expected_old_val = 0; + unsigned long actual_old_val = 0; + unsigned long bt_addr; + int ret = 0; + + /* + * Carve the virtual space out of userspace for the new + * bounds table: + */ + bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES); + if (IS_ERR((void *)bt_addr)) + return PTR_ERR((void *)bt_addr); + /* + * Set the valid flag (kinda like _PAGE_PRESENT in a pte) + */ + bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + + /* + * Go poke the address of the new bounds table in to the + * bounds directory entry out in userspace memory. Note: + * we may race with another CPU instantiating the same table. + * In that case the cmpxchg will see an unexpected + * 'actual_old_val'. + * + * This can fault, but that's OK because we do not hold + * mmap_sem at this point, unlike some of the other part + * of the MPX code that have to pagefault_disable(). + */ + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, bt_addr); + if (ret) + goto out_unmap; + + /* + * The user_atomic_cmpxchg_inatomic() will only return nonzero + * for faults, *not* if the cmpxchg itself fails. Now we must + * verify that the cmpxchg itself completed successfully. + */ + /* + * We expected an empty 'expected_old_val', but instead found + * an apparently valid entry. Assume we raced with another + * thread to instantiate this table and desclare succecss. + */ + if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { + ret = 0; + goto out_unmap; + } + /* + * We found a non-empty bd_entry but it did not have the + * VALID_FLAG set. Return an error which will result in + * a SEGV since this probably means that somebody scribbled + * some invalid data in to a bounds table. + */ + if (expected_old_val != actual_old_val) { + ret = -EINVAL; + goto out_unmap; + } + return 0; +out_unmap: + vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); + return ret; +} + +/* + * When a BNDSTX instruction attempts to save bounds to a bounds + * table, it will first attempt to look up the table in the + * first-level bounds directory. If it does not find a table in + * the directory, a #BR is generated and we get here in order to + * allocate a new table. + * + * With 32-bit mode, the size of BD is 4MB, and the size of each + * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, + * and the size of each bound table is 4MB. + */ +static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) +{ + unsigned long bd_entry, bd_base; + struct bndcsr *bndcsr; + + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + return -EINVAL; + /* + * Mask off the preserve and enable bits + */ + bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; + /* + * The hardware provides the address of the missing or invalid + * entry via BNDSTATUS, so we don't have to go look it up. + */ + bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; + /* + * Make sure the directory entry is within where we think + * the directory is. + */ + if ((bd_entry < bd_base) || + (bd_entry >= bd_base + MPX_BD_SIZE_BYTES)) + return -EINVAL; + + return allocate_bt((long __user *)bd_entry); +} + +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + /* + * Userspace never asked us to manage the bounds tables, + * so refuse to help. + */ + if (!kernel_managing_mpx_tables(current->mm)) + return -EINVAL; + + if (do_mpx_bt_fault(xsave_buf)) { + force_sig(SIGSEGV, current); + /* + * The force_sig() is essentially "handling" this + * exception, so we do not pass up the error + * from do_mpx_bt_fault(). + */ + } + return 0; +} + +/* + * A thin wrapper around get_user_pages(). Returns 0 if the + * fault was resolved or -errno if not. + */ +static int mpx_resolve_fault(long __user *addr, int write) +{ + long gup_ret; + int nr_pages = 1; + int force = 0; + + gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, + nr_pages, write, force, NULL, NULL); + /* + * get_user_pages() returns number of pages gotten. + * 0 means we failed to fault in and get anything, + * probably because 'addr' is bad. + */ + if (!gup_ret) + return -EFAULT; + /* Other error, return it */ + if (gup_ret < 0) + return gup_ret; + /* must have gup'd a page and gup_ret>0, success */ + return 0; +} + +/* + * Get the base of bounds tables pointed by specific bounds + * directory entry. + */ +static int get_bt_addr(struct mm_struct *mm, + long __user *bd_entry, unsigned long *bt_addr) +{ + int ret; + int valid_bit; + + if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry))) + return -EFAULT; + + while (1) { + int need_write = 0; + + pagefault_disable(); + ret = get_user(*bt_addr, bd_entry); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + + valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG; + *bt_addr &= MPX_BT_ADDR_MASK; + + /* + * When the kernel is managing bounds tables, a bounds directory + * entry will either have a valid address (plus the valid bit) + * *OR* be completely empty. If we see a !valid entry *and* some + * data in the address field, we know something is wrong. This + * -EINVAL return will cause a SIGSEGV. + */ + if (!valid_bit && *bt_addr) + return -EINVAL; + /* + * Do we have an completely zeroed bt entry? That is OK. It + * just means there was no bounds table for this memory. Make + * sure to distinguish this from -EINVAL, which will cause + * a SEGV. + */ + if (!valid_bit) + return -ENOENT; + + return 0; +} + +/* + * Free the backing physical pages of bounds table 'bt_addr'. + * Assume start...end is within that bounds table. + */ +static int zap_bt_entries(struct mm_struct *mm, + unsigned long bt_addr, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + unsigned long addr, len; + + /* + * Find the first overlapping vma. If vma->vm_start > start, there + * will be a hole in the bounds table. This -EINVAL return will + * cause a SIGSEGV. + */ + vma = find_vma(mm, start); + if (!vma || vma->vm_start > start) + return -EINVAL; + + /* + * A NUMA policy on a VM_MPX VMA could cause this bouds table to + * be split. So we need to look across the entire 'start -> end' + * range of this bounds table, find all of the VM_MPX VMAs, and + * zap only those. + */ + addr = start; + while (vma && vma->vm_start < end) { + /* + * We followed a bounds directory entry down + * here. If we find a non-MPX VMA, that's bad, + * so stop immediately and return an error. This + * probably results in a SIGSEGV. + */ + if (!is_mpx_vma(vma)) + return -EINVAL; + + len = min(vma->vm_end, end) - addr; + zap_page_range(vma, addr, len, NULL); + + vma = vma->vm_next; + addr = vma->vm_start; + } + + return 0; +} + +static int unmap_single_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long bt_addr) +{ + unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + unsigned long actual_old_val = 0; + int ret; + + while (1) { + int need_write = 1; + + pagefault_disable(); + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, 0); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + /* + * The cmpxchg was performed, check the results. + */ + if (actual_old_val != expected_old_val) { + /* + * Someone else raced with us to unmap the table. + * There was no bounds table pointed to by the + * directory, so declare success. Somebody freed + * it. + */ + if (!actual_old_val) + return 0; + /* + * Something messed with the bounds directory + * entry. We hold mmap_sem for read or write + * here, so it could not be a _new_ bounds table + * that someone just allocated. Something is + * wrong, so pass up the error and SIGSEGV. + */ + return -EINVAL; + } + + /* + * Note, we are likely being called under do_munmap() already. To + * avoid recursion, do_munmap() will check whether it comes + * from one bounds table through VM_MPX flag. + */ + return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES); +} + +/* + * If the bounds table pointed by bounds directory 'bd_entry' is + * not shared, unmap this whole bounds table. Otherwise, only free + * those backing physical pages of bounds table entries covered + * in this virtual address region start...end. + */ +static int unmap_shared_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long start, + unsigned long end, bool prev_shared, bool next_shared) +{ + unsigned long bt_addr; + int ret; + + ret = get_bt_addr(mm, bd_entry, &bt_addr); + /* + * We could see an "error" ret for not-present bounds + * tables (not really an error), or actual errors, but + * stop unmapping either way. + */ + if (ret) + return ret; + + if (prev_shared && next_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else if (prev_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_BT_SIZE_BYTES); + else if (next_shared) + ret = zap_bt_entries(mm, bt_addr, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else + ret = unmap_single_bt(mm, bd_entry, bt_addr); + + return ret; +} + +/* + * A virtual address region being munmap()ed might share bounds table + * with adjacent VMAs. We only need to free the backing physical + * memory of these shared bounds tables entries covered in this virtual + * address region. + */ +static int unmap_edge_bts(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bde_start, *bde_end; + struct vm_area_struct *prev, *next; + bool prev_shared = false, next_shared = false; + + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + + /* + * Check whether bde_start and bde_end are shared with adjacent + * VMAs. + * + * We already unliked the VMAs from the mm's rbtree so 'start' + * is guaranteed to be in a hole. This gets us the first VMA + * before the hole in to 'prev' and the next VMA after the hole + * in to 'next'. + */ + next = find_vma_prev(mm, start, &prev); + if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1)) + == bde_start) + prev_shared = true; + if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start)) + == bde_end) + next_shared = true; + + /* + * This virtual address region being munmap()ed is only + * covered by one bounds table. + * + * In this case, if this table is also shared with adjacent + * VMAs, only part of the backing physical memory of the bounds + * table need be freeed. Otherwise the whole bounds table need + * be unmapped. + */ + if (bde_start == bde_end) { + return unmap_shared_bt(mm, bde_start, start, end, + prev_shared, next_shared); + } + + /* + * If more than one bounds tables are covered in this virtual + * address region being munmap()ed, we need to separately check + * whether bde_start and bde_end are shared with adjacent VMAs. + */ + ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false); + if (ret) + return ret; + ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared); + if (ret) + return ret; + + return 0; +} + +static int mpx_unmap_tables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bd_entry, *bde_start, *bde_end; + unsigned long bt_addr; + + /* + * "Edge" bounds tables are those which are being used by the region + * (start -> end), but that may be shared with adjacent areas. If they + * turn out to be completely unshared, they will be freed. If they are + * shared, we will free the backing store (like an MADV_DONTNEED) for + * areas used by this region. + */ + ret = unmap_edge_bts(mm, start, end); + switch (ret) { + /* non-present tables are OK */ + case 0: + case -ENOENT: + /* Success, or no tables to unmap */ + break; + case -EINVAL: + case -EFAULT: + default: + return ret; + } + + /* + * Only unmap the bounds table that are + * 1. fully covered + * 2. not at the edges of the mapping, even if full aligned + */ + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) { + ret = get_bt_addr(mm, bd_entry, &bt_addr); + switch (ret) { + case 0: + break; + case -ENOENT: + /* No table here, try the next one */ + continue; + case -EINVAL: + case -EFAULT: + default: + /* + * Note: we are being strict here. + * Any time we run in to an issue + * unmapping tables, we stop and + * SIGSEGV. + */ + return ret; + } + + ret = unmap_single_bt(mm, bd_entry, bt_addr); + if (ret) + return ret; + } + + return 0; +} + +/* + * Free unused bounds tables covered in a virtual address region being + * munmap()ed. Assume end > start. + * + * This function will be called by do_munmap(), and the VMAs covering + * the virtual address region start...end have already been split if + * necessary, and the 'vma' is the first vma in this range (start -> end). + */ +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int ret; + + /* + * Refuse to do anything unless userspace has asked + * the kernel to help manage the bounds tables, + */ + if (!kernel_managing_mpx_tables(current->mm)) + return; + /* + * This will look across the entire 'start -> end' range, + * and find all of the non-VM_MPX VMAs. + * + * To avoid recursion, if a VM_MPX vma is found in the range + * (start->end), we will not continue follow-up work. This + * recursion represents having bounds tables for bounds tables, + * which should not occur normally. Being strict about it here + * helps ensure that we do not have an exploitable stack overflow. + */ + do { + if (vma->vm_flags & VM_MPX) + return; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + + ret = mpx_unmap_tables(mm, start, end); + if (ret) + force_sig(SIGSEGV, current); +} diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 36de293caf25..536ea2fb6e33 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -384,6 +384,26 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, } /* + * Lookup the PMD entry for a virtual address. Return a pointer to the entry + * or NULL if not present. + */ +pmd_t *lookup_pmd_address(unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(address); + if (pgd_none(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + return NULL; + + return pmd_offset(pud, address); +} + +/* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can @@ -485,14 +505,23 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, /* * We are safe now. Check whether the new pgprot is the same: + * Convert protection attributes to 4k-format, as cpa->mask* are set + * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pte_pgprot(old_pte); + old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* + * req_prot is in format of 4k pages. It must be converted to large + * page format: the caching mode includes the PAT bit located at + * different bit positions in the two formats. + */ + req_prot = pgprot_4k_2_large(req_prot); + + /* * Set the PSE and GLOBAL flags only if the PRESENT flag is * set otherwise pmd_present/pmd_huge will return true even on * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL @@ -585,13 +614,10 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* - * If we ever want to utilize the PAT bit, we need to - * update this function to make sure it's converted from - * bit 12 to bit 7 when we cross from the 2MB level to - * the 4K level: - */ - WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); + + /* promote PAT bit to correct position */ + if (level == PG_LEVEL_2M) + ref_prot = pgprot_large_2_4k(ref_prot); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { @@ -879,6 +905,7 @@ static int populate_pmd(struct cpa_data *cpa, { unsigned int cur_pages = 0; pmd_t *pmd; + pgprot_t pmd_pgprot; /* * Not on a 2M boundary? @@ -910,6 +937,8 @@ static int populate_pmd(struct cpa_data *cpa, if (num_pages == cur_pages) return cur_pages; + pmd_pgprot = pgprot_4k_2_large(pgprot); + while (end - start >= PMD_SIZE) { /* @@ -921,7 +950,8 @@ static int populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | + massage_pgprot(pmd_pgprot))); start += PMD_SIZE; cpa->pfn += PMD_SIZE; @@ -949,6 +979,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, pud_t *pud; unsigned long end; int cur_pages = 0; + pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -986,12 +1017,14 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, return cur_pages; pud = pud_offset(pgd, start); + pud_pgprot = pgprot_4k_2_large(pgprot); /* * Map everything starting from the Gb boundary, possibly with 1G pages */ while (end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | + massage_pgprot(pud_pgprot))); start += PUD_SIZE; cpa->pfn += PUD_SIZE; @@ -1304,12 +1337,6 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) return 0; } -static inline int cache_attr(pgprot_t attr) -{ - return pgprot_val(attr) & - (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); -} - static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, @@ -1390,7 +1417,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * No need to flush, when we did not set any of the caching * attributes: */ - cache = cache_attr(mask_set); + cache = !!pgprot2cachemode(mask_set); /* * On success we use CLFLUSH, when the CPU supports it to @@ -1445,7 +1472,8 @@ int _set_memory_uc(unsigned long addr, int numpages) * for now UC MINUS. see comments in ioremap_nocache() */ return change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), 0); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); } int set_memory_uc(unsigned long addr, int numpages) @@ -1456,7 +1484,7 @@ int set_memory_uc(unsigned long addr, int numpages) * for now UC MINUS. see comments in ioremap_nocache() */ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL); + _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; @@ -1474,7 +1502,7 @@ out_err: EXPORT_SYMBOL(set_memory_uc); static int _set_memory_array(unsigned long *addr, int addrinarray, - unsigned long new_type) + enum page_cache_mode new_type) { int i, j; int ret; @@ -1490,11 +1518,13 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, } ret = change_page_attr_set(addr, addrinarray, - __pgprot(_PAGE_CACHE_UC_MINUS), 1); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 1); - if (!ret && new_type == _PAGE_CACHE_WC) + if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(addr, addrinarray, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_ARRAY, NULL); if (ret) @@ -1511,13 +1541,13 @@ out_free: int set_memory_array_uc(unsigned long *addr, int addrinarray) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS); + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_memory_array_uc); int set_memory_array_wc(unsigned long *addr, int addrinarray) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC); + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_memory_array_wc); @@ -1527,10 +1557,12 @@ int _set_memory_wc(unsigned long addr, int numpages) unsigned long addr_copy = addr; ret = change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), 0); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); if (!ret) { ret = change_page_attr_set_clr(&addr_copy, numpages, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } @@ -1545,7 +1577,7 @@ int set_memory_wc(unsigned long addr, int numpages) return set_memory_uc(addr, numpages); ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_WC, NULL); + _PAGE_CACHE_MODE_WC, NULL); if (ret) goto out_err; @@ -1564,6 +1596,7 @@ EXPORT_SYMBOL(set_memory_wc); int _set_memory_wb(unsigned long addr, int numpages) { + /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_CACHE_MASK), 0); } @@ -1586,6 +1619,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray) int i; int ret; + /* WB cache mode is hard wired to all cache attribute bits being 0 */ ret = change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); if (ret) @@ -1648,7 +1682,7 @@ int set_pages_uc(struct page *page, int numpages) EXPORT_SYMBOL(set_pages_uc); static int _set_pages_array(struct page **pages, int addrinarray, - unsigned long new_type) + enum page_cache_mode new_type) { unsigned long start; unsigned long end; @@ -1666,10 +1700,11 @@ static int _set_pages_array(struct page **pages, int addrinarray, } ret = cpa_set_pages_array(pages, addrinarray, - __pgprot(_PAGE_CACHE_UC_MINUS)); - if (!ret && new_type == _PAGE_CACHE_WC) + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); + if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, addrinarray, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_PAGES_ARRAY, pages); if (ret) @@ -1689,13 +1724,13 @@ err_out: int set_pages_array_uc(struct page **pages, int addrinarray) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS); + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); int set_pages_array_wc(struct page **pages, int addrinarray) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC); + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); @@ -1714,6 +1749,7 @@ int set_pages_array_wb(struct page **pages, int addrinarray) unsigned long end; int i; + /* WB cache mode is hard wired to all cache attribute bits being 0 */ retval = cpa_clear_pages_array(pages, addrinarray, __pgprot(_PAGE_CACHE_MASK)); if (retval) @@ -1801,7 +1837,7 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 657438858e83..edf299c8ff6c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -31,6 +31,7 @@ #include <asm/io.h> #include "pat_internal.h" +#include "mm_internal.h" #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; @@ -66,6 +67,75 @@ __setup("debugpat", pat_debug_setup); static u64 __read_mostly boot_pat_state; +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags WC and Uncached together to keep track of + * memory type of pages that have backing page struct. X86 PAT supports 3 + * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and + * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not + * been changed from its default (value of -1 used to denote this). + * Note we do not support _PAGE_CACHE_MODE_UC here. + */ + +#define _PGMT_DEFAULT 0 +#define _PGMT_WC (1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_uncached) +#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_CLEAR_MASK (~_PGMT_MASK) + +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + unsigned long pg_flags = pg->flags & _PGMT_MASK; + + if (pg_flags == _PGMT_DEFAULT) + return -1; + else if (pg_flags == _PGMT_WC) + return _PAGE_CACHE_MODE_WC; + else if (pg_flags == _PGMT_UC_MINUS) + return _PAGE_CACHE_MODE_UC_MINUS; + else + return _PAGE_CACHE_MODE_WB; +} + +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ + unsigned long memtype_flags; + unsigned long old_flags; + unsigned long new_flags; + + switch (memtype) { + case _PAGE_CACHE_MODE_WC: + memtype_flags = _PGMT_WC; + break; + case _PAGE_CACHE_MODE_UC_MINUS: + memtype_flags = _PGMT_UC_MINUS; + break; + case _PAGE_CACHE_MODE_WB: + memtype_flags = _PGMT_WB; + break; + default: + memtype_flags = _PGMT_DEFAULT; + break; + } + + do { + old_flags = pg->flags; + new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; + } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); +} +#else +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + return -1; +} +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ +} +#endif + enum { PAT_UC = 0, /* uncached */ PAT_WC = 1, /* Write combining */ @@ -75,6 +145,52 @@ enum { PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ }; +#define CM(c) (_PAGE_CACHE_MODE_ ## c) + +static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) +{ + enum page_cache_mode cache; + char *cache_mode; + + switch (pat_val) { + case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; + case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; + case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; + case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; + case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; + case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; + default: cache = CM(WB); cache_mode = "WB "; break; + } + + memcpy(msg, cache_mode, 4); + + return cache; +} + +#undef CM + +/* + * Update the cache mode to pgprot translation tables according to PAT + * configuration. + * Using lower indices is preferred, so we start with highest index. + */ +void pat_init_cache_modes(void) +{ + int i; + enum page_cache_mode cache; + char pat_msg[33]; + u64 pat; + + rdmsrl(MSR_IA32_CR_PAT, pat); + pat_msg[32] = 0; + for (i = 7; i >= 0; i--) { + cache = pat_get_cache_mode((pat >> (i * 8)) & 7, + pat_msg + 4 * i); + update_cache_mode_entry(i, cache); + } + pr_info("PAT configuration [0-7]: %s\n", pat_msg); +} + #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) void pat_init(void) @@ -124,8 +240,7 @@ void pat_init(void) wrmsrl(MSR_IA32_CR_PAT, pat); if (boot_cpu) - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); + pat_init_cache_modes(); } #undef PAT @@ -139,20 +254,21 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ * The intersection is based on "Effective Memory Type" tables in IA-32 * SDM vol 3a */ -static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) +static unsigned long pat_x_mtrr_type(u64 start, u64 end, + enum page_cache_mode req_type) { /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ - if (req_type == _PAGE_CACHE_WB) { + if (req_type == _PAGE_CACHE_MODE_WB) { u8 mtrr_type; mtrr_type = mtrr_type_lookup(start, end); if (mtrr_type != MTRR_TYPE_WRBACK) - return _PAGE_CACHE_UC_MINUS; + return _PAGE_CACHE_MODE_UC_MINUS; - return _PAGE_CACHE_WB; + return _PAGE_CACHE_MODE_WB; } return req_type; @@ -207,25 +323,26 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) * - Find the memtype of all the pages in the range, look for any conflicts * - In case of no conflicts, set the new memtype for pages in the range */ -static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) +static int reserve_ram_pages_type(u64 start, u64 end, + enum page_cache_mode req_type, + enum page_cache_mode *new_type) { struct page *page; u64 pfn; - if (req_type == _PAGE_CACHE_UC) { + if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); - req_type = _PAGE_CACHE_UC_MINUS; + req_type = _PAGE_CACHE_MODE_UC_MINUS; } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { - unsigned long type; + enum page_cache_mode type; page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -258,21 +375,21 @@ static int free_ram_pages_type(u64 start, u64 end) /* * req_type typically has one of the: - * - _PAGE_CACHE_WB - * - _PAGE_CACHE_WC - * - _PAGE_CACHE_UC_MINUS - * - _PAGE_CACHE_UC + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_UC * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ -int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) +int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, + enum page_cache_mode *new_type) { struct memtype *new; - unsigned long actual_type; + enum page_cache_mode actual_type; int is_range_ram; int err = 0; @@ -281,10 +398,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == _PAGE_CACHE_WC) - *new_type = _PAGE_CACHE_UC_MINUS; + if (req_type == _PAGE_CACHE_MODE_WC) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; else - *new_type = req_type & _PAGE_CACHE_MASK; + *new_type = req_type; } return 0; } @@ -292,7 +409,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Low ISA region is always mapped WB in page table. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) - *new_type = _PAGE_CACHE_WB; + *new_type = _PAGE_CACHE_MODE_WB; return 0; } @@ -302,7 +419,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, * tools and ACPI tools). Use WB request for WB memory and use * UC_MINUS otherwise. */ - actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + actual_type = pat_x_mtrr_type(start, end, req_type); if (new_type) *new_type = actual_type; @@ -394,12 +511,12 @@ int free_memtype(u64 start, u64 end) * * Only to be called when PAT is enabled * - * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or - * _PAGE_CACHE_UC + * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS + * or _PAGE_CACHE_MODE_UC */ -static unsigned long lookup_memtype(u64 paddr) +static enum page_cache_mode lookup_memtype(u64 paddr) { - int rettype = _PAGE_CACHE_WB; + enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; struct memtype *entry; if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) @@ -414,7 +531,7 @@ static unsigned long lookup_memtype(u64 paddr) * default state and not reserved, and hence of type WB */ if (rettype == -1) - rettype = _PAGE_CACHE_WB; + rettype = _PAGE_CACHE_MODE_WB; return rettype; } @@ -425,7 +542,7 @@ static unsigned long lookup_memtype(u64 paddr) if (entry != NULL) rettype = entry->type; else - rettype = _PAGE_CACHE_UC_MINUS; + rettype = _PAGE_CACHE_MODE_UC_MINUS; spin_unlock(&memtype_lock); return rettype; @@ -442,11 +559,11 @@ static unsigned long lookup_memtype(u64 paddr) * On failure, returns non-zero */ int io_reserve_memtype(resource_size_t start, resource_size_t end, - unsigned long *type) + enum page_cache_mode *type) { resource_size_t size = end - start; - unsigned long req_type = *type; - unsigned long new_type; + enum page_cache_mode req_type = *type; + enum page_cache_mode new_type; int ret; WARN_ON_ONCE(iomem_map_sanity_check(start, size)); @@ -520,13 +637,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { - unsigned long flags = _PAGE_CACHE_WB; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_DSYNC) - flags = _PAGE_CACHE_UC_MINUS; + pcm = _PAGE_CACHE_MODE_UC_MINUS; #ifdef CONFIG_X86_32 /* @@ -543,12 +660,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, boot_cpu_has(X86_FEATURE_CYRIX_ARR) || boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - flags = _PAGE_CACHE_UC; + pcm = _PAGE_CACHE_MODE_UC; } #endif *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | - flags); + cachemode2protval(pcm)); return 1; } @@ -556,7 +673,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * Change the memory type for the physial address range in kernel identity * mapping space if that range is a part of identity map. */ -int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +int kernel_map_sync_memtype(u64 base, unsigned long size, + enum page_cache_mode pcm) { unsigned long id_sz; @@ -574,11 +692,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) __pa(high_memory) - base : size; - if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, - cattr_name(flags), + cattr_name(pcm), base, (unsigned long long)(base + size-1)); return -EINVAL; } @@ -595,8 +713,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, { int is_ram = 0; int ret; - unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); - unsigned long flags = want_flags; + enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); + enum page_cache_mode pcm = want_pcm; is_ram = pat_pagerange_is_ram(paddr, paddr + size); @@ -609,36 +727,36 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, if (!pat_enabled) return 0; - flags = lookup_memtype(paddr); - if (want_flags != flags) { + pcm = lookup_memtype(paddr); + if (want_pcm != pcm) { printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, - cattr_name(want_flags), + cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), - cattr_name(flags)); + cattr_name(pcm)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & - (~_PAGE_CACHE_MASK)) | - flags); + (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); } return 0; } - ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); if (ret) return ret; - if (flags != want_flags) { + if (pcm != want_pcm) { if (strict_prot || - !is_new_memtype_allowed(paddr, size, want_flags, flags)) { + !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, - cattr_name(want_flags), + cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), - cattr_name(flags)); + cattr_name(pcm)); return -EINVAL; } /* @@ -647,10 +765,10 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, */ *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); } - if (kernel_map_sync_memtype(paddr, size, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { free_memtype(paddr, paddr + size); return -EINVAL; } @@ -709,7 +827,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; - unsigned long flags; + enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { @@ -728,18 +846,18 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, * For anything smaller than the vma size we set prot based on the * lookup. */ - flags = lookup_memtype(paddr); + pcm = lookup_memtype(paddr); /* Check memtype for the remaining pages */ while (size > PAGE_SIZE) { size -= PAGE_SIZE; paddr += PAGE_SIZE; - if (flags != lookup_memtype(paddr)) + if (pcm != lookup_memtype(paddr)) return -EINVAL; } *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); return 0; } @@ -747,15 +865,15 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn) { - unsigned long flags; + enum page_cache_mode pcm; if (!pat_enabled) return 0; /* Set prot based on lookup */ - flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); return 0; } @@ -791,7 +909,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { if (pat_enabled) - return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); else return pgprot_noncached(prot); } @@ -824,7 +943,7 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos == 0) { ++*pos; - seq_printf(seq, "PAT memtype list:\n"); + seq_puts(seq, "PAT memtype list:\n"); } return memtype_get_idx(*pos); diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index 77e5ba153fac..f6411620305d 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -10,30 +10,32 @@ struct memtype { u64 start; u64 end; u64 subtree_max_end; - unsigned long type; + enum page_cache_mode type; struct rb_node rb; }; -static inline char *cattr_name(unsigned long flags) +static inline char *cattr_name(enum page_cache_mode pcm) { - switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; + switch (pcm) { + case _PAGE_CACHE_MODE_UC: return "uncached"; + case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_MODE_WB: return "write-back"; + case _PAGE_CACHE_MODE_WC: return "write-combining"; + case _PAGE_CACHE_MODE_WT: return "write-through"; + case _PAGE_CACHE_MODE_WP: return "write-protected"; + default: return "broken"; } } #ifdef CONFIG_X86_PAT extern int rbt_memtype_check_insert(struct memtype *new, - unsigned long *new_type); + enum page_cache_mode *new_type); extern struct memtype *rbt_memtype_erase(u64 start, u64 end); extern struct memtype *rbt_memtype_lookup(u64 addr); extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); #else static inline int rbt_memtype_check_insert(struct memtype *new, - unsigned long *new_type) + enum page_cache_mode *new_type) { return 0; } static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) { return NULL; } diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 415f6c4ced36..6582adcc8bd9 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -122,11 +122,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, static int memtype_rb_check_conflict(struct rb_root *root, u64 start, u64 end, - unsigned long reqtype, unsigned long *newtype) + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) { struct rb_node *node; struct memtype *match; - int found_type = reqtype; + enum page_cache_mode found_type = reqtype; match = memtype_rb_lowest_match(&memtype_rbroot, start, end); if (match == NULL) @@ -187,7 +188,8 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } -int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) +int rbt_memtype_check_insert(struct memtype *new, + enum page_cache_mode *ret_type) { int err = 0; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 3f627345d51c..987514396c1e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -24,7 +24,7 @@ extern u8 sk_load_byte_positive_offset[]; extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[]; extern u8 sk_load_byte_negative_offset[]; -static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) +static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { if (len == 1) *ptr = bytes; @@ -52,12 +52,12 @@ static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT4_off32(b1, b2, b3, b4, off) \ do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) -static inline bool is_imm8(int value) +static bool is_imm8(int value) { return value <= 127 && value >= -128; } -static inline bool is_simm32(s64 value) +static bool is_simm32(s64 value) { return value == (s64) (s32) value; } @@ -94,7 +94,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JGE 0x7D #define X86_JG 0x7F -static inline void bpf_flush_icache(void *start, void *end) +static void bpf_flush_icache(void *start, void *end) { mm_segment_t old_fs = get_fs(); @@ -133,24 +133,24 @@ static const int reg2hex[] = { * which need extra byte of encoding. * rax,rcx,...,rbp have simpler encoding */ -static inline bool is_ereg(u32 reg) +static bool is_ereg(u32 reg) { - if (reg == BPF_REG_5 || reg == AUX_REG || - (reg >= BPF_REG_7 && reg <= BPF_REG_9)) - return true; - else - return false; + return (1 << reg) & (BIT(BPF_REG_5) | + BIT(AUX_REG) | + BIT(BPF_REG_7) | + BIT(BPF_REG_8) | + BIT(BPF_REG_9)); } /* add modifiers if 'reg' maps to x64 registers r8..r15 */ -static inline u8 add_1mod(u8 byte, u32 reg) +static u8 add_1mod(u8 byte, u32 reg) { if (is_ereg(reg)) byte |= 1; return byte; } -static inline u8 add_2mod(u8 byte, u32 r1, u32 r2) +static u8 add_2mod(u8 byte, u32 r1, u32 r2) { if (is_ereg(r1)) byte |= 1; @@ -160,13 +160,13 @@ static inline u8 add_2mod(u8 byte, u32 r1, u32 r2) } /* encode 'dst_reg' register into x64 opcode 'byte' */ -static inline u8 add_1reg(u8 byte, u32 dst_reg) +static u8 add_1reg(u8 byte, u32 dst_reg) { return byte + reg2hex[dst_reg]; } /* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */ -static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) +static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) { return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } @@ -178,7 +178,7 @@ static void jit_fill_hole(void *area, unsigned int size) } struct jit_context { - unsigned int cleanup_addr; /* epilogue code offset */ + int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; }; @@ -192,6 +192,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, struct bpf_insn *insn = bpf_prog->insnsi; int insn_cnt = bpf_prog->len; bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i; int proglen = 0; @@ -854,10 +855,11 @@ common_load: goto common_load; case BPF_JMP | BPF_EXIT: - if (i != insn_cnt - 1) { + if (seen_exit) { jmp_offset = ctx->cleanup_addr - addrs[i]; goto emit_jmp; } + seen_exit = true; /* update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp-X] */ diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 37c1435889ce..9b18ef315a55 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -433,14 +433,14 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; if (pat_enabled && write_combine) - prot |= _PAGE_CACHE_WC; + prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); else if (pat_enabled || boot_cpu_data.x86 > 3) /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here * as well. */ - prot |= _PAGE_CACHE_UC_MINUS; + prot |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); vma->vm_page_prot = __pgprot(prot); diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c index 7307d9d12d15..2e565e65c893 100644 --- a/arch/x86/pci/numachip.c +++ b/arch/x86/pci/numachip.c @@ -103,7 +103,7 @@ static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus, return 0; } -const struct pci_raw_ops pci_mmcfg_numachip = { +static const struct pci_raw_ops pci_mmcfg_numachip = { .read = pci_mmcfg_read_numachip, .write = pci_mmcfg_write_numachip, }; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 093f5f4272d3..c489ef2c1a39 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -23,6 +23,8 @@ #include <xen/features.h> #include <xen/events.h> #include <asm/xen/pci.h> +#include <asm/xen/cpuid.h> +#include <asm/apic.h> #include <asm/i8259.h> static int xen_pcifront_enable_irq(struct pci_dev *dev) @@ -229,7 +231,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; list_for_each_entry(msidesc, &dev->msi_list, list) { - __read_msi_msg(msidesc, &msg); + __pci_read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); if (msg.data != XEN_PIRQ_MSI_DATA || @@ -240,7 +242,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; } xen_msi_compose_msg(dev, pirq, &msg); - __write_msi_msg(msidesc, &msg); + __pci_write_msi_msg(msidesc, &msg); dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); } else { dev_dbg(&dev->dev, @@ -394,14 +396,7 @@ static void xen_teardown_msi_irq(unsigned int irq) { xen_destroy_irq(irq); } -static u32 xen_nop_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) -{ - return 0; -} -static u32 xen_nop_msix_mask_irq(struct msi_desc *desc, u32 flag) -{ - return 0; -} + #endif int __init pci_xen_init(void) @@ -425,12 +420,33 @@ int __init pci_xen_init(void) x86_msi.setup_msi_irqs = xen_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs; - x86_msi.msi_mask_irq = xen_nop_msi_mask_irq; - x86_msi.msix_mask_irq = xen_nop_msix_mask_irq; + pci_msi_ignore_mask = 1; #endif return 0; } +#ifdef CONFIG_PCI_MSI +void __init xen_msi_init(void) +{ + if (!disable_apic) { + /* + * If hardware supports (x2)APIC virtualization (as indicated + * by hypervisor's leaf 4) then we don't need to use pirqs/ + * event channels for MSI handling and instead use regular + * APIC processing + */ + uint32_t eax = cpuid_eax(xen_cpuid_base() + 4); + + if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) || + ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && cpu_has_apic)) + return; + } + + x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs; + x86_msi.teardown_msi_irq = xen_teardown_msi_irq; +} +#endif + int __init pci_xen_hvm_init(void) { if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) @@ -445,8 +461,11 @@ int __init pci_xen_hvm_init(void) #endif #ifdef CONFIG_PCI_MSI - x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs; - x86_msi.teardown_msi_irq = xen_teardown_msi_irq; + /* + * We need to wait until after x2apic is initialized + * before we can set MSI IRQ ops. + */ + x86_platform.apic_post_init = xen_msi_init; #endif return 0; } @@ -506,8 +525,7 @@ int __init pci_xen_initial_domain(void) x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; - x86_msi.msi_mask_irq = xen_nop_msi_mask_irq; - x86_msi.msix_mask_irq = xen_nop_msix_mask_irq; + pci_msi_ignore_mask = 1; #endif xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 35aecb6042fb..17e80d829df0 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -48,8 +48,7 @@ static unsigned long efi_flags __initdata; * We allocate runtime services regions bottom-up, starting from -4G, i.e. * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. */ -static u64 efi_va = -4 * (1UL << 30); -#define EFI_VA_END (-68 * (1UL << 30)) +static u64 efi_va = EFI_VA_START; /* * Scratch space used for switching the pagetable in the EFI stub diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 4d171e8640ef..735ba21efe91 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -86,7 +86,6 @@ static int iris_remove(struct platform_device *pdev) static struct platform_driver iris_driver = { .driver = { .name = "iris", - .owner = THIS_MODULE, }, .probe = iris_probe, .remove = iris_remove, diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index a9acde72d4ed..c5350fd27d70 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -170,7 +170,6 @@ static int xo1_pm_remove(struct platform_device *pdev) static struct platform_driver cs5535_pms_driver = { .driver = { .name = "cs5535-pms", - .owner = THIS_MODULE, }, .probe = xo1_pm_probe, .remove = xo1_pm_remove, @@ -179,7 +178,6 @@ static struct platform_driver cs5535_pms_driver = { static struct platform_driver cs5535_acpi_driver = { .driver = { .name = "olpc-xo1-pm-acpi", - .owner = THIS_MODULE, }, .probe = xo1_pm_probe, .remove = xo1_pm_remove, diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3968d67d366b..994798548b1a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1367,23 +1367,25 @@ static int ptc_seq_show(struct seq_file *file, void *data) cpu = *(loff_t *)data; if (!cpu) { - seq_printf(file, - "# cpu bauoff sent stime self locals remotes ncpus localhub "); - seq_printf(file, - "remotehub numuvhubs numuvhubs16 numuvhubs8 "); - seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); - seq_printf(file, - "rok resetp resett giveup sto bz throt disable "); - seq_printf(file, - "enable wars warshw warwaits enters ipidis plugged "); - seq_printf(file, - "ipiover glim cong swack recv rtime all one mult "); - seq_printf(file, - "none retry canc nocan reset rcan\n"); + seq_puts(file, + "# cpu bauoff sent stime self locals remotes ncpus localhub "); + seq_puts(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); + seq_puts(file, + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); + seq_puts(file, + "rok resetp resett giveup sto bz throt disable "); + seq_puts(file, + "enable wars warshw warwaits enters ipidis plugged "); + seq_puts(file, + "ipiover glim cong swack recv rtime all one mult "); + seq_puts(file, "none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { bcp = &per_cpu(bau_control, cpu); + if (bcp->nobau) { + seq_printf(file, "cpu %d bau disabled\n", cpu); + return 0; + } stat = bcp->statp; /* source side statistics */ seq_printf(file, diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index f52e033557c9..2c835e356349 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -24,6 +24,7 @@ quiet_cmd_bin2c = BIN2C $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) + @: obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 9fe1b5d002f0..b3560ece1c9f 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -364,3 +364,4 @@ 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf +358 i386 execveat sys_execveat stub32_execveat diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 281150b539a2..8d656fbb57aa 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -328,6 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf +322 64 execveat stub_execveat # # x32-specific system call numbers start at 512 to avoid cache impact @@ -366,3 +367,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit +545 x32 execveat stub_x32_execveat diff --git a/arch/x86/tools/calc_run_size.pl b/arch/x86/tools/calc_run_size.pl new file mode 100644 index 000000000000..23210baade2d --- /dev/null +++ b/arch/x86/tools/calc_run_size.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl +# +# Calculate the amount of space needed to run the kernel, including room for +# the .bss and .brk sections. +# +# Usage: +# objdump -h a.out | perl calc_run_size.pl +use strict; + +my $mem_size = 0; +my $file_offset = 0; + +my $sections=" *[0-9]+ \.(?:bss|brk) +"; +while (<>) { + if (/^$sections([0-9a-f]+) +(?:[0-9a-f]+ +){2}([0-9a-f]+)/) { + my $size = hex($1); + my $offset = hex($2); + $mem_size += $size; + if ($file_offset == 0) { + $file_offset = $offset; + } elsif ($file_offset != $offset) { + # BFD linker shows the same file offset in ELF. + # Gold linker shows them as consecutive. + next if ($file_offset + $mem_size == $offset + $size); + + printf STDERR "file_offset: 0x%lx\n", $file_offset; + printf STDERR "mem_size: 0x%lx\n", $mem_size; + printf STDERR "offset: 0x%lx\n", $offset; + printf STDERR "size: 0x%lx\n", $size; + + die ".bss and .brk are non-contiguous\n"; + } + } +} + +if ($file_offset == 0) { + die "Never found .bss or .brk file offset\n"; +} +printf("%d\n", $mem_size + $file_offset); diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 872eb60e7806..ba70ff232917 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -254,7 +254,7 @@ int main(int argc, char **argv) continue; /* Decode an instruction */ - insn_init(&insn, insn_buf, x86_64); + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); insn_get_length(&insn); if (insn.next_byte <= insn.kaddr || diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index a5efb21d5228..0c2fae8d929d 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -20,7 +20,10 @@ struct relocs { static struct relocs relocs16; static struct relocs relocs32; +#if ELF_BITS == 64 +static struct relocs relocs32neg; static struct relocs relocs64; +#endif struct section { Elf_Shdr shdr; @@ -762,11 +765,16 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, switch (r_type) { case R_X86_64_NONE: + /* NONE can be ignored. */ + break; + case R_X86_64_PC32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * PC relative relocations don't need to be adjusted unless + * referencing a percpu symbol. */ + if (is_percpu_sym(sym, symname)) + add_reloc(&relocs32neg, offset); break; case R_X86_64_32: @@ -986,7 +994,10 @@ static void emit_relocs(int as_text, int use_real_mode) /* Order the relocations for more efficient processing */ sort_relocs(&relocs16); sort_relocs(&relocs32); +#if ELF_BITS == 64 + sort_relocs(&relocs32neg); sort_relocs(&relocs64); +#endif /* Print the relocations */ if (as_text) { @@ -1007,14 +1018,21 @@ static void emit_relocs(int as_text, int use_real_mode) for (i = 0; i < relocs32.count; i++) write_reloc(relocs32.offset[i], stdout); } else { - if (ELF_BITS == 64) { - /* Print a stop */ - write_reloc(0, stdout); +#if ELF_BITS == 64 + /* Print a stop */ + write_reloc(0, stdout); - /* Now print each relocation */ - for (i = 0; i < relocs64.count; i++) - write_reloc(relocs64.offset[i], stdout); - } + /* Now print each relocation */ + for (i = 0; i < relocs64.count; i++) + write_reloc(relocs64.offset[i], stdout); + + /* Print a stop */ + write_reloc(0, stdout); + + /* Now print each inverse 32-bit relocation */ + for (i = 0; i < relocs32neg.count; i++) + write_reloc(relocs32neg.offset[i], stdout); +#endif /* Print a stop */ write_reloc(0, stdout); diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 13403fc95a96..56f04db0c9c0 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -149,7 +149,7 @@ int main(int argc, char **argv) break; } /* Decode an instruction */ - insn_init(&insn, insn_buf, x86_64); + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); insn_get_length(&insn); if (insn.length != nb) { warnings++; diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index cc04e67bfd05..2d7d9a1f5b53 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -29,20 +29,18 @@ #endif /* CONFIG_X86_32 */ -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP - -#define smp_mb() mb() #ifdef CONFIG_X86_PPRO_FENCE -#define smp_rmb() rmb() +#define dma_rmb() rmb() #else /* CONFIG_X86_PPRO_FENCE */ -#define smp_rmb() barrier() +#define dma_rmb() barrier() #endif /* CONFIG_X86_PPRO_FENCE */ +#define dma_wmb() barrier() -#define smp_wmb() barrier() +#ifdef CONFIG_SMP -#define smp_read_barrier_depends() read_barrier_depends() +#define smp_mb() mb() +#define smp_rmb() dma_rmb() +#define smp_wmb() barrier() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* CONFIG_SMP */ @@ -50,11 +48,13 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) #endif /* CONFIG_SMP */ +#define read_barrier_depends() do { } while (0) +#define smp_read_barrier_depends() do { } while (0) + /* * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f2f0723070ca..20c3649d0691 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -31,6 +31,7 @@ #define stub_fork sys_fork #define stub_vfork sys_vfork #define stub_execve sys_execve +#define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 2f94b039e55b..8ec3d1f4ce9a 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -7,9 +7,7 @@ #include <linux/kernel.h> #include <linux/getcpu.h> -#include <linux/jiffies.h> #include <linux/time.h> -#include <asm/vsyscall.h> #include <asm/vgtod.h> notrace long diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 970463b566cf..009495b9ab4b 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -1,7 +1,8 @@ /* - * Set up the VMAs to tell the VM about the vDSO. * Copyright 2007 Andi Kleen, SUSE Labs. * Subject to the GPL, v.2 + * + * This contains most of the x86 vDSO kernel-side code. */ #include <linux/mm.h> #include <linux/err.h> @@ -10,17 +11,17 @@ #include <linux/init.h> #include <linux/random.h> #include <linux/elf.h> -#include <asm/vsyscall.h> +#include <linux/cpu.h> #include <asm/vgtod.h> #include <asm/proto.h> #include <asm/vdso.h> +#include <asm/vvar.h> #include <asm/page.h> #include <asm/hpet.h> +#include <asm/desc.h> #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; - -extern unsigned short vdso_sync_cpuid; #endif void __init init_vdso_image(const struct vdso_image *image) @@ -38,20 +39,6 @@ void __init init_vdso_image(const struct vdso_image *image) image->alt_len)); } -#if defined(CONFIG_X86_64) -static int __init init_vdso(void) -{ - init_vdso_image(&vdso_image_64); - -#ifdef CONFIG_X86_X32_ABI - init_vdso_image(&vdso_image_x32); -#endif - - return 0; -} -subsys_initcall(init_vdso); -#endif - struct linux_binprm; /* Put the vdso above the (randomized) stack with another randomized offset. @@ -238,3 +225,63 @@ static __init int vdso_setup(char *s) } __setup("vdso=", vdso_setup); #endif + +#ifdef CONFIG_X86_64 +static void vgetcpu_cpu_init(void *arg) +{ + int cpu = smp_processor_id(); + struct desc_struct d = { }; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* + * Store cpu number in limit so that it can be loaded + * quickly in user space in vgetcpu. (12 bits for the CPU + * and 8 bits for the node) + */ + d.limit0 = cpu | ((node & 0xf) << 12); + d.limit = node >> 4; + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static int +vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); + + return NOTIFY_DONE; +} + +static int __init init_vdso(void) +{ + init_vdso_image(&vdso_image_64); + +#ifdef CONFIG_X86_X32_ABI + init_vdso_image(&vdso_image_x32); +#endif + + cpu_notifier_register_begin(); + + on_each_cpu(vgetcpu_cpu_init, NULL, 1); + /* notifier priority > KVM */ + __hotcpu_notifier(vgetcpu_cpu_notifier, 30); + + cpu_notifier_register_done(); + + return 0; +} +subsys_initcall(init_vdso); +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fac5e4f9607c..6bf3a13e3e0f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1100,12 +1100,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) /* Fast syscall setup is all done in hypercalls, so these are all ignored. Stub them out here to stop Xen console noise. */ - break; - - case MSR_IA32_CR_PAT: - if (smp_processor_id() == 0) - xen_set_pat(((u64)high << 32) | low); - break; default: ret = native_write_msr_safe(msr, low, high); @@ -1561,10 +1555,6 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; -#if 0 - if (!xen_initial_domain()) -#endif - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* * Prevent page tables from being allocated in highmem, even @@ -1618,14 +1608,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif -#ifdef CONFIG_X86_PAT - /* - * For right now disable the PAT. We should remove this once - * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1 - * (xen/pat: Disable PAT support for now) is reverted. - */ - pat_enabled = 0; -#endif /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1636,6 +1618,13 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); + /* + * Modify the cache mode translation tables to match Xen's PAT + * configuration. + */ + + pat_init_cache_modes(); + /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a8a1a3d08d4d..5c1f9ace7ae7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -387,7 +387,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) unsigned long mfn; if (!xen_feature(XENFEAT_auto_translated_physmap)) - mfn = get_phys_to_machine(pfn); + mfn = __pfn_to_mfn(pfn); else mfn = pfn; /* @@ -410,13 +410,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) __visible pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; -#if 0 - /* If this is a WC pte, convert back from Xen WC to Linux WC */ - if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { - WARN_ON(!pat_enabled); - pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; - } -#endif + return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -427,47 +421,8 @@ __visible pgdval_t xen_pgd_val(pgd_t pgd) } PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); -/* - * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 - * are reserved for now, to correspond to the Intel-reserved PAT - * types. - * - * We expect Linux's PAT set as follows: - * - * Idx PTE flags Linux Xen Default - * 0 WB WB WB - * 1 PWT WC WT WT - * 2 PCD UC- UC- UC- - * 3 PCD PWT UC UC UC - * 4 PAT WB WC WB - * 5 PAT PWT WC WP WT - * 6 PAT PCD UC- rsv UC- - * 7 PAT PCD PWT UC rsv UC - */ - -void xen_set_pat(u64 pat) -{ - /* We expect Linux to use a PAT setting of - * UC UC- WC WB (ignoring the PAT flag) */ - WARN_ON(pat != 0x0007010600070106ull); -} - __visible pte_t xen_make_pte(pteval_t pte) { -#if 0 - /* If Linux is trying to set a WC pte, then map to the Xen WC. - * If _PAGE_PAT is set, then it probably means it is really - * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope - * things work out OK... - * - * (We should never see kernel mappings with _PAGE_PSE set, - * but we could see hugetlbfs mappings, I think.). - */ - if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { - if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) - pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; - } -#endif pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); @@ -1158,20 +1113,16 @@ static void __init xen_cleanhighmap(unsigned long vaddr, * instead of somewhere later and be confusing. */ xen_mc_flush(); } -static void __init xen_pagetable_p2m_copy(void) + +static void __init xen_pagetable_p2m_free(void) { unsigned long size; unsigned long addr; - unsigned long new_mfn_list; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - new_mfn_list = xen_revector_p2m_tree(); /* No memory or already called. */ - if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) + if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) return; /* using __ka address and sticking INVALID_P2M_ENTRY! */ @@ -1189,8 +1140,6 @@ static void __init xen_pagetable_p2m_copy(void) size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); memblock_free(__pa(xen_start_info->mfn_list), size); - /* And revector! Bye bye old array */ - xen_start_info->mfn_list = new_mfn_list; /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1214,17 +1163,35 @@ static void __init xen_pagetable_p2m_copy(void) } #endif -static void __init xen_pagetable_init(void) +static void __init xen_pagetable_p2m_setup(void) { - paging_init(); + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + xen_vmalloc_p2m_tree(); + #ifdef CONFIG_X86_64 - xen_pagetable_p2m_copy(); + xen_pagetable_p2m_free(); #endif + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; +} + +static void __init xen_pagetable_init(void) +{ + paging_init(); + xen_post_allocator_init(); + + xen_pagetable_p2m_setup(); + /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); + /* Remap memory freed due to conflicts with E820 map */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_remap_memory(); + xen_setup_shared_info(); - xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) { @@ -1457,8 +1424,10 @@ static int xen_pgd_alloc(struct mm_struct *mm) page->private = (unsigned long)user_pgd; if (user_pgd != NULL) { +#ifdef CONFIG_X86_VSYSCALL_EMULATION user_pgd[pgd_index(VSYSCALL_ADDR)] = __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); +#endif ret = 0; } @@ -2021,7 +1990,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) # ifdef CONFIG_HIGHMEM case FIX_KMAP_BEGIN ... FIX_KMAP_END: # endif -#else +#elif defined(CONFIG_X86_VSYSCALL_EMULATION) case VSYSCALL_PAGE: #endif case FIX_TEXT_POKE0: @@ -2060,7 +2029,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) __native_set_fixmap(idx, pte); -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_VSYSCALL_EMULATION /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ if (idx == VSYSCALL_PAGE) { diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b456b048eca9..edbc7a63fd73 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -3,21 +3,22 @@ * guests themselves, but it must also access and update the p2m array * during suspend/resume when all the pages are reallocated. * - * The p2m table is logically a flat array, but we implement it as a - * three-level tree to allow the address space to be sparse. + * The logical flat p2m table is mapped to a linear kernel memory area. + * For accesses by Xen a three-level tree linked via mfns only is set up to + * allow the address space to be sparse. * - * Xen - * | - * p2m_top p2m_top_mfn - * / \ / \ - * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn - * / \ / \ / / - * p2m p2m p2m p2m p2m p2m p2m ... + * Xen + * | + * p2m_top_mfn + * / \ + * p2m_mid_mfn p2m_mid_mfn + * / / + * p2m p2m p2m ... * * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. * - * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the - * maximum representable pseudo-physical address space is: + * The p2m_top_mfn level is limited to 1 page, so the maximum representable + * pseudo-physical address space is: * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages * * P2M_PER_PAGE depends on the architecture, as a mfn is always @@ -30,6 +31,9 @@ * leaf entries, or for the top root, or middle one, for which there is a void * entry, we assume it is "missing". So (for example) * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * We have a dedicated page p2m_missing with all entries being + * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m + * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns. * * We also have the possibility of setting 1-1 mappings on certain regions, so * that: @@ -39,122 +43,20 @@ * PCI BARs, or ACPI spaces), we can create mappings easily because we * get the PFN value to match the MFN. * - * For this to work efficiently we have one new page p2m_identity and - * allocate (via reserved_brk) any other pages we need to cover the sides - * (1GB or 4MB boundary violations). All entries in p2m_identity are set to - * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, - * no other fancy value). + * For this to work efficiently we have one new page p2m_identity. All entries + * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only + * recognizes that and MFNs, no other fancy value). * * On lookup we spot that the entry points to p2m_identity and return the * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. * If the entry points to an allocated page, we just proceed as before and - * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in * appropriate functions (pfn_to_mfn). * * The reason for having the IDENTITY_FRAME_BIT instead of just returning the * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a * non-identity pfn. To protect ourselves against we elect to set (and get) the * IDENTITY_FRAME_BIT on all identity mapped PFNs. - * - * This simplistic diagram is used to explain the more subtle piece of code. - * There is also a digram of the P2M at the end that can help. - * Imagine your E820 looking as so: - * - * 1GB 2GB 4GB - * /-------------------+---------\/----\ /----------\ /---+-----\ - * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | - * \-------------------+---------/\----/ \----------/ \---+-----/ - * ^- 1029MB ^- 2001MB - * - * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), - * 2048MB = 524288 (0x80000)] - * - * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB - * is actually not present (would have to kick the balloon driver to put it in). - * - * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: - * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start - * of the PFN and the end PFN (263424 and 512256 respectively). The first step - * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page - * covers 512^2 of page estate (1GB) and in case the start or end PFN is not - * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as - * required to split any existing p2m_mid_missing middle pages. - * - * With the E820 example above, 263424 is not 1GB aligned so we allocate a - * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. - * Each entry in the allocate page is "missing" (points to p2m_missing). - * - * Next stage is to determine if we need to do a more granular boundary check - * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. - * We check if the start pfn and end pfn violate that boundary check, and if - * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer - * granularity of setting which PFNs are missing and which ones are identity. - * In our example 263424 and 512256 both fail the check so we reserve_brk two - * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" - * values) and assign them to p2m[1][2] and p2m[1][488] respectively. - * - * At this point we would at minimum reserve_brk one page, but could be up to - * three. Each call to set_phys_range_identity has at maximum a three page - * cost. If we were to query the P2M at this stage, all those entries from - * start PFN through end PFN (so 1029MB -> 2001MB) would return - * INVALID_P2M_ENTRY ("missing"). - * - * The next step is to walk from the start pfn to the end pfn setting - * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. - * If we find that the middle entry is pointing to p2m_missing we can swap it - * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and - * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). - * At this point we do not need to worry about boundary aligment (so no need to - * reserve_brk a middle page, figure out which PFNs are "missing" and which - * ones are identity), as that has been done earlier. If we find that the - * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference - * that page (which covers 512 PFNs) and set the appropriate PFN with - * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we - * set from p2m[1][2][256->511] and p2m[1][488][0->256] with - * IDENTITY_FRAME_BIT set. - * - * All other regions that are void (or not filled) either point to p2m_missing - * (considered missing) or have the default value of INVALID_P2M_ENTRY (also - * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] - * contain the INVALID_P2M_ENTRY value and are considered "missing." - * - * Finally, the region beyond the end of of the E820 (4 GB in this example) - * is set to be identity (in case there are MMIO regions placed here). - * - * This is what the p2m ends up looking (for the E820 above) with this - * fabulous drawing: - * - * p2m /--------------\ - * /-----\ | &mfn_list[0],| /-----------------\ - * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | - * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | - * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | - * |-----| \ | [p2m_identity]+\\ | .... | - * | 2 |--\ \-------------------->| ... | \\ \----------------/ - * |-----| \ \---------------/ \\ - * | 3 |-\ \ \\ p2m_identity [1] - * |-----| \ \-------------------->/---------------\ /-----------------\ - * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... | - * \-----/ | | | [p2m_identity]+-->| ..., ~0 | - * | | | .... | \-----------------/ - * | | +-[x], ~0, ~0.. +\ - * | | \---------------/ \ - * | | \-> /---------------\ - * | V p2m_mid_missing p2m_missing | IDENTITY[@0] | - * | /-----------------\ /------------\ | IDENTITY[@256]| - * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... | - * | | [p2m_missing] +---->| ..., ~0 | \---------------/ - * | | ... | \------------/ - * | \-----------------/ - * | - * | p2m_mid_identity - * | /-----------------\ - * \-->| [p2m_identity] +---->[1] - * | [p2m_identity] +---->[1] - * | ... | - * \-----------------/ - * - * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ #include <linux/init.h> @@ -164,9 +66,11 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/bootmem.h> +#include <linux/slab.h> #include <asm/cache.h> #include <asm/setup.h> +#include <asm/uaccess.h> #include <asm/xen/page.h> #include <asm/xen/hypercall.h> @@ -178,31 +82,26 @@ #include "multicalls.h" #include "xen-ops.h" +#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) + static void __init m2p_override_init(void); +unsigned long *xen_p2m_addr __read_mostly; +EXPORT_SYMBOL_GPL(xen_p2m_addr); +unsigned long xen_p2m_size __read_mostly; +EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; +EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); + +static DEFINE_SPINLOCK(p2m_update_lock); static unsigned long *p2m_mid_missing_mfn; static unsigned long *p2m_top_mfn; static unsigned long **p2m_top_mfn_p; - -/* Placeholders for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); - -RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); - -/* For each I/O range remapped we may lose up to two leaf pages for the boundary - * violations and three mid pages to cover up to 3GB. With - * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the - * remapped region. - */ -RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES); +static unsigned long *p2m_missing; +static unsigned long *p2m_identity; +static pte_t *p2m_missing_pte; +static pte_t *p2m_identity_pte; static inline unsigned p2m_top_index(unsigned long pfn) { @@ -220,14 +119,6 @@ static inline unsigned p2m_index(unsigned long pfn) return pfn % P2M_PER_PAGE; } -static void p2m_top_init(unsigned long ***top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing; -} - static void p2m_top_mfn_init(unsigned long *top) { unsigned i; @@ -244,28 +135,43 @@ static void p2m_top_mfn_p_init(unsigned long **top) top[i] = p2m_mid_missing_mfn; } -static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) +static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = leaf; + mid[i] = virt_to_mfn(leaf); } -static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) +static void p2m_init(unsigned long *p2m) { unsigned i; - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(leaf); + for (i = 0; i < P2M_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; } -static void p2m_init(unsigned long *p2m) +static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) { unsigned i; - for (i = 0; i < P2M_MID_PER_PAGE; i++) - p2m[i] = INVALID_P2M_ENTRY; + for (i = 0; i < P2M_PER_PAGE; i++) + p2m[i] = IDENTITY_FRAME(pfn + i); +} + +static void * __ref alloc_p2m_page(void) +{ + if (unlikely(!slab_is_available())) + return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +/* Only to be called in case of a race for a page just allocated! */ +static void free_p2m_page(void *p) +{ + BUG_ON(!slab_is_available()); + free_page((unsigned long)p); } /* @@ -280,40 +186,46 @@ static void p2m_init(unsigned long *p2m) */ void __ref xen_build_mfn_list_list(void) { - unsigned long pfn; + unsigned long pfn, mfn; + pte_t *ptep; + unsigned int level, topidx, mididx; + unsigned long *mid_mfn_p; if (xen_feature(XENFEAT_auto_translated_physmap)) return; /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + p2m_mid_missing_mfn = alloc_p2m_page(); p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p = alloc_p2m_page(); p2m_top_mfn_p_init(p2m_top_mfn_p); - p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn = alloc_p2m_page(); p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); } - for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned long **mid; - unsigned long *mid_mfn_p; + for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN; + pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); - mid = p2m_top[topidx]; mid_mfn_p = p2m_top_mfn_p[topidx]; + ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), + &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); + mfn = pte_mfn(*ptep); + ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); /* Don't bother allocating any mfn mid levels if * they're just missing, just update the stored mfn, * since all could have changed over a migrate. */ - if (mid == p2m_mid_missing) { + if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) { BUG_ON(mididx); BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); @@ -322,19 +234,14 @@ void __ref xen_build_mfn_list_list(void) } if (mid_mfn_p == p2m_mid_missing_mfn) { - /* - * XXX boot-time only! We should never find - * missing parts of the mfn tree after - * runtime. - */ - mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + mid_mfn_p = alloc_p2m_page(); p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; } p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + mid_mfn_p[mididx] = mfn; } } @@ -353,171 +260,235 @@ void xen_setup_mfn_list_list(void) /* Set up p2m_top to point to the domain-builder provided p2m pages */ void __init xen_build_dynamic_phys_to_machine(void) { - unsigned long *mfn_list; - unsigned long max_pfn; unsigned long pfn; if (xen_feature(XENFEAT_auto_translated_physmap)) return; - mfn_list = (unsigned long *)xen_start_info->mfn_list; - max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - xen_max_p2m_pfn = max_pfn; + xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list; + xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE); - p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_missing); - p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_identity); + for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++) + xen_p2m_addr[pfn] = INVALID_P2M_ENTRY; - p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing, p2m_missing); - p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_identity, p2m_identity); + xen_max_p2m_pfn = xen_p2m_size; +} - p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_init(p2m_top); +#define P2M_TYPE_IDENTITY 0 +#define P2M_TYPE_MISSING 1 +#define P2M_TYPE_PFN 2 +#define P2M_TYPE_UNKNOWN 3 - /* - * The domain builder gives us a pre-constructed p2m array in - * mfn_list for all the pages initially given to us, so we just - * need to graft that into our tree structure. - */ - for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); +static int xen_p2m_elem_type(unsigned long pfn) +{ + unsigned long mfn; - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid, p2m_missing); + if (pfn >= xen_p2m_size) + return P2M_TYPE_IDENTITY; - p2m_top[topidx] = mid; - } + mfn = xen_p2m_addr[pfn]; - /* - * As long as the mfn_list has enough entries to completely - * fill a p2m page, pointing into the array is ok. But if - * not the entries beyond the last pfn will be undefined. - */ - if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { - unsigned long p2midx; + if (mfn == INVALID_P2M_ENTRY) + return P2M_TYPE_MISSING; - p2midx = max_pfn % P2M_PER_PAGE; - for ( ; p2midx < P2M_PER_PAGE; p2midx++) - mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; - } - p2m_top[topidx][mididx] = &mfn_list[pfn]; - } + if (mfn & IDENTITY_FRAME_BIT) + return P2M_TYPE_IDENTITY; - m2p_override_init(); + return P2M_TYPE_PFN; } -#ifdef CONFIG_X86_64 -unsigned long __init xen_revector_p2m_tree(void) + +static void __init xen_rebuild_p2m_list(unsigned long *p2m) { - unsigned long va_start; - unsigned long va_end; + unsigned int i, chunk; unsigned long pfn; - unsigned long pfn_free = 0; - unsigned long *mfn_list = NULL; - unsigned long size; - - va_start = xen_start_info->mfn_list; - /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), - * so make sure it is rounded up to that */ - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - va_end = va_start + size; - - /* If we were revectored already, don't do it again. */ - if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) - return 0; + unsigned long *mfns; + pte_t *ptep; + pmd_t *pmdp; + int type; - mfn_list = alloc_bootmem_align(size, PAGE_SIZE); - if (!mfn_list) { - pr_warn("Could not allocate space for a new P2M tree!\n"); - return xen_start_info->mfn_list; - } - /* Fill it out with INVALID_P2M_ENTRY value */ - memset(mfn_list, 0xFF, size); + p2m_missing = alloc_p2m_page(); + p2m_init(p2m_missing); + p2m_identity = alloc_p2m_page(); + p2m_init(p2m_identity); - for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx; - unsigned long *mid_p; + p2m_missing_pte = alloc_p2m_page(); + paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT); + p2m_identity_pte = alloc_p2m_page(); + paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT); + for (i = 0; i < PTRS_PER_PTE; i++) { + set_pte(p2m_missing_pte + i, + pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO)); + set_pte(p2m_identity_pte + i, + pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO)); + } - if (!p2m_top[topidx]) + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) { + /* + * Try to map missing/identity PMDs or p2m-pages if possible. + * We have to respect the structure of the mfn_list_list + * which will be built just afterwards. + * Chunk size to test is one p2m page if we are in the middle + * of a mfn_list_list mid page and the complete mid page area + * if we are at index 0 of the mid page. Please note that a + * mid page might cover more than one PMD, e.g. on 32 bit PAE + * kernels. + */ + chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ? + P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE; + + type = xen_p2m_elem_type(pfn); + i = 0; + if (type != P2M_TYPE_PFN) + for (i = 1; i < chunk; i++) + if (xen_p2m_elem_type(pfn + i) != type) + break; + if (i < chunk) + /* Reset to minimal chunk size. */ + chunk = P2M_PER_PAGE; + + if (type == P2M_TYPE_PFN || i < chunk) { + /* Use initial p2m page contents. */ +#ifdef CONFIG_X86_64 + mfns = alloc_p2m_page(); + copy_page(mfns, xen_p2m_addr + pfn); +#else + mfns = xen_p2m_addr + pfn; +#endif + ptep = populate_extra_pte((unsigned long)(p2m + pfn)); + set_pte(ptep, + pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL)); continue; + } - if (p2m_top[topidx] == p2m_mid_missing) + if (chunk == P2M_PER_PAGE) { + /* Map complete missing or identity p2m-page. */ + mfns = (type == P2M_TYPE_MISSING) ? + p2m_missing : p2m_identity; + ptep = populate_extra_pte((unsigned long)(p2m + pfn)); + set_pte(ptep, + pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO)); continue; + } - mididx = p2m_mid_index(pfn); - mid_p = p2m_top[topidx][mididx]; - if (!mid_p) - continue; - if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) - continue; + /* Complete missing or identity PMD(s) can be mapped. */ + ptep = (type == P2M_TYPE_MISSING) ? + p2m_missing_pte : p2m_identity_pte; + for (i = 0; i < PMDS_PER_MID_PAGE; i++) { + pmdp = populate_extra_pmd( + (unsigned long)(p2m + pfn + i * PTRS_PER_PTE)); + set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE)); + } + } +} - if ((unsigned long)mid_p == INVALID_P2M_ENTRY) - continue; +void __init xen_vmalloc_p2m_tree(void) +{ + static struct vm_struct vm; - /* The old va. Rebase it on mfn_list */ - if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { - unsigned long *new; + vm.flags = VM_ALLOC; + vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn, + PMD_SIZE * PMDS_PER_MID_PAGE); + vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE); + pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size); - if (pfn_free > (size / sizeof(unsigned long))) { - WARN(1, "Only allocated for %ld pages, but we want %ld!\n", - size / sizeof(unsigned long), pfn_free); - return 0; - } - new = &mfn_list[pfn_free]; + xen_max_p2m_pfn = vm.size / sizeof(unsigned long); - copy_page(new, mid_p); - p2m_top[topidx][mididx] = &mfn_list[pfn_free]; + xen_rebuild_p2m_list(vm.addr); - pfn_free += P2M_PER_PAGE; + xen_p2m_addr = vm.addr; + xen_p2m_size = xen_max_p2m_pfn; - } - /* This should be the leafs allocated for identity from _brk. */ - } - return (unsigned long)mfn_list; + xen_inv_extra_mem(); + m2p_override_init(); } -#else -unsigned long __init xen_revector_p2m_tree(void) -{ - return 0; -} -#endif + unsigned long get_phys_to_machine(unsigned long pfn) { - unsigned topidx, mididx, idx; + pte_t *ptep; + unsigned int level; + + if (unlikely(pfn >= xen_p2m_size)) { + if (pfn < xen_max_p2m_pfn) + return xen_chk_extra_mem(pfn); - if (unlikely(pfn >= MAX_P2M_PFN)) return IDENTITY_FRAME(pfn); + } - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); + ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); /* * The INVALID_P2M_ENTRY is filled in both p2m_*identity * and in p2m_*missing, so returning the INVALID_P2M_ENTRY * would be wrong. */ - if (p2m_top[topidx][mididx] == p2m_identity) + if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity))) return IDENTITY_FRAME(pfn); - return p2m_top[topidx][mididx][idx]; + return xen_p2m_addr[pfn]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); -static void *alloc_p2m_page(void) +/* + * Allocate new pmd(s). It is checked whether the old pmd is still in place. + * If not, nothing is changed. This is okay as the only reason for allocating + * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual + * pmd. In case of PAE/x86-32 there are multiple pmds to allocate! + */ +static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg) { - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); -} + pte_t *ptechk; + pte_t *pteret = ptep; + pte_t *pte_newpg[PMDS_PER_MID_PAGE]; + pmd_t *pmdp; + unsigned int level; + unsigned long flags; + unsigned long vaddr; + int i; -static void free_p2m_page(void *p) -{ - free_page((unsigned long)p); + /* Do all allocations first to bail out in error case. */ + for (i = 0; i < PMDS_PER_MID_PAGE; i++) { + pte_newpg[i] = alloc_p2m_page(); + if (!pte_newpg[i]) { + for (i--; i >= 0; i--) + free_p2m_page(pte_newpg[i]); + + return NULL; + } + } + + vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1); + + for (i = 0; i < PMDS_PER_MID_PAGE; i++) { + copy_page(pte_newpg[i], pte_pg); + paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT); + + pmdp = lookup_pmd_address(vaddr); + BUG_ON(!pmdp); + + spin_lock_irqsave(&p2m_update_lock, flags); + + ptechk = lookup_address(vaddr, &level); + if (ptechk == pte_pg) { + set_pmd(pmdp, + __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); + if (vaddr == (addr & ~(PMD_SIZE - 1))) + pteret = pte_offset_kernel(pmdp, addr); + pte_newpg[i] = NULL; + } + + spin_unlock_irqrestore(&p2m_update_lock, flags); + + if (pte_newpg[i]) { + paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT); + free_p2m_page(pte_newpg[i]); + } + + vaddr += PMD_SIZE; + } + + return pteret; } /* @@ -530,58 +501,62 @@ static void free_p2m_page(void *p) static bool alloc_p2m(unsigned long pfn) { unsigned topidx, mididx; - unsigned long ***top_p, **mid; unsigned long *top_mfn_p, *mid_mfn; - unsigned long *p2m_orig; + pte_t *ptep, *pte_pg; + unsigned int level; + unsigned long flags; + unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); + unsigned long p2m_pfn; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); - top_p = &p2m_top[topidx]; - mid = ACCESS_ONCE(*top_p); + ptep = lookup_address(addr, &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); + pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); - if (mid == p2m_mid_missing) { - /* Mid level is missing, allocate a new one */ - mid = alloc_p2m_page(); - if (!mid) + if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) { + /* PMD level is missing, allocate a new one */ + ptep = alloc_p2m_pmd(addr, ptep, pte_pg); + if (!ptep) return false; - - p2m_mid_init(mid, p2m_missing); - - if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) - free_p2m_page(mid); } - top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); + if (p2m_top_mfn) { + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); - BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); - if (mid_mfn == p2m_mid_missing_mfn) { - /* Separately check the mid mfn level */ - unsigned long missing_mfn; - unsigned long mid_mfn_mfn; - unsigned long old_mfn; + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + unsigned long old_mfn; - mid_mfn = alloc_p2m_page(); - if (!mid_mfn) - return false; + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; - p2m_mid_mfn_init(mid_mfn, p2m_missing); + p2m_mid_mfn_init(mid_mfn, p2m_missing); - missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); - mid_mfn_mfn = virt_to_mfn(mid_mfn); - old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); - if (old_mfn != missing_mfn) { - free_p2m_page(mid_mfn); - mid_mfn = mfn_to_virt(old_mfn); - } else { - p2m_top_mfn_p[topidx] = mid_mfn; + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); + if (old_mfn != missing_mfn) { + free_p2m_page(mid_mfn); + mid_mfn = mfn_to_virt(old_mfn); + } else { + p2m_top_mfn_p[topidx] = mid_mfn; + } } + } else { + mid_mfn = NULL; } - p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); - if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { + p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep)); + if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) || + p2m_pfn == PFN_DOWN(__pa(p2m_missing))) { /* p2m leaf page is missing */ unsigned long *p2m; @@ -589,183 +564,36 @@ static bool alloc_p2m(unsigned long pfn) if (!p2m) return false; - p2m_init(p2m); - - if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) - free_p2m_page(p2m); + if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) + p2m_init(p2m); else - mid_mfn[mididx] = virt_to_mfn(p2m); - } - - return true; -} - -static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) -{ - unsigned topidx, mididx, idx; - unsigned long *p2m; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - /* Pfff.. No boundary cross-over, lets get out. */ - if (!idx && check_boundary) - return false; - - WARN(p2m_top[topidx][mididx] == p2m_identity, - "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", - topidx, mididx); - - /* - * Could be done by xen_build_dynamic_phys_to_machine.. - */ - if (p2m_top[topidx][mididx] != p2m_missing) - return false; - - /* Boundary cross-over for the edges: */ - p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_init(p2m); + p2m_init_identity(p2m, pfn); - p2m_top[topidx][mididx] = p2m; + spin_lock_irqsave(&p2m_update_lock, flags); - return true; -} - -static bool __init early_alloc_p2m_middle(unsigned long pfn) -{ - unsigned topidx = p2m_top_index(pfn); - unsigned long **mid; - - mid = p2m_top[topidx]; - if (mid == p2m_mid_missing) { - mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_mid_init(mid, p2m_missing); - - p2m_top[topidx] = mid; - } - return true; -} - -/* - * Skim over the P2M tree looking at pages that are either filled with - * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and - * replace the P2M leaf with a p2m_missing or p2m_identity. - * Stick the old page in the new P2M tree location. - */ -static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn) -{ - unsigned topidx; - unsigned mididx; - unsigned ident_pfns; - unsigned inv_pfns; - unsigned long *p2m; - unsigned idx; - unsigned long pfn; - - /* We only look when this entails a P2M middle layer */ - if (p2m_index(set_pfn)) - return false; - - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { - topidx = p2m_top_index(pfn); - - if (!p2m_top[topidx]) - continue; - - if (p2m_top[topidx] == p2m_mid_missing) - continue; - - mididx = p2m_mid_index(pfn); - p2m = p2m_top[topidx][mididx]; - if (!p2m) - continue; - - if ((p2m == p2m_missing) || (p2m == p2m_identity)) - continue; - - if ((unsigned long)p2m == INVALID_P2M_ENTRY) - continue; - - ident_pfns = 0; - inv_pfns = 0; - for (idx = 0; idx < P2M_PER_PAGE; idx++) { - /* IDENTITY_PFNs are 1:1 */ - if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) - ident_pfns++; - else if (p2m[idx] == INVALID_P2M_ENTRY) - inv_pfns++; - else - break; + if (pte_pfn(*ptep) == p2m_pfn) { + set_pte(ptep, + pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); + if (mid_mfn) + mid_mfn[mididx] = virt_to_mfn(p2m); + p2m = NULL; } - if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) - goto found; - } - return false; -found: - /* Found one, replace old with p2m_identity or p2m_missing */ - p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); - - /* Reset where we want to stick the old page in. */ - topidx = p2m_top_index(set_pfn); - mididx = p2m_mid_index(set_pfn); - - /* This shouldn't happen */ - if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) - early_alloc_p2m_middle(set_pfn); - - if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) - return false; - - p2m_init(p2m); - p2m_top[topidx][mididx] = p2m; - return true; -} -bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!early_alloc_p2m_middle(pfn)) - return false; - - if (early_can_reuse_p2m_middle(pfn)) - return __set_phys_to_machine(pfn, mfn); - - if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) - return false; + spin_unlock_irqrestore(&p2m_update_lock, flags); - if (!__set_phys_to_machine(pfn, mfn)) - return false; + if (p2m) + free_p2m_page(p2m); } return true; } -static void __init early_split_p2m(unsigned long pfn) -{ - unsigned long mididx, idx; - - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - /* - * Allocate new middle and leaf pages if this pfn lies in the - * middle of one. - */ - if (mididx || idx) - early_alloc_p2m_middle(pfn); - if (idx) - early_alloc_p2m(pfn, false); -} - unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { unsigned long pfn; - if (unlikely(pfn_s >= MAX_P2M_PFN)) + if (unlikely(pfn_s >= xen_p2m_size)) return 0; if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) @@ -774,101 +602,51 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (pfn_s > pfn_e) return 0; - if (pfn_e > MAX_P2M_PFN) - pfn_e = MAX_P2M_PFN; - - early_split_p2m(pfn_s); - early_split_p2m(pfn_e); - - for (pfn = pfn_s; pfn < pfn_e;) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - - if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) - break; - pfn++; - - /* - * If the PFN was set to a middle or leaf identity - * page the remainder must also be identity, so skip - * ahead to the next middle or leaf entry. - */ - if (p2m_top[topidx] == p2m_mid_identity) - pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); - else if (p2m_top[topidx][mididx] == p2m_identity) - pfn = ALIGN(pfn, P2M_PER_PAGE); - } + if (pfn_e > xen_p2m_size) + pfn_e = xen_p2m_size; - WARN((pfn - pfn_s) != (pfn_e - pfn_s), - "Identity mapping failed. We are %ld short of 1-1 mappings!\n", - (pfn_e - pfn_s) - (pfn - pfn_s)); + for (pfn = pfn_s; pfn < pfn_e; pfn++) + xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn); return pfn - pfn_s; } -/* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, mididx, idx; + pte_t *ptep; + unsigned int level; /* don't track P2M changes in autotranslate guests */ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) return true; - if (unlikely(pfn >= MAX_P2M_PFN)) { + if (unlikely(pfn >= xen_p2m_size)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - /* For sparse holes were the p2m leaf has real PFN along with - * PCI holes, stick in the PFN as the MFN value. - * - * set_phys_range_identity() will have allocated new middle - * and leaf pages as required so an existing p2m_mid_missing - * or p2m_missing mean that whole range will be identity so - * these can be switched to p2m_mid_identity or p2m_identity. - */ - if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { - if (p2m_top[topidx] == p2m_mid_identity) - return true; - - if (p2m_top[topidx] == p2m_mid_missing) { - WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, - p2m_mid_identity) != p2m_mid_missing); - return true; - } - - if (p2m_top[topidx][mididx] == p2m_identity) - return true; + if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) + return true; - /* Swap over from MISSING to IDENTITY if needed. */ - if (p2m_top[topidx][mididx] == p2m_missing) { - WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, - p2m_identity) != p2m_missing); - return true; - } - } + ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); - if (p2m_top[topidx][mididx] == p2m_missing) + if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing))) return mfn == INVALID_P2M_ENTRY; - p2m_top[topidx][mididx][idx] = mfn; + if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity))) + return mfn == IDENTITY_FRAME(pfn); - return true; + return false; } bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!alloc_p2m(pfn)) return false; - if (!__set_phys_to_machine(pfn, mfn)) - return false; + return __set_phys_to_machine(pfn, mfn); } return true; @@ -877,15 +655,16 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) #define M2P_OVERRIDE_HASH_SHIFT 10 #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) -static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static struct list_head *m2p_overrides; static DEFINE_SPINLOCK(m2p_override_lock); static void __init m2p_override_init(void) { unsigned i; - m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, - sizeof(unsigned long)); + m2p_overrides = alloc_bootmem_align( + sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, + sizeof(unsigned long)); for (i = 0; i < M2P_OVERRIDE_HASH; i++) INIT_LIST_HEAD(&m2p_overrides[i]); @@ -896,68 +675,9 @@ static unsigned long mfn_hash(unsigned long mfn) return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); } -int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) -{ - int i, ret = 0; - bool lazy = false; - pte_t *pte; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - if (kmap_ops && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - unsigned long mfn, pfn; - - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; - - if (map_ops[i].flags & GNTMAP_contains_pte) { - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - } else { - mfn = PFN_DOWN(map_ops[i].dev_bus_addr); - } - pfn = page_to_pfn(pages[i]); - - WARN_ON(PagePrivate(pages[i])); - SetPagePrivate(pages[i]); - set_page_private(pages[i], mfn); - pages[i]->index = pfn_to_mfn(pfn); - - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { - ret = -ENOMEM; - goto out; - } - - if (kmap_ops) { - ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); - if (ret) - goto out; - } - } - -out: - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; -} -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); - /* Add an MFN override for a particular page */ -int m2p_add_override(unsigned long mfn, struct page *page, - struct gnttab_map_grant_ref *kmap_op) +static int m2p_add_override(unsigned long mfn, struct page *page, + struct gnttab_map_grant_ref *kmap_op) { unsigned long flags; unsigned long pfn; @@ -970,7 +690,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, address = (unsigned long)__va(pfn << PAGE_SHIFT); ptep = lookup_address(address, &level); if (WARN(ptep == NULL || level != PG_LEVEL_4K, - "m2p_add_override: pfn %lx not mapped", pfn)) + "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } @@ -1004,19 +724,19 @@ int m2p_add_override(unsigned long mfn, struct page *page, * because mfn_to_pfn (that ends up being called by GUPF) will * return the backend pfn rather than the frontend pfn. */ pfn = mfn_to_pfn_no_overrides(mfn); - if (get_phys_to_machine(pfn) == mfn) + if (__pfn_to_mfn(pfn) == mfn) set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); return 0; } -EXPORT_SYMBOL_GPL(m2p_add_override); -int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) +int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { int i, ret = 0; bool lazy = false; + pte_t *pte; if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; @@ -1029,35 +749,75 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, } for (i = 0; i < count; i++) { - unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); - unsigned long pfn = page_to_pfn(pages[i]); + unsigned long mfn, pfn; - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { - ret = -EINVAL; - goto out; + /* Do not add to override if the map failed. */ + if (map_ops[i].status) + continue; + + if (map_ops[i].flags & GNTMAP_contains_pte) { + pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + } else { + mfn = PFN_DOWN(map_ops[i].dev_bus_addr); } + pfn = page_to_pfn(pages[i]); - set_page_private(pages[i], INVALID_P2M_ENTRY); - WARN_ON(!PagePrivate(pages[i])); - ClearPagePrivate(pages[i]); - set_phys_to_machine(pfn, pages[i]->index); + WARN_ON(PagePrivate(pages[i])); + SetPagePrivate(pages[i]); + set_page_private(pages[i], mfn); + pages[i]->index = pfn_to_mfn(pfn); - if (kmap_ops) - ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); - if (ret) + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { + ret = -ENOMEM; goto out; + } + + if (kmap_ops) { + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); + if (ret) + goto out; + } } out: if (lazy) arch_leave_lazy_mmu_mode(); + return ret; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); +EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); -int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn) +static struct page *m2p_find_override(unsigned long mfn) +{ + unsigned long flags; + struct list_head *bucket; + struct page *p, *ret; + + if (unlikely(!m2p_overrides)) + return NULL; + + ret = NULL; + bucket = &m2p_overrides[mfn_hash(mfn)]; + + spin_lock_irqsave(&m2p_override_lock, flags); + + list_for_each_entry(p, bucket, lru) { + if (page_private(p) == mfn) { + ret = p; + break; + } + } + + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return ret; +} + +static int m2p_remove_override(struct page *page, + struct gnttab_map_grant_ref *kmap_op, + unsigned long mfn) { unsigned long flags; unsigned long pfn; @@ -1072,7 +832,7 @@ int m2p_remove_override(struct page *page, ptep = lookup_address(address, &level); if (WARN(ptep == NULL || level != PG_LEVEL_4K, - "m2p_remove_override: pfn %lx not mapped", pfn)) + "m2p_remove_override: pfn %lx not mapped", pfn)) return -EINVAL; } @@ -1102,9 +862,8 @@ int m2p_remove_override(struct page *page, * hypercall actually returned an error. */ if (kmap_op->handle == GNTST_general_error) { - printk(KERN_WARNING "m2p_remove_override: " - "pfn %lx mfn %lx, failed to modify kernel mappings", - pfn, mfn); + pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings", + pfn, mfn); put_balloon_scratch_page(); return -1; } @@ -1112,14 +871,14 @@ int m2p_remove_override(struct page *page, xen_mc_batch(); mcs = __xen_mc_entry( - sizeof(struct gnttab_unmap_and_replace)); + sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; unmap_op->new_addr = scratch_page_address; unmap_op->handle = kmap_op->handle; MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_and_replace, unmap_op, 1); + GNTTABOP_unmap_and_replace, unmap_op, 1); mcs = __xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, scratch_page_address, @@ -1145,35 +904,56 @@ int m2p_remove_override(struct page *page, * pfn again. */ mfn &= ~FOREIGN_FRAME_BIT; pfn = mfn_to_pfn_no_overrides(mfn); - if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) && m2p_find_override(mfn) == NULL) set_phys_to_machine(pfn, mfn); return 0; } -EXPORT_SYMBOL_GPL(m2p_remove_override); -struct page *m2p_find_override(unsigned long mfn) +int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { - unsigned long flags; - struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; - struct page *p, *ret; + int i, ret = 0; + bool lazy = false; - ret = NULL; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; - spin_lock_irqsave(&m2p_override_lock, flags); + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } - list_for_each_entry(p, bucket, lru) { - if (page_private(p) == mfn) { - ret = p; - break; + for (i = 0; i < count; i++) { + unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); + unsigned long pfn = page_to_pfn(pages[i]); + + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + ret = -EINVAL; + goto out; } - } - spin_unlock_irqrestore(&m2p_override_lock, flags); + set_page_private(pages[i], INVALID_P2M_ENTRY); + WARN_ON(!PagePrivate(pages[i])); + ClearPagePrivate(pages[i]); + set_phys_to_machine(pfn, pages[i]->index); + + if (kmap_ops) + ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); + if (ret) + goto out; + } +out: + if (lazy) + arch_leave_lazy_mmu_mode(); return ret; } +EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) { @@ -1192,79 +972,29 @@ EXPORT_SYMBOL_GPL(m2p_find_override_pfn); #include "debugfs.h" static int p2m_dump_show(struct seq_file *m, void *v) { - static const char * const level_name[] = { "top", "middle", - "entry", "abnormal", "error"}; -#define TYPE_IDENTITY 0 -#define TYPE_MISSING 1 -#define TYPE_PFN 2 -#define TYPE_UNKNOWN 3 static const char * const type_name[] = { - [TYPE_IDENTITY] = "identity", - [TYPE_MISSING] = "missing", - [TYPE_PFN] = "pfn", - [TYPE_UNKNOWN] = "abnormal"}; - unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; - unsigned int uninitialized_var(prev_level); - unsigned int uninitialized_var(prev_type); - - if (!p2m_top) - return 0; - - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned idx = p2m_index(pfn); - unsigned lvl, type; - - lvl = 4; - type = TYPE_UNKNOWN; - if (p2m_top[topidx] == p2m_mid_missing) { - lvl = 0; type = TYPE_MISSING; - } else if (p2m_top[topidx] == NULL) { - lvl = 0; type = TYPE_UNKNOWN; - } else if (p2m_top[topidx][mididx] == NULL) { - lvl = 1; type = TYPE_UNKNOWN; - } else if (p2m_top[topidx][mididx] == p2m_identity) { - lvl = 1; type = TYPE_IDENTITY; - } else if (p2m_top[topidx][mididx] == p2m_missing) { - lvl = 1; type = TYPE_MISSING; - } else if (p2m_top[topidx][mididx][idx] == 0) { - lvl = 2; type = TYPE_UNKNOWN; - } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { - lvl = 2; type = TYPE_IDENTITY; - } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { - lvl = 2; type = TYPE_MISSING; - } else if (p2m_top[topidx][mididx][idx] == pfn) { - lvl = 2; type = TYPE_PFN; - } else if (p2m_top[topidx][mididx][idx] != pfn) { - lvl = 2; type = TYPE_PFN; - } - if (pfn == 0) { - prev_level = lvl; - prev_type = type; - } - if (pfn == MAX_DOMAIN_PAGES-1) { - lvl = 3; - type = TYPE_UNKNOWN; - } - if (prev_type != type) { - seq_printf(m, " [0x%lx->0x%lx] %s\n", - prev_pfn_type, pfn, type_name[prev_type]); - prev_pfn_type = pfn; + [P2M_TYPE_IDENTITY] = "identity", + [P2M_TYPE_MISSING] = "missing", + [P2M_TYPE_PFN] = "pfn", + [P2M_TYPE_UNKNOWN] = "abnormal"}; + unsigned long pfn, first_pfn; + int type, prev_type; + + prev_type = xen_p2m_elem_type(0); + first_pfn = 0; + + for (pfn = 0; pfn < xen_p2m_size; pfn++) { + type = xen_p2m_elem_type(pfn); + if (type != prev_type) { + seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn, + type_name[prev_type]); prev_type = type; - } - if (prev_level != lvl) { - seq_printf(m, " [0x%lx->0x%lx] level %s\n", - prev_pfn_level, pfn, level_name[prev_level]); - prev_pfn_level = pfn; - prev_level = lvl; + first_pfn = pfn; } } + seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn, + type_name[prev_type]); return 0; -#undef TYPE_IDENTITY -#undef TYPE_MISSING -#undef TYPE_PFN -#undef TYPE_UNKNOWN } static int p2m_dump_open(struct inode *inode, struct file *filp) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 29834b3fd87f..dfd77dec8e2b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -30,6 +30,7 @@ #include "xen-ops.h" #include "vdso.h" #include "p2m.h" +#include "mmu.h" /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -47,8 +48,19 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; -/* Buffer used to remap identity mapped pages */ -unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; +/* + * Buffer used to remap identity mapped pages. We only need the virtual space. + * The physical page behind this address is remapped as needed to different + * buffer pages. + */ +#define REMAP_SIZE (P2M_PER_PAGE - 3) +static struct { + unsigned long next_area_mfn; + unsigned long target_pfn; + unsigned long size; + unsigned long mfns[REMAP_SIZE]; +} xen_remap_buf __initdata __aligned(PAGE_SIZE); +static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; /* * The maximum amount of extra memory compared to the base size. The @@ -64,7 +76,6 @@ unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; static void __init xen_add_extra_mem(u64 start, u64 size) { - unsigned long pfn; int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { @@ -84,75 +95,76 @@ static void __init xen_add_extra_mem(u64 start, u64 size) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); memblock_reserve(start, size); +} - xen_max_p2m_pfn = PFN_DOWN(start + size); - for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { - unsigned long mfn = pfn_to_mfn(pfn); - - if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) - continue; - WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", - pfn, mfn); +static void __init xen_del_extra_mem(u64 start, u64 size) +{ + int i; + u64 start_r, size_r; - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + start_r = xen_extra_mem[i].start; + size_r = xen_extra_mem[i].size; + + /* Start of region. */ + if (start_r == start) { + BUG_ON(size > size_r); + xen_extra_mem[i].start += size; + xen_extra_mem[i].size -= size; + break; + } + /* End of region. */ + if (start_r + size_r == start + size) { + BUG_ON(size > size_r); + xen_extra_mem[i].size -= size; + break; + } + /* Mid of region. */ + if (start > start_r && start < start_r + size_r) { + BUG_ON(start + size > start_r + size_r); + xen_extra_mem[i].size = start - start_r; + /* Calling memblock_reserve() again is okay. */ + xen_add_extra_mem(start + size, start_r + size_r - + (start + size)); + break; + } } + memblock_free(start, size); } -static unsigned long __init xen_do_chunk(unsigned long start, - unsigned long end, bool release) +/* + * Called during boot before the p2m list can take entries beyond the + * hypervisor supplied p2m list. Entries in extra mem are to be regarded as + * invalid. + */ +unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { - struct xen_memory_reservation reservation = { - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - unsigned long len = 0; - unsigned long pfn; - int ret; + int i; + unsigned long addr = PFN_PHYS(pfn); - for (pfn = start; pfn < end; pfn++) { - unsigned long frame; - unsigned long mfn = pfn_to_mfn(pfn); + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + if (addr >= xen_extra_mem[i].start && + addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + return INVALID_P2M_ENTRY; + } - if (release) { - /* Make sure pfn exists to start with */ - if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) - continue; - frame = mfn; - } else { - if (mfn != INVALID_P2M_ENTRY) - continue; - frame = pfn; - } - set_xen_guest_handle(reservation.extent_start, &frame); - reservation.nr_extents = 1; + return IDENTITY_FRAME(pfn); +} - ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, - &reservation); - WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", - release ? "release" : "populate", pfn, ret); +/* + * Mark all pfns of extra mem as invalid in p2m list. + */ +void __init xen_inv_extra_mem(void) +{ + unsigned long pfn, pfn_s, pfn_e; + int i; - if (ret == 1) { - if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { - if (release) - break; - set_xen_guest_handle(reservation.extent_start, &frame); - reservation.nr_extents = 1; - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - break; - } - len++; - } else - break; + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + pfn_s = PFN_DOWN(xen_extra_mem[i].start); + pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + for (pfn = pfn_s; pfn < pfn_e; pfn++) + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } - if (len) - printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", - release ? "Freeing" : "Populating", - start, end, len, - release ? "freed" : "added"); - - return len; } /* @@ -198,26 +210,62 @@ static unsigned long __init xen_find_pfn_range( return done; } +static int __init xen_free_mfn(unsigned long mfn) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + set_xen_guest_handle(reservation.extent_start, &mfn); + reservation.nr_extents = 1; + + return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); +} + /* - * This releases a chunk of memory and then does the identity map. It's used as + * This releases a chunk of memory and then does the identity map. It's used * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, unsigned long *released) { + unsigned long len = 0; + unsigned long pfn, end; + int ret; + WARN_ON(start_pfn > end_pfn); + end = min(end_pfn, nr_pages); + for (pfn = start_pfn; pfn < end; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + + ret = xen_free_mfn(mfn); + WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); + + if (ret == 1) { + if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) + break; + len++; + } else + break; + } + /* Need to release pages first */ - *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true); + *released += len; *identity += set_phys_range_identity(start_pfn, end_pfn); } /* - * Helper function to update both the p2m and m2p tables. + * Helper function to update the p2m and m2p tables and kernel mapping. */ -static unsigned long __init xen_update_mem_tables(unsigned long pfn, - unsigned long mfn) +static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) { struct mmu_update update = { .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, @@ -225,161 +273,88 @@ static unsigned long __init xen_update_mem_tables(unsigned long pfn, }; /* Update p2m */ - if (!early_set_phys_to_machine(pfn, mfn)) { + if (!set_phys_to_machine(pfn, mfn)) { WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", pfn, mfn); - return false; + BUG(); } /* Update m2p */ if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", mfn, pfn); - return false; + BUG(); } - return true; + /* Update kernel mapping, but not for highmem. */ + if ((pfn << PAGE_SHIFT) >= __pa(high_memory)) + return; + + if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(mfn, PAGE_KERNEL), 0)) { + WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n", + mfn, pfn); + BUG(); + } } /* * This function updates the p2m and m2p tables with an identity map from - * start_pfn to start_pfn+size and remaps the underlying RAM of the original - * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks - * to not exhaust the reserved brk space. Doing it in properly aligned blocks - * ensures we only allocate the minimum required leaf pages in the p2m table. It - * copies the existing mfns from the p2m table under the 1:1 map, overwrites - * them with the identity map and then updates the p2m and m2p tables with the - * remapped memory. + * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the + * original allocation at remap_pfn. The information needed for remapping is + * saved in the memory itself to avoid the need for allocating buffers. The + * complete remap information is contained in a list of MFNs each containing + * up to REMAP_SIZE MFNs and the start target PFN for doing the remap. + * This enables us to preserve the original mfn sequence while doing the + * remapping at a time when the memory management is capable of allocating + * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and + * its callers. */ -static unsigned long __init xen_do_set_identity_and_remap_chunk( +static void __init xen_do_set_identity_and_remap_chunk( unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) { + unsigned long buf = (unsigned long)&xen_remap_buf; + unsigned long mfn_save, mfn; unsigned long ident_pfn_iter, remap_pfn_iter; - unsigned long ident_start_pfn_align, remap_start_pfn_align; - unsigned long ident_end_pfn_align, remap_end_pfn_align; - unsigned long ident_boundary_pfn, remap_boundary_pfn; - unsigned long ident_cnt = 0; - unsigned long remap_cnt = 0; + unsigned long ident_end_pfn = start_pfn + size; unsigned long left = size; - unsigned long mod; - int i; + unsigned long ident_cnt = 0; + unsigned int i, chunk; WARN_ON(size == 0); BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); - /* - * Determine the proper alignment to remap memory in P2M_PER_PAGE sized - * blocks. We need to keep track of both the existing pfn mapping and - * the new pfn remapping. - */ - mod = start_pfn % P2M_PER_PAGE; - ident_start_pfn_align = - mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn; - mod = remap_pfn % P2M_PER_PAGE; - remap_start_pfn_align = - mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn; - mod = (start_pfn + size) % P2M_PER_PAGE; - ident_end_pfn_align = start_pfn + size - mod; - mod = (remap_pfn + size) % P2M_PER_PAGE; - remap_end_pfn_align = remap_pfn + size - mod; - - /* Iterate over each p2m leaf node in each range */ - for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align; - ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align; - ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) { - /* Check we aren't past the end */ - BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size); - BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size); - - /* Save p2m mappings */ - for (i = 0; i < P2M_PER_PAGE; i++) - xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i); - - /* Set identity map which will free a p2m leaf */ - ident_cnt += set_phys_range_identity(ident_pfn_iter, - ident_pfn_iter + P2M_PER_PAGE); + mfn_save = virt_to_mfn(buf); -#ifdef DEBUG - /* Helps verify a p2m leaf has been freed */ - for (i = 0; i < P2M_PER_PAGE; i++) { - unsigned int pfn = ident_pfn_iter + i; - BUG_ON(pfn_to_mfn(pfn) != pfn); - } -#endif - /* Now remap memory */ - for (i = 0; i < P2M_PER_PAGE; i++) { - unsigned long mfn = xen_remap_buf[i]; - - /* This will use the p2m leaf freed above */ - if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) { - WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", - remap_pfn_iter + i, mfn); - return 0; - } - - remap_cnt++; - } - - left -= P2M_PER_PAGE; - } - - /* Max boundary space possible */ - BUG_ON(left > (P2M_PER_PAGE - 1) * 2); + for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; + ident_pfn_iter < ident_end_pfn; + ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) { + chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE; - /* Now handle the boundary conditions */ - ident_boundary_pfn = start_pfn; - remap_boundary_pfn = remap_pfn; - for (i = 0; i < left; i++) { - unsigned long mfn; + /* Map first pfn to xen_remap_buf */ + mfn = pfn_to_mfn(ident_pfn_iter); + set_pte_mfn(buf, mfn, PAGE_KERNEL); - /* These two checks move from the start to end boundaries */ - if (ident_boundary_pfn == ident_start_pfn_align) - ident_boundary_pfn = ident_pfn_iter; - if (remap_boundary_pfn == remap_start_pfn_align) - remap_boundary_pfn = remap_pfn_iter; + /* Save mapping information in page */ + xen_remap_buf.next_area_mfn = xen_remap_mfn; + xen_remap_buf.target_pfn = remap_pfn_iter; + xen_remap_buf.size = chunk; + for (i = 0; i < chunk; i++) + xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i); - /* Check we aren't past the end */ - BUG_ON(ident_boundary_pfn >= start_pfn + size); - BUG_ON(remap_boundary_pfn >= remap_pfn + size); - - mfn = pfn_to_mfn(ident_boundary_pfn); - - if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) { - WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", - remap_pfn_iter + i, mfn); - return 0; - } - remap_cnt++; + /* Put remap buf into list. */ + xen_remap_mfn = mfn; - ident_boundary_pfn++; - remap_boundary_pfn++; - } + /* Set identity map */ + ident_cnt += set_phys_range_identity(ident_pfn_iter, + ident_pfn_iter + chunk); - /* Finish up the identity map */ - if (ident_start_pfn_align >= ident_end_pfn_align) { - /* - * In this case we have an identity range which does not span an - * aligned block so everything needs to be identity mapped here. - * If we didn't check this we might remap too many pages since - * the align boundaries are not meaningful in this case. - */ - ident_cnt += set_phys_range_identity(start_pfn, - start_pfn + size); - } else { - /* Remapped above so check each end of the chunk */ - if (start_pfn < ident_start_pfn_align) - ident_cnt += set_phys_range_identity(start_pfn, - ident_start_pfn_align); - if (start_pfn + size > ident_pfn_iter) - ident_cnt += set_phys_range_identity(ident_pfn_iter, - start_pfn + size); + left -= chunk; } - BUG_ON(ident_cnt != size); - BUG_ON(remap_cnt != size); - - return size; + /* Restore old xen_remap_buf mapping */ + set_pte_mfn(buf, mfn_save, PAGE_KERNEL); } /* @@ -396,8 +371,7 @@ static unsigned long __init xen_do_set_identity_and_remap_chunk( static unsigned long __init xen_set_identity_and_remap_chunk( const struct e820entry *list, size_t map_size, unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *identity, unsigned long *remapped, - unsigned long *released) + unsigned long *identity, unsigned long *released) { unsigned long pfn; unsigned long i = 0; @@ -431,19 +405,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (size > remap_range_size) size = remap_range_size; - if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) { - WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n", - cur_pfn, size, remap_pfn); - xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, identity, released); - break; - } + xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn); /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; *identity += size; - *remapped += size; } /* @@ -458,13 +425,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static unsigned long __init xen_set_identity_and_remap( +static void __init xen_set_identity_and_remap( const struct e820entry *list, size_t map_size, unsigned long nr_pages, unsigned long *released) { phys_addr_t start = 0; unsigned long identity = 0; - unsigned long remapped = 0; unsigned long last_pfn = nr_pages; const struct e820entry *entry; unsigned long num_released = 0; @@ -494,8 +460,7 @@ static unsigned long __init xen_set_identity_and_remap( last_pfn = xen_set_identity_and_remap_chunk( list, map_size, start_pfn, end_pfn, nr_pages, last_pfn, - &identity, &remapped, - &num_released); + &identity, &num_released); start = end; } } @@ -503,12 +468,63 @@ static unsigned long __init xen_set_identity_and_remap( *released = num_released; pr_info("Set %ld page(s) to 1-1 mapping\n", identity); - pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped, - last_pfn); pr_info("Released %ld page(s)\n", num_released); +} + +/* + * Remap the memory prepared in xen_do_set_identity_and_remap_chunk(). + * The remap information (which mfn remap to which pfn) is contained in the + * to be remapped memory itself in a linked list anchored at xen_remap_mfn. + * This scheme allows to remap the different chunks in arbitrary order while + * the resulting mapping will be independant from the order. + */ +void __init xen_remap_memory(void) +{ + unsigned long buf = (unsigned long)&xen_remap_buf; + unsigned long mfn_save, mfn, pfn; + unsigned long remapped = 0; + unsigned int i; + unsigned long pfn_s = ~0UL; + unsigned long len = 0; + + mfn_save = virt_to_mfn(buf); + + while (xen_remap_mfn != INVALID_P2M_ENTRY) { + /* Map the remap information */ + set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL); - return last_pfn; + BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]); + + pfn = xen_remap_buf.target_pfn; + for (i = 0; i < xen_remap_buf.size; i++) { + mfn = xen_remap_buf.mfns[i]; + xen_update_mem_tables(pfn, mfn); + remapped++; + pfn++; + } + if (pfn_s == ~0UL || pfn == pfn_s) { + pfn_s = xen_remap_buf.target_pfn; + len += xen_remap_buf.size; + } else if (pfn_s + len == xen_remap_buf.target_pfn) { + len += xen_remap_buf.size; + } else { + xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + pfn_s = xen_remap_buf.target_pfn; + len = xen_remap_buf.size; + } + + mfn = xen_remap_mfn; + xen_remap_mfn = xen_remap_buf.next_area_mfn; + } + + if (pfn_s != ~0UL && len) + xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + + set_pte_mfn(buf, mfn_save, PAGE_KERNEL); + + pr_info("Remapped %ld page(s)\n", remapped); } + static unsigned long __init xen_get_max_pages(void) { unsigned long max_pages = MAX_DOMAIN_PAGES; @@ -569,7 +585,6 @@ char * __init xen_memory_setup(void) int rc; struct xen_memory_map memmap; unsigned long max_pages; - unsigned long last_pfn = 0; unsigned long extra_pages = 0; int i; int op; @@ -616,17 +631,14 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Set identity map on non-RAM pages and remap the underlying RAM. + * Set identity map on non-RAM pages and prepare remapping the + * underlying RAM. */ - last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages); + xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, + &xen_released_pages); extra_pages += xen_released_pages; - if (last_pfn > max_pfn) { - max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); - mem_end = PFN_PHYS(max_pfn); - } /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base @@ -653,6 +665,7 @@ char * __init xen_memory_setup(void) size = min(size, (u64)extra_pages * PAGE_SIZE); extra_pages -= size / PAGE_SIZE; xen_add_extra_mem(addr, size); + xen_max_p2m_pfn = PFN_DOWN(addr + size); } else type = E820_UNUSABLE; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8650cdb53209..4c071aeb8417 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -510,6 +510,9 @@ static void xen_cpu_die(unsigned int cpu) current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } + + cpu_die_common(cpu); + xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 28c7e0be56e4..5686bd9d58cc 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -29,12 +29,13 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); -extern unsigned long xen_max_p2m_pfn; void xen_mm_pin_all(void); void xen_mm_unpin_all(void); -void xen_set_pat(u64); +unsigned long __ref xen_chk_extra_mem(unsigned long pfn); +void __init xen_inv_extra_mem(void); +void __init xen_remap_memory(void); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); @@ -47,7 +48,7 @@ void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); -unsigned long __init xen_revector_p2m_tree(void); +void __init xen_vmalloc_p2m_tree(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); |