diff options
194 files changed, 2364 insertions, 1613 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 2ad01cad7f1c..bcc974d276dc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -526,6 +526,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort /sys/devices/system/cpu/vulnerabilities/itlb_multihit + /sys/devices/system/cpu/vulnerabilities/mmio_stale_data Date: January 2018 Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 8cbc711cda93..4df436e7c417 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -17,3 +17,4 @@ are configurable at compile, boot or run time. special-register-buffer-data-sampling.rst core-scheduling.rst l1d_flush.rst + processor_mmio_stale_data.rst diff --git a/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst new file mode 100644 index 000000000000..9393c50b5afc --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst @@ -0,0 +1,246 @@ +========================================= +Processor MMIO Stale Data Vulnerabilities +========================================= + +Processor MMIO Stale Data Vulnerabilities are a class of memory-mapped I/O +(MMIO) vulnerabilities that can expose data. The sequences of operations for +exposing data range from simple to very complex. Because most of the +vulnerabilities require the attacker to have access to MMIO, many environments +are not affected. System environments using virtualization where MMIO access is +provided to untrusted guests may need mitigation. These vulnerabilities are +not transient execution attacks. However, these vulnerabilities may propagate +stale data into core fill buffers where the data can subsequently be inferred +by an unmitigated transient execution attack. Mitigation for these +vulnerabilities includes a combination of microcode update and software +changes, depending on the platform and usage model. Some of these mitigations +are similar to those used to mitigate Microarchitectural Data Sampling (MDS) or +those used to mitigate Special Register Buffer Data Sampling (SRBDS). + +Data Propagators +================ +Propagators are operations that result in stale data being copied or moved from +one microarchitectural buffer or register to another. Processor MMIO Stale Data +Vulnerabilities are operations that may result in stale data being directly +read into an architectural, software-visible state or sampled from a buffer or +register. + +Fill Buffer Stale Data Propagator (FBSDP) +----------------------------------------- +Stale data may propagate from fill buffers (FB) into the non-coherent portion +of the uncore on some non-coherent writes. Fill buffer propagation by itself +does not make stale data architecturally visible. Stale data must be propagated +to a location where it is subject to reading or sampling. + +Sideband Stale Data Propagator (SSDP) +------------------------------------- +The sideband stale data propagator (SSDP) is limited to the client (including +Intel Xeon server E3) uncore implementation. The sideband response buffer is +shared by all client cores. For non-coherent reads that go to sideband +destinations, the uncore logic returns 64 bytes of data to the core, including +both requested data and unrequested stale data, from a transaction buffer and +the sideband response buffer. As a result, stale data from the sideband +response and transaction buffers may now reside in a core fill buffer. + +Primary Stale Data Propagator (PSDP) +------------------------------------ +The primary stale data propagator (PSDP) is limited to the client (including +Intel Xeon server E3) uncore implementation. Similar to the sideband response +buffer, the primary response buffer is shared by all client cores. For some +processors, MMIO primary reads will return 64 bytes of data to the core fill +buffer including both requested data and unrequested stale data. This is +similar to the sideband stale data propagator. + +Vulnerabilities +=============== +Device Register Partial Write (DRPW) (CVE-2022-21166) +----------------------------------------------------- +Some endpoint MMIO registers incorrectly handle writes that are smaller than +the register size. Instead of aborting the write or only copying the correct +subset of bytes (for example, 2 bytes for a 2-byte write), more bytes than +specified by the write transaction may be written to the register. On +processors affected by FBSDP, this may expose stale data from the fill buffers +of the core that created the write transaction. + +Shared Buffers Data Sampling (SBDS) (CVE-2022-21125) +---------------------------------------------------- +After propagators may have moved data around the uncore and copied stale data +into client core fill buffers, processors affected by MFBDS can leak data from +the fill buffer. It is limited to the client (including Intel Xeon server E3) +uncore implementation. + +Shared Buffers Data Read (SBDR) (CVE-2022-21123) +------------------------------------------------ +It is similar to Shared Buffer Data Sampling (SBDS) except that the data is +directly read into the architectural software-visible state. It is limited to +the client (including Intel Xeon server E3) uncore implementation. + +Affected Processors +=================== +Not all the CPUs are affected by all the variants. For instance, most +processors for the server market (excluding Intel Xeon E3 processors) are +impacted by only Device Register Partial Write (DRPW). + +Below is the list of affected Intel processors [#f1]_: + + =================== ============ ========= + Common name Family_Model Steppings + =================== ============ ========= + HASWELL_X 06_3FH 2,4 + SKYLAKE_L 06_4EH 3 + BROADWELL_X 06_4FH All + SKYLAKE_X 06_55H 3,4,6,7,11 + BROADWELL_D 06_56H 3,4,5 + SKYLAKE 06_5EH 3 + ICELAKE_X 06_6AH 4,5,6 + ICELAKE_D 06_6CH 1 + ICELAKE_L 06_7EH 5 + ATOM_TREMONT_D 06_86H All + LAKEFIELD 06_8AH 1 + KABYLAKE_L 06_8EH 9 to 12 + ATOM_TREMONT 06_96H 1 + ATOM_TREMONT_L 06_9CH 0 + KABYLAKE 06_9EH 9 to 13 + COMETLAKE 06_A5H 2,3,5 + COMETLAKE_L 06_A6H 0,1 + ROCKETLAKE 06_A7H 1 + =================== ============ ========= + +If a CPU is in the affected processor list, but not affected by a variant, it +is indicated by new bits in MSR IA32_ARCH_CAPABILITIES. As described in a later +section, mitigation largely remains the same for all the variants, i.e. to +clear the CPU fill buffers via VERW instruction. + +New bits in MSRs +================ +Newer processors and microcode update on existing affected processors added new +bits to IA32_ARCH_CAPABILITIES MSR. These bits can be used to enumerate +specific variants of Processor MMIO Stale Data vulnerabilities and mitigation +capability. + +MSR IA32_ARCH_CAPABILITIES +-------------------------- +Bit 13 - SBDR_SSDP_NO - When set, processor is not affected by either the + Shared Buffers Data Read (SBDR) vulnerability or the sideband stale + data propagator (SSDP). +Bit 14 - FBSDP_NO - When set, processor is not affected by the Fill Buffer + Stale Data Propagator (FBSDP). +Bit 15 - PSDP_NO - When set, processor is not affected by Primary Stale Data + Propagator (PSDP). +Bit 17 - FB_CLEAR - When set, VERW instruction will overwrite CPU fill buffer + values as part of MD_CLEAR operations. Processors that do not + enumerate MDS_NO (meaning they are affected by MDS) but that do + enumerate support for both L1D_FLUSH and MD_CLEAR implicitly enumerate + FB_CLEAR as part of their MD_CLEAR support. +Bit 18 - FB_CLEAR_CTRL - Processor supports read and write to MSR + IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]. On such processors, the FB_CLEAR_DIS + bit can be set to cause the VERW instruction to not perform the + FB_CLEAR action. Not all processors that support FB_CLEAR will support + FB_CLEAR_CTRL. + +MSR IA32_MCU_OPT_CTRL +--------------------- +Bit 3 - FB_CLEAR_DIS - When set, VERW instruction does not perform the FB_CLEAR +action. This may be useful to reduce the performance impact of FB_CLEAR in +cases where system software deems it warranted (for example, when performance +is more critical, or the untrusted software has no MMIO access). Note that +FB_CLEAR_DIS has no impact on enumeration (for example, it does not change +FB_CLEAR or MD_CLEAR enumeration) and it may not be supported on all processors +that enumerate FB_CLEAR. + +Mitigation +========== +Like MDS, all variants of Processor MMIO Stale Data vulnerabilities have the +same mitigation strategy to force the CPU to clear the affected buffers before +an attacker can extract the secrets. + +This is achieved by using the otherwise unused and obsolete VERW instruction in +combination with a microcode update. The microcode clears the affected CPU +buffers when the VERW instruction is executed. + +Kernel reuses the MDS function to invoke the buffer clearing: + + mds_clear_cpu_buffers() + +On MDS affected CPUs, the kernel already invokes CPU buffer clear on +kernel/userspace, hypervisor/guest and C-state (idle) transitions. No +additional mitigation is needed on such CPUs. + +For CPUs not affected by MDS or TAA, mitigation is needed only for the attacker +with MMIO capability. Therefore, VERW is not required for kernel/userspace. For +virtualization case, VERW is only needed at VMENTER for a guest with MMIO +capability. + +Mitigation points +----------------- +Return to user space +^^^^^^^^^^^^^^^^^^^^ +Same mitigation as MDS when affected by MDS/TAA, otherwise no mitigation +needed. + +C-State transition +^^^^^^^^^^^^^^^^^^ +Control register writes by CPU during C-state transition can propagate data +from fill buffer to uncore buffers. Execute VERW before C-state transition to +clear CPU fill buffers. + +Guest entry point +^^^^^^^^^^^^^^^^^ +Same mitigation as MDS when processor is also affected by MDS/TAA, otherwise +execute VERW at VMENTER only for MMIO capable guests. On CPUs not affected by +MDS/TAA, guest without MMIO access cannot extract secrets using Processor MMIO +Stale Data vulnerabilities, so there is no need to execute VERW for such guests. + +Mitigation control on the kernel command line +--------------------------------------------- +The kernel command line allows to control the Processor MMIO Stale Data +mitigations at boot time with the option "mmio_stale_data=". The valid +arguments for this option are: + + ========== ================================================================= + full If the CPU is vulnerable, enable mitigation; CPU buffer clearing + on exit to userspace and when entering a VM. Idle transitions are + protected as well. It does not automatically disable SMT. + full,nosmt Same as full, with SMT disabled on vulnerable CPUs. This is the + complete mitigation. + off Disables mitigation completely. + ========== ================================================================= + +If the CPU is affected and mmio_stale_data=off is not supplied on the kernel +command line, then the kernel selects the appropriate mitigation. + +Mitigation status information +----------------------------- +The Linux kernel provides a sysfs interface to enumerate the current +vulnerability status of the system: whether the system is vulnerable, and +which mitigations are active. The relevant sysfs file is: + + /sys/devices/system/cpu/vulnerabilities/mmio_stale_data + +The possible values in this file are: + + .. list-table:: + + * - 'Not affected' + - The processor is not vulnerable + * - 'Vulnerable' + - The processor is vulnerable, but no mitigation enabled + * - 'Vulnerable: Clear CPU buffers attempted, no microcode' + - The processor is vulnerable, but microcode is not updated. The + mitigation is enabled on a best effort basis. + * - 'Mitigation: Clear CPU buffers' + - The processor is vulnerable and the CPU buffer clearing mitigation is + enabled. + +If the processor is vulnerable then the following information is appended to +the above information: + + ======================== =========================================== + 'SMT vulnerable' SMT is enabled + 'SMT disabled' SMT is disabled + 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown + ======================== =========================================== + +References +---------- +.. [#f1] Affected Processors + https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8090130b544b..2522b11e593f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2469,7 +2469,6 @@ protected: nVHE-based mode with support for guests whose state is kept private from the host. - Not valid if the kernel is running in EL2. Defaults to VHE/nVHE based on hardware support. Setting mode to "protected" will disable kexec and hibernation @@ -3176,6 +3175,7 @@ srbds=off [X86,INTEL] no_entry_flush [PPC] no_uaccess_flush [PPC] + mmio_stale_data=off [X86] Exceptions: This does not have any effect on @@ -3197,6 +3197,7 @@ Equivalent to: l1tf=flush,nosmt [X86] mds=full,nosmt [X86] tsx_async_abort=full,nosmt [X86] + mmio_stale_data=full,nosmt [X86] mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this @@ -3206,6 +3207,40 @@ log everything. Information is printed at KERN_DEBUG so loglevel=8 may also need to be specified. + mmio_stale_data= + [X86,INTEL] Control mitigation for the Processor + MMIO Stale Data vulnerabilities. + + Processor MMIO Stale Data is a class of + vulnerabilities that may expose data after an MMIO + operation. Exposed data could originate or end in + the same CPU buffers as affected by MDS and TAA. + Therefore, similar to MDS and TAA, the mitigation + is to clear the affected CPU buffers. + + This parameter controls the mitigation. The + options are: + + full - Enable mitigation on vulnerable CPUs + + full,nosmt - Enable mitigation and disable SMT on + vulnerable CPUs. + + off - Unconditionally disable mitigation + + On MDS or TAA affected machines, + mmio_stale_data=off can be prevented by an active + MDS or TAA mitigation as these vulnerabilities are + mitigated with the same mechanism so in order to + disable this mitigation, you need to specify + mds=off and tsx_async_abort=off too. + + Not specifying this option is equivalent to + mmio_stale_data=full. + + For details see: + Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst + module.sig_enforce [KNL] When CONFIG_MODULE_SIG is set, this means that modules without (valid) signatures will fail to load. diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 3276c3d55142..4d19b19bcc08 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -79,7 +79,7 @@ To help deal with the per-inode context, a number helper functions are provided. Firstly, a function to perform basic initialisation on a context and set the operations table pointer:: - void netfs_inode_init(struct inode *inode, + void netfs_inode_init(struct netfs_inode *ctx, const struct netfs_request_ops *ops); then a function to cast from the VFS inode structure to the netfs context:: @@ -89,7 +89,7 @@ then a function to cast from the VFS inode structure to the netfs context:: and finally, a function to get the cache cookie pointer from the context attached to an inode (or NULL if fscache is disabled):: - struct fscache_cookie *netfs_i_cookie(struct inode *inode); + struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx); Buffered Read Helpers @@ -136,8 +136,9 @@ Three read helpers are provided:: void netfs_readahead(struct readahead_control *ractl); int netfs_read_folio(struct file *file, - struct folio *folio); - int netfs_write_begin(struct file *file, + struct folio *folio); + int netfs_write_begin(struct netfs_inode *ctx, + struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, @@ -157,9 +158,10 @@ The helpers manage the read request, calling back into the network filesystem through the suppplied table of operations. Waits will be performed as necessary before returning for helpers that are meant to be synchronous. -If an error occurs and netfs_priv is non-NULL, ops->cleanup() will be called to -deal with it. If some parts of the request are in progress when an error -occurs, the request will get partially completed if sufficient data is read. +If an error occurs, the ->free_request() will be called to clean up the +netfs_io_request struct allocated. If some parts of the request are in +progress when an error occurs, the request will get partially completed if +sufficient data is read. Additionally, there is:: @@ -207,8 +209,7 @@ The above fields are the ones the netfs can use. They are: * ``netfs_priv`` The network filesystem's private data. The value for this can be passed in - to the helper functions or set during the request. The ->cleanup() op will - be called if this is non-NULL at the end. + to the helper functions or set during the request. * ``start`` * ``len`` @@ -293,6 +294,7 @@ through which it can issue requests and negotiate:: struct netfs_request_ops { void (*init_request)(struct netfs_io_request *rreq, struct file *file); + void (*free_request)(struct netfs_io_request *rreq); int (*begin_cache_operation)(struct netfs_io_request *rreq); void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); @@ -301,7 +303,6 @@ through which it can issue requests and negotiate:: int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, struct folio *folio, void **_fsdata); void (*done)(struct netfs_io_request *rreq); - void (*cleanup)(struct address_space *mapping, void *netfs_priv); }; The operations are as follows: @@ -309,7 +310,12 @@ The operations are as follows: * ``init_request()`` [Optional] This is called to initialise the request structure. It is given - the file for reference and can modify the ->netfs_priv value. + the file for reference. + + * ``free_request()`` + + [Optional] This is called as the request is being deallocated so that the + filesystem can clean up any state it has attached there. * ``begin_cache_operation()`` @@ -383,11 +389,6 @@ The operations are as follows: [Optional] This is called after the folios in the request have all been unlocked (and marked uptodate if applicable). - * ``cleanup`` - - [Optional] This is called as the request is being deallocated so that the - filesystem can clean up ->netfs_priv. - Read Helper Procedure diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 34415ae1af1b..19c286c23786 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -32,6 +32,7 @@ you probably needn't concern yourself with pcmciautils. GNU C 5.1 gcc --version Clang/LLVM (optional) 11.0.0 clang --version GNU make 3.81 make --version +bash 4.2 bash --version binutils 2.23 ld -v flex 2.5.35 flex --version bison 2.0 bison --version @@ -84,6 +85,12 @@ Make You will need GNU make 3.81 or later to build the kernel. +Bash +---- + +Some bash scripts are used for the kernel build. +Bash 4.2 or newer is needed. + Binutils -------- @@ -362,6 +369,11 @@ Make - <ftp://ftp.gnu.org/gnu/make/> +Bash +---- + +- <ftp://ftp.gnu.org/gnu/bash/> + Binutils -------- diff --git a/MAINTAINERS b/MAINTAINERS index 795f7e40230c..15a2341936ea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7668,6 +7668,7 @@ F: include/uapi/scsi/fc/ FILE LOCKING (flock() and fcntl()/lockf()) M: Jeff Layton <jlayton@kernel.org> +M: Chuck Lever <chuck.lever@oracle.com> L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/fcntl.c @@ -10760,6 +10761,7 @@ W: http://kernelnewbies.org/KernelJanitors KERNEL NFSD, SUNRPC, AND LOCKD SERVERS M: Chuck Lever <chuck.lever@oracle.com> +M: Jeff Layton <jlayton@kernel.org> L: linux-nfs@vger.kernel.org S: Supported W: http://nfs.sourceforge.net/ @@ -10884,7 +10886,6 @@ F: arch/riscv/include/asm/kvm* F: arch/riscv/include/uapi/asm/kvm* F: arch/riscv/kvm/ F: tools/testing/selftests/kvm/*/riscv/ -F: tools/testing/selftests/kvm/riscv/ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) M: Christian Borntraeger <borntraeger@linux.ibm.com> @@ -13799,6 +13800,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git F: Documentation/devicetree/bindings/net/ F: drivers/connector/ F: drivers/net/ +F: include/dt-bindings/net/ F: include/linux/etherdevice.h F: include/linux/fcdevice.h F: include/linux/fddidevice.h @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Superb Owl # *DOCUMENTATION* diff --git a/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts b/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts index 443e8b022897..14af1fd6d247 100644 --- a/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts +++ b/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts @@ -120,26 +120,31 @@ port@0 { reg = <0>; label = "lan1"; + phy-mode = "internal"; }; port@1 { reg = <1>; label = "lan2"; + phy-mode = "internal"; }; port@2 { reg = <2>; label = "lan3"; + phy-mode = "internal"; }; port@3 { reg = <3>; label = "lan4"; + phy-mode = "internal"; }; port@4 { reg = <4>; label = "lan5"; + phy-mode = "internal"; }; port@5 { diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 47a1e25e25bb..de32152cea04 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -363,11 +363,6 @@ struct kvm_vcpu_arch { struct kvm_pmu pmu; /* - * Anything that is not used directly from assembly code goes - * here. - */ - - /* * Guest registers we preserve during guest debugging. * * These shadow registers are updated by the kvm_handle_sys_reg diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 3c8af033a997..0e80db4327b6 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -113,6 +113,9 @@ static __always_inline bool has_vhe(void) /* * Code only run in VHE/NVHE hyp context can assume VHE is present or * absent. Otherwise fall back to caps. + * This allows the compiler to discard VHE-specific code from the + * nVHE object, reducing the number of external symbol references + * needed to link. */ if (is_vhe_hyp_code()) return true; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 42ea2bd856c6..79fac13ab2ef 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1974,15 +1974,7 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) #ifdef CONFIG_KVM static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused) { - if (kvm_get_mode() != KVM_MODE_PROTECTED) - return false; - - if (is_kernel_in_hyp_mode()) { - pr_warn("Protected KVM not available with VHE\n"); - return false; - } - - return true; + return kvm_get_mode() == KVM_MODE_PROTECTED; } #endif /* CONFIG_KVM */ diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 4e39ace073af..3b8d062e30ea 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1230,6 +1230,9 @@ bool kvm_arch_timer_get_input_level(int vintid) struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); struct arch_timer_context *timer; + if (WARN(!vcpu, "No vcpu context!\n")) + return false; + if (vintid == vcpu_vtimer(vcpu)->irq.irq) timer = vcpu_vtimer(vcpu); else if (vintid == vcpu_ptimer(vcpu)->irq.irq) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 400bb0fe2745..a0188144a122 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -150,8 +150,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out_free_stage2_pgd; - if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) + if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) { + ret = -ENOMEM; goto out_free_stage2_pgd; + } cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); kvm_vgic_early_init(kvm); @@ -2271,7 +2273,11 @@ static int __init early_kvm_mode_cfg(char *arg) return -EINVAL; if (strcmp(arg, "protected") == 0) { - kvm_mode = KVM_MODE_PROTECTED; + if (!is_kernel_in_hyp_mode()) + kvm_mode = KVM_MODE_PROTECTED; + else + pr_warn_once("Protected KVM not available with VHE\n"); + return 0; } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 3d251a4d2cf7..6012b08ecb14 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -80,6 +80,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) vcpu->arch.flags &= ~KVM_ARM64_FP_ENABLED; vcpu->arch.flags |= KVM_ARM64_FP_HOST; + vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED; if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED; @@ -93,6 +94,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) * operations. Do this for ZA as well for now for simplicity. */ if (system_supports_sme()) { + vcpu->arch.flags &= ~KVM_ARM64_HOST_SME_ENABLED; if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 78edf077fa3b..1e78acf9662e 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -314,15 +314,11 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot) { - hyp_assert_lock_held(&host_kvm.lock); - return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); } int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - hyp_assert_lock_held(&host_kvm.lock); - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, addr, size, &host_s2_pool, owner_id); } diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index b6d86e423319..35a4331ba5f3 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -243,15 +243,9 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) case SYS_ID_AA64MMFR2_EL1: return get_pvm_id_aa64mmfr2(vcpu); default: - /* - * Should never happen because all cases are covered in - * pvm_sys_reg_descs[]. - */ - WARN_ON(1); - break; + /* Unhandled ID register, RAZ */ + return 0; } - - return 0; } static u64 read_id_reg(const struct kvm_vcpu *vcpu, @@ -332,6 +326,16 @@ static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, /* Mark the specified system register as an AArch64 feature id register. */ #define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 } +/* + * sys_reg_desc initialiser for architecturally unallocated cpufeature ID + * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 + * (1 <= crm < 8, 0 <= Op2 < 8). + */ +#define ID_UNALLOCATED(crm, op2) { \ + Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ + .access = pvm_access_id_aarch64, \ +} + /* Mark the specified system register as Read-As-Zero/Write-Ignored */ #define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi } @@ -375,24 +379,46 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { AARCH32(SYS_MVFR0_EL1), AARCH32(SYS_MVFR1_EL1), AARCH32(SYS_MVFR2_EL1), + ID_UNALLOCATED(3,3), AARCH32(SYS_ID_PFR2_EL1), AARCH32(SYS_ID_DFR1_EL1), AARCH32(SYS_ID_MMFR5_EL1), + ID_UNALLOCATED(3,7), /* AArch64 ID registers */ /* CRm=4 */ AARCH64(SYS_ID_AA64PFR0_EL1), AARCH64(SYS_ID_AA64PFR1_EL1), + ID_UNALLOCATED(4,2), + ID_UNALLOCATED(4,3), AARCH64(SYS_ID_AA64ZFR0_EL1), + ID_UNALLOCATED(4,5), + ID_UNALLOCATED(4,6), + ID_UNALLOCATED(4,7), AARCH64(SYS_ID_AA64DFR0_EL1), AARCH64(SYS_ID_AA64DFR1_EL1), + ID_UNALLOCATED(5,2), + ID_UNALLOCATED(5,3), AARCH64(SYS_ID_AA64AFR0_EL1), AARCH64(SYS_ID_AA64AFR1_EL1), + ID_UNALLOCATED(5,6), + ID_UNALLOCATED(5,7), AARCH64(SYS_ID_AA64ISAR0_EL1), AARCH64(SYS_ID_AA64ISAR1_EL1), + AARCH64(SYS_ID_AA64ISAR2_EL1), + ID_UNALLOCATED(6,3), + ID_UNALLOCATED(6,4), + ID_UNALLOCATED(6,5), + ID_UNALLOCATED(6,6), + ID_UNALLOCATED(6,7), AARCH64(SYS_ID_AA64MMFR0_EL1), AARCH64(SYS_ID_AA64MMFR1_EL1), AARCH64(SYS_ID_AA64MMFR2_EL1), + ID_UNALLOCATED(7,3), + ID_UNALLOCATED(7,4), + ID_UNALLOCATED(7,5), + ID_UNALLOCATED(7,6), + ID_UNALLOCATED(7,7), /* Scalable Vector Registers are restricted. */ diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index 77a67e9d3d14..e070cda86e12 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -429,11 +429,11 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, vgic_mmio_read_pending, vgic_mmio_write_spending, - NULL, vgic_uaccess_write_spending, 1, + vgic_uaccess_read_pending, vgic_uaccess_write_spending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, vgic_mmio_read_pending, vgic_mmio_write_cpending, - NULL, vgic_uaccess_write_cpending, 1, + vgic_uaccess_read_pending, vgic_uaccess_write_cpending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, vgic_mmio_read_active, vgic_mmio_write_sactive, diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index f7aa7bcd6fb8..f15e29cc63ce 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -353,42 +353,6 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, return 0; } -static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* - * pending state of interrupt is latched in pending_latch variable. - * Userspace will save and restore pending state and line_level - * separately. - * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst - * for handling of ISPENDR and ICPENDR. - */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - bool state = irq->pending_latch; - - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - int err; - - err = irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &state); - WARN_ON(err); - } - - if (state) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) @@ -666,7 +630,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, + vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, vgic_mmio_read_pending, vgic_mmio_write_cpending, @@ -750,7 +714,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, + vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, vgic_mmio_read_pending, vgic_mmio_write_cpending, diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 49837d3a3ef5..997d0fce2088 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -226,8 +226,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, return 0; } -unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) +static unsigned long __read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; @@ -239,6 +240,15 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, unsigned long flags; bool val; + /* + * When used from userspace with a GICv3 model: + * + * Pending state of interrupt is latched in pending_latch + * variable. Userspace will save and restore pending state + * and line_level separately. + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst + * for handling of ISPENDR and ICPENDR. + */ raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { int err; @@ -248,10 +258,20 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, IRQCHIP_STATE_PENDING, &val); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); - } else if (vgic_irq_is_mapped_level(irq)) { + } else if (!is_user && vgic_irq_is_mapped_level(irq)) { val = vgic_get_phys_line_level(irq); } else { - val = irq_is_pending(irq); + switch (vcpu->kvm->arch.vgic.vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + if (is_user) { + val = irq->pending_latch; + break; + } + fallthrough; + default: + val = irq_is_pending(irq); + break; + } } value |= ((u32)val << i); @@ -263,6 +283,18 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, return value; } +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __read_pending(vcpu, addr, len, false); +} + +unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __read_pending(vcpu, addr, len, true); +} + static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { return (vgic_irq_is_sgi(irq->intid) && diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index 3fa696f198a3..6082d4b66d39 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -149,6 +149,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); +unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val); diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index 8d5f0506fd87..d78ae63d7c15 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -66,7 +66,7 @@ static void flush_context(void) * the next context-switch, we broadcast TLB flush + I-cache * invalidation over the inner shareable domain on rollover. */ - kvm_call_hyp(__kvm_flush_vm_context); + kvm_call_hyp(__kvm_flush_vm_context); } static bool check_update_reserved_vmid(u64 vmid, u64 newvmid) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 80657bf83b05..1920d52653b4 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -343,6 +343,7 @@ config NR_CPUS config NUMA bool "NUMA Support" + select SMP select ACPI_NUMA if ACPI help Say Y to compile the kernel with NUMA (Non-Uniform Memory Access) diff --git a/arch/loongarch/include/asm/hardirq.h b/arch/loongarch/include/asm/hardirq.h index befe8184aa08..0ef3b18f8980 100644 --- a/arch/loongarch/include/asm/hardirq.h +++ b/arch/loongarch/include/asm/hardirq.h @@ -19,7 +19,7 @@ typedef struct { unsigned int __softirq_pending; } ____cacheline_aligned irq_cpustat_t; -DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); +DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h index 34f15a6fb1e7..e6569f18c6dd 100644 --- a/arch/loongarch/include/asm/percpu.h +++ b/arch/loongarch/include/asm/percpu.h @@ -6,6 +6,7 @@ #define __ASM_PERCPU_H #include <asm/cmpxchg.h> +#include <asm/loongarch.h> /* Use r21 for fast access */ register unsigned long __my_cpu_offset __asm__("$r21"); diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h index 551e1f37c705..71189b28bfb2 100644 --- a/arch/loongarch/include/asm/smp.h +++ b/arch/loongarch/include/asm/smp.h @@ -9,10 +9,16 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/linkage.h> -#include <linux/smp.h> #include <linux/threads.h> #include <linux/cpumask.h> +extern int smp_num_siblings; +extern int num_processors; +extern int disabled_cpus; +extern cpumask_t cpu_sibling_map[]; +extern cpumask_t cpu_core_map[]; +extern cpumask_t cpu_foreign_map[]; + void loongson3_smp_setup(void); void loongson3_prepare_cpus(unsigned int max_cpus); void loongson3_boot_secondary(int cpu, struct task_struct *idle); @@ -25,26 +31,11 @@ int loongson3_cpu_disable(void); void loongson3_cpu_die(unsigned int cpu); #endif -#ifdef CONFIG_SMP - static inline void plat_smp_setup(void) { loongson3_smp_setup(); } -#else /* !CONFIG_SMP */ - -static inline void plat_smp_setup(void) { } - -#endif /* !CONFIG_SMP */ - -extern int smp_num_siblings; -extern int num_processors; -extern int disabled_cpus; -extern cpumask_t cpu_sibling_map[]; -extern cpumask_t cpu_core_map[]; -extern cpumask_t cpu_foreign_map[]; - static inline int raw_smp_processor_id(void) { #if defined(__VDSO__) diff --git a/arch/loongarch/include/asm/timex.h b/arch/loongarch/include/asm/timex.h index d3ed99a4fdbd..fb41e9e7a222 100644 --- a/arch/loongarch/include/asm/timex.h +++ b/arch/loongarch/include/asm/timex.h @@ -12,13 +12,6 @@ #include <asm/cpu.h> #include <asm/cpu-features.h> -/* - * Standard way to access the cycle counter. - * Currently only used on SMP for scheduling. - * - * We know that all SMP capable CPUs have cycle counters. - */ - typedef unsigned long cycles_t; #define get_cycles get_cycles diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c index b16c3dea5eeb..bb729ee8a237 100644 --- a/arch/loongarch/kernel/acpi.c +++ b/arch/loongarch/kernel/acpi.c @@ -138,6 +138,7 @@ void __init acpi_boot_table_init(void) } } +#ifdef CONFIG_SMP static int set_processor_mask(u32 id, u32 flags) { @@ -166,15 +167,18 @@ static int set_processor_mask(u32 id, u32 flags) return cpu; } +#endif static void __init acpi_process_madt(void) { +#ifdef CONFIG_SMP int i; for (i = 0; i < NR_CPUS; i++) { __cpu_number_map[i] = -1; __cpu_logical_map[i] = -1; } +#endif loongson_sysconf.nr_cpus = num_processors; } diff --git a/arch/loongarch/kernel/cacheinfo.c b/arch/loongarch/kernel/cacheinfo.c index 8c9fe29e98f0..b38f5489d094 100644 --- a/arch/loongarch/kernel/cacheinfo.c +++ b/arch/loongarch/kernel/cacheinfo.c @@ -4,6 +4,7 @@ * * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ +#include <asm/cpu-info.h> #include <linux/cacheinfo.h> /* Populates leaf and increments to next leaf */ diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c index 4b671d305ede..b34b8d792aa4 100644 --- a/arch/loongarch/kernel/irq.c +++ b/arch/loongarch/kernel/irq.c @@ -22,6 +22,8 @@ #include <asm/setup.h> DEFINE_PER_CPU(unsigned long, irq_stack); +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); struct irq_domain *cpu_domain; struct irq_domain *liointc_domain; @@ -56,8 +58,11 @@ int arch_show_interrupts(struct seq_file *p, int prec) void __init init_IRQ(void) { - int i, r, ipi_irq; + int i; +#ifdef CONFIG_SMP + int r, ipi_irq; static int ipi_dummy_dev; +#endif unsigned int order = get_order(IRQ_STACK_SIZE); struct page *page; diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index 6d944d65f600..bfa0dfe8b7d7 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -120,10 +120,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long childksp; + unsigned long tls = args->tls; + unsigned long usp = args->stack; + unsigned long clone_flags = args->flags; struct pt_regs *childregs, *regs = current_pt_regs(); childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE - 32; @@ -136,12 +138,12 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, p->thread.csr_crmd = csr_read32(LOONGARCH_CSR_CRMD); p->thread.csr_prmd = csr_read32(LOONGARCH_CSR_PRMD); p->thread.csr_ecfg = csr_read32(LOONGARCH_CSR_ECFG); - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ - p->thread.reg23 = usp; /* fn */ - p->thread.reg24 = kthread_arg; p->thread.reg03 = childksp; - p->thread.reg01 = (unsigned long) ret_from_kernel_thread; + p->thread.reg23 = (unsigned long)args->fn; + p->thread.reg24 = (unsigned long)args->fn_arg; + p->thread.reg01 = (unsigned long)ret_from_kernel_thread; memset(childregs, 0, sizeof(struct pt_regs)); childregs->csr_euen = p->thread.csr_euen; childregs->csr_crmd = p->thread.csr_crmd; diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 185e4035811a..c74860b53375 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -39,7 +39,6 @@ #include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/smp.h> #include <asm/time.h> #define SMBIOS_BIOSSIZE_OFFSET 0x09 @@ -349,8 +348,6 @@ static void __init prefill_possible_map(void) nr_cpu_ids = possible; } -#else -static inline void prefill_possible_map(void) {} #endif void __init setup_arch(char **cmdline_p) @@ -367,8 +364,10 @@ void __init setup_arch(char **cmdline_p) arch_mem_init(cmdline_p); resource_init(); +#ifdef CONFIG_SMP plat_smp_setup(); prefill_possible_map(); +#endif paging_init(); } diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index b8c53b755a25..73cec62504fb 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -66,8 +66,6 @@ static cpumask_t cpu_core_setup_map; struct secondary_data cpuboot_data; static DEFINE_PER_CPU(int, cpu_state); -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); enum ipi_msg_type { IPI_RESCHEDULE, diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 9f764df125db..6cd93995fb65 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -97,7 +97,7 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu) * We ran out of VMIDs so we increment vmid_version and * start assigning VMIDs from 1. * - * This also means existing VMIDs assignement to all Guest + * This also means existing VMIDs assignment to all Guest * instances is invalid and we have force VMID re-assignement * for all Guest instances. The Guest instances that were not * running will automatically pick-up new VMIDs because will diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 5c092a9153ea..027847023184 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -544,6 +544,8 @@ static int um_pci_init_vqs(struct um_pci_device *dev) dev->cmd_vq = vqs[0]; dev->irq_vq = vqs[1]; + virtio_device_ready(dev->vdev); + for (i = 0; i < NUM_IRQ_MSGS; i++) { void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL); @@ -587,7 +589,7 @@ static int um_pci_virtio_probe(struct virtio_device *vdev) dev->irq = irq_alloc_desc(numa_node_id()); if (dev->irq < 0) { err = dev->irq; - goto error; + goto err_reset; } um_pci_devices[free].dev = dev; vdev->priv = dev; @@ -604,6 +606,9 @@ static int um_pci_virtio_probe(struct virtio_device *vdev) um_pci_rescan(); return 0; +err_reset: + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); error: mutex_unlock(&um_pci_mtx); kfree(dev); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 393f2bbb5e3a..03acc823838a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -446,5 +446,6 @@ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3a240a64ac68..9217bd6cf0d1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1047,14 +1047,77 @@ struct kvm_x86_msr_filter { }; enum kvm_apicv_inhibit { + + /********************************************************************/ + /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */ + /********************************************************************/ + + /* + * APIC acceleration is disabled by a module parameter + * and/or not supported in hardware. + */ APICV_INHIBIT_REASON_DISABLE, + + /* + * APIC acceleration is inhibited because AutoEOI feature is + * being used by a HyperV guest. + */ APICV_INHIBIT_REASON_HYPERV, + + /* + * APIC acceleration is inhibited because the userspace didn't yet + * enable the kernel/split irqchip. + */ + APICV_INHIBIT_REASON_ABSENT, + + /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ + * (out of band, debug measure of blocking all interrupts on this vCPU) + * was enabled, to avoid AVIC/APICv bypassing it. + */ + APICV_INHIBIT_REASON_BLOCKIRQ, + + /* + * For simplicity, the APIC acceleration is inhibited + * first time either APIC ID or APIC base are changed by the guest + * from their reset values. + */ + APICV_INHIBIT_REASON_APIC_ID_MODIFIED, + APICV_INHIBIT_REASON_APIC_BASE_MODIFIED, + + /******************************************************/ + /* INHIBITs that are relevant only to the AMD's AVIC. */ + /******************************************************/ + + /* + * AVIC is inhibited on a vCPU because it runs a nested guest. + * + * This is needed because unlike APICv, the peers of this vCPU + * cannot use the doorbell mechanism to signal interrupts via AVIC when + * a vCPU runs nested. + */ APICV_INHIBIT_REASON_NESTED, + + /* + * On SVM, the wait for the IRQ window is implemented with pending vIRQ, + * which cannot be injected when the AVIC is enabled, thus AVIC + * is inhibited while KVM waits for IRQ window. + */ APICV_INHIBIT_REASON_IRQWIN, + + /* + * PIT (i8254) 're-inject' mode, relies on EOI intercept, + * which AVIC doesn't support for edge triggered interrupts. + */ APICV_INHIBIT_REASON_PIT_REINJ, + + /* + * AVIC is inhibited because the guest has x2apic in its CPUID. + */ APICV_INHIBIT_REASON_X2APIC, - APICV_INHIBIT_REASON_BLOCKIRQ, - APICV_INHIBIT_REASON_ABSENT, + + /* + * AVIC is disabled because SEV doesn't support it. + */ APICV_INHIBIT_REASON_SEV, }; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 403e83b4adc8..d27e0581b777 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -116,6 +116,30 @@ * Not susceptible to * TSX Async Abort (TAA) vulnerabilities. */ +#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /* + * Not susceptible to SBDR and SSDP + * variants of Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_FBSDP_NO BIT(14) /* + * Not susceptible to FBSDP variant of + * Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_PSDP_NO BIT(15) /* + * Not susceptible to PSDP variant of + * Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_FB_CLEAR BIT(17) /* + * VERW clears CPU fill buffer + * even on MDS_NO CPUs. + */ +#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /* + * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS] + * bit available to control VERW + * behavior. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -133,6 +157,7 @@ #define MSR_IA32_MCU_OPT_CTRL 0x00000123 #define RNGDS_MITG_DIS BIT(0) /* SRBDS support */ #define RTM_ALLOW BIT(1) /* TSX development mode */ +#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */ #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index acbaeaf83b61..da251a5645b0 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -269,6 +269,8 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); +DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); + #include <asm/segment.h> /** diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d879a6c93609..74c62cc47a5f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -41,8 +41,10 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); -static void __init mds_print_mitigation(void); +static void __init md_clear_update_mitigation(void); +static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); +static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); @@ -85,6 +87,10 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); +/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ +DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); +EXPORT_SYMBOL_GPL(mmio_stale_data_clear); + void __init check_bugs(void) { identify_boot_cpu(); @@ -117,17 +123,10 @@ void __init check_bugs(void) spectre_v2_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); - mds_select_mitigation(); - taa_select_mitigation(); + md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); - /* - * As MDS and TAA mitigations are inter-related, print MDS - * mitigation until after TAA mitigation selection is done. - */ - mds_print_mitigation(); - arch_smt_update(); #ifdef CONFIG_X86_32 @@ -267,14 +266,6 @@ static void __init mds_select_mitigation(void) } } -static void __init mds_print_mitigation(void) -{ - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) - return; - - pr_info("%s\n", mds_strings[mds_mitigation]); -} - static int __init mds_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MDS)) @@ -329,7 +320,7 @@ static void __init taa_select_mitigation(void) /* TSX previously disabled by tsx=off */ if (!boot_cpu_has(X86_FEATURE_RTM)) { taa_mitigation = TAA_MITIGATION_TSX_DISABLED; - goto out; + return; } if (cpu_mitigations_off()) { @@ -343,7 +334,7 @@ static void __init taa_select_mitigation(void) */ if (taa_mitigation == TAA_MITIGATION_OFF && mds_mitigation == MDS_MITIGATION_OFF) - goto out; + return; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) taa_mitigation = TAA_MITIGATION_VERW; @@ -375,18 +366,6 @@ static void __init taa_select_mitigation(void) if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); - - /* - * Update MDS mitigation, if necessary, as the mds_user_clear is - * now enabled for TAA mitigation. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } -out: - pr_info("%s\n", taa_strings[taa_mitigation]); } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -411,6 +390,151 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "MMIO Stale Data: " fmt + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static bool mmio_nosmt __ro_after_init = false; + +static const char * const mmio_strings[] = { + [MMIO_MITIGATION_OFF] = "Vulnerable", + [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", +}; + +static void __init mmio_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || + cpu_mitigations_off()) { + mmio_mitigation = MMIO_MITIGATION_OFF; + return; + } + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable CPU buffer clear mitigation for host and VMM, if also affected + * by MDS or TAA. Otherwise, enable mitigation for VMM only. + */ + if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && + boot_cpu_has(X86_FEATURE_RTM))) + static_branch_enable(&mds_user_clear); + else + static_branch_enable(&mmio_stale_data_clear); + + /* + * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can + * be propagated to uncore buffers, clearing the Fill buffers on idle + * is required irrespective of SMT state. + */ + if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + static_branch_enable(&mds_idle_clear); + + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(ia32_cap & ARCH_CAP_MDS_NO))) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + + if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init mmio_stale_data_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + mmio_mitigation = MMIO_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_nosmt = true; + } + + return 0; +} +early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static void __init md_clear_update_mitigation(void) +{ + if (cpu_mitigations_off()) + return; + + if (!static_key_enabled(&mds_user_clear)) + goto out; + + /* + * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data + * mitigation, if necessary. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } + if (taa_mitigation == TAA_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_select_mitigation(); + } + if (mmio_mitigation == MMIO_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_select_mitigation(); + } +out: + if (boot_cpu_has_bug(X86_BUG_MDS)) + pr_info("MDS: %s\n", mds_strings[mds_mitigation]); + if (boot_cpu_has_bug(X86_BUG_TAA)) + pr_info("TAA: %s\n", taa_strings[taa_mitigation]); + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); +} + +static void __init md_clear_select_mitigation(void) +{ + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + + /* + * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update + * and print their mitigation after MDS, TAA and MMIO Stale Data + * mitigation selection is done. + */ + md_clear_update_mitigation(); +} + +#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { @@ -478,11 +602,13 @@ static void __init srbds_select_mitigation(void) return; /* - * Check to see if this is one of the MDS_NO systems supporting - * TSX that are only exposed to SRBDS when TSX is enabled. + * Check to see if this is one of the MDS_NO systems supporting TSX that + * are only exposed to SRBDS when TSX is enabled or when CPU is affected + * by Processor MMIO Stale Data vulnerability. */ ia32_cap = x86_read_arch_cap_msr(); - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; @@ -1116,6 +1242,8 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { + u64 ia32_cap = x86_read_arch_cap_msr(); + /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -1127,14 +1255,17 @@ static void update_mds_branch_idle(void) if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; - if (sched_smt_active()) + if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); - else + } else if (mmio_mitigation == MMIO_MITIGATION_OFF || + (ia32_cap & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); + } } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" #define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" void cpu_bugs_smt_update(void) { @@ -1179,6 +1310,16 @@ void cpu_bugs_smt_update(void) break; } + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1781,6 +1922,20 @@ static ssize_t tsx_async_abort_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t mmio_stale_data_show_state(char *buf) +{ + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mmio_strings[mmio_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) @@ -1881,6 +2036,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SRBDS: return srbds_show_state(buf); + case X86_BUG_MMIO_STALE_DATA: + return mmio_stale_data_show_state(buf); + default: break; } @@ -1932,4 +2090,9 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * { return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); } + +ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c296cb1c0113..4730b0a58f24 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1211,18 +1211,42 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { X86_FEATURE_ANY, issues) #define SRBDS BIT(0) +/* CPU is affected by X86_BUG_MMIO_STALE_DATA */ +#define MMIO BIT(1) +/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ +#define MMIO_SBDS BIT(2) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) | + BIT(7) | BIT(0xB), MMIO), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO), + VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS), {} }; @@ -1243,6 +1267,13 @@ u64 x86_read_arch_cap_msr(void) return ia32_cap; } +static bool arch_cap_mmio_immune(u64 ia32_cap) +{ + return (ia32_cap & ARCH_CAP_FBSDP_NO && + ia32_cap & ARCH_CAP_PSDP_NO && + ia32_cap & ARCH_CAP_SBDR_SSDP_NO); +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -1296,12 +1327,27 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) /* * SRBDS affects CPUs which support RDRAND or RDSEED and are listed * in the vulnerability blacklist. + * + * Some of the implications and mitigation of Shared Buffers Data + * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as + * SRBDS. */ if ((cpu_has(c, X86_FEATURE_RDRAND) || cpu_has(c, X86_FEATURE_RDSEED)) && - cpu_matches(cpu_vuln_blacklist, SRBDS)) + cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS)) setup_force_cpu_bug(X86_BUG_SRBDS); + /* + * Processor MMIO Stale Data bug enumeration + * + * Affected CPU list is generally enough to enumerate the vulnerability, + * but for virtualization case check for ARCH_CAP MSR bits also, VMM may + * not want the guest to enumerate the bug. + */ + if (cpu_matches(cpu_vuln_blacklist, MMIO) && + !arch_cap_mmio_immune(ia32_cap)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f1bdac3f5aa8..0e68b4c937fc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2039,6 +2039,19 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) } } +static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) +{ + struct kvm *kvm = apic->vcpu->kvm; + + if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm)) + return; + + if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id) + return; + + kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); +} + static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -2047,10 +2060,12 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ - if (!apic_x2apic_mode(apic)) + if (!apic_x2apic_mode(apic)) { kvm_apic_set_xapic_id(apic, val >> 24); - else + kvm_lapic_xapic_id_updated(apic); + } else { ret = 1; + } break; case APIC_TASKPRI: @@ -2336,8 +2351,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) MSR_IA32_APICBASE_BASE; if ((value & MSR_IA32_APICBASE_ENABLE) && - apic->base_address != APIC_DEFAULT_PHYS_BASE) - pr_warn_once("APIC base relocation is unsupported by KVM"); + apic->base_address != APIC_DEFAULT_PHYS_BASE) { + kvm_set_apicv_inhibit(apic->vcpu->kvm, + APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); + } } void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) @@ -2648,6 +2665,8 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); } + } else { + kvm_lapic_xapic_id_updated(vcpu->arch.apic); } return 0; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e826ee9138fa..17252f39bd7c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3411,7 +3411,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); mmu->pae_root[i] = root | PT_PRESENT_MASK | - shadow_me_mask; + shadow_me_value; } mmu->root.hpa = __pa(mmu->pae_root); } else { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 54fe03714f8a..d1bc5820ea46 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -291,58 +291,91 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh, u32 index) { - u32 dest, apic_id; - struct kvm_vcpu *vcpu; + u32 l1_physical_id, dest; + struct kvm_vcpu *target_vcpu; int dest_mode = icrl & APIC_DEST_MASK; int shorthand = icrl & APIC_SHORT_MASK; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); - u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); if (shorthand != APIC_DEST_NOSHORT) return -EINVAL; - /* - * The AVIC incomplete IPI #vmexit info provides index into - * the physical APIC ID table, which can be used to derive - * guest physical APIC ID. - */ + if (apic_x2apic_mode(source)) + dest = icrh; + else + dest = GET_APIC_DEST_FIELD(icrh); + if (dest_mode == APIC_DEST_PHYSICAL) { - apic_id = index; + /* broadcast destination, use slow path */ + if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) + return -EINVAL; + if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) + return -EINVAL; + + l1_physical_id = dest; + + if (WARN_ON_ONCE(l1_physical_id != index)) + return -EINVAL; + } else { - if (!apic_x2apic_mode(source)) { - /* For xAPIC logical mode, the index is for logical APIC table. */ - apic_id = avic_logical_id_table[index] & 0x1ff; + u32 bitmap, cluster; + int logid_index; + + if (apic_x2apic_mode(source)) { + /* 16 bit dest mask, 16 bit cluster id */ + bitmap = dest & 0xFFFF0000; + cluster = (dest >> 16) << 4; + } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { + /* 8 bit dest mask*/ + bitmap = dest; + cluster = 0; } else { - return -EINVAL; + /* 4 bit desk mask, 4 bit cluster id */ + bitmap = dest & 0xF; + cluster = (dest >> 4) << 2; } - } - /* - * Assuming vcpu ID is the same as physical apic ID, - * and use it to retrieve the target vCPU. - */ - vcpu = kvm_get_vcpu_by_id(kvm, apic_id); - if (!vcpu) - return -EINVAL; + if (unlikely(!bitmap)) + /* guest bug: nobody to send the logical interrupt to */ + return 0; - if (apic_x2apic_mode(vcpu->arch.apic)) - dest = icrh; - else - dest = GET_APIC_DEST_FIELD(icrh); + if (!is_power_of_2(bitmap)) + /* multiple logical destinations, use slow path */ + return -EINVAL; - /* - * Try matching the destination APIC ID with the vCPU. - */ - if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) { - vcpu->arch.apic->irr_pending = true; - svm_complete_interrupt_delivery(vcpu, - icrl & APIC_MODE_MASK, - icrl & APIC_INT_LEVELTRIG, - icrl & APIC_VECTOR_MASK); - return 0; + logid_index = cluster + __ffs(bitmap); + + if (apic_x2apic_mode(source)) { + l1_physical_id = logid_index; + } else { + u32 *avic_logical_id_table = + page_address(kvm_svm->avic_logical_id_table_page); + + u32 logid_entry = avic_logical_id_table[logid_index]; + + if (WARN_ON_ONCE(index != logid_index)) + return -EINVAL; + + /* guest bug: non existing/reserved logical destination */ + if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) + return 0; + + l1_physical_id = logid_entry & + AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } } - return -EINVAL; + target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id); + if (unlikely(!target_vcpu)) + /* guest bug: non existing vCPU is a target of this IPI*/ + return 0; + + target_vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(target_vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + return 0; } static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, @@ -508,35 +541,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) return ret; } -static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) -{ - u64 *old, *new; - struct vcpu_svm *svm = to_svm(vcpu); - u32 id = kvm_xapic_id(vcpu->arch.apic); - - if (vcpu->vcpu_id == id) - return 0; - - old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); - new = avic_get_physical_id_entry(vcpu, id); - if (!new || !old) - return 1; - - /* We need to move physical_id_entry to new offset */ - *new = *old; - *old = 0ULL; - to_svm(vcpu)->avic_physical_id_cache = new; - - /* - * Also update the guest physical APIC ID in the logical - * APIC ID table entry if already setup the LDR. - */ - if (svm->ldr_reg) - avic_handle_ldr_update(vcpu); - - return 0; -} - static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -555,10 +559,6 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) AVIC_UNACCEL_ACCESS_OFFSET_MASK; switch (offset) { - case APIC_ID: - if (avic_handle_apic_id_update(vcpu)) - return 0; - break; case APIC_LDR: if (avic_handle_ldr_update(vcpu)) return 0; @@ -650,8 +650,6 @@ int avic_init_vcpu(struct vcpu_svm *svm) void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) { - if (avic_handle_apic_id_update(vcpu) != 0) - return; avic_handle_dfr_update(vcpu); avic_handle_ldr_update(vcpu); } @@ -910,7 +908,9 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_PIT_REINJ) | BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_SEV); + BIT(APICV_INHIBIT_REASON_SEV) | + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); return supported & BIT(reason); } @@ -946,7 +946,7 @@ out: return ret; } -void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; int h_physical_id = kvm_cpu_get_apicid(cpu); @@ -978,7 +978,7 @@ void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); } -void __avic_vcpu_put(struct kvm_vcpu *vcpu) +void avic_vcpu_put(struct kvm_vcpu *vcpu) { u64 entry; struct vcpu_svm *svm = to_svm(vcpu); @@ -997,25 +997,6 @@ void __avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -static void avic_vcpu_load(struct kvm_vcpu *vcpu) -{ - int cpu = get_cpu(); - - WARN_ON(cpu != vcpu->cpu); - - __avic_vcpu_load(vcpu, cpu); - - put_cpu(); -} - -static void avic_vcpu_put(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - - __avic_vcpu_put(vcpu); - - preempt_enable(); -} void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { @@ -1042,7 +1023,7 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) vmcb_mark_dirty(vmcb, VMCB_AVIC); if (activated) - avic_vcpu_load(vcpu); + avic_vcpu_load(vcpu, vcpu->cpu); else avic_vcpu_put(vcpu); @@ -1075,5 +1056,5 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - avic_vcpu_load(vcpu); + avic_vcpu_load(vcpu, vcpu->cpu); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3361258640a2..ba7cd26f438f 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -616,6 +616,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + u32 pause_count12; + u32 pause_thresh12; /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, @@ -671,27 +673,25 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0; + pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0; if (kvm_pause_in_guest(svm->vcpu.kvm)) { - /* use guest values since host doesn't use them */ - vmcb02->control.pause_filter_count = - svm->pause_filter_enabled ? - svm->nested.ctl.pause_filter_count : 0; + /* use guest values since host doesn't intercept PAUSE */ + vmcb02->control.pause_filter_count = pause_count12; + vmcb02->control.pause_filter_thresh = pause_thresh12; - vmcb02->control.pause_filter_thresh = - svm->pause_threshold_enabled ? - svm->nested.ctl.pause_filter_thresh : 0; - - } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { - /* use host values when guest doesn't use them */ + } else { + /* start from host values otherwise */ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; - } else { - /* - * Intercept every PAUSE otherwise and - * ignore both host and guest values - */ - vmcb02->control.pause_filter_count = 0; - vmcb02->control.pause_filter_thresh = 0; + + /* ... but ensure filtering is disabled if so requested. */ + if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { + if (!pause_count12) + vmcb02->control.pause_filter_count = 0; + if (!pause_thresh12) + vmcb02->control.pause_filter_thresh = 0; + } } nested_svm_transition_tlb_flush(vcpu); @@ -951,8 +951,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count) + if (!kvm_pause_in_guest(vcpu->kvm)) { vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; + vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); + + } nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1dc02cdf6960..87da90360bc7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -921,7 +921,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm) || !old) + if (kvm_pause_in_guest(vcpu->kvm)) return; control->pause_filter_count = __grow_ple_window(old, @@ -942,7 +942,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm) || !old) + if (kvm_pause_in_guest(vcpu->kvm)) return; control->pause_filter_count = @@ -1400,13 +1400,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) indirect_branch_prediction_barrier(); } if (kvm_vcpu_apicv_active(vcpu)) - __avic_vcpu_load(vcpu, cpu); + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { if (kvm_vcpu_apicv_active(vcpu)) - __avic_vcpu_put(vcpu); + avic_vcpu_put(vcpu); svm_prepare_host_switch(vcpu); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 500348c1cb35..1bddd336a27e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -610,8 +610,8 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); -void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void __avic_vcpu_put(struct kvm_vcpu *vcpu); +void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9bd86ecccdab..3a919e49129b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -229,6 +229,9 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; +/* Control for disabling CPU Fill buffer clear */ +static bool __read_mostly vmx_fb_clear_ctrl_available; + static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; @@ -360,6 +363,60 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } +static void vmx_setup_fb_clear_ctrl(void) +{ + u64 msr; + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && + !boot_cpu_has_bug(X86_BUG_MDS) && + !boot_cpu_has_bug(X86_BUG_TAA)) { + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_FB_CLEAR_CTRL) + vmx_fb_clear_ctrl_available = true; + } +} + +static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) +{ + u64 msr; + + if (!vmx->disable_fb_clear) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + msr |= FB_CLEAR_DIS; + wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + /* Cache the MSR value to avoid reading it later */ + vmx->msr_ia32_mcu_opt_ctrl = msr; +} + +static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) +{ + if (!vmx->disable_fb_clear) + return; + + vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; + wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); +} + +static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +{ + vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + + /* + * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS + * at VMEntry. Skip the MSR read/write when a guest has no use case to + * execute VERW. + */ + if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || + ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) + vmx->disable_fb_clear = false; +} + static const struct kernel_param_ops vmentry_l1d_flush_ops = { .set = vmentry_l1d_flush_set, .get = vmentry_l1d_flush_get, @@ -2252,6 +2309,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ret = kvm_set_msr_common(vcpu, msr_info); } + /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ + if (msr_index == MSR_IA32_ARCH_CAPABILITIES) + vmx_update_fb_clear_dis(vcpu, vmx); + return ret; } @@ -4553,6 +4614,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); + + vmx_update_fb_clear_dis(vcpu, vmx); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) @@ -6772,6 +6835,11 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_l1d_flush(vcpu); else if (static_branch_unlikely(&mds_user_clear)) mds_clear_cpu_buffers(); + else if (static_branch_unlikely(&mmio_stale_data_clear) && + kvm_arch_has_assigned_device(vcpu->kvm)) + mds_clear_cpu_buffers(); + + vmx_disable_fb_clear(vmx); if (vcpu->arch.cr2 != native_read_cr2()) native_write_cr2(vcpu->arch.cr2); @@ -6781,6 +6849,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = native_read_cr2(); + vmx_enable_fb_clear(vmx); + guest_state_exit_irqoff(); } @@ -7709,7 +7779,9 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ); + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); return supported & BIT(reason); } @@ -8212,6 +8284,8 @@ static int __init vmx_init(void) return r; } + vmx_setup_fb_clear_ctrl(); + for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index b98c7e96697a..8d2342ede0c5 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -348,6 +348,8 @@ struct vcpu_vmx { u64 msr_ia32_feature_control_valid_bits; /* SGX Launch Control public key hash */ u64 msr_ia32_sgxlepubkeyhash[4]; + u64 msr_ia32_mcu_opt_ctrl; + bool disable_fb_clear; struct pt_desc pt_desc; struct lbr_desc lbr_desc; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03fbfbbec460..1910e1e78b15 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1617,6 +1617,9 @@ static u64 kvm_get_arch_capabilities(void) */ } + /* Guests don't need to know "Fill buffer clear control" exists */ + data &= ~ARCH_CAP_FB_CLEAR_CTRL; + return data; } @@ -9850,6 +9853,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) return; down_read(&vcpu->kvm->arch.apicv_update_lock); + preempt_disable(); activate = kvm_vcpu_apicv_activated(vcpu); @@ -9870,6 +9874,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); out: + preempt_enable(); up_read(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); diff --git a/block/bio.c b/block/bio.c index f92d0223247b..51c99f2c5c90 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1747,26 +1747,6 @@ bad: } EXPORT_SYMBOL(bioset_init); -/* - * Initialize and setup a new bio_set, based on the settings from - * another bio_set. - */ -int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) -{ - int flags; - - flags = 0; - if (src->bvec_pool.min_nr) - flags |= BIOSET_NEED_BVECS; - if (src->rescue_workqueue) - flags |= BIOSET_NEED_RESCUER; - if (src->cache) - flags |= BIOSET_PERCPU_CACHE; - - return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); -} -EXPORT_SYMBOL(bioset_init_from_src); - static int __init init_bio(void) { int i; diff --git a/certs/.gitignore b/certs/.gitignore index 56637aceaf81..cec5465f31c1 100644 --- a/certs/.gitignore +++ b/certs/.gitignore @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -/blacklist_hashes_checked +/blacklist_hash_list /extract-cert /x509_certificate_list /x509_revocation_list diff --git a/certs/Makefile b/certs/Makefile index cb1a9da3fc58..a8d628fd5f7b 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -7,22 +7,22 @@ obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o c obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o common.o obj-$(CONFIG_SYSTEM_REVOCATION_LIST) += revocation_certificates.o ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),) -quiet_cmd_check_blacklist_hashes = CHECK $(patsubst "%",%,$(2)) - cmd_check_blacklist_hashes = $(AWK) -f $(srctree)/scripts/check-blacklist-hashes.awk $(2); touch $@ -$(eval $(call config_filename,SYSTEM_BLACKLIST_HASH_LIST)) +$(obj)/blacklist_hashes.o: $(obj)/blacklist_hash_list +CFLAGS_blacklist_hashes.o := -I $(obj) -$(obj)/blacklist_hashes.o: $(obj)/blacklist_hashes_checked +quiet_cmd_check_and_copy_blacklist_hash_list = GEN $@ + cmd_check_and_copy_blacklist_hash_list = \ + $(AWK) -f $(srctree)/scripts/check-blacklist-hashes.awk $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) >&2; \ + cat $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) > $@ -CFLAGS_blacklist_hashes.o += -I$(srctree) - -targets += blacklist_hashes_checked -$(obj)/blacklist_hashes_checked: $(SYSTEM_BLACKLIST_HASH_LIST_SRCPREFIX)$(SYSTEM_BLACKLIST_HASH_LIST_FILENAME) scripts/check-blacklist-hashes.awk FORCE - $(call if_changed,check_blacklist_hashes,$(SYSTEM_BLACKLIST_HASH_LIST_SRCPREFIX)$(CONFIG_SYSTEM_BLACKLIST_HASH_LIST)) +$(obj)/blacklist_hash_list: $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) FORCE + $(call if_changed,check_and_copy_blacklist_hash_list) obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_hashes.o else obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_nohashes.o endif +targets += blacklist_hash_list quiet_cmd_extract_certs = CERT $@ cmd_extract_certs = $(obj)/extract-cert $(extract-cert-in) $@ @@ -33,7 +33,7 @@ $(obj)/system_certificates.o: $(obj)/x509_certificate_list $(obj)/x509_certificate_list: $(CONFIG_SYSTEM_TRUSTED_KEYS) $(obj)/extract-cert FORCE $(call if_changed,extract_certs) -targets += x509_certificate_list blacklist_hashes_checked +targets += x509_certificate_list # If module signing is requested, say by allyesconfig, but a key has not been # supplied, then one will need to be generated to make sure the build does not diff --git a/certs/blacklist_hashes.c b/certs/blacklist_hashes.c index 344892337be0..86d66fe11348 100644 --- a/certs/blacklist_hashes.c +++ b/certs/blacklist_hashes.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "blacklist.h" -const char __initdata *const blacklist_hashes[] = { -#include CONFIG_SYSTEM_BLACKLIST_HASH_LIST +const char __initconst *const blacklist_hashes[] = { +#include "blacklist_hash_list" , NULL }; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 2ef23fce0860..a97776ea9d99 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -564,6 +564,12 @@ ssize_t __weak cpu_show_srbds(struct device *dev, return sysfs_emit(buf, "Not affected\n"); } +ssize_t __weak cpu_show_mmio_stale_data(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -573,6 +579,7 @@ static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); +static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -584,6 +591,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_tsx_async_abort.attr, &dev_attr_itlb_multihit.attr, &dev_attr_srbds.attr, + &dev_attr_mmio_stale_data.attr, NULL }; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 69fd31ffb847..0b6c03643ddc 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -429,28 +429,40 @@ config ADI driver include crash and makedumpfile. config RANDOM_TRUST_CPU - bool "Trust the CPU manufacturer to initialize Linux's CRNG" + bool "Initialize RNG using CPU RNG instructions" + default y depends on ARCH_RANDOM - default n help - Assume that CPU manufacturer (e.g., Intel or AMD for RDSEED or - RDRAND, IBM for the S390 and Power PC architectures) is trustworthy - for the purposes of initializing Linux's CRNG. Since this is not - something that can be independently audited, this amounts to trusting - that CPU manufacturer (perhaps with the insistence or mandate - of a Nation State's intelligence or law enforcement agencies) - has not installed a hidden back door to compromise the CPU's - random number generation facilities. This can also be configured - at boot with "random.trust_cpu=on/off". + Initialize the RNG using random numbers supplied by the CPU's + RNG instructions (e.g. RDRAND), if supported and available. These + random numbers are never used directly, but are rather hashed into + the main input pool, and this happens regardless of whether or not + this option is enabled. Instead, this option controls whether the + they are credited and hence can initialize the RNG. Additionally, + other sources of randomness are always used, regardless of this + setting. Enabling this implies trusting that the CPU can supply high + quality and non-backdoored random numbers. + + Say Y here unless you have reason to mistrust your CPU or believe + its RNG facilities may be faulty. This may also be configured at + boot time with "random.trust_cpu=on/off". config RANDOM_TRUST_BOOTLOADER - bool "Trust the bootloader to initialize Linux's CRNG" - help - Some bootloaders can provide entropy to increase the kernel's initial - device randomness. Say Y here to assume the entropy provided by the - booloader is trustworthy so it will be added to the kernel's entropy - pool. Otherwise, say N here so it will be regarded as device input that - only mixes the entropy pool. This can also be configured at boot with - "random.trust_bootloader=on/off". + bool "Initialize RNG using bootloader-supplied seed" + default y + help + Initialize the RNG using a seed supplied by the bootloader or boot + environment (e.g. EFI or a bootloader-generated device tree). This + seed is not used directly, but is rather hashed into the main input + pool, and this happens regardless of whether or not this option is + enabled. Instead, this option controls whether the seed is credited + and hence can initialize the RNG. Additionally, other sources of + randomness are always used, regardless of this setting. Enabling + this implies trusting that the bootloader can supply high quality and + non-backdoored seeds. + + Say Y here unless you have reason to mistrust your bootloader or + believe its RNG facilities may be faulty. This may also be configured + at boot time with "random.trust_bootloader=on/off". endmenu diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index e856df7e285c..a6f3a8a2aca6 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -159,6 +159,8 @@ static int probe_common(struct virtio_device *vdev) goto err_find; } + virtio_device_ready(vdev); + /* we always have a pending entropy request */ request_entropy(vi); diff --git a/drivers/char/random.c b/drivers/char/random.c index b691b9d59503..655e327d425e 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -650,7 +650,8 @@ static void __cold _credit_init_bits(size_t bits) if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) { crng_reseed(); /* Sets crng_init to CRNG_READY under base_crng.lock. */ - execute_in_process_context(crng_set_ready, &set_ready); + if (static_key_initialized) + execute_in_process_context(crng_set_ready, &set_ready); wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); pr_notice("crng init done\n"); @@ -724,9 +725,8 @@ static void __cold _credit_init_bits(size_t bits) * **********************************************************************/ -static bool used_arch_random; -static bool trust_cpu __ro_after_init = IS_ENABLED(CONFIG_RANDOM_TRUST_CPU); -static bool trust_bootloader __ro_after_init = IS_ENABLED(CONFIG_RANDOM_TRUST_BOOTLOADER); +static bool trust_cpu __initdata = IS_ENABLED(CONFIG_RANDOM_TRUST_CPU); +static bool trust_bootloader __initdata = IS_ENABLED(CONFIG_RANDOM_TRUST_BOOTLOADER); static int __init parse_trust_cpu(char *arg) { return kstrtobool(arg, &trust_cpu); @@ -776,7 +776,7 @@ static struct notifier_block pm_notifier = { .notifier_call = random_pm_notifica int __init random_init(const char *command_line) { ktime_t now = ktime_get_real(); - unsigned int i, arch_bytes; + unsigned int i, arch_bits; unsigned long entropy; #if defined(LATENT_ENTROPY_PLUGIN) @@ -784,12 +784,12 @@ int __init random_init(const char *command_line) _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed)); #endif - for (i = 0, arch_bytes = BLAKE2S_BLOCK_SIZE; + for (i = 0, arch_bits = BLAKE2S_BLOCK_SIZE * 8; i < BLAKE2S_BLOCK_SIZE; i += sizeof(entropy)) { if (!arch_get_random_seed_long_early(&entropy) && !arch_get_random_long_early(&entropy)) { entropy = random_get_entropy(); - arch_bytes -= sizeof(entropy); + arch_bits -= sizeof(entropy) * 8; } _mix_pool_bytes(&entropy, sizeof(entropy)); } @@ -798,11 +798,18 @@ int __init random_init(const char *command_line) _mix_pool_bytes(command_line, strlen(command_line)); add_latent_entropy(); + /* + * If we were initialized by the bootloader before jump labels are + * initialized, then we should enable the static branch here, where + * it's guaranteed that jump labels have been initialized. + */ + if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY) + crng_set_ready(NULL); + if (crng_ready()) crng_reseed(); else if (trust_cpu) - credit_init_bits(arch_bytes * 8); - used_arch_random = arch_bytes * 8 >= POOL_READY_BITS; + _credit_init_bits(arch_bits); WARN_ON(register_pm_notifier(&pm_notifier)); @@ -812,17 +819,6 @@ int __init random_init(const char *command_line) } /* - * Returns whether arch randomness has been mixed into the initial - * state of the RNG, regardless of whether or not that randomness - * was credited. Knowing this is only good for a very limited set - * of uses, such as early init printk pointer obfuscation. - */ -bool rng_has_arch_random(void) -{ - return used_arch_random; -} - -/* * Add device- or boot-specific data to the input pool to help * initialize it. * @@ -865,13 +861,12 @@ EXPORT_SYMBOL_GPL(add_hwgenerator_randomness); * Handle random seed passed by bootloader, and credit it if * CONFIG_RANDOM_TRUST_BOOTLOADER is set. */ -void __cold add_bootloader_randomness(const void *buf, size_t len) +void __init add_bootloader_randomness(const void *buf, size_t len) { mix_pool_bytes(buf, len); if (trust_bootloader) credit_init_bits(len * 8); } -EXPORT_SYMBOL_GPL(add_bootloader_randomness); #if IS_ENABLED(CONFIG_VMGENID) static BLOCKING_NOTIFIER_HEAD(vmfork_chain); diff --git a/drivers/gpio/gpio-crystalcove.c b/drivers/gpio/gpio-crystalcove.c index b55c74a5e064..1ee62cd58582 100644 --- a/drivers/gpio/gpio-crystalcove.c +++ b/drivers/gpio/gpio-crystalcove.c @@ -15,6 +15,7 @@ #include <linux/platform_device.h> #include <linux/regmap.h> #include <linux/seq_file.h> +#include <linux/types.h> #define CRYSTALCOVE_GPIO_NUM 16 #define CRYSTALCOVE_VGPIO_NUM 95 @@ -110,8 +111,7 @@ static inline int to_reg(int gpio, enum ctrl_register reg_type) return reg + gpio % 8; } -static void crystalcove_update_irq_mask(struct crystalcove_gpio *cg, - int gpio) +static void crystalcove_update_irq_mask(struct crystalcove_gpio *cg, int gpio) { u8 mirqs0 = gpio < 8 ? MGPIO0IRQS0 : MGPIO1IRQS0; int mask = BIT(gpio % 8); @@ -140,8 +140,7 @@ static int crystalcove_gpio_dir_in(struct gpio_chip *chip, unsigned int gpio) return regmap_write(cg->regmap, reg, CTLO_INPUT_SET); } -static int crystalcove_gpio_dir_out(struct gpio_chip *chip, unsigned int gpio, - int value) +static int crystalcove_gpio_dir_out(struct gpio_chip *chip, unsigned int gpio, int value) { struct crystalcove_gpio *cg = gpiochip_get_data(chip); int reg = to_reg(gpio, CTRL_OUT); @@ -168,8 +167,7 @@ static int crystalcove_gpio_get(struct gpio_chip *chip, unsigned int gpio) return val & 0x1; } -static void crystalcove_gpio_set(struct gpio_chip *chip, - unsigned int gpio, int value) +static void crystalcove_gpio_set(struct gpio_chip *chip, unsigned int gpio, int value) { struct crystalcove_gpio *cg = gpiochip_get_data(chip); int reg = to_reg(gpio, CTRL_OUT); @@ -185,10 +183,10 @@ static void crystalcove_gpio_set(struct gpio_chip *chip, static int crystalcove_irq_type(struct irq_data *data, unsigned int type) { - struct crystalcove_gpio *cg = - gpiochip_get_data(irq_data_get_irq_chip_data(data)); + struct crystalcove_gpio *cg = gpiochip_get_data(irq_data_get_irq_chip_data(data)); + irq_hw_number_t hwirq = irqd_to_hwirq(data); - if (data->hwirq >= CRYSTALCOVE_GPIO_NUM) + if (hwirq >= CRYSTALCOVE_GPIO_NUM) return 0; switch (type) { @@ -215,22 +213,20 @@ static int crystalcove_irq_type(struct irq_data *data, unsigned int type) static void crystalcove_bus_lock(struct irq_data *data) { - struct crystalcove_gpio *cg = - gpiochip_get_data(irq_data_get_irq_chip_data(data)); + struct crystalcove_gpio *cg = gpiochip_get_data(irq_data_get_irq_chip_data(data)); mutex_lock(&cg->buslock); } static void crystalcove_bus_sync_unlock(struct irq_data *data) { - struct crystalcove_gpio *cg = - gpiochip_get_data(irq_data_get_irq_chip_data(data)); - int gpio = data->hwirq; + struct crystalcove_gpio *cg = gpiochip_get_data(irq_data_get_irq_chip_data(data)); + irq_hw_number_t hwirq = irqd_to_hwirq(data); if (cg->update & UPDATE_IRQ_TYPE) - crystalcove_update_irq_ctrl(cg, gpio); + crystalcove_update_irq_ctrl(cg, hwirq); if (cg->update & UPDATE_IRQ_MASK) - crystalcove_update_irq_mask(cg, gpio); + crystalcove_update_irq_mask(cg, hwirq); cg->update = 0; mutex_unlock(&cg->buslock); @@ -238,34 +234,43 @@ static void crystalcove_bus_sync_unlock(struct irq_data *data) static void crystalcove_irq_unmask(struct irq_data *data) { - struct crystalcove_gpio *cg = - gpiochip_get_data(irq_data_get_irq_chip_data(data)); + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct crystalcove_gpio *cg = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(data); - if (data->hwirq < CRYSTALCOVE_GPIO_NUM) { - cg->set_irq_mask = false; - cg->update |= UPDATE_IRQ_MASK; - } + if (hwirq >= CRYSTALCOVE_GPIO_NUM) + return; + + gpiochip_enable_irq(gc, hwirq); + + cg->set_irq_mask = false; + cg->update |= UPDATE_IRQ_MASK; } static void crystalcove_irq_mask(struct irq_data *data) { - struct crystalcove_gpio *cg = - gpiochip_get_data(irq_data_get_irq_chip_data(data)); + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct crystalcove_gpio *cg = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(data); - if (data->hwirq < CRYSTALCOVE_GPIO_NUM) { - cg->set_irq_mask = true; - cg->update |= UPDATE_IRQ_MASK; - } + if (hwirq >= CRYSTALCOVE_GPIO_NUM) + return; + + cg->set_irq_mask = true; + cg->update |= UPDATE_IRQ_MASK; + + gpiochip_disable_irq(gc, hwirq); } -static struct irq_chip crystalcove_irqchip = { +static const struct irq_chip crystalcove_irqchip = { .name = "Crystal Cove", .irq_mask = crystalcove_irq_mask, .irq_unmask = crystalcove_irq_unmask, .irq_set_type = crystalcove_irq_type, .irq_bus_lock = crystalcove_bus_lock, .irq_bus_sync_unlock = crystalcove_bus_sync_unlock, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, }; static irqreturn_t crystalcove_gpio_irq_handler(int irq, void *data) @@ -293,8 +298,7 @@ static irqreturn_t crystalcove_gpio_irq_handler(int irq, void *data) return IRQ_HANDLED; } -static void crystalcove_gpio_dbg_show(struct seq_file *s, - struct gpio_chip *chip) +static void crystalcove_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) { struct crystalcove_gpio *cg = gpiochip_get_data(chip); int gpio, offset; @@ -353,7 +357,7 @@ static int crystalcove_gpio_probe(struct platform_device *pdev) cg->regmap = pmic->regmap; girq = &cg->chip.irq; - girq->chip = &crystalcove_irqchip; + gpio_irq_chip_set_chip(girq, &crystalcove_irqchip); /* This will let us handle the parent IRQ in the driver */ girq->parent_handler = NULL; girq->num_parents = 0; diff --git a/drivers/gpio/gpio-dln2.c b/drivers/gpio/gpio-dln2.c index 08b9e2cf4f2d..71fa437b491f 100644 --- a/drivers/gpio/gpio-dln2.c +++ b/drivers/gpio/gpio-dln2.c @@ -46,7 +46,6 @@ struct dln2_gpio { struct platform_device *pdev; struct gpio_chip gpio; - struct irq_chip irqchip; /* * Cache pin direction to save us one transfer, since the hardware has @@ -306,6 +305,7 @@ static void dln2_irq_unmask(struct irq_data *irqd) struct dln2_gpio *dln2 = gpiochip_get_data(gc); int pin = irqd_to_hwirq(irqd); + gpiochip_enable_irq(gc, pin); set_bit(pin, dln2->unmasked_irqs); } @@ -316,6 +316,7 @@ static void dln2_irq_mask(struct irq_data *irqd) int pin = irqd_to_hwirq(irqd); clear_bit(pin, dln2->unmasked_irqs); + gpiochip_disable_irq(gc, pin); } static int dln2_irq_set_type(struct irq_data *irqd, unsigned type) @@ -384,6 +385,17 @@ static void dln2_irq_bus_unlock(struct irq_data *irqd) mutex_unlock(&dln2->irq_lock); } +static const struct irq_chip dln2_irqchip = { + .name = "dln2-irq", + .irq_mask = dln2_irq_mask, + .irq_unmask = dln2_irq_unmask, + .irq_set_type = dln2_irq_set_type, + .irq_bus_lock = dln2_irq_bus_lock, + .irq_bus_sync_unlock = dln2_irq_bus_unlock, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static void dln2_gpio_event(struct platform_device *pdev, u16 echo, const void *data, int len) { @@ -465,15 +477,8 @@ static int dln2_gpio_probe(struct platform_device *pdev) dln2->gpio.direction_output = dln2_gpio_direction_output; dln2->gpio.set_config = dln2_gpio_set_config; - dln2->irqchip.name = "dln2-irq", - dln2->irqchip.irq_mask = dln2_irq_mask, - dln2->irqchip.irq_unmask = dln2_irq_unmask, - dln2->irqchip.irq_set_type = dln2_irq_set_type, - dln2->irqchip.irq_bus_lock = dln2_irq_bus_lock, - dln2->irqchip.irq_bus_sync_unlock = dln2_irq_bus_unlock, - girq = &dln2->gpio.irq; - girq->chip = &dln2->irqchip; + gpio_irq_chip_set_chip(girq, &dln2_irqchip); /* The event comes from the outside so no parent handler */ girq->parent_handler = NULL; girq->num_parents = 0; diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c index 04afe728e187..c22fcaa44a61 100644 --- a/drivers/gpio/gpio-dwapb.c +++ b/drivers/gpio/gpio-dwapb.c @@ -662,10 +662,9 @@ static int dwapb_get_clks(struct dwapb_gpio *gpio) gpio->clks[1].id = "db"; err = devm_clk_bulk_get_optional(gpio->dev, DWAPB_NR_CLOCKS, gpio->clks); - if (err) { - dev_err(gpio->dev, "Cannot get APB/Debounce clocks\n"); - return err; - } + if (err) + return dev_err_probe(gpio->dev, err, + "Cannot get APB/Debounce clocks\n"); err = clk_bulk_prepare_enable(DWAPB_NR_CLOCKS, gpio->clks); if (err) { diff --git a/drivers/gpio/gpio-merrifield.c b/drivers/gpio/gpio-merrifield.c index f3d1baeacbe9..72ac09a59702 100644 --- a/drivers/gpio/gpio-merrifield.c +++ b/drivers/gpio/gpio-merrifield.c @@ -220,10 +220,8 @@ static void mrfld_irq_ack(struct irq_data *d) raw_spin_unlock_irqrestore(&priv->lock, flags); } -static void mrfld_irq_unmask_mask(struct irq_data *d, bool unmask) +static void mrfld_irq_unmask_mask(struct mrfld_gpio *priv, u32 gpio, bool unmask) { - struct mrfld_gpio *priv = irq_data_get_irq_chip_data(d); - u32 gpio = irqd_to_hwirq(d); void __iomem *gimr = gpio_reg(&priv->chip, gpio, GIMR); unsigned long flags; u32 value; @@ -241,12 +239,20 @@ static void mrfld_irq_unmask_mask(struct irq_data *d, bool unmask) static void mrfld_irq_mask(struct irq_data *d) { - mrfld_irq_unmask_mask(d, false); + struct mrfld_gpio *priv = irq_data_get_irq_chip_data(d); + u32 gpio = irqd_to_hwirq(d); + + mrfld_irq_unmask_mask(priv, gpio, false); + gpiochip_disable_irq(&priv->chip, gpio); } static void mrfld_irq_unmask(struct irq_data *d) { - mrfld_irq_unmask_mask(d, true); + struct mrfld_gpio *priv = irq_data_get_irq_chip_data(d); + u32 gpio = irqd_to_hwirq(d); + + gpiochip_enable_irq(&priv->chip, gpio); + mrfld_irq_unmask_mask(priv, gpio, true); } static int mrfld_irq_set_type(struct irq_data *d, unsigned int type) @@ -329,13 +335,15 @@ static int mrfld_irq_set_wake(struct irq_data *d, unsigned int on) return 0; } -static struct irq_chip mrfld_irqchip = { +static const struct irq_chip mrfld_irqchip = { .name = "gpio-merrifield", .irq_ack = mrfld_irq_ack, .irq_mask = mrfld_irq_mask, .irq_unmask = mrfld_irq_unmask, .irq_set_type = mrfld_irq_set_type, .irq_set_wake = mrfld_irq_set_wake, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, }; static void mrfld_irq_handler(struct irq_desc *desc) @@ -482,7 +490,7 @@ static int mrfld_gpio_probe(struct pci_dev *pdev, const struct pci_device_id *id return retval; girq = &priv->chip.irq; - girq->chip = &mrfld_irqchip; + gpio_irq_chip_set_chip(girq, &mrfld_irqchip); girq->init_hw = mrfld_irq_init_hw; girq->parent_handler = mrfld_irq_handler; girq->num_parents = 1; diff --git a/drivers/gpio/gpio-sch.c b/drivers/gpio/gpio-sch.c index acda4c5052d3..8a83f7bf4382 100644 --- a/drivers/gpio/gpio-sch.c +++ b/drivers/gpio/gpio-sch.c @@ -38,7 +38,6 @@ struct sch_gpio { struct gpio_chip chip; - struct irq_chip irqchip; spinlock_t lock; unsigned short iobase; unsigned short resume_base; @@ -218,11 +217,9 @@ static void sch_irq_ack(struct irq_data *d) spin_unlock_irqrestore(&sch->lock, flags); } -static void sch_irq_mask_unmask(struct irq_data *d, int val) +static void sch_irq_mask_unmask(struct gpio_chip *gc, irq_hw_number_t gpio_num, int val) { - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct sch_gpio *sch = gpiochip_get_data(gc); - irq_hw_number_t gpio_num = irqd_to_hwirq(d); unsigned long flags; spin_lock_irqsave(&sch->lock, flags); @@ -232,14 +229,32 @@ static void sch_irq_mask_unmask(struct irq_data *d, int val) static void sch_irq_mask(struct irq_data *d) { - sch_irq_mask_unmask(d, 0); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + + sch_irq_mask_unmask(gc, gpio_num, 0); + gpiochip_disable_irq(gc, gpio_num); } static void sch_irq_unmask(struct irq_data *d) { - sch_irq_mask_unmask(d, 1); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + + gpiochip_enable_irq(gc, gpio_num); + sch_irq_mask_unmask(gc, gpio_num, 1); } +static const struct irq_chip sch_irqchip = { + .name = "sch_gpio", + .irq_ack = sch_irq_ack, + .irq_mask = sch_irq_mask, + .irq_unmask = sch_irq_unmask, + .irq_set_type = sch_irq_type, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static u32 sch_gpio_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context) { struct sch_gpio *sch = context; @@ -367,14 +382,8 @@ static int sch_gpio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sch); - sch->irqchip.name = "sch_gpio"; - sch->irqchip.irq_ack = sch_irq_ack; - sch->irqchip.irq_mask = sch_irq_mask; - sch->irqchip.irq_unmask = sch_irq_unmask; - sch->irqchip.irq_set_type = sch_irq_type; - girq = &sch->chip.irq; - girq->chip = &sch->irqchip; + gpio_irq_chip_set_chip(girq, &sch_irqchip); girq->num_parents = 0; girq->parents = NULL; girq->parent_handler = NULL; diff --git a/drivers/gpio/gpio-wcove.c b/drivers/gpio/gpio-wcove.c index 16a0fae1e32e..c18b6b47384f 100644 --- a/drivers/gpio/gpio-wcove.c +++ b/drivers/gpio/gpio-wcove.c @@ -299,6 +299,8 @@ static void wcove_irq_unmask(struct irq_data *data) if (gpio >= WCOVE_GPIO_NUM) return; + gpiochip_enable_irq(chip, gpio); + wg->set_irq_mask = false; wg->update |= UPDATE_IRQ_MASK; } @@ -314,15 +316,19 @@ static void wcove_irq_mask(struct irq_data *data) wg->set_irq_mask = true; wg->update |= UPDATE_IRQ_MASK; + + gpiochip_disable_irq(chip, gpio); } -static struct irq_chip wcove_irqchip = { +static const struct irq_chip wcove_irqchip = { .name = "Whiskey Cove", .irq_mask = wcove_irq_mask, .irq_unmask = wcove_irq_unmask, .irq_set_type = wcove_irq_type, .irq_bus_lock = wcove_bus_lock, .irq_bus_sync_unlock = wcove_bus_sync_unlock, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, }; static irqreturn_t wcove_gpio_irq_handler(int irq, void *data) @@ -452,7 +458,7 @@ static int wcove_gpio_probe(struct platform_device *pdev) } girq = &wg->chip.irq; - girq->chip = &wcove_irqchip; + gpio_irq_chip_set_chip(girq, &wcove_irqchip); /* This will let us handle the parent IRQ in the driver */ girq->parent_handler = NULL; girq->num_parents = 0; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index d21648a923ea..54c0473a51dd 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -33,6 +33,14 @@ struct dm_kobject_holder { * access their members! */ +/* + * For mempools pre-allocation at the table loading time. + */ +struct dm_md_mempools { + struct bio_set bs; + struct bio_set io_bs; +}; + struct mapped_device { struct mutex suspend_lock; @@ -110,8 +118,7 @@ struct mapped_device { /* * io objects are allocated from here. */ - struct bio_set io_bs; - struct bio_set bs; + struct dm_md_mempools *mempools; /* kobject and completion */ struct dm_kobject_holder kobj_holder; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 6087cdcaad46..a83b98a8d2a9 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -319,7 +319,7 @@ static int setup_clone(struct request *clone, struct request *rq, { int r; - r = blk_rq_prep_clone(clone, rq, &tio->md->bs, gfp_mask, + r = blk_rq_prep_clone(clone, rq, &tio->md->mempools->bs, gfp_mask, dm_rq_bio_constructor, tio); if (r) return r; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0e833a154b31..bd539afbfe88 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1038,17 +1038,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * return 0; } -void dm_table_free_md_mempools(struct dm_table *t) -{ - dm_free_md_mempools(t->mempools); - t->mempools = NULL; -} - -struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) -{ - return t->mempools; -} - static int setup_indexes(struct dm_table *t) { int i; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dfb0a551bd88..d8f16183bf27 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -136,14 +136,6 @@ static int get_swap_bios(void) return latch; } -/* - * For mempools pre-allocation at the table loading time. - */ -struct dm_md_mempools { - struct bio_set bs; - struct bio_set io_bs; -}; - struct table_device { struct list_head list; refcount_t count; @@ -581,7 +573,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs); + clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs); /* Set default bdev, but target must bio_set_dev() before issuing IO */ clone->bi_bdev = md->disk->part0; @@ -628,7 +620,8 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, } else { struct mapped_device *md = ci->io->md; - clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs); + clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, + &md->mempools->bs); if (!clone) return NULL; /* Set default bdev, but target must bio_set_dev() before issuing IO */ @@ -1023,23 +1016,19 @@ static void clone_endio(struct bio *bio) struct dm_io *io = tio->io; struct mapped_device *md = io->md; - if (likely(bio->bi_bdev != md->disk->part0)) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - - if (unlikely(error == BLK_STS_TARGET)) { - if (bio_op(bio) == REQ_OP_DISCARD && - !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !q->limits.max_write_zeroes_sectors) - disable_write_zeroes(md); - } - - if (static_branch_unlikely(&zoned_enabled) && - unlikely(blk_queue_is_zoned(q))) - dm_zone_endio(io, bio); + if (unlikely(error == BLK_STS_TARGET)) { + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev)) + disable_discard(md); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !bdev_write_zeroes_sectors(bio->bi_bdev)) + disable_write_zeroes(md); } + if (static_branch_unlikely(&zoned_enabled) && + unlikely(blk_queue_is_zoned(bdev_get_queue(bio->bi_bdev)))) + dm_zone_endio(io, bio); + if (endio) { int r = endio(ti, bio, &error); switch (r) { @@ -1876,8 +1865,7 @@ static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) destroy_workqueue(md->wq); - bioset_exit(&md->bs); - bioset_exit(&md->io_bs); + dm_free_md_mempools(md->mempools); if (md->dax_dev) { dax_remove_host(md->disk); @@ -2049,48 +2037,6 @@ static void free_dev(struct mapped_device *md) kvfree(md); } -static int __bind_mempools(struct mapped_device *md, struct dm_table *t) -{ - struct dm_md_mempools *p = dm_table_get_md_mempools(t); - int ret = 0; - - if (dm_table_bio_based(t)) { - /* - * The md may already have mempools that need changing. - * If so, reload bioset because front_pad may have changed - * because a different table was loaded. - */ - bioset_exit(&md->bs); - bioset_exit(&md->io_bs); - - } else if (bioset_initialized(&md->bs)) { - /* - * There's no need to reload with request-based dm - * because the size of front_pad doesn't change. - * Note for future: If you are to reload bioset, - * prep-ed requests in the queue may refer - * to bio from the old bioset, so you must walk - * through the queue to unprep. - */ - goto out; - } - - BUG_ON(!p || - bioset_initialized(&md->bs) || - bioset_initialized(&md->io_bs)); - - ret = bioset_init_from_src(&md->bs, &p->bs); - if (ret) - goto out; - ret = bioset_init_from_src(&md->io_bs, &p->io_bs); - if (ret) - bioset_exit(&md->bs); -out: - /* mempool bind completed, no longer need any mempools in the table */ - dm_table_free_md_mempools(t); - return ret; -} - /* * Bind a table to the device. */ @@ -2144,12 +2090,28 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * immutable singletons - used to optimize dm_mq_queue_rq. */ md->immutable_target = dm_table_get_immutable_target(t); - } - ret = __bind_mempools(md, t); - if (ret) { - old_map = ERR_PTR(ret); - goto out; + /* + * There is no need to reload with request-based dm because the + * size of front_pad doesn't change. + * + * Note for future: If you are to reload bioset, prep-ed + * requests in the queue may refer to bio from the old bioset, + * so you must walk through the queue to unprep. + */ + if (!md->mempools) { + md->mempools = t->mempools; + t->mempools = NULL; + } + } else { + /* + * The md may already have mempools that need changing. + * If so, reload bioset because front_pad may have changed + * because a different table was loaded. + */ + dm_free_md_mempools(md->mempools); + md->mempools = t->mempools; + t->mempools = NULL; } ret = dm_table_set_restrictions(t, md->queue, limits); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 3f89664fea01..a8405ce305a9 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -71,8 +71,6 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_bio_based(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); -void dm_table_free_md_mempools(struct dm_table *t); -struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index e6f48786949c..02bd3cf9a260 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -332,7 +332,6 @@ static void bgmac_remove(struct bcma_device *core) bcma_mdio_mii_unregister(bgmac->mii_bus); bgmac_enet_remove(bgmac); bcma_set_drvdata(core, NULL); - kfree(bgmac); } static struct bcma_driver bgmac_bcma_driver = { diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index e1cae253412c..c1ac2f746714 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5763,25 +5763,38 @@ static netdev_features_t ice_fix_features(struct net_device *netdev, netdev_features_t features) { struct ice_netdev_priv *np = netdev_priv(netdev); - netdev_features_t supported_vlan_filtering; - netdev_features_t requested_vlan_filtering; - struct ice_vsi *vsi = np->vsi; - - requested_vlan_filtering = features & NETIF_VLAN_FILTERING_FEATURES; - - /* make sure supported_vlan_filtering works for both SVM and DVM */ - supported_vlan_filtering = NETIF_F_HW_VLAN_CTAG_FILTER; - if (ice_is_dvm_ena(&vsi->back->hw)) - supported_vlan_filtering |= NETIF_F_HW_VLAN_STAG_FILTER; - - if (requested_vlan_filtering && - requested_vlan_filtering != supported_vlan_filtering) { - if (requested_vlan_filtering & NETIF_F_HW_VLAN_CTAG_FILTER) { - netdev_warn(netdev, "cannot support requested VLAN filtering settings, enabling all supported VLAN filtering settings\n"); - features |= supported_vlan_filtering; + netdev_features_t req_vlan_fltr, cur_vlan_fltr; + bool cur_ctag, cur_stag, req_ctag, req_stag; + + cur_vlan_fltr = netdev->features & NETIF_VLAN_FILTERING_FEATURES; + cur_ctag = cur_vlan_fltr & NETIF_F_HW_VLAN_CTAG_FILTER; + cur_stag = cur_vlan_fltr & NETIF_F_HW_VLAN_STAG_FILTER; + + req_vlan_fltr = features & NETIF_VLAN_FILTERING_FEATURES; + req_ctag = req_vlan_fltr & NETIF_F_HW_VLAN_CTAG_FILTER; + req_stag = req_vlan_fltr & NETIF_F_HW_VLAN_STAG_FILTER; + + if (req_vlan_fltr != cur_vlan_fltr) { + if (ice_is_dvm_ena(&np->vsi->back->hw)) { + if (req_ctag && req_stag) { + features |= NETIF_VLAN_FILTERING_FEATURES; + } else if (!req_ctag && !req_stag) { + features &= ~NETIF_VLAN_FILTERING_FEATURES; + } else if ((!cur_ctag && req_ctag && !cur_stag) || + (!cur_stag && req_stag && !cur_ctag)) { + features |= NETIF_VLAN_FILTERING_FEATURES; + netdev_warn(netdev, "802.1Q and 802.1ad VLAN filtering must be either both on or both off. VLAN filtering has been enabled for both types.\n"); + } else if ((cur_ctag && !req_ctag && cur_stag) || + (cur_stag && !req_stag && cur_ctag)) { + features &= ~NETIF_VLAN_FILTERING_FEATURES; + netdev_warn(netdev, "802.1Q and 802.1ad VLAN filtering must be either both on or both off. VLAN filtering has been disabled for both types.\n"); + } } else { - netdev_warn(netdev, "cannot support requested VLAN filtering settings, clearing all supported VLAN filtering settings\n"); - features &= ~supported_vlan_filtering; + if (req_vlan_fltr & NETIF_F_HW_VLAN_STAG_FILTER) + netdev_warn(netdev, "cannot support requested 802.1ad filtering setting in SVM mode\n"); + + if (req_vlan_fltr & NETIF_F_HW_VLAN_CTAG_FILTER) + features |= NETIF_F_HW_VLAN_CTAG_FILTER; } } diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 662947c882e8..ef9344ef0d8e 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -2271,7 +2271,7 @@ static int ice_ptp_init_tx_e822(struct ice_pf *pf, struct ice_ptp_tx *tx, u8 port) { tx->quad = port / ICE_PORTS_PER_QUAD; - tx->quad_offset = tx->quad * INDEX_PER_PORT; + tx->quad_offset = (port % ICE_PORTS_PER_QUAD) * INDEX_PER_PORT; tx->len = INDEX_PER_PORT; return ice_ptp_alloc_tx_tracker(tx); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.h b/drivers/net/ethernet/intel/ice/ice_ptp.h index afd048d69959..10e396abf130 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp.h @@ -49,6 +49,37 @@ struct ice_perout_channel { * To allow multiple ports to access the shared register block independently, * the blocks are split up so that indexes are assigned to each port based on * hardware logical port number. + * + * The timestamp blocks are handled differently for E810- and E822-based + * devices. In E810 devices, each port has its own block of timestamps, while in + * E822 there is a need to logically break the block of registers into smaller + * chunks based on the port number to avoid collisions. + * + * Example for port 5 in E810: + * +--------+--------+--------+--------+--------+--------+--------+--------+ + * |register|register|register|register|register|register|register|register| + * | block | block | block | block | block | block | block | block | + * | for | for | for | for | for | for | for | for | + * | port 0 | port 1 | port 2 | port 3 | port 4 | port 5 | port 6 | port 7 | + * +--------+--------+--------+--------+--------+--------+--------+--------+ + * ^^ + * || + * |--- quad offset is always 0 + * ---- quad number + * + * Example for port 5 in E822: + * +-----------------------------+-----------------------------+ + * | register block for quad 0 | register block for quad 1 | + * |+------+------+------+------+|+------+------+------+------+| + * ||port 0|port 1|port 2|port 3|||port 0|port 1|port 2|port 3|| + * |+------+------+------+------+|+------+------+------+------+| + * +-----------------------------+-------^---------------------+ + * ^ | + * | --- quad offset* + * ---- quad number + * + * * PHY port 5 is port 1 in quad 1 + * */ /** diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.c b/drivers/net/ethernet/intel/ice/ice_vf_lib.c index cd8e6b50968c..7adf9ddf129e 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.c @@ -504,6 +504,11 @@ int ice_reset_vf(struct ice_vf *vf, u32 flags) } if (ice_is_vf_disabled(vf)) { + vsi = ice_get_vf_vsi(vf); + if (WARN_ON(!vsi)) + return -EINVAL; + ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id); + ice_vsi_stop_all_rx_rings(vsi); dev_dbg(dev, "VF is already disabled, there is no need for resetting it, telling VM, all is fine %d\n", vf->vf_id); return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index 1d9b84c3937a..4547bc1f7cee 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -1569,35 +1569,27 @@ error_param: */ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) { - enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS; struct virtchnl_vsi_queue_config_info *qci = (struct virtchnl_vsi_queue_config_info *)msg; struct virtchnl_queue_pair_info *qpi; struct ice_pf *pf = vf->pf; struct ice_vsi *vsi; - int i, q_idx; + int i = -1, q_idx; - if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) goto error_param; - } - if (!ice_vc_isvalid_vsi_id(vf, qci->vsi_id)) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + if (!ice_vc_isvalid_vsi_id(vf, qci->vsi_id)) goto error_param; - } vsi = ice_get_vf_vsi(vf); - if (!vsi) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + if (!vsi) goto error_param; - } if (qci->num_queue_pairs > ICE_MAX_RSS_QS_PER_VF || qci->num_queue_pairs > min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)) { dev_err(ice_pf_to_dev(pf), "VF-%d requesting more than supported number of queues: %d\n", vf->vf_id, min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)); - v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; } @@ -1610,7 +1602,6 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) !ice_vc_isvalid_ring_len(qpi->txq.ring_len) || !ice_vc_isvalid_ring_len(qpi->rxq.ring_len) || !ice_vc_isvalid_q_id(vf, qci->vsi_id, qpi->txq.queue_id)) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; } @@ -1620,7 +1611,6 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) * for selected "vsi" */ if (q_idx >= vsi->alloc_txq || q_idx >= vsi->alloc_rxq) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; } @@ -1630,14 +1620,13 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) vsi->tx_rings[i]->count = qpi->txq.ring_len; /* Disable any existing queue first */ - if (ice_vf_vsi_dis_single_txq(vf, vsi, q_idx)) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + if (ice_vf_vsi_dis_single_txq(vf, vsi, q_idx)) goto error_param; - } /* Configure a queue with the requested settings */ if (ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx)) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + dev_warn(ice_pf_to_dev(pf), "VF-%d failed to configure TX queue %d\n", + vf->vf_id, i); goto error_param; } } @@ -1651,17 +1640,13 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) if (qpi->rxq.databuffer_size != 0 && (qpi->rxq.databuffer_size > ((16 * 1024) - 128) || - qpi->rxq.databuffer_size < 1024)) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + qpi->rxq.databuffer_size < 1024)) goto error_param; - } vsi->rx_buf_len = qpi->rxq.databuffer_size; vsi->rx_rings[i]->rx_buf_len = vsi->rx_buf_len; if (qpi->rxq.max_pkt_size > max_frame_size || - qpi->rxq.max_pkt_size < 64) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + qpi->rxq.max_pkt_size < 64) goto error_param; - } vsi->max_frame = qpi->rxq.max_pkt_size; /* add space for the port VLAN since the VF driver is @@ -1672,16 +1657,30 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) vsi->max_frame += VLAN_HLEN; if (ice_vsi_cfg_single_rxq(vsi, q_idx)) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; + dev_warn(ice_pf_to_dev(pf), "VF-%d failed to configure RX queue %d\n", + vf->vf_id, i); goto error_param; } } } + /* send the response to the VF */ + return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES, + VIRTCHNL_STATUS_SUCCESS, NULL, 0); error_param: + /* disable whatever we can */ + for (; i >= 0; i--) { + if (ice_vsi_ctrl_one_rx_ring(vsi, false, i, true)) + dev_err(ice_pf_to_dev(pf), "VF-%d could not disable RX queue %d\n", + vf->vf_id, i); + if (ice_vf_vsi_dis_single_txq(vf, vsi, i)) + dev_err(ice_pf_to_dev(pf), "VF-%d could not disable TX queue %d\n", + vf->vf_id, i); + } + /* send the response to the VF */ - return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES, v_ret, - NULL, 0); + return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES, + VIRTCHNL_STATUS_ERR_PARAM, NULL, 0); } /** diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index fa7bcd2c1892..1760930ec0c4 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2039,6 +2039,7 @@ static int axienet_probe(struct platform_device *pdev) } if (!IS_ENABLED(CONFIG_64BIT) && lp->features & XAE_FEATURE_DMA_64BIT) { dev_err(&pdev->dev, "64-bit addressable DMA is not compatible with 32-bit archecture\n"); + ret = -EINVAL; goto cleanup_clk; } diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 45c3c4a1101b..9fb567524220 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -99,6 +99,7 @@ struct sixpack { unsigned int rx_count; unsigned int rx_count_cooked; + spinlock_t rxlock; int mtu; /* Our mtu (to spot changes!) */ int buffsize; /* Max buffers sizes */ @@ -565,6 +566,7 @@ static int sixpack_open(struct tty_struct *tty) sp->dev = dev; spin_lock_init(&sp->lock); + spin_lock_init(&sp->rxlock); refcount_set(&sp->refcnt, 1); init_completion(&sp->dead); @@ -913,6 +915,7 @@ static void decode_std_command(struct sixpack *sp, unsigned char cmd) sp->led_state = 0x60; /* fill trailing bytes with zeroes */ sp->tty->ops->write(sp->tty, &sp->led_state, 1); + spin_lock_bh(&sp->rxlock); rest = sp->rx_count; if (rest != 0) for (i = rest; i <= 3; i++) @@ -930,6 +933,7 @@ static void decode_std_command(struct sixpack *sp, unsigned char cmd) sp_bump(sp, 0); } sp->rx_count_cooked = 0; + spin_unlock_bh(&sp->rxlock); } break; case SIXP_TX_URUN: printk(KERN_DEBUG "6pack: TX underrun\n"); @@ -959,8 +963,11 @@ sixpack_decode(struct sixpack *sp, const unsigned char *pre_rbuff, int count) decode_prio_command(sp, inbyte); else if ((inbyte & SIXP_STD_CMD_MASK) != 0) decode_std_command(sp, inbyte); - else if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK) + else if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK) { + spin_lock_bh(&sp->rxlock); decode_data(sp, inbyte); + spin_unlock_bh(&sp->rxlock); + } } } diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c index a8db1a19011b..c7047f5d7a9b 100644 --- a/drivers/net/phy/aquantia_main.c +++ b/drivers/net/phy/aquantia_main.c @@ -34,6 +34,8 @@ #define MDIO_AN_VEND_PROV 0xc400 #define MDIO_AN_VEND_PROV_1000BASET_FULL BIT(15) #define MDIO_AN_VEND_PROV_1000BASET_HALF BIT(14) +#define MDIO_AN_VEND_PROV_5000BASET_FULL BIT(11) +#define MDIO_AN_VEND_PROV_2500BASET_FULL BIT(10) #define MDIO_AN_VEND_PROV_DOWNSHIFT_EN BIT(4) #define MDIO_AN_VEND_PROV_DOWNSHIFT_MASK GENMASK(3, 0) #define MDIO_AN_VEND_PROV_DOWNSHIFT_DFLT 4 @@ -231,9 +233,20 @@ static int aqr_config_aneg(struct phy_device *phydev) phydev->advertising)) reg |= MDIO_AN_VEND_PROV_1000BASET_HALF; + /* Handle the case when the 2.5G and 5G speeds are not advertised */ + if (linkmode_test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + phydev->advertising)) + reg |= MDIO_AN_VEND_PROV_2500BASET_FULL; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + phydev->advertising)) + reg |= MDIO_AN_VEND_PROV_5000BASET_FULL; + ret = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_VEND_PROV, MDIO_AN_VEND_PROV_1000BASET_HALF | - MDIO_AN_VEND_PROV_1000BASET_FULL, reg); + MDIO_AN_VEND_PROV_1000BASET_FULL | + MDIO_AN_VEND_PROV_2500BASET_FULL | + MDIO_AN_VEND_PROV_5000BASET_FULL, reg); if (ret < 0) return ret; if (ret > 0) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 7a8c11a26eb5..4704ed6f00ef 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1750,7 +1750,7 @@ static const struct driver_info ax88179_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1763,7 +1763,7 @@ static const struct driver_info ax88178a_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1776,7 +1776,7 @@ static const struct driver_info cypress_GX3_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1789,7 +1789,7 @@ static const struct driver_info dlink_dub1312_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1802,7 +1802,7 @@ static const struct driver_info sitecom_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1815,7 +1815,7 @@ static const struct driver_info samsung_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1828,7 +1828,7 @@ static const struct driver_info lenovo_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1841,7 +1841,7 @@ static const struct driver_info belkin_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1854,7 +1854,7 @@ static const struct driver_info toshiba_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1867,7 +1867,7 @@ static const struct driver_info mct_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1880,7 +1880,7 @@ static const struct driver_info at_umc2000_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1893,7 +1893,7 @@ static const struct driver_info at_umc200_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; @@ -1906,7 +1906,7 @@ static const struct driver_info at_umc2000sp_info = { .link_reset = ax88179_link_reset, .reset = ax88179_reset, .stop = ax88179_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_SEND_ZLP, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, }; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 466da01ba2e3..2cb833b3006a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -312,6 +312,7 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct netdev_queue *queue = NULL; struct veth_rq *rq = NULL; struct net_device *rcv; int length = skb->len; @@ -329,6 +330,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; + queue = netdev_get_tx_queue(dev, rxq); /* The napi pointer is available when an XDP program is * attached or when GRO is enabled @@ -340,6 +342,8 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) skb_tx_timestamp(skb); if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { + if (queue) + txq_trans_cond_update(queue); if (!use_napi) dev_lstats_add(dev, length); } else { diff --git a/drivers/platform/mellanox/Kconfig b/drivers/platform/mellanox/Kconfig index 72df4b8f4dd8..09c7829e95c4 100644 --- a/drivers/platform/mellanox/Kconfig +++ b/drivers/platform/mellanox/Kconfig @@ -85,7 +85,7 @@ config NVSW_SN2201 depends on I2C depends on REGMAP_I2C help - This driver provides support for the Nvidia SN2201 platfom. + This driver provides support for the Nvidia SN2201 platform. The SN2201 is a highly integrated for one rack unit system with L3 management switches. It has 48 x 1Gbps RJ45 + 4 x 100G QSFP28 ports in a compact 1RU form factor. The system also including a diff --git a/drivers/platform/mellanox/nvsw-sn2201.c b/drivers/platform/mellanox/nvsw-sn2201.c index 0bcdc7c75007..2923daf63b75 100644 --- a/drivers/platform/mellanox/nvsw-sn2201.c +++ b/drivers/platform/mellanox/nvsw-sn2201.c @@ -326,7 +326,7 @@ static struct resource nvsw_sn2201_lpc_res[] = { }; /* SN2201 I2C platform data. */ -struct mlxreg_core_hotplug_platform_data nvsw_sn2201_i2c_data = { +static struct mlxreg_core_hotplug_platform_data nvsw_sn2201_i2c_data = { .irq = NVSW_SN2201_CPLD_SYSIRQ, }; diff --git a/drivers/platform/x86/barco-p50-gpio.c b/drivers/platform/x86/barco-p50-gpio.c index 05534287bc26..8dd672339485 100644 --- a/drivers/platform/x86/barco-p50-gpio.c +++ b/drivers/platform/x86/barco-p50-gpio.c @@ -405,11 +405,14 @@ MODULE_DEVICE_TABLE(dmi, dmi_ids); static int __init p50_module_init(void) { struct resource res = DEFINE_RES_IO(P50_GPIO_IO_PORT_BASE, P50_PORT_CMD + 1); + int ret; if (!dmi_first_match(dmi_ids)) return -ENODEV; - platform_driver_register(&p50_gpio_driver); + ret = platform_driver_register(&p50_gpio_driver); + if (ret) + return ret; gpio_pdev = platform_device_register_simple(DRIVER_NAME, PLATFORM_DEVID_NONE, &res, 1); if (IS_ERR(gpio_pdev)) { diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c index 1ef606e3ef80..497ad2f64a51 100644 --- a/drivers/platform/x86/gigabyte-wmi.c +++ b/drivers/platform/x86/gigabyte-wmi.c @@ -140,6 +140,7 @@ static u8 gigabyte_wmi_detect_sensor_usability(struct wmi_device *wdev) }} static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M DS3H-CF"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M S2H V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE AX V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE"), @@ -156,6 +157,7 @@ static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 GAMING X"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 I AORUS PRO WIFI"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 UD"), + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("Z690M AORUS ELITE AX DDR4"), { } }; diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index 667f94bba905..0d8cb22e30df 100644 --- a/drivers/platform/x86/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c @@ -38,6 +38,7 @@ MODULE_ALIAS("wmi:5FB7F034-2C63-45e9-BE91-3D44E2C707E4"); #define HPWMI_EVENT_GUID "95F24279-4D7B-4334-9387-ACCDC67EF61C" #define HPWMI_BIOS_GUID "5FB7F034-2C63-45e9-BE91-3D44E2C707E4" #define HP_OMEN_EC_THERMAL_PROFILE_OFFSET 0x95 +#define zero_if_sup(tmp) (zero_insize_support?0:sizeof(tmp)) // use when zero insize is required /* DMI board names of devices that should use the omen specific path for * thermal profiles. @@ -220,6 +221,7 @@ static struct input_dev *hp_wmi_input_dev; static struct platform_device *hp_wmi_platform_dev; static struct platform_profile_handler platform_profile_handler; static bool platform_profile_support; +static bool zero_insize_support; static struct rfkill *wifi_rfkill; static struct rfkill *bluetooth_rfkill; @@ -290,14 +292,16 @@ static int hp_wmi_perform_query(int query, enum hp_wmi_command command, struct bios_return *bios_return; union acpi_object *obj = NULL; struct bios_args *args = NULL; - int mid, actual_outsize, ret; + int mid, actual_insize, actual_outsize; size_t bios_args_size; + int ret; mid = encode_outsize_for_pvsz(outsize); if (WARN_ON(mid < 0)) return mid; - bios_args_size = struct_size(args, data, insize); + actual_insize = max(insize, 128); + bios_args_size = struct_size(args, data, actual_insize); args = kmalloc(bios_args_size, GFP_KERNEL); if (!args) return -ENOMEM; @@ -374,7 +378,7 @@ static int hp_wmi_read_int(int query) int val = 0, ret; ret = hp_wmi_perform_query(query, HPWMI_READ, &val, - 0, sizeof(val)); + zero_if_sup(val), sizeof(val)); if (ret) return ret < 0 ? ret : -EINVAL; @@ -410,7 +414,8 @@ static int hp_wmi_get_tablet_mode(void) return -ENODEV; ret = hp_wmi_perform_query(HPWMI_SYSTEM_DEVICE_MODE, HPWMI_READ, - system_device_mode, 0, sizeof(system_device_mode)); + system_device_mode, zero_if_sup(system_device_mode), + sizeof(system_device_mode)); if (ret < 0) return ret; @@ -497,7 +502,7 @@ static int hp_wmi_fan_speed_max_get(void) int val = 0, ret; ret = hp_wmi_perform_query(HPWMI_FAN_SPEED_MAX_GET_QUERY, HPWMI_GM, - &val, 0, sizeof(val)); + &val, zero_if_sup(val), sizeof(val)); if (ret) return ret < 0 ? ret : -EINVAL; @@ -509,7 +514,7 @@ static int __init hp_wmi_bios_2008_later(void) { int state = 0; int ret = hp_wmi_perform_query(HPWMI_FEATURE_QUERY, HPWMI_READ, &state, - 0, sizeof(state)); + zero_if_sup(state), sizeof(state)); if (!ret) return 1; @@ -520,7 +525,7 @@ static int __init hp_wmi_bios_2009_later(void) { u8 state[128]; int ret = hp_wmi_perform_query(HPWMI_FEATURE2_QUERY, HPWMI_READ, &state, - 0, sizeof(state)); + zero_if_sup(state), sizeof(state)); if (!ret) return 1; @@ -598,7 +603,7 @@ static int hp_wmi_rfkill2_refresh(void) int err, i; err = hp_wmi_perform_query(HPWMI_WIRELESS2_QUERY, HPWMI_READ, &state, - 0, sizeof(state)); + zero_if_sup(state), sizeof(state)); if (err) return err; @@ -1007,7 +1012,7 @@ static int __init hp_wmi_rfkill2_setup(struct platform_device *device) int err, i; err = hp_wmi_perform_query(HPWMI_WIRELESS2_QUERY, HPWMI_READ, &state, - 0, sizeof(state)); + zero_if_sup(state), sizeof(state)); if (err) return err < 0 ? err : -EINVAL; @@ -1483,11 +1488,15 @@ static int __init hp_wmi_init(void) { int event_capable = wmi_has_guid(HPWMI_EVENT_GUID); int bios_capable = wmi_has_guid(HPWMI_BIOS_GUID); - int err; + int err, tmp = 0; if (!bios_capable && !event_capable) return -ENODEV; + if (hp_wmi_perform_query(HPWMI_HARDWARE_QUERY, HPWMI_READ, &tmp, + sizeof(tmp), sizeof(tmp)) == HPWMI_RET_INVALID_PARAMETERS) + zero_insize_support = true; + if (event_capable) { err = hp_wmi_input_setup(); if (err) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 216d31e3403d..79cff1fc675c 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -122,6 +122,12 @@ static const struct dmi_system_id dmi_vgbs_allow_list[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Spectre x360 Convertible 15-df0xxx"), }, }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Surface Go"), + }, + }, { } }; diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index edaf22e5ae98..40183bda7894 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -1912,6 +1912,7 @@ static const struct x86_cpu_id intel_pmc_core_ids[] = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &tgl_reg_map), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &tgl_reg_map), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_reg_map), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &tgl_reg_map), {} }; diff --git a/drivers/platform/x86/intel/pmt/crashlog.c b/drivers/platform/x86/intel/pmt/crashlog.c index 34daf9df168b..ace1239bc0a0 100644 --- a/drivers/platform/x86/intel/pmt/crashlog.c +++ b/drivers/platform/x86/intel/pmt/crashlog.c @@ -282,7 +282,7 @@ static int pmt_crashlog_probe(struct auxiliary_device *auxdev, auxiliary_set_drvdata(auxdev, priv); for (i = 0; i < intel_vsec_dev->num_resources; i++) { - struct intel_pmt_entry *entry = &priv->entry[i].entry; + struct intel_pmt_entry *entry = &priv->entry[priv->num_entries].entry; ret = intel_pmt_dev_create(entry, &pmt_crashlog_ns, intel_vsec_dev, i); if (ret < 0) diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 256ec6d08c16..9d01a3e3c26a 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -9795,7 +9795,7 @@ static int ipr_alloc_mem(struct ipr_ioa_cfg *ioa_cfg) GFP_KERNEL); if (!ioa_cfg->hrrq[i].host_rrq) { - while (--i > 0) + while (--i >= 0) dma_free_coherent(&pdev->dev, sizeof(u32) * ioa_cfg->hrrq[i].size, ioa_cfg->hrrq[i].host_rrq, @@ -10068,7 +10068,7 @@ static int ipr_request_other_msi_irqs(struct ipr_ioa_cfg *ioa_cfg, ioa_cfg->vectors_info[i].desc, &ioa_cfg->hrrq[i]); if (rc) { - while (--i >= 0) + while (--i > 0) free_irq(pci_irq_vector(pdev, i), &ioa_cfg->hrrq[i]); return rc; diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index b1be0dd0337a..f5d74958b664 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -420,8 +420,6 @@ int lpfc_sli_issue_iocb_wait(struct lpfc_hba *, uint32_t, uint32_t); void lpfc_sli_abort_fcp_cmpl(struct lpfc_hba *, struct lpfc_iocbq *, struct lpfc_iocbq *); -void lpfc_sli4_abort_fcp_cmpl(struct lpfc_hba *h, struct lpfc_iocbq *i, - struct lpfc_wcqe_complete *w); void lpfc_sli_free_hbq(struct lpfc_hba *, struct hbq_dmabuf *); @@ -630,7 +628,7 @@ void lpfc_nvmet_invalidate_host(struct lpfc_hba *phba, struct lpfc_nodelist *ndlp); void lpfc_nvme_abort_fcreq_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, - struct lpfc_wcqe_complete *abts_cmpl); + struct lpfc_iocbq *rspiocb); void lpfc_create_multixri_pools(struct lpfc_hba *phba); void lpfc_create_destroy_pools(struct lpfc_hba *phba); void lpfc_move_xri_pvt_to_pbl(struct lpfc_hba *phba, u32 hwqid); diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 9d36b20fb878..13dfe285493d 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -197,7 +197,7 @@ lpfc_ct_reject_event(struct lpfc_nodelist *ndlp, memset(bpl, 0, sizeof(struct ulp_bde64)); bpl->addrHigh = le32_to_cpu(putPaddrHigh(mp->phys)); bpl->addrLow = le32_to_cpu(putPaddrLow(mp->phys)); - bpl->tus.f.bdeFlags = BUFF_TYPE_BLP_64; + bpl->tus.f.bdeFlags = BUFF_TYPE_BDE_64; bpl->tus.f.bdeSize = (LPFC_CT_PREAMBLE - 4); bpl->tus.w = le32_to_cpu(bpl->tus.w); diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 07f9a6e61e10..3fababb7c181 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -2998,10 +2998,7 @@ lpfc_cmpl_els_logo(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, ndlp->nlp_DID, ulp_status, ulp_word4); - /* Call NLP_EVT_DEVICE_RM if link is down or LOGO is aborted */ if (lpfc_error_lost_link(ulp_status, ulp_word4)) { - lpfc_disc_state_machine(vport, ndlp, cmdiocb, - NLP_EVT_DEVICE_RM); skip_recovery = 1; goto out; } @@ -3021,18 +3018,10 @@ lpfc_cmpl_els_logo(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, spin_unlock_irq(&ndlp->lock); lpfc_disc_state_machine(vport, ndlp, cmdiocb, NLP_EVT_DEVICE_RM); - lpfc_els_free_iocb(phba, cmdiocb); - lpfc_nlp_put(ndlp); - - /* Presume the node was released. */ - return; + goto out_rsrc_free; } out: - /* Driver is done with the IO. */ - lpfc_els_free_iocb(phba, cmdiocb); - lpfc_nlp_put(ndlp); - /* At this point, the LOGO processing is complete. NOTE: For a * pt2pt topology, we are assuming the NPortID will only change * on link up processing. For a LOGO / PLOGI initiated by the @@ -3059,6 +3048,10 @@ out: ndlp->nlp_DID, ulp_status, ulp_word4, tmo, vport->num_disc_nodes); + + lpfc_els_free_iocb(phba, cmdiocb); + lpfc_nlp_put(ndlp); + lpfc_disc_start(vport); return; } @@ -3075,6 +3068,10 @@ out: lpfc_disc_state_machine(vport, ndlp, cmdiocb, NLP_EVT_DEVICE_RM); } +out_rsrc_free: + /* Driver is done with the I/O. */ + lpfc_els_free_iocb(phba, cmdiocb); + lpfc_nlp_put(ndlp); } /** diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index 8511369d2cf8..f024415731ac 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -4487,6 +4487,9 @@ struct wqe_common { #define wqe_sup_SHIFT 6 #define wqe_sup_MASK 0x00000001 #define wqe_sup_WORD word11 +#define wqe_ffrq_SHIFT 6 +#define wqe_ffrq_MASK 0x00000001 +#define wqe_ffrq_WORD word11 #define wqe_wqec_SHIFT 7 #define wqe_wqec_MASK 0x00000001 #define wqe_wqec_WORD word11 diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 93b94c64518d..750dd1e9f2cc 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -12188,7 +12188,7 @@ lpfc_sli_enable_msi(struct lpfc_hba *phba) rc = pci_enable_msi(phba->pcidev); if (!rc) lpfc_printf_log(phba, KERN_INFO, LOG_INIT, - "0462 PCI enable MSI mode success.\n"); + "0012 PCI enable MSI mode success.\n"); else { lpfc_printf_log(phba, KERN_INFO, LOG_INIT, "0471 PCI enable MSI mode failed (%d)\n", rc); diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index 639f86635127..b86ff9fcdf0c 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -834,7 +834,8 @@ lpfc_rcv_logo(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, lpfc_nvmet_invalidate_host(phba, ndlp); if (ndlp->nlp_DID == Fabric_DID) { - if (vport->port_state <= LPFC_FDISC) + if (vport->port_state <= LPFC_FDISC || + vport->fc_flag & FC_PT2PT) goto out; lpfc_linkdown_port(vport); spin_lock_irq(shost->host_lock); diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 335e90633933..cd10ee6482fc 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -1065,25 +1065,37 @@ lpfc_nvme_io_cmd_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn, nCmd->rcv_rsplen = wcqe->parameter; nCmd->status = 0; + /* Get the NVME cmd details for this unique error. */ + cp = (struct nvme_fc_cmd_iu *)nCmd->cmdaddr; + ep = (struct nvme_fc_ersp_iu *)nCmd->rspaddr; + /* Check if this is really an ERSP */ if (nCmd->rcv_rsplen == LPFC_NVME_ERSP_LEN) { lpfc_ncmd->status = IOSTAT_SUCCESS; lpfc_ncmd->result = 0; lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME, - "6084 NVME Completion ERSP: " - "xri %x placed x%x\n", - lpfc_ncmd->cur_iocbq.sli4_xritag, - wcqe->total_data_placed); + "6084 NVME FCP_ERR ERSP: " + "xri %x placed x%x opcode x%x cmd_id " + "x%x cqe_status x%x\n", + lpfc_ncmd->cur_iocbq.sli4_xritag, + wcqe->total_data_placed, + cp->sqe.common.opcode, + cp->sqe.common.command_id, + ep->cqe.status); break; } lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, "6081 NVME Completion Protocol Error: " "xri %x status x%x result x%x " - "placed x%x\n", + "placed x%x opcode x%x cmd_id x%x, " + "cqe_status x%x\n", lpfc_ncmd->cur_iocbq.sli4_xritag, lpfc_ncmd->status, lpfc_ncmd->result, - wcqe->total_data_placed); + wcqe->total_data_placed, + cp->sqe.common.opcode, + cp->sqe.common.command_id, + ep->cqe.status); break; case IOSTAT_LOCAL_REJECT: /* Let fall through to set command final state. */ @@ -1195,7 +1207,8 @@ lpfc_nvme_prep_io_cmd(struct lpfc_vport *vport, { struct lpfc_hba *phba = vport->phba; struct nvmefc_fcp_req *nCmd = lpfc_ncmd->nvmeCmd; - struct lpfc_iocbq *pwqeq = &(lpfc_ncmd->cur_iocbq); + struct nvme_common_command *sqe; + struct lpfc_iocbq *pwqeq = &lpfc_ncmd->cur_iocbq; union lpfc_wqe128 *wqe = &pwqeq->wqe; uint32_t req_len; @@ -1252,8 +1265,14 @@ lpfc_nvme_prep_io_cmd(struct lpfc_vport *vport, cstat->control_requests++; } - if (pnode->nlp_nvme_info & NLP_NVME_NSLER) + if (pnode->nlp_nvme_info & NLP_NVME_NSLER) { bf_set(wqe_erp, &wqe->generic.wqe_com, 1); + sqe = &((struct nvme_fc_cmd_iu *) + nCmd->cmdaddr)->sqe.common; + if (sqe->opcode == nvme_admin_async_event) + bf_set(wqe_ffrq, &wqe->generic.wqe_com, 1); + } + /* * Finish initializing those WQE fields that are independent * of the nvme_cmnd request_buffer @@ -1787,7 +1806,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport, * lpfc_nvme_abort_fcreq_cmpl - Complete an NVME FCP abort request. * @phba: Pointer to HBA context object * @cmdiocb: Pointer to command iocb object. - * @abts_cmpl: Pointer to wcqe complete object. + * @rspiocb: Pointer to response iocb object. * * This is the callback function for any NVME FCP IO that was aborted. * @@ -1796,8 +1815,10 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport, **/ void lpfc_nvme_abort_fcreq_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, - struct lpfc_wcqe_complete *abts_cmpl) + struct lpfc_iocbq *rspiocb) { + struct lpfc_wcqe_complete *abts_cmpl = &rspiocb->wcqe_cmpl; + lpfc_printf_log(phba, KERN_INFO, LOG_NVME, "6145 ABORT_XRI_CN completing on rpi x%x " "original iotag x%x, abort cmd iotag x%x " @@ -1840,6 +1861,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport, struct lpfc_nvme_fcpreq_priv *freqpriv; unsigned long flags; int ret_val; + struct nvme_fc_cmd_iu *cp; /* Validate pointers. LLDD fault handling with transport does * have timing races. @@ -1963,10 +1985,16 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport, return; } + /* + * Get Command Id from cmd to plug into response. This + * code is not needed in the next NVME Transport drop. + */ + cp = (struct nvme_fc_cmd_iu *)lpfc_nbuf->nvmeCmd->cmdaddr; lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_ABTS, "6138 Transport Abort NVME Request Issued for " - "ox_id x%x\n", - nvmereq_wqe->sli4_xritag); + "ox_id x%x nvme opcode x%x nvme cmd_id x%x\n", + nvmereq_wqe->sli4_xritag, cp->sqe.common.opcode, + cp->sqe.common.command_id); return; out_unlock: diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index d43968203248..ba5e4016262e 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -6062,6 +6062,9 @@ lpfc_device_reset_handler(struct scsi_cmnd *cmnd) int status; u32 logit = LOG_FCP; + if (!rport) + return FAILED; + rdata = rport->dd_data; if (!rdata || !rdata->pnode) { lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, @@ -6140,6 +6143,9 @@ lpfc_target_reset_handler(struct scsi_cmnd *cmnd) unsigned long flags; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waitq); + if (!rport) + return FAILED; + rdata = rport->dd_data; if (!rdata || !rdata->pnode) { lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 6ed696c4602a..80ac3a051c19 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -1930,7 +1930,7 @@ lpfc_issue_cmf_sync_wqe(struct lpfc_hba *phba, u32 ms, u64 total) sync_buf = __lpfc_sli_get_iocbq(phba); if (!sync_buf) { lpfc_printf_log(phba, KERN_ERR, LOG_CGN_MGMT, - "6213 No available WQEs for CMF_SYNC_WQE\n"); + "6244 No available WQEs for CMF_SYNC_WQE\n"); ret_val = ENOMEM; goto out_unlock; } @@ -3805,7 +3805,7 @@ lpfc_sli_process_sol_iocb(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, set_job_ulpword4(cmdiocbp, IOERR_ABORT_REQUESTED); /* - * For SLI4, irsiocb contains + * For SLI4, irspiocb contains * NO_XRI in sli_xritag, it * shall not affect releasing * sgl (xri) process. @@ -3823,7 +3823,7 @@ lpfc_sli_process_sol_iocb(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, } } } - (cmdiocbp->cmd_cmpl) (phba, cmdiocbp, saveq); + cmdiocbp->cmd_cmpl(phba, cmdiocbp, saveq); } else lpfc_sli_release_iocbq(phba, cmdiocbp); } else { @@ -4063,8 +4063,7 @@ lpfc_sli_handle_fast_ring_event(struct lpfc_hba *phba, cmdiocbq->cmd_flag &= ~LPFC_DRIVER_ABORTED; if (cmdiocbq->cmd_cmpl) { spin_unlock_irqrestore(&phba->hbalock, iflag); - (cmdiocbq->cmd_cmpl)(phba, cmdiocbq, - &rspiocbq); + cmdiocbq->cmd_cmpl(phba, cmdiocbq, &rspiocbq); spin_lock_irqsave(&phba->hbalock, iflag); } break; @@ -10288,7 +10287,7 @@ __lpfc_sli_issue_iocb_s3(struct lpfc_hba *phba, uint32_t ring_number, * @flag: Flag indicating if this command can be put into txq. * * __lpfc_sli_issue_fcp_io_s3 is wrapper function to invoke lockless func to - * send an iocb command to an HBA with SLI-4 interface spec. + * send an iocb command to an HBA with SLI-3 interface spec. * * This function takes the hbalock before invoking the lockless version. * The function will return success after it successfully submit the wqe to @@ -12740,7 +12739,7 @@ lpfc_sli_wake_iocb_wait(struct lpfc_hba *phba, cmdiocbq->cmd_cmpl = cmdiocbq->wait_cmd_cmpl; cmdiocbq->wait_cmd_cmpl = NULL; if (cmdiocbq->cmd_cmpl) - (cmdiocbq->cmd_cmpl)(phba, cmdiocbq, NULL); + cmdiocbq->cmd_cmpl(phba, cmdiocbq, NULL); else lpfc_sli_release_iocbq(phba, cmdiocbq); return; @@ -12754,9 +12753,9 @@ lpfc_sli_wake_iocb_wait(struct lpfc_hba *phba, /* Set the exchange busy flag for task management commands */ if ((cmdiocbq->cmd_flag & LPFC_IO_FCP) && - !(cmdiocbq->cmd_flag & LPFC_IO_LIBDFC)) { + !(cmdiocbq->cmd_flag & LPFC_IO_LIBDFC)) { lpfc_cmd = container_of(cmdiocbq, struct lpfc_io_buf, - cur_iocbq); + cur_iocbq); if (rspiocbq && (rspiocbq->cmd_flag & LPFC_EXCHANGE_BUSY)) lpfc_cmd->flags |= LPFC_SBUF_XBUSY; else @@ -13896,7 +13895,7 @@ void lpfc_sli4_els_xri_abort_event_proc(struct lpfc_hba *phba) * @irspiocbq: Pointer to work-queue completion queue entry. * * This routine handles an ELS work-queue completion event and construct - * a pseudo response ELS IODBQ from the SLI4 ELS WCQE for the common + * a pseudo response ELS IOCBQ from the SLI4 ELS WCQE for the common * discovery engine to handle. * * Return: Pointer to the receive IOCBQ, NULL otherwise. @@ -13940,7 +13939,7 @@ lpfc_sli4_els_preprocess_rspiocbq(struct lpfc_hba *phba, if (bf_get(lpfc_wcqe_c_xb, wcqe)) { spin_lock_irqsave(&phba->hbalock, iflags); - cmdiocbq->cmd_flag |= LPFC_EXCHANGE_BUSY; + irspiocbq->cmd_flag |= LPFC_EXCHANGE_BUSY; spin_unlock_irqrestore(&phba->hbalock, iflags); } @@ -14799,7 +14798,7 @@ lpfc_sli4_fp_handle_fcp_wcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, /* Pass the cmd_iocb and the wcqe to the upper layer */ memcpy(&cmdiocbq->wcqe_cmpl, wcqe, sizeof(struct lpfc_wcqe_complete)); - (cmdiocbq->cmd_cmpl)(phba, cmdiocbq, cmdiocbq); + cmdiocbq->cmd_cmpl(phba, cmdiocbq, cmdiocbq); } else { lpfc_printf_log(phba, KERN_WARNING, LOG_SLI, "0375 FCP cmdiocb not callback function " @@ -18956,7 +18955,7 @@ lpfc_sli4_send_seq_to_ulp(struct lpfc_vport *vport, /* Free iocb created in lpfc_prep_seq */ list_for_each_entry_safe(curr_iocb, next_iocb, - &iocbq->list, list) { + &iocbq->list, list) { list_del_init(&curr_iocb->list); lpfc_sli_release_iocbq(phba, curr_iocb); } diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h index 4fab79ed58ed..2ab6f7db64d8 100644 --- a/drivers/scsi/lpfc/lpfc_version.h +++ b/drivers/scsi/lpfc/lpfc_version.h @@ -20,7 +20,7 @@ * included with this package. * *******************************************************************/ -#define LPFC_DRIVER_VERSION "14.2.0.3" +#define LPFC_DRIVER_VERSION "14.2.0.4" #define LPFC_DRIVER_NAME "lpfc" /* Used for SLI 2/3 */ diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 37d46ae5c61d..9a1ae52bb621 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -5369,6 +5369,7 @@ static int _base_assign_fw_reported_qd(struct MPT3SAS_ADAPTER *ioc) Mpi2ConfigReply_t mpi_reply; Mpi2SasIOUnitPage1_t *sas_iounit_pg1 = NULL; Mpi26PCIeIOUnitPage1_t pcie_iounit_pg1; + u16 depth; int sz; int rc = 0; @@ -5380,7 +5381,7 @@ static int _base_assign_fw_reported_qd(struct MPT3SAS_ADAPTER *ioc) goto out; /* sas iounit page 1 */ sz = offsetof(Mpi2SasIOUnitPage1_t, PhyData); - sas_iounit_pg1 = kzalloc(sz, GFP_KERNEL); + sas_iounit_pg1 = kzalloc(sizeof(Mpi2SasIOUnitPage1_t), GFP_KERNEL); if (!sas_iounit_pg1) { pr_err("%s: failure at %s:%d/%s()!\n", ioc->name, __FILE__, __LINE__, __func__); @@ -5393,16 +5394,16 @@ static int _base_assign_fw_reported_qd(struct MPT3SAS_ADAPTER *ioc) ioc->name, __FILE__, __LINE__, __func__); goto out; } - ioc->max_wideport_qd = - (le16_to_cpu(sas_iounit_pg1->SASWideMaxQueueDepth)) ? - le16_to_cpu(sas_iounit_pg1->SASWideMaxQueueDepth) : - MPT3SAS_SAS_QUEUE_DEPTH; - ioc->max_narrowport_qd = - (le16_to_cpu(sas_iounit_pg1->SASNarrowMaxQueueDepth)) ? - le16_to_cpu(sas_iounit_pg1->SASNarrowMaxQueueDepth) : - MPT3SAS_SAS_QUEUE_DEPTH; - ioc->max_sata_qd = (sas_iounit_pg1->SATAMaxQDepth) ? - sas_iounit_pg1->SATAMaxQDepth : MPT3SAS_SATA_QUEUE_DEPTH; + + depth = le16_to_cpu(sas_iounit_pg1->SASWideMaxQueueDepth); + ioc->max_wideport_qd = (depth ? depth : MPT3SAS_SAS_QUEUE_DEPTH); + + depth = le16_to_cpu(sas_iounit_pg1->SASNarrowMaxQueueDepth); + ioc->max_narrowport_qd = (depth ? depth : MPT3SAS_SAS_QUEUE_DEPTH); + + depth = sas_iounit_pg1->SATAMaxQDepth; + ioc->max_sata_qd = (depth ? depth : MPT3SAS_SATA_QUEUE_DEPTH); + /* pcie iounit page 1 */ rc = mpt3sas_config_get_pcie_iounit_pg1(ioc, &mpi_reply, &pcie_iounit_pg1, sizeof(Mpi26PCIeIOUnitPage1_t)); diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index bfce60183a6e..836ddc476764 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -4031,7 +4031,7 @@ pmcraid_register_interrupt_handler(struct pmcraid_instance *pinstance) return 0; out_unwind: - while (--i > 0) + while (--i >= 0) free_irq(pci_irq_vector(pdev, i), &pinstance->hrrq_vector[i]); pci_free_irq_vectors(pdev); return rc; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 895b56c8f25e..a1a2ac09066f 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3072,7 +3072,7 @@ static void sd_read_cpr(struct scsi_disk *sdkp) goto out; /* We must have at least a 64B header and one 32B range descriptor */ - vpd_len = get_unaligned_be16(&buffer[2]) + 3; + vpd_len = get_unaligned_be16(&buffer[2]) + 4; if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Ranges VPD page\n"); diff --git a/drivers/scsi/vmw_pvscsi.h b/drivers/scsi/vmw_pvscsi.h index 51a82f7803d3..9d16cf925483 100644 --- a/drivers/scsi/vmw_pvscsi.h +++ b/drivers/scsi/vmw_pvscsi.h @@ -331,8 +331,8 @@ struct PVSCSIRingReqDesc { u8 tag; u8 bus; u8 target; - u8 vcpuHint; - u8 unused[59]; + u16 vcpuHint; + u8 unused[58]; } __packed; /* diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index b7a955479156..1b6d46b86f81 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -107,7 +107,7 @@ struct mlx5_vdpa_virtqueue { /* Resources for implementing the notification channel from the device * to the driver. fwqp is the firmware end of an RC connection; the - * other end is vqqp used by the driver. cq is is where completions are + * other end is vqqp used by the driver. cq is where completions are * reported. */ struct mlx5_vdpa_cq cq; @@ -1814,12 +1814,13 @@ static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd) id = mlx5vdpa16_to_cpu(mvdev, vlan); mac_vlan_del(ndev, ndev->config.mac, id, true); + status = VIRTIO_NET_OK; break; default: - break; -} + break; + } -return status; + return status; } static void mlx5_cvq_kick_handler(struct work_struct *work) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index d503848b3b6e..776ad7496f53 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1345,9 +1345,9 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->minor = ret; dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT; - dev->dev = device_create(vduse_class, NULL, - MKDEV(MAJOR(vduse_major), dev->minor), - dev, "%s", config->name); + dev->dev = device_create_with_groups(vduse_class, NULL, + MKDEV(MAJOR(vduse_major), dev->minor), + dev, vduse_dev_groups, "%s", config->name); if (IS_ERR(dev->dev)) { ret = PTR_ERR(dev->dev); goto err_dev; @@ -1596,7 +1596,6 @@ static int vduse_init(void) return PTR_ERR(vduse_class); vduse_class->devnode = vduse_devnode; - vduse_class->dev_groups = vduse_dev_groups; ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse"); if (ret) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 935a1d0ddb97..5ad2596c6e8a 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -499,6 +499,8 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, ops->set_vq_ready(vdpa, idx, s.num); return 0; case VHOST_VDPA_GET_VRING_GROUP: + if (!ops->get_vq_group) + return -EOPNOTSUPP; s.index = idx; s.num = ops->get_vq_group(vdpa, idx); if (s.num >= vdpa->ngroups) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 14e2043d7685..eab55accf381 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -292,7 +292,7 @@ __vringh_iov(struct vringh *vrh, u16 i, int (*copy)(const struct vringh *vrh, void *dst, const void *src, size_t len)) { - int err, count = 0, up_next, desc_max; + int err, count = 0, indirect_count = 0, up_next, desc_max; struct vring_desc desc, *descs; struct vringh_range range = { -1ULL, 0 }, slowrange; bool slow = false; @@ -349,7 +349,12 @@ __vringh_iov(struct vringh *vrh, u16 i, continue; } - if (count++ == vrh->vring.num) { + if (up_next == -1) + count++; + else + indirect_count++; + + if (count > vrh->vring.num || indirect_count > desc_max) { vringh_bad("Descriptor loop in %p", descs); err = -ELOOP; goto fail; @@ -411,6 +416,7 @@ __vringh_iov(struct vringh *vrh, u16 i, i = return_from_indirect(vrh, &up_next, &descs, &desc_max); slow = false; + indirect_count = 0; } else break; } diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index f9a36bc7ac27..c9bec3813e94 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -255,7 +255,7 @@ static void vm_set_status(struct virtio_device *vdev, u8 status) /* * Per memory-barriers.txt, wmb() is not needed to guarantee - * that the the cache coherent memory writes have completed + * that the cache coherent memory writes have completed * before writing to the MMIO region. */ writel(status, vm_dev->base + VIRTIO_MMIO_STATUS); @@ -701,6 +701,7 @@ static int vm_cmdline_set(const char *device, if (!vm_cmdline_parent_registered) { err = device_register(&vm_cmdline_parent); if (err) { + put_device(&vm_cmdline_parent); pr_err("Failed to register parent device!\n"); return err; } diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index a0fa14f28a7f..b790f30b2b56 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -469,7 +469,7 @@ void vp_modern_set_status(struct virtio_pci_modern_device *mdev, /* * Per memory-barriers.txt, wmb() is not needed to guarantee - * that the the cache coherent memory writes have completed + * that the cache coherent memory writes have completed * before writing to the MMIO region. */ vp_iowrite8(status, &cfg->device_status); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 1b219c21d15e..6acabc2e7dc9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -124,7 +124,7 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode) static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE - return netfs_i_cookie(&v9inode->netfs.inode); + return netfs_i_cookie(&v9inode->netfs); #else return NULL; #endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 90c6c1ba03ab..a8f512b44a85 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -66,13 +66,12 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) } /** - * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request - * @mapping: unused mapping of request to cleanup - * @priv: private data to cleanup, a fid, guaranted non-null. + * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq + * @rreq: The I/O request to clean up */ -static void v9fs_req_cleanup(struct address_space *mapping, void *priv) +static void v9fs_free_request(struct netfs_io_request *rreq) { - struct p9_fid *fid = priv; + struct p9_fid *fid = rreq->netfs_priv; p9_client_clunk(fid); } @@ -94,9 +93,9 @@ static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, + .free_request = v9fs_free_request, .begin_cache_operation = v9fs_begin_cache_operation, .issue_read = v9fs_issue_read, - .cleanup = v9fs_req_cleanup, }; /** @@ -274,7 +273,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - retval = netfs_write_begin(filp, mapping, pos, len, &folio, fsdata); + retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); if (retval < 0) return retval; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e660c6348b9d..419d2f3cf2c2 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -252,7 +252,8 @@ void v9fs_free_inode(struct inode *inode) */ static void v9fs_set_netfs_context(struct inode *inode) { - netfs_inode_init(inode, &v9fs_req_ops); + struct v9fs_inode *v9inode = V9FS_I(inode); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); } int v9fs_init_inode(struct v9fs_session_info *v9ses, diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 3a5bbffdf053..d7d9402ff718 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_inode_init(inode, NULL); + netfs_inode_init(&vnode->netfs, NULL); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index 4de7af7c2f09..42118a4f3383 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -382,17 +382,17 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; } -static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +static void afs_free_request(struct netfs_io_request *rreq) { - key_put(netfs_priv); + key_put(rreq->netfs_priv); } const struct netfs_request_ops afs_req_ops = { .init_request = afs_init_request, + .free_request = afs_free_request, .begin_cache_operation = afs_begin_cache_operation, .check_write_begin = afs_check_write_begin, .issue_read = afs_issue_read, - .cleanup = afs_priv_cleanup, }; int afs_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 22811e9eacf5..89630acbc2cc 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_inode_init(&vnode->netfs.inode, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops); } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 984b113a9107..a6f25d9e75b5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -670,7 +670,7 @@ struct afs_vnode { static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE - return netfs_i_cookie(&vnode->netfs.inode); + return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 94a3d247924b..cc665cef0abe 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -9,8 +9,7 @@ #include <linux/slab.h> #include "internal.h" -unsigned __read_mostly afs_volume_gc_delay = 10; -unsigned __read_mostly afs_volume_record_life = 60 * 60; +static unsigned __read_mostly afs_volume_record_life = 60 * 60; /* * Insert a volume into a cell. If there's an existing volume record, that is diff --git a/fs/afs/write.c b/fs/afs/write.c index f80a6096d91c..2c885b22de34 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - ret = netfs_write_begin(file, mapping, pos, len, &folio, fsdata); + ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); if (ret < 0) return ret; diff --git a/fs/attr.c b/fs/attr.c index 66899b6e9bd8..dbe996b0dedf 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -61,9 +61,15 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, const struct inode *inode, kgid_t gid) { kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && - (in_group_p(gid) || gid_eq(gid, inode->i_gid))) - return true; + if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { + kgid_t mapped_gid; + + if (gid_eq(gid, inode->i_gid)) + return true; + mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); + if (in_group_p(mapped_gid)) + return true; + } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; if (gid_eq(kgid, INVALID_GID) && @@ -123,12 +129,20 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { + kgid_t mapped_gid; + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; + + if (ia_valid & ATTR_GID) + mapped_gid = mapped_kgid_fs(mnt_userns, + i_user_ns(inode), attr->ia_gid); + else + mapped_gid = i_gid_into_mnt(mnt_userns, inode); + /* Also check the setgid bit! */ - if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - i_gid_into_mnt(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_p(mapped_gid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f5f116ed1b9e..6dee88815491 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -394,11 +394,10 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) return 0; } -static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +static void ceph_netfs_free_request(struct netfs_io_request *rreq) { - struct inode *inode = mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); - int got = (uintptr_t)priv; + struct ceph_inode_info *ci = ceph_inode(rreq->inode); + int got = (uintptr_t)rreq->netfs_priv; if (got) ceph_put_cap_refs(ci, got); @@ -406,12 +405,12 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, + .free_request = ceph_netfs_free_request, .begin_cache_operation = ceph_begin_cache_operation, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, - .cleanup = ceph_readahead_cleanup, }; #ifdef CONFIG_CEPH_FSCACHE @@ -1322,10 +1321,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; int r; - r = netfs_write_begin(file, inode->i_mapping, pos, len, &folio, NULL); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); if (r == 0) folio_wait_fscache(folio); if (r < 0) { diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 26c6ae06e2f4..dc502daac49a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write); static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - return netfs_i_cookie(&ci->netfs.inode); + return netfs_i_cookie(&ci->netfs); } static inline void ceph_fscache_resize(struct inode *inode, loff_t to) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 650746b3ba99..56c53ab3618e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -460,7 +460,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) dout("alloc_inode %p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_inode_init(&ci->netfs.inode, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs, &ceph_netfs_ops); spin_lock_init(&ci->i_ceph_lock); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c85d9a378325..8f2e003e0590 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1086,7 +1086,7 @@ struct file_system_type cifs_fs_type = { }; MODULE_ALIAS_FS("cifs"); -static struct file_system_type smb3_fs_type = { +struct file_system_type smb3_fs_type = { .owner = THIS_MODULE, .name = "smb3", .init_fs_context = smb3_init_fs_context, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index dd7e070ca243..b17be47a8e59 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -38,7 +38,7 @@ static inline unsigned long cifs_get_time(struct dentry *dentry) return (unsigned long) dentry->d_fsdata; } -extern struct file_system_type cifs_fs_type; +extern struct file_system_type cifs_fs_type, smb3_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d46702f5a663..1849e3411487 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,6 +97,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) if (!server->hostname) return -EINVAL; + /* if server hostname isn't populated, there's nothing to do here */ + if (server->hostname[0] == '\0') + return 0; + len = strlen(server->hostname) + 3; unc = kmalloc(len, GFP_KERNEL); diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index ab9a51d0125c..aa3b941a5555 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -61,7 +61,7 @@ void cifs_fscache_fill_coherency(struct inode *inode, static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - return netfs_i_cookie(inode); + return netfs_i_cookie(&CIFS_I(inode)->netfs); } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index cbc3b433f502..c69e1240d730 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1211,18 +1211,23 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void .data = data, .sb = NULL, }; + struct file_system_type **fs_type = (struct file_system_type *[]) { + &cifs_fs_type, &smb3_fs_type, NULL, + }; - iterate_supers_type(&cifs_fs_type, f, &sd); - - if (!sd.sb) - return ERR_PTR(-EINVAL); - /* - * Grab an active reference in order to prevent automounts (DFS links) - * of expiring and then freeing up our cifs superblock pointer while - * we're doing failover. - */ - cifs_sb_active(sd.sb); - return sd.sb; + for (; *fs_type; fs_type++) { + iterate_supers_type(*fs_type, f, &sd); + if (sd.sb) { + /* + * Grab an active reference in order to prevent automounts (DFS links) + * of expiring and then freeing up our cifs superblock pointer while + * we're doing failover. + */ + cifs_sb_active(sd.sb); + return sd.sb; + } + } + return ERR_PTR(-EINVAL); } static void __cifs_put_super(struct super_block *sb) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3b7915af1f62..0bece97547d4 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -301,7 +301,10 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, /* Auth */ ctx.domainauto = ses->domainAuto; ctx.domainname = ses->domainName; - ctx.server_hostname = ses->server->hostname; + + /* no hostname for extra channels */ + ctx.server_hostname = ""; + ctx.username = ses->user_name; ctx.password = ses->password; ctx.sectype = ses->sectype; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0e8c85249579..eaf975f1ad89 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -288,6 +288,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; goto failed; + } else if (rc) { + mutex_unlock(&ses->session_mutex); + goto out; } } else { mutex_unlock(&ses->session_mutex); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index d37e012386f3..42f892c5712e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -297,6 +297,7 @@ zero_out: /** * netfs_write_begin - Helper to prepare for writing + * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from * @pos: File position at which the write will begin @@ -326,12 +327,12 @@ zero_out: * * This is usable whether or not caching is enabled. */ -int netfs_write_begin(struct file *file, struct address_space *mapping, +int netfs_write_begin(struct netfs_inode *ctx, + struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **_folio, void **_fsdata) { struct netfs_io_request *rreq; - struct netfs_inode *ctx = netfs_inode(file_inode(file )); struct folio *folio; unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; pgoff_t index = pos >> PAGE_SHIFT; diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c6afa605b63b..e17cdf53f6a7 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -75,10 +75,10 @@ static void netfs_free_request(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); - netfs_clear_subrequests(rreq, false); - if (rreq->netfs_priv) - rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); trace_netfs_rreq(rreq, netfs_rreq_trace_free); + netfs_clear_subrequests(rreq, false); + if (rreq->netfs_ops->free_request) + rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); kfree(rreq); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index f172412447f5..9cb2d590c036 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -309,11 +309,12 @@ nfsd_file_put(struct nfsd_file *nf) if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { nfsd_file_flush(nf); nfsd_file_put_noref(nf); - } else { + } else if (nf->nf_file) { nfsd_file_put_noref(nf); - if (nf->nf_file) - nfsd_file_schedule_laundrette(); - } + nfsd_file_schedule_laundrette(); + } else + nfsd_file_put_noref(nf); + if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) nfsd_file_gc(); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 1cf3738ef1ea..992ee987f273 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -403,7 +403,6 @@ enum { extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags); extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); -extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask, diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 54dc2f9a2d56..2c7477354744 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -65,6 +65,9 @@ extern ssize_t cpu_show_tsx_async_abort(struct device *dev, extern ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_mmio_stale_data(struct device *dev, + struct device_attribute *attr, + char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/include/linux/crc-itu-t.h b/include/linux/crc-itu-t.h index a4367051e192..2f991a427ade 100644 --- a/include/linux/crc-itu-t.h +++ b/include/linux/crc-itu-t.h @@ -4,7 +4,7 @@ * * Implements the standard CRC ITU-T V.41: * Width 16 - * Poly 0x1021 (x^16 + x^12 + x^15 + 1) + * Poly 0x1021 (x^16 + x^12 + x^5 + 1) * Init 0 */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b34ff2cdbc4f..c29ab4c0cd5c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -227,6 +227,7 @@ struct page { * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. * @lru: Least Recently Used list; tracks how recently this folio was used. + * @mlock_count: Number of times this folio has been pinned by mlock(). * @mapping: The file this page belongs to, or refers to the anon_vma for * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, @@ -255,10 +256,14 @@ struct folio { unsigned long flags; union { struct list_head lru; + /* private: avoid cluttering the output */ struct { void *__filler; + /* public: */ unsigned int mlock_count; + /* private: */ }; + /* public: */ }; struct address_space *mapping; pgoff_t index; diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 6dbb4c9ce50d..1773e5df8e65 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -206,7 +206,9 @@ struct netfs_io_request { */ struct netfs_request_ops { int (*init_request)(struct netfs_io_request *rreq, struct file *file); + void (*free_request)(struct netfs_io_request *rreq); int (*begin_cache_operation)(struct netfs_io_request *rreq); + void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); @@ -214,7 +216,6 @@ struct netfs_request_ops { int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, struct folio *folio, void **_fsdata); void (*done)(struct netfs_io_request *rreq); - void (*cleanup)(struct address_space *mapping, void *netfs_priv); }; /* @@ -277,7 +278,8 @@ struct netfs_cache_ops { struct readahead_control; extern void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); -extern int netfs_write_begin(struct file *, struct address_space *, +extern int netfs_write_begin(struct netfs_inode *, + struct file *, struct address_space *, loff_t, unsigned int, struct folio **, void **); @@ -302,19 +304,17 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) /** * netfs_inode_init - Initialise a netfslib inode context - * @inode: The inode with which the context is associated + * @ctx: The netfs inode to initialise * @ops: The netfs's operations list * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ -static inline void netfs_inode_init(struct inode *inode, +static inline void netfs_inode_init(struct netfs_inode *ctx, const struct netfs_request_ops *ops) { - struct netfs_inode *ctx = netfs_inode(inode); - ctx->ops = ops; - ctx->remote_i_size = i_size_read(inode); + ctx->remote_i_size = i_size_read(&ctx->inode); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -322,28 +322,25 @@ static inline void netfs_inode_init(struct inode *inode, /** * netfs_resize_file - Note that a file got resized - * @inode: The inode being resized + * @ctx: The netfs inode being resized * @new_i_size: The new file size * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct inode *inode, loff_t new_i_size) +static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size) { - struct netfs_inode *ctx = netfs_inode(inode); - ctx->remote_i_size = new_i_size; } /** * netfs_i_cookie - Get the cache cookie from the inode - * @inode: The inode to query + * @ctx: The netfs inode to query * * Get the caching cookie (if enabled) from the network filesystem's inode. */ -static inline struct fscache_cookie *netfs_i_cookie(struct inode *inode) +static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) { #if IS_ENABLED(CONFIG_FSCACHE) - struct netfs_inode *ctx = netfs_inode(inode); return ctx->cache; #else return NULL; diff --git a/include/linux/random.h b/include/linux/random.h index fae0c84027fd..20e389a14e5c 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -13,7 +13,7 @@ struct notifier_block; void add_device_randomness(const void *buf, size_t len); -void add_bootloader_randomness(const void *buf, size_t len); +void __init add_bootloader_randomness(const void *buf, size_t len); void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; void add_interrupt_randomness(int irq) __latent_entropy; @@ -74,7 +74,6 @@ static inline unsigned long get_random_canary(void) int __init random_init(const char *command_line); bool rng_is_initialized(void); -bool rng_has_arch_random(void); int wait_for_random_bytes(void); /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 4417f667c757..5860f32e3958 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -243,7 +243,7 @@ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes); -extern void xdr_commit_encode(struct xdr_stream *xdr); +extern void __xdr_commit_encode(struct xdr_stream *xdr); extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len); extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, @@ -307,6 +307,20 @@ xdr_reset_scratch_buffer(struct xdr_stream *xdr) } /** + * xdr_commit_encode - Ensure all data is written to xdr->buf + * @xdr: pointer to xdr_stream + * + * Handle encoding across page boundaries by giving the caller a + * temporary location to write to, then later copying the data into + * place. __xdr_commit_encode() does that copying. + */ +static inline void xdr_commit_encode(struct xdr_stream *xdr) +{ + if (unlikely(xdr->scratch.iov_len)) + __xdr_commit_encode(xdr); +} + +/** * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 4700a88a28f6..7b4a13d3bd91 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -178,7 +178,8 @@ struct vdpa_map_file { * for the device * @vdev: vdpa device * Returns virtqueue algin requirement - * @get_vq_group: Get the group id for a specific virtqueue + * @get_vq_group: Get the group id for a specific + * virtqueue (optional) * @vdev: vdpa device * @idx: virtqueue index * Returns u32: group id for this virtqueue @@ -243,7 +244,7 @@ struct vdpa_map_file { * Returns the iova range supported by * the device. * @set_group_asid: Set address space identifier for a - * virtqueue group + * virtqueue group (optional) * @vdev: vdpa device * @group: virtqueue group * @asid: address space id for this group diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b159c2789961..096d48aa3437 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -215,6 +215,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); +struct vmap_area *find_vmap_area(unsigned long addr); static inline bool is_vm_area_hugepages(const void *addr) { diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 7fee9b6cfede..62e75dd40d9a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -406,7 +406,7 @@ alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) - * @args...: args for @fmt + * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are @@ -445,7 +445,7 @@ extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); -extern void flush_workqueue(struct workqueue_struct *wq); +extern void __flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); @@ -563,15 +563,23 @@ static inline bool schedule_work(struct work_struct *work) return queue_work(system_wq, work); } +/* + * Detect attempt to flush system-wide workqueues at compile time when possible. + * + * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp + * for reasons and steps for converting system-wide workqueues into local workqueues. + */ +extern void __warn_flushing_systemwide_wq(void) + __compiletime_warning("Please avoid flushing system-wide workqueues."); + /** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its * completion. * - * Think twice before calling this function! It's very easy to get into - * trouble if you don't take great care. Either of the following situations - * will lead to deadlock: + * It's very easy to get into trouble if you don't take great care. + * Either of the following situations will lead to deadlock: * * One of the work items currently on the workqueue needs to acquire * a lock held by your code or its caller. @@ -586,11 +594,51 @@ static inline bool schedule_work(struct work_struct *work) * need to know that a particular work item isn't queued and isn't running. * In such cases you should use cancel_delayed_work_sync() or * cancel_work_sync() instead. + * + * Please stop calling this function! A conversion to stop flushing system-wide + * workqueues is in progress. This function will be removed after all in-tree + * users stopped calling this function. */ -static inline void flush_scheduled_work(void) -{ - flush_workqueue(system_wq); -} +/* + * The background of commit 771c035372a036f8 ("deprecate the + * '__deprecated' attribute warnings entirely and for good") is that, + * since Linus builds all modules between every single pull he does, + * the standard kernel build needs to be _clean_ in order to be able to + * notice when new problems happen. Therefore, don't emit warning while + * there are in-tree users. + */ +#define flush_scheduled_work() \ +({ \ + if (0) \ + __warn_flushing_systemwide_wq(); \ + __flush_workqueue(system_wq); \ +}) + +/* + * Although there is no longer in-tree caller, for now just emit warning + * in order to give out-of-tree callers time to update. + */ +#define flush_workqueue(wq) \ +({ \ + struct workqueue_struct *_wq = (wq); \ + \ + if ((__builtin_constant_p(_wq == system_wq) && \ + _wq == system_wq) || \ + (__builtin_constant_p(_wq == system_highpri_wq) && \ + _wq == system_highpri_wq) || \ + (__builtin_constant_p(_wq == system_long_wq) && \ + _wq == system_long_wq) || \ + (__builtin_constant_p(_wq == system_unbound_wq) && \ + _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_freezable_wq) && \ + _wq == system_freezable_wq) || \ + (__builtin_constant_p(_wq == system_power_efficient_wq) && \ + _wq == system_power_efficient_wq) || \ + (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \ + _wq == system_freezable_power_efficient_wq)) \ + __warn_flushing_systemwide_wq(); \ + __flush_workqueue(_wq); \ +}) /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 72feab5ea8d4..c29e11b2c073 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1508,6 +1508,7 @@ void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); +void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 077cd730ce2f..85cd695e7fd1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -25,7 +25,6 @@ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; -struct inet_bind2_bucket; struct tcp_congestion_ops; /* @@ -58,7 +57,6 @@ struct inet_connection_sock_af_ops { * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node - * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout @@ -85,7 +83,6 @@ struct inet_connection_sock { struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; - struct inet_bind2_bucket *icsk_bind2_hash; unsigned long icsk_timeout; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index a0887b70967b..ebfa3df6f8dc 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -90,32 +90,11 @@ struct inet_bind_bucket { struct hlist_head owners; }; -struct inet_bind2_bucket { - possible_net_t ib_net; - int l3mdev; - unsigned short port; - union { -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr v6_rcv_saddr; -#endif - __be32 rcv_saddr; - }; - /* Node in the inet2_bind_hashbucket chain */ - struct hlist_node node; - /* List of sockets hashed to this bucket */ - struct hlist_head owners; -}; - static inline struct net *ib_net(struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } -static inline struct net *ib2_net(struct inet_bind2_bucket *ib) -{ - return read_pnet(&ib->ib_net); -} - #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) @@ -124,15 +103,6 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; -/* This is synchronized using the inet_bind_hashbucket's spinlock. - * Instead of having separate spinlocks, the inet_bind2_hashbucket can share - * the inet_bind_hashbucket's given that in every case where the bhash2 table - * is useful, a lookup in the bhash table also occurs. - */ -struct inet_bind2_hashbucket { - struct hlist_head chain; -}; - /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without @@ -164,12 +134,6 @@ struct inet_hashinfo { */ struct kmem_cache *bind_bucket_cachep; struct inet_bind_hashbucket *bhash; - /* The 2nd binding table hashed by port and address. - * This is used primarily for expediting the resolution of bind - * conflicts. - */ - struct kmem_cache *bind2_bucket_cachep; - struct inet_bind2_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ @@ -229,36 +193,6 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); -static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb, - struct net *net, - const unsigned short port, - int l3mdev) -{ - return net_eq(ib_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev; -} - -struct inet_bind2_bucket * -inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, - struct inet_bind2_hashbucket *head, - const unsigned short port, int l3mdev, - const struct sock *sk); - -void inet_bind2_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind2_bucket *tb); - -struct inet_bind2_bucket * -inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, - const unsigned short port, int l3mdev, - struct sock *sk, - struct inet_bind2_hashbucket **head); - -bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, - struct net *net, - const unsigned short port, - int l3mdev, - const struct sock *sk); - static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { @@ -266,7 +200,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - struct inet_bind2_bucket *tb2, const unsigned short snum); + const unsigned short snum); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); diff --git a/include/net/sock.h b/include/net/sock.h index c585ef6565d9..72ca97ccb460 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -348,7 +348,6 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference - * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -538,7 +537,6 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -819,16 +817,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } -static inline void __sk_del_bind2_node(struct sock *sk) -{ - __hlist_del(&sk->sk_bind2_node); -} - -static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) -{ - hlist_add_head(&sk->sk_bind2_node, list); -} - #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ @@ -846,8 +834,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) -#define sk_for_each_bound_bhash2(__sk, list) \ - hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 6154a2e72bce..262d52021c23 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -22,7 +22,7 @@ struct pool_workqueue; */ TRACE_EVENT(workqueue_queue_work, - TP_PROTO(unsigned int req_cpu, struct pool_workqueue *pwq, + TP_PROTO(int req_cpu, struct pool_workqueue *pwq, struct work_struct *work), TP_ARGS(req_cpu, pwq, work), @@ -31,8 +31,8 @@ TRACE_EVENT(workqueue_queue_work, __field( void *, work ) __field( void *, function) __string( workqueue, pwq->wq->name) - __field( unsigned int, req_cpu ) - __field( unsigned int, cpu ) + __field( int, req_cpu ) + __field( int, cpu ) ), TP_fast_assign( @@ -43,7 +43,7 @@ TRACE_EVENT(workqueue_queue_work, __entry->cpu = pwq->pool->cpu; ), - TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%u cpu=%u", + TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d", __entry->work, __entry->function, __get_str(workqueue), __entry->req_cpu, __entry->cpu) ); diff --git a/kernel/cfi.c b/kernel/cfi.c index 9594cfd1cf2c..08102d19ec15 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -281,6 +281,8 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr) static inline cfi_check_fn find_check_fn(unsigned long ptr) { cfi_check_fn fn = NULL; + unsigned long flags; + bool rcu_idle; if (is_kernel_text(ptr)) return __cfi_check; @@ -290,13 +292,21 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) * the shadow and __module_address use RCU, so we need to wake it * up if necessary. */ - RCU_NONIDLE({ - if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) - fn = find_shadow_check_fn(ptr); + rcu_idle = !rcu_is_watching(); + if (rcu_idle) { + local_irq_save(flags); + rcu_irq_enter(); + } + + if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) + fn = find_shadow_check_fn(ptr); + if (!fn) + fn = find_module_check_fn(ptr); - if (!fn) - fn = find_module_check_fn(ptr); - }); + if (rcu_idle) { + rcu_irq_exit(); + local_irq_restore(flags); + } return fn; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4056f2a3f9d5..1ea50f6be843 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2788,13 +2788,13 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } /** - * flush_workqueue - ensure that any scheduled work has run to completion. + * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ -void flush_workqueue(struct workqueue_struct *wq) +void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), @@ -2943,7 +2943,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL(flush_workqueue); +EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -2971,7 +2971,7 @@ void drain_workqueue(struct workqueue_struct *wq) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: - flush_workqueue(wq); + __flush_workqueue(wq); mutex_lock(&wq->mutex); @@ -6111,3 +6111,11 @@ void __init workqueue_init(void) wq_online = true; wq_watchdog_init(); } + +/* + * Despite the naming, this is a no-op function which is here only for avoiding + * link error. Since compile-time warning may fail to catch, we will need to + * emit run-time warning from __flush_workqueue(). + */ +void __warn_flushing_systemwide_wq(void) { } +EXPORT_SYMBOL(__warn_flushing_systemwide_wq); diff --git a/lib/crc-itu-t.c b/lib/crc-itu-t.c index 1974b355c148..1d26a1647da5 100644 --- a/lib/crc-itu-t.c +++ b/lib/crc-itu-t.c @@ -7,7 +7,7 @@ #include <linux/module.h> #include <linux/crc-itu-t.h> -/** CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^15 + 1) */ +/* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */ const u16 crc_itu_t_table[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 6dd5330f7a99..0b64695ab632 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1434,7 +1434,7 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i, { unsigned nr, offset; pgoff_t index, count; - size_t size = maxsize, actual; + size_t size = maxsize; loff_t pos; if (!size || !maxpages) @@ -1461,13 +1461,7 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i, if (nr == 0) return 0; - actual = PAGE_SIZE * nr; - actual -= offset; - if (nr == count && size > 0) { - unsigned last_offset = (nr > 1) ? 0 : offset; - actual -= PAGE_SIZE - (last_offset + size); - } - return actual; + return min_t(size_t, nr * PAGE_SIZE - offset, maxsize); } /* must be done on non-empty ITER_IOVEC one */ @@ -1602,7 +1596,7 @@ static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, struct page **p; unsigned nr, offset; pgoff_t index, count; - size_t size = maxsize, actual; + size_t size = maxsize; loff_t pos; if (!size) @@ -1631,13 +1625,7 @@ static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, if (nr == 0) return 0; - actual = PAGE_SIZE * nr; - actual -= offset; - if (nr == count && size > 0) { - unsigned last_offset = (nr > 1) ? 0 : offset; - actual -= PAGE_SIZE - (last_offset + size); - } - return actual; + return min_t(size_t, nr * PAGE_SIZE - offset, maxsize); } ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, diff --git a/lib/vsprintf.c b/lib/vsprintf.c index fb77f7bfd126..3c1853a9d1c0 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -769,8 +769,7 @@ static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out) static DECLARE_WORK(enable_ptr_key_work, enable_ptr_key_workfn); unsigned long flags; - if (!system_unbound_wq || - (!rng_is_initialized() && !rng_has_arch_random()) || + if (!system_unbound_wq || !rng_is_initialized() || !spin_trylock_irqsave(&filling, flags)) return -EAGAIN; diff --git a/lib/xarray.c b/lib/xarray.c index 54e646e8e6ee..ea9ce1f0b386 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -264,9 +264,10 @@ static void xa_node_free(struct xa_node *node) * xas_destroy() - Free any resources allocated during the XArray operation. * @xas: XArray operation state. * - * This function is now internal-only. + * Most users will not need to call this function; it is called for you + * by xas_nomem(). */ -static void xas_destroy(struct xa_state *xas) +void xas_destroy(struct xa_state *xas) { struct xa_node *next, *node = xas->xa_alloc; diff --git a/mm/filemap.c b/mm/filemap.c index 9daeaab36081..ac3775c1ce4c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2991,11 +2991,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; + unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ - if (vmf->vma->vm_flags & VM_HUGEPAGE) { + if (vm_flags & VM_HUGEPAGE) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; @@ -3003,7 +3004,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * Fetch two PMD folios, so we get the chance to actually * readahead, unless we've been told not to. */ - if (!(vmf->vma->vm_flags & VM_RAND_READ)) + if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); @@ -3012,12 +3013,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) #endif /* If we don't want any read-ahead, don't bother */ - if (vmf->vma->vm_flags & VM_RAND_READ) + if (vm_flags & VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; - if (vmf->vma->vm_flags & VM_SEQ_READ) { + if (vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a77c78a2b6b5..f7248002dad9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2672,8 +2672,7 @@ out_unlock: if (mapping) i_mmap_unlock_read(mapping); out: - /* Free any memory we didn't use */ - xas_nomem(&xas, 0); + xas_destroy(&xas); count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } diff --git a/mm/readahead.c b/mm/readahead.c index 415c39d764ea..57a015108254 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -164,12 +164,14 @@ static void read_pages(struct readahead_control *rac) while ((folio = readahead_folio(rac)) != NULL) { unsigned long nr = folio_nr_pages(folio); + folio_get(folio); rac->ra->size -= nr; if (rac->ra->async_size >= nr) { rac->ra->async_size -= nr; filemap_remove_folio(folio); } folio_unlock(folio); + folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) diff --git a/mm/usercopy.c b/mm/usercopy.c index baeacc735b83..4e1da708699b 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -161,29 +161,27 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n, static inline void check_heap_object(const void *ptr, unsigned long n, bool to_user) { + uintptr_t addr = (uintptr_t)ptr; + unsigned long offset; struct folio *folio; if (is_kmap_addr(ptr)) { - unsigned long page_end = (unsigned long)ptr | (PAGE_SIZE - 1); - - if ((unsigned long)ptr + n - 1 > page_end) - usercopy_abort("kmap", NULL, to_user, - offset_in_page(ptr), n); + offset = offset_in_page(ptr); + if (n > PAGE_SIZE - offset) + usercopy_abort("kmap", NULL, to_user, offset, n); return; } if (is_vmalloc_addr(ptr)) { - struct vm_struct *area = find_vm_area(ptr); - unsigned long offset; + struct vmap_area *area = find_vmap_area(addr); - if (!area) { + if (!area) usercopy_abort("vmalloc", "no area", to_user, 0, n); - return; - } - offset = ptr - area->addr; - if (offset + n > get_vm_area_size(area)) + if (n > area->va_end - addr) { + offset = addr - area->va_start; usercopy_abort("vmalloc", NULL, to_user, offset, n); + } return; } @@ -196,8 +194,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n, /* Check slab allocator for flags and size. */ __check_heap_object(ptr, n, folio_slab(folio), to_user); } else if (folio_test_large(folio)) { - unsigned long offset = ptr - folio_address(folio); - if (offset + n > folio_size(folio)) + offset = ptr - folio_address(folio); + if (n > folio_size(folio) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 07db42455dd4..effd1ff6a4b4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1798,7 +1798,7 @@ static void free_unmap_vmap_area(struct vmap_area *va) free_vmap_area_noflush(va); } -static struct vmap_area *find_vmap_area(unsigned long addr) +struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_area *va; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 95393bb2760b..4c7030ed8d33 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1661,9 +1661,12 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; - struct sk_buff *skb; + struct sk_buff *skb, *last; + struct sk_buff_head *sk_queue; int copied; int err = 0; + int off = 0; + long timeo; lock_sock(sk); /* @@ -1675,10 +1678,29 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, goto out; } - /* Now we can treat all alike */ - skb = skb_recv_datagram(sk, flags, &err); - if (skb == NULL) - goto out; + /* We need support for non-blocking reads. */ + sk_queue = &sk->sk_receive_queue; + skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, &err, &last); + /* If no packet is available, release_sock(sk) and try again. */ + if (!skb) { + if (err != -EAGAIN) + goto out; + release_sock(sk); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, &err, + &timeo, last)) { + skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, + &err, &last); + if (skb) + break; + + if (err != -EAGAIN) + goto done; + } + if (!skb) + goto done; + lock_sock(sk); + } if (!sk_to_ax25(sk)->pidincl) skb_pull(skb, 1); /* Remove PID */ @@ -1725,6 +1747,7 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, out: release_sock(sk); +done: return err; } diff --git a/net/core/dev.c b/net/core/dev.c index 08ce317fcec8..8e6f22961206 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -397,16 +397,18 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev) +static void unlist_netdevice(struct net_device *dev, bool lock) { ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock(&dev_base_lock); + if (lock) + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock(&dev_base_lock); + if (lock) + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -10043,11 +10045,11 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) { - dev->reg_state = NETREG_UNREGISTERED; + write_lock(&dev_base_lock); + dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + write_unlock(&dev_base_lock); + if (ret) goto err_uninit; - } - dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); @@ -10329,7 +10331,9 @@ void netdev_run_todo(void) continue; } + write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; + write_unlock(&dev_base_lock); linkwatch_forget_dev(dev); } @@ -10810,9 +10814,10 @@ void unregister_netdevice_many(struct list_head *head) list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - unlist_netdevice(dev); - + write_lock(&dev_base_lock); + unlist_netdevice(dev, false); dev->reg_state = NETREG_UNREGISTERING; + write_unlock(&dev_base_lock); } flush_all_backlogs(); @@ -10959,7 +10964,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev); + unlist_netdevice(dev, true); synchronize_net(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e319e242dddf..a3642569fe53 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -33,6 +33,7 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; +/* Caller holds RTNL or dev_base_lock */ static inline int dev_isalive(const struct net_device *dev) { return dev->reg_state <= NETREG_REGISTERED; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e78458900f2..eb8e128e43e8 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1120,12 +1120,6 @@ static int __init dccp_init(void) SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; - dccp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("dccp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind2_bucket_cachep) - goto out_free_bind_bucket_cachep; /* * Size and allocate the main established and bind bucket @@ -1156,7 +1150,7 @@ static int __init dccp_init(void) if (!dccp_hashinfo.ehash) { DCCP_CRIT("Failed to allocate DCCP established hash table"); - goto out_free_bind2_bucket_cachep; + goto out_free_bind_bucket_cachep; } for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) @@ -1182,23 +1176,14 @@ static int __init dccp_init(void) goto out_free_dccp_locks; } - dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *) - __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); - - if (!dccp_hashinfo.bhash2) { - DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); - goto out_free_dccp_bhash; - } - for (i = 0; i < dccp_hashinfo.bhash_size; i++) { spin_lock_init(&dccp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } rc = dccp_mib_init(); if (rc) - goto out_free_dccp_bhash2; + goto out_free_dccp_bhash; rc = dccp_ackvec_init(); if (rc) @@ -1222,38 +1207,30 @@ out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: dccp_mib_exit(); -out_free_dccp_bhash2: - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); out_free_dccp_locks: inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); -out_free_bind2_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); out_fail: dccp_hashinfo.bhash = NULL; - dccp_hashinfo.bhash2 = NULL; dccp_hashinfo.ehash = NULL; dccp_hashinfo.bind_bucket_cachep = NULL; - dccp_hashinfo.bind2_bucket_cachep = NULL; return rc; } static void __exit dccp_fini(void) { - int bhash_order = get_order(dccp_hashinfo.bhash_size * - sizeof(struct inet_bind_hashbucket)); - ccid_cleanup_builtins(); dccp_mib_exit(); - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); + free_pages((unsigned long)dccp_hashinfo.bhash, + get_order(dccp_hashinfo.bhash_size * + sizeof(struct inet_bind_hashbucket))); free_pages((unsigned long)dccp_hashinfo.ehash, get_order((dccp_hashinfo.ehash_mask + 1) * sizeof(struct inet_ehash_bucket))); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c0b7e6c21360..53f5f956d948 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -117,32 +117,6 @@ bool inet_rcv_saddr_any(const struct sock *sk) return !sk->sk_rcv_saddr; } -static bool use_bhash2_on_bind(const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - int addr_type; - - if (sk->sk_family == AF_INET6) { - addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - return addr_type != IPV6_ADDR_ANY && - addr_type != IPV6_ADDR_MAPPED; - } -#endif - return sk->sk_rcv_saddr != htonl(INADDR_ANY); -} - -static u32 get_bhash2_nulladdr_hash(const struct sock *sk, struct net *net, - int port) -{ -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr nulladdr = {}; - - if (sk->sk_family == AF_INET6) - return ipv6_portaddr_hash(net, &nulladdr, port); -#endif - return ipv4_portaddr_hash(net, 0, port); -} - void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; @@ -156,71 +130,16 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) } EXPORT_SYMBOL(inet_get_local_port_range); -static bool bind_conflict_exist(const struct sock *sk, struct sock *sk2, - kuid_t sk_uid, bool relax, - bool reuseport_cb_ok, bool reuseport_ok) -{ - int bound_dev_if2; - - if (sk == sk2) - return false; - - bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); - - if (!sk->sk_bound_dev_if || !bound_dev_if2 || - sk->sk_bound_dev_if == bound_dev_if2) { - if (sk->sk_reuse && sk2->sk_reuse && - sk2->sk_state != TCP_LISTEN) { - if (!relax || (!reuseport_ok && sk->sk_reuseport && - sk2->sk_reuseport && reuseport_cb_ok && - (sk2->sk_state == TCP_TIME_WAIT || - uid_eq(sk_uid, sock_i_uid(sk2))))) - return true; - } else if (!reuseport_ok || !sk->sk_reuseport || - !sk2->sk_reuseport || !reuseport_cb_ok || - (sk2->sk_state != TCP_TIME_WAIT && - !uid_eq(sk_uid, sock_i_uid(sk2)))) { - return true; - } - } - return false; -} - -static bool check_bhash2_conflict(const struct sock *sk, - struct inet_bind2_bucket *tb2, kuid_t sk_uid, - bool relax, bool reuseport_cb_ok, - bool reuseport_ok) -{ - struct sock *sk2; - - sk_for_each_bound_bhash2(sk2, &tb2->owners) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - continue; - - if (bind_conflict_exist(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) - return true; - } - return false; -} - -/* This should be called only when the corresponding inet_bind_bucket spinlock - * is held - */ -static int inet_csk_bind_conflict(const struct sock *sk, int port, - struct inet_bind_bucket *tb, - struct inet_bind2_bucket *tb2, /* may be null */ +static int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, bool relax, bool reuseport_ok) { - struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - kuid_t uid = sock_i_uid((struct sock *)sk); - struct sock_reuseport *reuseport_cb; - struct inet_bind2_hashbucket *head2; - bool reuseport_cb_ok; struct sock *sk2; - struct net *net; - int l3mdev; - u32 hash; + bool reuseport_cb_ok; + bool reuse = sk->sk_reuse; + bool reuseport = !!sk->sk_reuseport; + struct sock_reuseport *reuseport_cb; + kuid_t uid = sock_i_uid((struct sock *)sk); rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); @@ -231,42 +150,40 @@ static int inet_csk_bind_conflict(const struct sock *sk, int port, /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed - * in tb->owners and tb2->owners list belong - * to the same net + * in tb->owners list belong to the same net - the + * one this bucket belongs to. */ - if (!use_bhash2_on_bind(sk)) { - sk_for_each_bound(sk2, &tb->owners) - if (bind_conflict_exist(sk, sk2, uid, relax, - reuseport_cb_ok, reuseport_ok) && - inet_rcv_saddr_equal(sk, sk2, true)) - return true; + sk_for_each_bound(sk2, &tb->owners) { + int bound_dev_if2; - return false; + if (sk == sk2) + continue; + bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); + if ((!sk->sk_bound_dev_if || + !bound_dev_if2 || + sk->sk_bound_dev_if == bound_dev_if2)) { + if (reuse && sk2->sk_reuse && + sk2->sk_state != TCP_LISTEN) { + if ((!relax || + (!reuseport_ok && + reuseport && sk2->sk_reuseport && + reuseport_cb_ok && + (sk2->sk_state == TCP_TIME_WAIT || + uid_eq(uid, sock_i_uid(sk2))))) && + inet_rcv_saddr_equal(sk, sk2, true)) + break; + } else if (!reuseport_ok || + !reuseport || !sk2->sk_reuseport || + !reuseport_cb_ok || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, sock_i_uid(sk2)))) { + if (inet_rcv_saddr_equal(sk, sk2, true)) + break; + } + } } - - if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) - return true; - - net = sock_net(sk); - - /* check there's no conflict with an existing IPV6_ADDR_ANY (if ipv6) or - * INADDR_ANY (if ipv4) socket. - */ - hash = get_bhash2_nulladdr_hash(sk, net, port); - head2 = &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; - - l3mdev = inet_sk_bound_l3mdev(sk); - inet_bind_bucket_for_each(tb2, &head2->chain) - if (check_bind2_bucket_match_nulladdr(tb2, net, port, l3mdev, sk)) - break; - - if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) - return true; - - return false; + return sk2 != NULL; } /* @@ -274,20 +191,16 @@ static int inet_csk_bind_conflict(const struct sock *sk, int port, * inet_bind_hashbucket lock held. */ static struct inet_bind_hashbucket * -inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, - struct inet_bind2_bucket **tb2_ret, - struct inet_bind2_hashbucket **head2_ret, int *port_ret) +inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret) { struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - struct inet_bind2_hashbucket *head2; + int port = 0; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); + bool relax = false; int i, low, high, attempt_half; - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; - bool relax = false; - int port = 0; int l3mdev; l3mdev = inet_sk_bound_l3mdev(sk); @@ -326,12 +239,10 @@ other_parity_scan: head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); - tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, - &head2); inet_bind_bucket_for_each(tb, &head->chain) - if (check_bind_bucket_match(tb, net, port, l3mdev)) { - if (!inet_csk_bind_conflict(sk, port, tb, tb2, - relax, false)) + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { + if (!inet_csk_bind_conflict(sk, tb, relax, false)) goto success; goto next_port; } @@ -361,8 +272,6 @@ next_port: success: *port_ret = port; *tb_ret = tb; - *tb2_ret = tb2; - *head2_ret = head2; return head; } @@ -458,81 +367,54 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - bool bhash_created = false, bhash2_created = false; - struct inet_bind2_bucket *tb2 = NULL; - struct inet_bind2_hashbucket *head2; - struct inet_bind_bucket *tb = NULL; + int ret = 1, port = snum; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); - int ret = 1, port = snum; - bool found_port = false; + struct inet_bind_bucket *tb = NULL; int l3mdev; l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { - head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); + head = inet_csk_find_open_port(sk, &tb, &port); if (!head) return ret; - if (tb && tb2) - goto success; - found_port = true; - } else { - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock_bh(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (check_bind_bucket_match(tb, net, port, l3mdev)) - break; - - tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, - &head2); - } - - if (!tb) { - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, - head, port, l3mdev); if (!tb) - goto fail_unlock; - bhash_created = true; - } - - if (!tb2) { - tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, - net, head2, port, l3mdev, sk); - if (!tb2) - goto fail_unlock; - bhash2_created = true; + goto tb_not_found; + goto success; } - - /* If we had to find an open port, we already checked for conflicts */ - if (!found_port && !hlist_empty(&tb->owners)) { + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) + goto tb_found; +tb_not_found: + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port, l3mdev); + if (!tb) + goto fail_unlock; +tb_found: + if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; if ((tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) goto success; - if (inet_csk_bind_conflict(sk, port, tb, tb2, true, true)) + if (inet_csk_bind_conflict(sk, tb, true, true)) goto fail_unlock; } success: inet_csk_update_fastreuse(tb, sk); if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, tb2, port); + inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); - WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); ret = 0; fail_unlock: - if (ret) { - if (bhash_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - if (bhash2_created) - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - tb2); - } spin_unlock_bh(&head->lock); return ret; } @@ -1079,7 +961,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; - newicsk->icsk_bind2_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 545f91b6cb5e..b9d995b5ce24 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -81,41 +81,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, return tb; } -struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, - struct net *net, - struct inet_bind2_hashbucket *head, - const unsigned short port, - int l3mdev, - const struct sock *sk) -{ - struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); - - if (tb) { - write_pnet(&tb->ib_net, net); - tb->l3mdev = l3mdev; - tb->port = port; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; - else -#endif - tb->rcv_saddr = sk->sk_rcv_saddr; - INIT_HLIST_HEAD(&tb->owners); - hlist_add_head(&tb->node, &head->chain); - } - return tb; -} - -static bool bind2_bucket_addr_match(struct inet_bind2_bucket *tb2, struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - return ipv6_addr_equal(&tb2->v6_rcv_saddr, - &sk->sk_v6_rcv_saddr); -#endif - return tb2->rcv_saddr == sk->sk_rcv_saddr; -} - /* * Caller must hold hashbucket lock for this tb with local BH disabled */ @@ -127,25 +92,12 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket } } -/* Caller must hold the lock for the corresponding hashbucket in the bhash table - * with local BH disabled - */ -void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) -{ - if (hlist_empty(&tb->owners)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); - } -} - void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - struct inet_bind2_bucket *tb2, const unsigned short snum) + const unsigned short snum) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; - sk_add_bind2_node(sk, &tb2->owners); - inet_csk(sk)->icsk_bind2_hash = tb2; } /* @@ -157,7 +109,6 @@ static void __inet_put_port(struct sock *sk) const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; spin_lock(&head->lock); @@ -166,13 +117,6 @@ static void __inet_put_port(struct sock *sk) inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - - if (inet_csk(sk)->icsk_bind2_hash) { - tb2 = inet_csk(sk)->icsk_bind2_hash; - __sk_del_bind2_node(sk); - inet_csk(sk)->icsk_bind2_hash = NULL; - inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); - } spin_unlock(&head->lock); } @@ -189,19 +133,14 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; const int bhash = inet_bhashfn(sock_net(sk), port, - table->bhash_size); + table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; - struct inet_bind2_hashbucket *head_bhash2; - bool created_inet_bind_bucket = false; - struct net *net = sock_net(sk); - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; int l3mdev; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; - tb2 = inet_csk(sk)->icsk_bind2_hash; - if (unlikely(!tb || !tb2)) { + if (unlikely(!tb)) { spin_unlock(&head->lock); return -ENOENT; } @@ -214,45 +153,25 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (check_bind_bucket_match(tb, net, port, l3mdev)) + if (net_eq(ib_net(tb), sock_net(sk)) && + tb->l3mdev == l3mdev && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, - net, head, port, l3mdev); + sock_net(sk), head, port, + l3mdev); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; } - created_inet_bind_bucket = true; } inet_csk_update_fastreuse(tb, child); - - goto bhash2_find; - } else if (!bind2_bucket_addr_match(tb2, child)) { - l3mdev = inet_sk_bound_l3mdev(sk); - -bhash2_find: - tb2 = inet_bind2_bucket_find(table, net, port, l3mdev, child, - &head_bhash2); - if (!tb2) { - tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, - net, head_bhash2, port, - l3mdev, child); - if (!tb2) - goto error; - } } - inet_bind_hash(child, tb, tb2, port); + inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; - -error: - if (created_inet_bind_bucket) - inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); - spin_unlock(&head->lock); - return -ENOMEM; } EXPORT_SYMBOL_GPL(__inet_inherit_port); @@ -756,76 +675,6 @@ void inet_unhash(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_unhash); -static bool check_bind2_bucket_match(struct inet_bind2_bucket *tb, - struct net *net, unsigned short port, - int l3mdev, struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); - else -#endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; -} - -bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, - struct net *net, const unsigned short port, - int l3mdev, const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr nulladdr = {}; - - if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_equal(&tb->v6_rcv_saddr, &nulladdr); - else -#endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == 0; -} - -static struct inet_bind2_hashbucket * -inet_bhashfn_portaddr(struct inet_hashinfo *hinfo, const struct sock *sk, - const struct net *net, unsigned short port) -{ - u32 hash; - -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); - else -#endif - hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); - return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; -} - -/* This should only be called when the spinlock for the socket's corresponding - * bind_hashbucket is held - */ -struct inet_bind2_bucket * -inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, - const unsigned short port, int l3mdev, struct sock *sk, - struct inet_bind2_hashbucket **head) -{ - struct inet_bind2_bucket *bhash2 = NULL; - struct inet_bind2_hashbucket *h; - - h = inet_bhashfn_portaddr(hinfo, sk, net, port); - inet_bind_bucket_for_each(bhash2, &h->chain) { - if (check_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) - break; - } - - if (head) - *head = h; - - return bhash2; -} - /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this @@ -846,13 +695,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_timewait_sock *tw = NULL; - struct inet_bind2_hashbucket *head2; struct inet_bind_hashbucket *head; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; - bool tb_created = false; u32 remaining, offset; int ret, i, low, high; int l3mdev; @@ -909,7 +755,8 @@ other_parity_scan: * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (check_bind_bucket_match(tb, net, port, l3mdev)) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; @@ -927,7 +774,6 @@ other_parity_scan: spin_unlock_bh(&head->lock); return -ENOMEM; } - tb_created = true; tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; @@ -943,17 +789,6 @@ next_port: return -EADDRNOTAVAIL; ok: - /* Find the corresponding tb2 bucket since we need to - * add the socket to the bhash2 table as well - */ - tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2); - if (!tb2) { - tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, - head2, port, l3mdev, sk); - if (!tb2) - goto error; - } - /* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention @@ -963,7 +798,7 @@ ok: WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, tb2, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); @@ -975,12 +810,6 @@ ok: inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; - -error: - if (tb_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - spin_unlock_bh(&head->lock); - return -ENOMEM; } /* diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 1a43ca73f94d..3c6101def7d6 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -319,12 +319,16 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); + if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) + return 0; + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); - if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk), - addr->sin_addr.s_addr, - chk_addr_ret)) + if (chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST || + (chk_addr_ret != RTN_LOCAL && + !inet_can_nonlocal_bind(net, isk))) return -EADDRNOTAVAIL; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9984d23a7f3e..028513d3e2a2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4604,12 +4604,6 @@ void __init tcp_init(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); - tcp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("tcp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT, - NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -4632,9 +4626,8 @@ void __init tcp_init(void) if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = - alloc_large_system_hash("TCP bind bhash tables", - sizeof(struct inet_bind_hashbucket) + - sizeof(struct inet_bind2_hashbucket), + alloc_large_system_hash("TCP bind", + sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, @@ -4643,12 +4636,9 @@ void __init tcp_init(void) 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; - tcp_hashinfo.bhash2 = - (struct inet_bind2_hashbucket *)(tcp_hashinfo.bhash + tcp_hashinfo.bhash_size); for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); - INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index df194cc07035..f87a2d8f23a7 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -919,7 +919,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, EXPORT_SYMBOL_GPL(xdr_init_encode); /** - * xdr_commit_encode - Ensure all data is written to buffer + * __xdr_commit_encode - Ensure all data is written to buffer * @xdr: pointer to xdr_stream * * We handle encoding across page boundaries by giving the caller a @@ -931,26 +931,29 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * required at the end of encoding, or any other time when the xdr_buf * data might be read. */ -inline void xdr_commit_encode(struct xdr_stream *xdr) +void __xdr_commit_encode(struct xdr_stream *xdr) { - int shift = xdr->scratch.iov_len; + size_t shift = xdr->scratch.iov_len; void *page; - if (shift == 0) - return; page = page_address(*xdr->page_ptr); memcpy(xdr->scratch.iov_base, page, shift); memmove(page, page + shift, (void *)xdr->p - page); xdr_reset_scratch_buffer(xdr); } -EXPORT_SYMBOL_GPL(xdr_commit_encode); +EXPORT_SYMBOL_GPL(__xdr_commit_encode); -static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, - size_t nbytes) +/* + * The buffer space to be reserved crosses the boundary between + * xdr->buf->head and xdr->buf->pages, or between two pages + * in xdr->buf->pages. + */ +static noinline __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, + size_t nbytes) { - __be32 *p; int space_left; int frag1bytes, frag2bytes; + void *p; if (nbytes > PAGE_SIZE) goto out_overflow; /* Bigger buffers require special handling */ @@ -964,6 +967,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, xdr->buf->page_len += frag1bytes; xdr->page_ptr++; xdr->iov = NULL; + /* * If the last encode didn't end exactly on a page boundary, the * next one will straddle boundaries. Encode into the next @@ -972,14 +976,19 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, * space at the end of the previous buffer: */ xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes); - p = page_address(*xdr->page_ptr); + /* - * Note this is where the next encode will start after we've - * shifted this one back: + * xdr->p is where the next encode will start after + * xdr_commit_encode() has shifted this one back: */ - xdr->p = (void *)p + frag2bytes; + p = page_address(*xdr->page_ptr); + xdr->p = p + frag2bytes; space_left = xdr->buf->buflen - xdr->buf->len; - xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE); + if (space_left - nbytes >= PAGE_SIZE) + xdr->end = p + PAGE_SIZE; + else + xdr->end = p + space_left - frag1bytes; + xdr->buf->page_len += frag2bytes; xdr->buf->len += nbytes; return p; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 5f0155fdefc7..11cf7c646644 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -478,10 +478,10 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, unsigned int write_len; u64 offset; - seg = &info->wi_chunk->ch_segments[info->wi_seg_no]; - if (!seg) + if (info->wi_seg_no >= info->wi_chunk->ch_segcount) goto out_overflow; + seg = &info->wi_chunk->ch_segments[info->wi_seg_no]; write_len = min(remaining, seg->rs_length - info->wi_seg_off); if (!write_len) goto out_overflow; diff --git a/net/tipc/core.c b/net/tipc/core.c index 3f4542e0f065..434e70eabe08 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -109,10 +109,9 @@ static void __net_exit tipc_exit_net(struct net *net) struct tipc_net *tn = tipc_net(net); tipc_detach_loopback(net); + tipc_net_stop(net); /* Make sure the tipc_net_finalize_work() finished */ cancel_work_sync(&tn->work); - tipc_net_stop(net); - tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 1f01ac65c0cd..cac070aee791 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -251,8 +251,8 @@ $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE # To make this rule robust against "Argument list too long" error, # ensure to add $(obj)/ prefix by a shell command. -cmd_mod = echo $(call real-search, $*.o, .o, -objs -y -m) | \ - $(AWK) -v RS='( |\n)' '!x[$$0]++ { print("$(obj)/"$$0) }' > $@ +cmd_mod = printf '%s\n' $(call real-search, $*.o, .o, -objs -y -m) | \ + $(AWK) '!x[$$0]++ { print("$(obj)/"$$0) }' > $@ $(obj)/%.mod: FORCE $(call if_changed,mod) diff --git a/scripts/check-local-export b/scripts/check-local-export index da745e2743b7..6ccc2f467416 100755 --- a/scripts/check-local-export +++ b/scripts/check-local-export @@ -8,11 +8,31 @@ set -e +# catch errors from ${NM} +set -o pipefail + +# Run the last element of a pipeline in the current shell. +# Without this, the while-loop would be executed in a subshell, and +# the changes made to 'symbol_types' and 'export_symbols' would be lost. +shopt -s lastpipe + declare -A symbol_types declare -a export_symbols exit_code=0 +# If there is no symbol in the object, ${NM} (both GNU nm and llvm-nm) shows +# 'no symbols' diagnostic (but exits with 0). It is harmless and hidden by +# '2>/dev/null'. However, it suppresses real error messages as well. Add a +# hand-crafted error message here. +# +# TODO: +# Use --quiet instead of 2>/dev/null when we upgrade the minimum version of +# binutils to 2.37, llvm to 13.0.0. +# Then, the following line will be really simple: +# ${NM} --quiet ${1} | + +{ ${NM} ${1} 2>/dev/null || { echo "${0}: ${NM} failed" >&2; false; } } | while read value type name do # Skip the line if the number of fields is less than 3. @@ -37,21 +57,7 @@ do if [[ ${name} == __ksymtab_* ]]; then export_symbols+=(${name#__ksymtab_}) fi - - # If there is no symbol in the object, ${NM} (both GNU nm and llvm-nm) - # shows 'no symbols' diagnostic (but exits with 0). It is harmless and - # hidden by '2>/dev/null'. However, it suppresses real error messages - # as well. Add a hand-crafted error message here. - # - # Use --quiet instead of 2>/dev/null when we upgrade the minimum version - # of binutils to 2.37, llvm to 13.0.0. - # - # Then, the following line will be really simple: - # done < <(${NM} --quiet ${1}) -done < <(${NM} ${1} 2>/dev/null || { echo "${0}: ${NM} failed" >&2; false; } ) - -# Catch error in the process substitution -wait $! +done for name in "${export_symbols[@]}" do diff --git a/scripts/gdb/linux/config.py b/scripts/gdb/linux/config.py index 90e1565b1967..8843ab3cbadd 100644 --- a/scripts/gdb/linux/config.py +++ b/scripts/gdb/linux/config.py @@ -24,9 +24,9 @@ class LxConfigDump(gdb.Command): filename = arg try: - py_config_ptr = gdb.parse_and_eval("kernel_config_data + 8") - py_config_size = gdb.parse_and_eval( - "sizeof(kernel_config_data) - 1 - 8 * 2") + py_config_ptr = gdb.parse_and_eval("&kernel_config_data") + py_config_ptr_end = gdb.parse_and_eval("&kernel_config_data_end") + py_config_size = py_config_ptr_end - py_config_ptr except gdb.error as e: raise gdb.GdbError("Can't find config, enable CONFIG_IKCONFIG?") diff --git a/scripts/nsdeps b/scripts/nsdeps index 04c4b96e95ec..f1718cc0d700 100644 --- a/scripts/nsdeps +++ b/scripts/nsdeps @@ -34,9 +34,8 @@ generate_deps() { local mod=${1%.ko:} shift local namespaces="$*" - local mod_source_files="`cat $mod.mod | sed -n 1p \ - | sed -e 's/\.o/\.c/g' \ - | sed "s|[^ ]* *|${src_prefix}&|g"`" + local mod_source_files=$(sed "s|^\(.*\)\.o$|${src_prefix}\1.c|" $mod.mod) + for ns in $namespaces; do echo "Adding namespace $ns to module $mod.ko." generate_deps_for_ns $ns "$mod_source_files" diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..e17de69faa54 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -443,5 +443,6 @@ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 403e83b4adc8..d27e0581b777 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -116,6 +116,30 @@ * Not susceptible to * TSX Async Abort (TAA) vulnerabilities. */ +#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /* + * Not susceptible to SBDR and SSDP + * variants of Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_FBSDP_NO BIT(14) /* + * Not susceptible to FBSDP variant of + * Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_PSDP_NO BIT(15) /* + * Not susceptible to PSDP variant of + * Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_FB_CLEAR BIT(17) /* + * VERW clears CPU fill buffer + * even on MDS_NO CPUs. + */ +#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /* + * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS] + * bit available to control VERW + * behavior. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -133,6 +157,7 @@ #define MSR_IA32_MCU_OPT_CTRL 0x00000123 #define RNGDS_MITG_DIS BIT(0) /* SRBDS support */ #define RTM_ALLOW BIT(1) /* TSX development mode */ +#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */ #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 81470a99ed1c..22423c871ed6 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -37,11 +37,38 @@ ifeq ($(ARCH),riscv) UNAME_M := riscv endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c -LIBKVM_x86_64 = lib/x86_64/apic.c lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S -LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handlers.S lib/aarch64/spinlock.c lib/aarch64/gic.c lib/aarch64/gic_v3.c lib/aarch64/vgic.c -LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c -LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c +LIBKVM += lib/assert.c +LIBKVM += lib/elf.c +LIBKVM += lib/guest_modes.c +LIBKVM += lib/io.c +LIBKVM += lib/kvm_util.c +LIBKVM += lib/perf_test_util.c +LIBKVM += lib/rbtree.c +LIBKVM += lib/sparsebit.c +LIBKVM += lib/test_util.c + +LIBKVM_x86_64 += lib/x86_64/apic.c +LIBKVM_x86_64 += lib/x86_64/handlers.S +LIBKVM_x86_64 += lib/x86_64/perf_test_util.c +LIBKVM_x86_64 += lib/x86_64/processor.c +LIBKVM_x86_64 += lib/x86_64/svm.c +LIBKVM_x86_64 += lib/x86_64/ucall.c +LIBKVM_x86_64 += lib/x86_64/vmx.c + +LIBKVM_aarch64 += lib/aarch64/gic.c +LIBKVM_aarch64 += lib/aarch64/gic_v3.c +LIBKVM_aarch64 += lib/aarch64/handlers.S +LIBKVM_aarch64 += lib/aarch64/processor.c +LIBKVM_aarch64 += lib/aarch64/spinlock.c +LIBKVM_aarch64 += lib/aarch64/ucall.c +LIBKVM_aarch64 += lib/aarch64/vgic.c + +LIBKVM_s390x += lib/s390x/diag318_test_handler.c +LIBKVM_s390x += lib/s390x/processor.c +LIBKVM_s390x += lib/s390x/ucall.c + +LIBKVM_riscv += lib/riscv/processor.c +LIBKVM_riscv += lib/riscv/ucall.c TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test @@ -173,12 +200,13 @@ LDFLAGS += -pthread $(no-pie-option) $(pgste-option) # $(TEST_GEN_PROGS) starts with $(OUTPUT)/ include ../lib.mk -STATIC_LIBS := $(OUTPUT)/libkvm.a LIBKVM_C := $(filter %.c,$(LIBKVM)) LIBKVM_S := $(filter %.S,$(LIBKVM)) LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C)) LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S)) -EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.* +LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) + +EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.* x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)))) $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c @@ -187,13 +215,8 @@ $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) -$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS) - $(AR) crs $@ $^ - x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) -all: $(STATIC_LIBS) -$(TEST_GEN_PROGS): $(STATIC_LIBS) +$(TEST_GEN_PROGS): $(LIBKVM_OBJS) cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. cscope: diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 7b47ae4f952e..d60a34cdfaee 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -336,8 +336,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-i iterations] [-p offset] [-g]" - "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" + printf("usage: %s [-h] [-i iterations] [-p offset] [-g] " + "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" "[-x memslots]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", @@ -351,6 +351,7 @@ static void help(char *name) printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); guest_modes_help(); + printf(" -n: Run the vCPUs in nested mode (L2)\n"); printf(" -b: specify the size of the memory region which should be\n" " dirtied by each vCPU. e.g. 10M or 3G.\n" " (default: 1G)\n"); @@ -387,7 +388,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "ghi:p:m:b:f:v:os:x:")) != -1) { + while ((opt = getopt(argc, argv, "ghi:p:m:nb:f:v:os:x:")) != -1) { switch (opt) { case 'g': dirty_log_manual_caps = 0; @@ -401,6 +402,9 @@ int main(int argc, char *argv[]) case 'm': guest_modes_cmdline(optarg); break; + case 'n': + perf_test_args.nested = true; + break; case 'b': guest_percpu_mem_size = parse_size(optarg); break; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index a86f953d8d36..d822cb670f1c 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -30,10 +30,15 @@ struct perf_test_vcpu_args { struct perf_test_args { struct kvm_vm *vm; + /* The starting address and size of the guest test region. */ uint64_t gpa; + uint64_t size; uint64_t guest_page_size; int wr_fract; + /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ + bool nested; + struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS]; }; @@ -49,5 +54,9 @@ void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract); void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)); void perf_test_join_vcpu_threads(int vcpus); +void perf_test_guest_code(uint32_t vcpu_id); + +uint64_t perf_test_nested_pages(int nr_vcpus); +void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus); #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index d0d51adec76e..6ce185449259 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -482,13 +482,23 @@ void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); void vm_xsave_req_perm(int bit); -enum x86_page_size { - X86_PAGE_SIZE_4K = 0, - X86_PAGE_SIZE_2M, - X86_PAGE_SIZE_1G, +enum pg_level { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, + PG_LEVEL_512G, + PG_LEVEL_NUM }; -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - enum x86_page_size page_size); + +#define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12) +#define PG_LEVEL_SIZE(_level) (1ull << PG_LEVEL_SHIFT(_level)) + +#define PG_SIZE_4K PG_LEVEL_SIZE(PG_LEVEL_4K) +#define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) +#define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) + +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level); /* * Basic CPU control in CR0 @@ -505,9 +515,6 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, #define X86_CR0_CD (1UL<<30) /* Cache Disable */ #define X86_CR0_PG (1UL<<31) /* Paging */ -/* VMX_EPT_VPID_CAP bits */ -#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) - #define XSTATE_XTILE_CFG_BIT 17 #define XSTATE_XTILE_DATA_BIT 18 diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 583ceb0d1457..cc3604f8f1d3 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -96,6 +96,9 @@ #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f #define VMX_MISC_SAVE_EFER_LMA 0x00000020 +#define VMX_EPT_VPID_CAP_1G_PAGES 0x00020000 +#define VMX_EPT_VPID_CAP_AD_BITS 0x00200000 + #define EXIT_REASON_FAILED_VMENTRY 0x80000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 @@ -606,6 +609,7 @@ bool load_vmcs(struct vmx_pages *vmx); bool nested_vmx_supported(void); void nested_vmx_check_supported(void); +bool ept_1g_pages_supported(void); void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr); @@ -613,6 +617,8 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t memslot); +void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 722df3a28791..f989ff91f022 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -40,7 +40,7 @@ static bool all_vcpu_threads_running; * Continuously write to the first 8 bytes of each page in the * specified region. */ -static void guest_code(uint32_t vcpu_id) +void perf_test_guest_code(uint32_t vcpu_id) { struct perf_test_args *pta = &perf_test_args; struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_id]; @@ -108,8 +108,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, { struct perf_test_args *pta = &perf_test_args; struct kvm_vm *vm; - uint64_t guest_num_pages; + uint64_t guest_num_pages, slot0_pages = DEFAULT_GUEST_PHY_PAGES; uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); + uint64_t region_end_gfn; int i; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); @@ -135,33 +136,53 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, slots); /* + * If using nested, allocate extra pages for the nested page tables and + * in-memory data structures. + */ + if (pta->nested) + slot0_pages += perf_test_nested_pages(vcpus); + + /* * Pass guest_num_pages to populate the page tables for test memory. * The memory is also added to memslot 0, but that's a benign side * effect as KVM allows aliasing HVAs in meslots. */ - vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, - guest_num_pages, 0, guest_code, NULL); + vm = vm_create_with_vcpus(mode, vcpus, slot0_pages, guest_num_pages, 0, + perf_test_guest_code, NULL); pta->vm = vm; + /* Put the test region at the top guest physical memory. */ + region_end_gfn = vm_get_max_gfn(vm) + 1; + +#ifdef __x86_64__ + /* + * When running vCPUs in L2, restrict the test region to 48 bits to + * avoid needing 5-level page tables to identity map L2. + */ + if (pta->nested) + region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size); +#endif /* * If there should be more memory in the guest test region than there * can be pages in the guest, it will definitely cause problems. */ - TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), + TEST_ASSERT(guest_num_pages < region_end_gfn, "Requested more guest memory than address space allows.\n" " guest pages: %" PRIx64 " max gfn: %" PRIx64 " vcpus: %d wss: %" PRIx64 "]\n", - guest_num_pages, vm_get_max_gfn(vm), vcpus, + guest_num_pages, region_end_gfn - 1, vcpus, vcpu_memory_bytes); - pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * pta->guest_page_size; + pta->gpa = (region_end_gfn - guest_num_pages) * pta->guest_page_size; pta->gpa = align_down(pta->gpa, backing_src_pagesz); #ifdef __s390x__ /* Align to 1M (segment size) */ pta->gpa = align_down(pta->gpa, 1 << 20); #endif - pr_info("guest physical test memory offset: 0x%lx\n", pta->gpa); + pta->size = guest_num_pages * pta->guest_page_size; + pr_info("guest physical test memory: [0x%lx, 0x%lx)\n", + pta->gpa, pta->gpa + pta->size); /* Add extra memory slots for testing */ for (i = 0; i < slots; i++) { @@ -178,6 +199,11 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, perf_test_setup_vcpus(vm, vcpus, vcpu_memory_bytes, partition_vcpu_memory_access); + if (pta->nested) { + pr_info("Configuring vCPUs to run in L2 (nested).\n"); + perf_test_setup_nested(vm, vcpus); + } + ucall_init(vm, NULL); /* Export the shared variables to the guest. */ @@ -198,6 +224,17 @@ void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract) sync_global_to_guest(vm, perf_test_args); } +uint64_t __weak perf_test_nested_pages(int nr_vcpus) +{ + return 0; +} + +void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus) +{ + pr_info("%s() not support on this architecture, skipping.\n", __func__); + exit(KSFT_SKIP); +} + static void *vcpu_thread_main(void *data) { struct vcpu_thread *vcpu = data; diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c new file mode 100644 index 000000000000..e258524435a0 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86_64-specific extensions to perf_test_util.c. + * + * Copyright (C) 2022, Google, Inc. + */ +#include <stdio.h> +#include <stdlib.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "perf_test_util.h" +#include "../kvm_util_internal.h" +#include "processor.h" +#include "vmx.h" + +void perf_test_l2_guest_code(uint64_t vcpu_id) +{ + perf_test_guest_code(vcpu_id); + vmcall(); +} + +extern char perf_test_l2_guest_entry[]; +__asm__( +"perf_test_l2_guest_entry:" +" mov (%rsp), %rdi;" +" call perf_test_l2_guest_code;" +" ud2;" +); + +static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long *rsp; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + GUEST_ASSERT(ept_1g_pages_supported()); + + rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; + *rsp = vcpu_id; + prepare_vmcs(vmx, perf_test_l2_guest_entry, rsp); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +uint64_t perf_test_nested_pages(int nr_vcpus) +{ + /* + * 513 page tables is enough to identity-map 256 TiB of L2 with 1G + * pages and 4-level paging, plus a few pages per-vCPU for data + * structures such as the VMCS. + */ + return 513 + 10 * nr_vcpus; +} + +void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) +{ + uint64_t start, end; + + prepare_eptp(vmx, vm, 0); + + /* + * Identity map the first 4G and the test region with 1G pages so that + * KVM can shadow the EPT12 with the maximum huge page size supported + * by the backing source. + */ + nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + + start = align_down(perf_test_args.gpa, PG_SIZE_1G); + end = align_up(perf_test_args.gpa + perf_test_args.size, PG_SIZE_1G); + nested_identity_map_1g(vmx, vm, start, end - start); +} + +void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus) +{ + struct vmx_pages *vmx, *vmx0 = NULL; + struct kvm_regs regs; + vm_vaddr_t vmx_gva; + int vcpu_id; + + nested_vmx_check_supported(); + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + vmx = vcpu_alloc_vmx(vm, &vmx_gva); + + if (vcpu_id == 0) { + perf_test_setup_ept(vmx, vm); + vmx0 = vmx; + } else { + /* Share the same EPT table across all vCPUs. */ + vmx->eptp = vmx0->eptp; + vmx->eptp_hva = vmx0->eptp_hva; + vmx->eptp_gpa = vmx0->eptp_gpa; + } + + /* + * Override the vCPU to run perf_test_l1_guest_code() which will + * bounce it into L2 before calling perf_test_guest_code(). + */ + vcpu_regs_get(vm, vcpu_id, ®s); + regs.rip = (unsigned long) perf_test_l1_guest_code; + vcpu_regs_set(vm, vcpu_id, ®s); + vcpu_args_set(vm, vcpu_id, 2, vmx_gva, vcpu_id); + } +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 33ea5e9955d9..ead7011ee8f6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -158,7 +158,7 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, int level) { uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift); - int index = vaddr >> (vm->page_shift + level * 9) & 0x1ffu; + int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; return &page_table[index]; } @@ -167,14 +167,14 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, uint64_t paddr, - int level, - enum x86_page_size page_size) + int current_level, + int target_level) { - uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level); + uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, current_level); if (!(*pte & PTE_PRESENT_MASK)) { *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; - if (level == page_size) + if (current_level == target_level) *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); else *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; @@ -184,20 +184,19 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, * a hugepage at this level, and that there isn't a hugepage at * this level. */ - TEST_ASSERT(level != page_size, + TEST_ASSERT(current_level != target_level, "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", - page_size, vaddr); + current_level, vaddr); TEST_ASSERT(!(*pte & PTE_LARGE_MASK), "Cannot create page table at level: %u, vaddr: 0x%lx\n", - level, vaddr); + current_level, vaddr); } return pte; } -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - enum x86_page_size page_size) +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { - const uint64_t pg_size = 1ull << ((page_size * 9) + 12); + const uint64_t pg_size = PG_LEVEL_SIZE(level); uint64_t *pml4e, *pdpe, *pde; uint64_t *pte; @@ -222,20 +221,20 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, * early if a hugepage was created. */ pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, - vaddr, paddr, 3, page_size); + vaddr, paddr, PG_LEVEL_512G, level); if (*pml4e & PTE_LARGE_MASK) return; - pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size); + pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, PG_LEVEL_1G, level); if (*pdpe & PTE_LARGE_MASK) return; - pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size); + pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, PG_LEVEL_2M, level); if (*pde & PTE_LARGE_MASK) return; /* Fill in page table entry. */ - pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0); + pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); @@ -243,7 +242,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { - __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K); + __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); } static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index d089d8b850b5..b77a01d0a271 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -198,6 +198,16 @@ bool load_vmcs(struct vmx_pages *vmx) return true; } +static bool ept_vpid_cap_supported(uint64_t mask) +{ + return rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & mask; +} + +bool ept_1g_pages_supported(void) +{ + return ept_vpid_cap_supported(VMX_EPT_VPID_CAP_1G_PAGES); +} + /* * Initialize the control fields to the most basic settings possible. */ @@ -215,7 +225,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) struct eptPageTablePointer eptp = { .memory_type = VMX_BASIC_MEM_TYPE_WB, .page_walk_length = 3, /* + 1 */ - .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS), + .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS), .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, }; @@ -392,80 +402,93 @@ void nested_vmx_check_supported(void) } } -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr) +static void nested_create_pte(struct kvm_vm *vm, + struct eptPageTableEntry *pte, + uint64_t nested_paddr, + uint64_t paddr, + int current_level, + int target_level) +{ + if (!pte->readable) { + pte->writable = true; + pte->readable = true; + pte->executable = true; + pte->page_size = (current_level == target_level); + if (pte->page_size) + pte->address = paddr >> vm->page_shift; + else + pte->address = vm_alloc_page_table(vm) >> vm->page_shift; + } else { + /* + * Entry already present. Assert that the caller doesn't want + * a hugepage at this level, and that there isn't a hugepage at + * this level. + */ + TEST_ASSERT(current_level != target_level, + "Cannot create hugepage at level: %u, nested_paddr: 0x%lx\n", + current_level, nested_paddr); + TEST_ASSERT(!pte->page_size, + "Cannot create page table at level: %u, nested_paddr: 0x%lx\n", + current_level, nested_paddr); + } +} + + +void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, int target_level) { - uint16_t index[4]; - struct eptPageTableEntry *pml4e; + const uint64_t page_size = PG_LEVEL_SIZE(target_level); + struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; + uint16_t index; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - TEST_ASSERT((nested_paddr % vm->page_size) == 0, + TEST_ASSERT((nested_paddr >> 48) == 0, + "Nested physical address 0x%lx requires 5-level paging", + nested_paddr); + TEST_ASSERT((nested_paddr % page_size) == 0, "Nested physical address not on page boundary,\n" - " nested_paddr: 0x%lx vm->page_size: 0x%x", - nested_paddr, vm->page_size); + " nested_paddr: 0x%lx page_size: 0x%lx", + nested_paddr, page_size); TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, "Physical address beyond beyond maximum supported,\n" " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - TEST_ASSERT((paddr % vm->page_size) == 0, + TEST_ASSERT((paddr % page_size) == 0, "Physical address not on page boundary,\n" - " paddr: 0x%lx vm->page_size: 0x%x", - paddr, vm->page_size); + " paddr: 0x%lx page_size: 0x%lx", + paddr, page_size); TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, "Physical address beyond beyond maximum supported,\n" " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - index[0] = (nested_paddr >> 12) & 0x1ffu; - index[1] = (nested_paddr >> 21) & 0x1ffu; - index[2] = (nested_paddr >> 30) & 0x1ffu; - index[3] = (nested_paddr >> 39) & 0x1ffu; - - /* Allocate page directory pointer table if not present. */ - pml4e = vmx->eptp_hva; - if (!pml4e[index[3]].readable) { - pml4e[index[3]].address = vm_alloc_page_table(vm) >> vm->page_shift; - pml4e[index[3]].writable = true; - pml4e[index[3]].readable = true; - pml4e[index[3]].executable = true; - } + for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) { + index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; + pte = &pt[index]; - /* Allocate page directory table if not present. */ - struct eptPageTableEntry *pdpe; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].readable) { - pdpe[index[2]].address = vm_alloc_page_table(vm) >> vm->page_shift; - pdpe[index[2]].writable = true; - pdpe[index[2]].readable = true; - pdpe[index[2]].executable = true; - } + nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level); - /* Allocate page table if not present. */ - struct eptPageTableEntry *pde; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].readable) { - pde[index[1]].address = vm_alloc_page_table(vm) >> vm->page_shift; - pde[index[1]].writable = true; - pde[index[1]].readable = true; - pde[index[1]].executable = true; - } + if (pte->page_size) + break; - /* Fill in page table entry. */ - struct eptPageTableEntry *pte; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - pte[index[0]].address = paddr >> vm->page_shift; - pte[index[0]].writable = true; - pte[index[0]].readable = true; - pte[index[0]].executable = true; + pt = addr_gpa2hva(vm, pte->address * vm->page_size); + } /* * For now mark these as accessed and dirty because the only * testcase we have needs that. Can be reconsidered later. */ - pte[index[0]].accessed = true; - pte[index[0]].dirty = true; + pte->accessed = true; + pte->dirty = true; + +} + +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr) +{ + __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); } /* @@ -476,7 +499,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * nested_paddr - Nested guest physical address to map * paddr - VM Physical Address * size - The size of the range to map - * eptp_memslot - Memory region slot for new virtual translation tables + * level - The level at which to map the range * * Output Args: None * @@ -485,22 +508,29 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size) +void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + int level) { - size_t page_size = vm->page_size; + size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - nested_pg_map(vmx, vm, nested_paddr, paddr); + __nested_pg_map(vmx, vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size) +{ + __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); +} + /* Prepare an identity extended page table that maps all the * physical pages in VM. */ @@ -525,6 +555,13 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, } } +/* Identity map a region with 1GiB Pages. */ +void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size) +{ + __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); +} + void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot) { diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 3875c4b23a04..15f046e19cb2 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -244,7 +244,7 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ /* Identity map memory in the guest using 1gb pages. */ for (i = 0; i < slot_size; i += size_1gb) - __virt_pg_map(vm, gpa + i, gpa + i, X86_PAGE_SIZE_1G); + __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); #else for (i = 0; i < slot_size; i += vm_get_page_size(vm)) virt_pg_map(vm, gpa + i, gpa + i); diff --git a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c index da2325fcad87..bdecd532f935 100644 --- a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c @@ -35,7 +35,7 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) run = vcpu_state(vm, VCPU_ID); /* Map 1gb page without a backing memlot. */ - __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, X86_PAGE_SIZE_1G); + __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, PG_LEVEL_1G); r = _vcpu_run(vm, VCPU_ID); diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index b984f8c8d523..a29f79618934 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -37,4 +37,3 @@ gro ioam6_parser toeplitz cmsg_sender -bind_bhash_test diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 464df13831f2..7ea54af55490 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -59,7 +59,6 @@ TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh -TEST_GEN_FILES += bind_bhash_test TEST_FILES := settings @@ -70,5 +69,4 @@ include bpf/Makefile $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -$(OUTPUT)/bind_bhash_test: LDLIBS += -lpthread $(OUTPUT)/tcp_inq: LDLIBS += -lpthread diff --git a/tools/testing/selftests/net/bind_bhash_test.c b/tools/testing/selftests/net/bind_bhash_test.c deleted file mode 100644 index 252e73754e76..000000000000 --- a/tools/testing/selftests/net/bind_bhash_test.c +++ /dev/null @@ -1,119 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This times how long it takes to bind to a port when the port already - * has multiple sockets in its bhash table. - * - * In the setup(), we populate the port's bhash table with - * MAX_THREADS * MAX_CONNECTIONS number of entries. - */ - -#include <unistd.h> -#include <stdio.h> -#include <netdb.h> -#include <pthread.h> - -#define MAX_THREADS 600 -#define MAX_CONNECTIONS 40 - -static const char *bind_addr = "::1"; -static const char *port; - -static int fd_array[MAX_THREADS][MAX_CONNECTIONS]; - -static int bind_socket(int opt, const char *addr) -{ - struct addrinfo *res, hint = {}; - int sock_fd, reuse = 1, err; - - sock_fd = socket(AF_INET6, SOCK_STREAM, 0); - if (sock_fd < 0) { - perror("socket fd err"); - return -1; - } - - hint.ai_family = AF_INET6; - hint.ai_socktype = SOCK_STREAM; - - err = getaddrinfo(addr, port, &hint, &res); - if (err) { - perror("getaddrinfo failed"); - return -1; - } - - if (opt) { - err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse)); - if (err) { - perror("setsockopt failed"); - return -1; - } - } - - err = bind(sock_fd, res->ai_addr, res->ai_addrlen); - if (err) { - perror("failed to bind to port"); - return -1; - } - - return sock_fd; -} - -static void *setup(void *arg) -{ - int sock_fd, i; - int *array = (int *)arg; - - for (i = 0; i < MAX_CONNECTIONS; i++) { - sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr); - if (sock_fd < 0) - return NULL; - array[i] = sock_fd; - } - - return NULL; -} - -int main(int argc, const char *argv[]) -{ - int listener_fd, sock_fd, i, j; - pthread_t tid[MAX_THREADS]; - clock_t begin, end; - - if (argc != 2) { - printf("Usage: listener <port>\n"); - return -1; - } - - port = argv[1]; - - listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr); - if (listen(listener_fd, 100) < 0) { - perror("listen failed"); - return -1; - } - - /* Set up threads to populate the bhash table entry for the port */ - for (i = 0; i < MAX_THREADS; i++) - pthread_create(&tid[i], NULL, setup, fd_array[i]); - - for (i = 0; i < MAX_THREADS; i++) - pthread_join(tid[i], NULL); - - begin = clock(); - - /* Bind to the same port on a different address */ - sock_fd = bind_socket(0, "2001:0db8:0:f101::1"); - - end = clock(); - - printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC); - - /* clean up */ - close(sock_fd); - close(listener_fd); - for (i = 0; i < MAX_THREADS; i++) { - for (j = 0; i < MAX_THREADS; i++) - close(fd_array[i][j]); - } - - return 0; -} diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 54701c8b0cd7..75223b63e3c8 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -70,6 +70,10 @@ NSB_LO_IP6=2001:db8:2::2 NL_IP=172.17.1.1 NL_IP6=2001:db8:4::1 +# multicast and broadcast addresses +MCAST_IP=224.0.0.1 +BCAST_IP=255.255.255.255 + MD5_PW=abc123 MD5_WRONG_PW=abc1234 @@ -308,6 +312,9 @@ addr2str() 127.0.0.1) echo "loopback";; ::1) echo "IPv6 loopback";; + ${BCAST_IP}) echo "broadcast";; + ${MCAST_IP}) echo "multicast";; + ${NSA_IP}) echo "ns-A IP";; ${NSA_IP6}) echo "ns-A IPv6";; ${NSA_LO_IP}) echo "ns-A loopback IP";; @@ -1801,6 +1808,19 @@ ipv4_addr_bind_novrf() log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after device bind" # + # check that ICMP sockets cannot bind to broadcast and multicast addresses + # + a=${BCAST_IP} + log_start + run_cmd nettest -s -R -P icmp -l ${a} -b + log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address" + + a=${MCAST_IP} + log_start + run_cmd nettest -s -R -P icmp -f -l ${a} -b + log_test_addr ${a} $? 1 "ICMP socket bind to multicast address" + + # # tcp sockets # a=${NSA_IP} @@ -1858,6 +1878,19 @@ ipv4_addr_bind_vrf() log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after VRF bind" # + # check that ICMP sockets cannot bind to broadcast and multicast addresses + # + a=${BCAST_IP} + log_start + run_cmd nettest -s -R -P icmp -l ${a} -I ${VRF} -b + log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address after VRF bind" + + a=${MCAST_IP} + log_start + run_cmd nettest -s -R -P icmp -f -l ${a} -I ${VRF} -b + log_test_addr ${a} $? 1 "ICMP socket bind to multicast address after VRF bind" + + # # tcp sockets # for a in ${NSA_IP} ${VRF_IP} diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index bca07b93eeb0..7d1b80988d8a 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -64,8 +64,8 @@ QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else -QEMU_MACHINE := -cpu cortex-a53 -machine virt -CFLAGS += -march=armv8-a -mtune=cortex-a53 +QEMU_MACHINE := -cpu max -machine virt +CFLAGS += -march=armv8-a endif else ifeq ($(ARCH),aarch64_be) CHOST := aarch64_be-linux-musl @@ -76,8 +76,8 @@ QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else -QEMU_MACHINE := -cpu cortex-a53 -machine virt -CFLAGS += -march=armv8-a -mtune=cortex-a53 +QEMU_MACHINE := -cpu max -machine virt +CFLAGS += -march=armv8-a endif else ifeq ($(ARCH),arm) CHOST := arm-linux-musleabi @@ -88,8 +88,8 @@ QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else -QEMU_MACHINE := -cpu cortex-a15 -machine virt -CFLAGS += -march=armv7-a -mtune=cortex-a15 -mabi=aapcs-linux +QEMU_MACHINE := -cpu max -machine virt +CFLAGS += -march=armv7-a -mabi=aapcs-linux endif else ifeq ($(ARCH),armeb) CHOST := armeb-linux-musleabi @@ -100,8 +100,8 @@ QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else -QEMU_MACHINE := -cpu cortex-a15 -machine virt -CFLAGS += -march=armv7-a -mabi=aapcs-linux # We don't pass -mtune=cortex-a15 due to a compiler bug on big endian. +QEMU_MACHINE := -cpu max -machine virt +CFLAGS += -march=armv7-a -mabi=aapcs-linux LDFLAGS += -Wl,--be8 endif else ifeq ($(ARCH),x86_64) @@ -112,8 +112,7 @@ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine q35,accel=kvm else -QEMU_MACHINE := -cpu Skylake-Server -machine q35 -CFLAGS += -march=skylake-avx512 +QEMU_MACHINE := -cpu max -machine q35 endif else ifeq ($(ARCH),i686) CHOST := i686-linux-musl @@ -123,8 +122,7 @@ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH)) QEMU_MACHINE := -cpu host -machine q35,accel=kvm else -QEMU_MACHINE := -cpu coreduo -machine q35 -CFLAGS += -march=prescott +QEMU_MACHINE := -cpu max -machine q35 endif else ifeq ($(ARCH),mips64) CHOST := mips64-linux-musl @@ -182,7 +180,7 @@ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host,accel=kvm -machine pseries else -QEMU_MACHINE := -machine pseries +QEMU_MACHINE := -machine pseries -device spapr-rng,rng=rng -object rng-random,id=rng endif else ifeq ($(ARCH),powerpc64le) CHOST := powerpc64le-linux-musl @@ -192,7 +190,7 @@ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host,accel=kvm -machine pseries else -QEMU_MACHINE := -machine pseries +QEMU_MACHINE := -machine pseries -device spapr-rng,rng=rng -object rng-random,id=rng endif else ifeq ($(ARCH),powerpc) CHOST := powerpc-linux-musl @@ -247,7 +245,7 @@ QEMU_VPORT_RESULT := virtio-serial-ccw ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host,accel=kvm -machine s390-ccw-virtio -append $(KERNEL_CMDLINE) else -QEMU_MACHINE := -machine s390-ccw-virtio -append $(KERNEL_CMDLINE) +QEMU_MACHINE := -cpu max -machine s390-ccw-virtio -append $(KERNEL_CMDLINE) endif else $(error I only build: x86_64, i686, arm, armeb, aarch64, aarch64_be, mips, mipsel, mips64, mips64el, powerpc64, powerpc64le, powerpc, m68k, riscv64, riscv32, s390x) diff --git a/tools/testing/selftests/wireguard/qemu/init.c b/tools/testing/selftests/wireguard/qemu/init.c index 2a0f48fac925..c9e128436546 100644 --- a/tools/testing/selftests/wireguard/qemu/init.c +++ b/tools/testing/selftests/wireguard/qemu/init.c @@ -21,6 +21,7 @@ #include <sys/utsname.h> #include <sys/sendfile.h> #include <sys/sysmacros.h> +#include <sys/random.h> #include <linux/random.h> #include <linux/version.h> @@ -58,6 +59,8 @@ static void seed_rng(void) { int bits = 256, fd; + if (!getrandom(NULL, 0, GRND_NONBLOCK)) + return; pretty_message("[+] Fake seeding RNG..."); fd = open("/dev/random", O_WRONLY); if (fd < 0) diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index a9b5a520a1d2..bad88f4b0a03 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -31,6 +31,7 @@ CONFIG_TTY=y CONFIG_BINFMT_ELF=y CONFIG_BINFMT_SCRIPT=y CONFIG_VDSO=y +CONFIG_STRICT_KERNEL_RWX=y CONFIG_VIRTUALIZATION=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y @@ -65,6 +66,8 @@ CONFIG_PROC_FS=y CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y +CONFIG_RANDOM_TRUST_CPU=y +CONFIG_RANDOM_TRUST_BOOTLOADER=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 CONFIG_LOG_BUF_SHIFT=18 CONFIG_PRINTK_TIME=y diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 44c47670447a..a49df8988cd6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3328,9 +3328,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) vcpu->stat.generic.blocking = 1; + preempt_disable(); kvm_arch_vcpu_blocking(vcpu); - prepare_to_rcuwait(wait); + preempt_enable(); + for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -3340,9 +3342,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) waited = true; schedule(); } - finish_rcuwait(wait); + preempt_disable(); + finish_rcuwait(wait); kvm_arch_vcpu_unblocking(vcpu); + preempt_enable(); vcpu->stat.generic.blocking = 0; |