diff options
980 files changed, 22198 insertions, 11743 deletions
@@ -152,6 +152,7 @@ Krzysztof Kozlowski <krzk@kernel.org> <k.kozlowski.k@gmail.com> Kuninori Morimoto <kuninori.morimoto.gx@renesas.com> Leon Romanovsky <leon@kernel.org> <leon@leon.nu> Leon Romanovsky <leon@kernel.org> <leonro@mellanox.com> +Leonardo Bras <leobras.c@gmail.com> <leonardo@linux.ibm.com> Leonid I Ananiev <leonid.i.ananiev@intel.com> Linas Vepstas <linas@austin.ibm.com> Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de> @@ -234,7 +235,9 @@ Ralf Baechle <ralf@linux-mips.org> Ralf Wildenhues <Ralf.Wildenhues@gmx.de> Randy Dunlap <rdunlap@infradead.org> <rdunlap@xenotime.net> Rémi Denis-Courmont <rdenis@simphalempin.com> -Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com> +Ricardo Ribalda <ribalda@kernel.org> <ricardo.ribalda@gmail.com> +Ricardo Ribalda <ribalda@kernel.org> <ricardo@ribalda.com> +Ricardo Ribalda <ribalda@kernel.org> Ricardo Ribalda Delgado <ribalda@kernel.org> Ross Zwisler <zwisler@kernel.org> <ross.zwisler@linux.intel.com> Rudolf Marek <R.Marek@sh.cvut.cz> Rui Saraiva <rmps@joel.ist.utl.pt> @@ -3104,14 +3104,16 @@ W: http://www.qsl.net/dl1bke/ D: Generic Z8530 driver, AX.25 DAMA slave implementation D: Several AX.25 hacks -N: Ricardo Ribalda Delgado -E: ricardo.ribalda@gmail.com +N: Ricardo Ribalda +E: ribalda@kernel.org W: http://ribalda.com D: PLX USB338x driver D: PCA9634 driver D: Option GTM671WFS D: Fintek F81216A D: AD5761 iio driver +D: TI DAC7612 driver +D: Sony IMX214 driver D: Various kernel hacks S: Qtechnology A/S S: Valby Langgade 142 diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index df8413cf1468..484fc04bcc25 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -54,7 +54,7 @@ Date: October 2002 Contact: Linux Memory Management list <linux-mm@kvack.org> Description: Provides information about the node's distribution and memory - utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.txt + utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.rst What: /sys/devices/system/node/nodeX/numastat Date: October 2002 diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup index 274df44d8b1b..046978193368 100644 --- a/Documentation/ABI/testing/procfs-smaps_rollup +++ b/Documentation/ABI/testing/procfs-smaps_rollup @@ -11,7 +11,7 @@ Description: Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem are not present in /proc/pid/smaps. These fields represent the sum of the Pss field of each type (anon, file, shmem). - For more details, see Documentation/filesystems/proc.txt + For more details, see Documentation/filesystems/proc.rst and the procfs man page. Typical output looks like this: diff --git a/Documentation/Makefile b/Documentation/Makefile index cc786d11a028..db1fc35ded50 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -98,7 +98,11 @@ else # HAVE_PDFLATEX pdfdocs: latexdocs @$(srctree)/scripts/sphinx-pre-install --version-check - $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) + $(foreach var,$(SPHINXDIRS), \ + $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit; \ + mkdir -p $(BUILDDIR)/$(var)/pdf; \ + mv $(subst .tex,.pdf,$(wildcard $(BUILDDIR)/$(var)/latex/*.tex)) $(BUILDDIR)/$(var)/pdf/; \ + ) endif # HAVE_PDFLATEX diff --git a/Documentation/PCI/boot-interrupts.rst b/Documentation/PCI/boot-interrupts.rst index d078ef3eb192..2ec70121bfca 100644 --- a/Documentation/PCI/boot-interrupts.rst +++ b/Documentation/PCI/boot-interrupts.rst @@ -32,12 +32,13 @@ interrupt goes unhandled over time, they are tracked by the Linux kernel as Spurious Interrupts. The IRQ will be disabled by the Linux kernel after it reaches a specific count with the error "nobody cared". This disabled IRQ now prevents valid usage by an existing interrupt which may happen to share -the IRQ line. +the IRQ line:: irq 19: nobody cared (try booting with the "irqpoll" option) CPU: 0 PID: 2988 Comm: irq/34-nipalk Tainted: 4.14.87-rt49-02410-g4a640ec-dirty #1 Hardware name: National Instruments NI PXIe-8880/NI PXIe-8880, BIOS 2.1.5f1 01/09/2020 Call Trace: + <IRQ> ? dump_stack+0x46/0x5e ? __report_bad_irq+0x2e/0xb0 @@ -85,15 +86,18 @@ Mitigations The mitigations take the form of PCI quirks. The preference has been to first identify and make use of a means to disable the routing to the PCH. In such a case a quirk to disable boot interrupt generation can be -added.[1] +added. [1]_ - Intel® 6300ESB I/O Controller Hub +Intel® 6300ESB I/O Controller Hub Alternate Base Address Register: BIE: Boot Interrupt Enable - 0 = Boot interrupt is enabled. - 1 = Boot interrupt is disabled. - Intel® Sandy Bridge through Sky Lake based Xeon servers: + == =========================== + 0 Boot interrupt is enabled. + 1 Boot interrupt is disabled. + == =========================== + +Intel® Sandy Bridge through Sky Lake based Xeon servers: Coherent Interface Protocol Interrupt Control dis_intx_route2pch/dis_intx_route2ich/dis_intx_route2dmi2: When this bit is set. Local INTx messages received from the @@ -109,12 +113,12 @@ line by default. Therefore, on chipsets where this INTx routing cannot be disabled, the Linux kernel will reroute the valid interrupt to its legacy interrupt. This redirection of the handler will prevent the occurrence of the spurious interrupt detection which would ordinarily disable the IRQ -line due to excessive unhandled counts.[2] +line due to excessive unhandled counts. [2]_ The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS exists to enable (or disable) the redirection of the interrupt handler to the PCH interrupt line. The option can be overridden by either pci=ioapicreroute or -pci=noioapicreroute.[3] +pci=noioapicreroute. [3]_ More Documentation @@ -127,19 +131,19 @@ into the evolution of its handling with chipsets. Example of disabling of the boot interrupt ------------------------------------------ -Intel® 6300ESB I/O Controller Hub (Document # 300641-004US) + - Intel® 6300ESB I/O Controller Hub (Document # 300641-004US) 5.7.3 Boot Interrupt https://www.intel.com/content/dam/doc/datasheet/6300esb-io-controller-hub-datasheet.pdf -Intel® Xeon® Processor E5-1600/2400/2600/4600 v3 Product Families -Datasheet - Volume 2: Registers (Document # 330784-003) + - Intel® Xeon® Processor E5-1600/2400/2600/4600 v3 Product Families + Datasheet - Volume 2: Registers (Document # 330784-003) 6.6.41 cipintrc Coherent Interface Protocol Interrupt Control https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf Example of handler rerouting ---------------------------- -Intel® 6700PXH 64-bit PCI Hub (Document # 302628) + - Intel® 6700PXH 64-bit PCI Hub (Document # 302628) 2.15.2 PCI Express Legacy INTx Support and Boot Interrupt https://www.intel.com/content/dam/doc/datasheet/6700pxh-64-bit-pci-hub-datasheet.pdf @@ -150,6 +154,6 @@ Cheers, Sean V Kelley sean.v.kelley@linux.intel.com -[1] https://lore.kernel.org/r/12131949181903-git-send-email-sassmann@suse.de/ -[2] https://lore.kernel.org/r/12131949182094-git-send-email-sassmann@suse.de/ -[3] https://lore.kernel.org/r/487C8EA7.6020205@suse.de/ +.. [1] https://lore.kernel.org/r/12131949181903-git-send-email-sassmann@suse.de/ +.. [2] https://lore.kernel.org/r/12131949182094-git-send-email-sassmann@suse.de/ +.. [3] https://lore.kernel.org/r/487C8EA7.6020205@suse.de/ diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index fd5e2cbc4935..75b8ca007a11 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -1943,56 +1943,27 @@ invoked from a CPU-hotplug notifier. Scheduler and RCU ~~~~~~~~~~~~~~~~~ -RCU depends on the scheduler, and the scheduler uses RCU to protect some -of its data structures. The preemptible-RCU ``rcu_read_unlock()`` -implementation must therefore be written carefully to avoid deadlocks -involving the scheduler's runqueue and priority-inheritance locks. In -particular, ``rcu_read_unlock()`` must tolerate an interrupt where the -interrupt handler invokes both ``rcu_read_lock()`` and -``rcu_read_unlock()``. This possibility requires ``rcu_read_unlock()`` -to use negative nesting levels to avoid destructive recursion via -interrupt handler's use of RCU. - -This scheduler-RCU requirement came as a `complete -surprise <https://lwn.net/Articles/453002/>`__. - -As noted above, RCU makes use of kthreads, and it is necessary to avoid -excessive CPU-time accumulation by these kthreads. This requirement was -no surprise, but RCU's violation of it when running context-switch-heavy -workloads when built with ``CONFIG_NO_HZ_FULL=y`` `did come as a -surprise +RCU makes use of kthreads, and it is necessary to avoid excessive CPU-time +accumulation by these kthreads. This requirement was no surprise, but +RCU's violation of it when running context-switch-heavy workloads when +built with ``CONFIG_NO_HZ_FULL=y`` `did come as a surprise [PDF] <http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf>`__. RCU has made good progress towards meeting this requirement, even for context-switch-heavy ``CONFIG_NO_HZ_FULL=y`` workloads, but there is room for further improvement. -It is forbidden to hold any of scheduler's runqueue or -priority-inheritance spinlocks across an ``rcu_read_unlock()`` unless -interrupts have been disabled across the entire RCU read-side critical -section, that is, up to and including the matching ``rcu_read_lock()``. -Violating this restriction can result in deadlocks involving these -scheduler spinlocks. There was hope that this restriction might be -lifted when interrupt-disabled calls to ``rcu_read_unlock()`` started -deferring the reporting of the resulting RCU-preempt quiescent state -until the end of the corresponding interrupts-disabled region. -Unfortunately, timely reporting of the corresponding quiescent state to -expedited grace periods requires a call to ``raise_softirq()``, which -can acquire these scheduler spinlocks. In addition, real-time systems -using RCU priority boosting need this restriction to remain in effect -because deferred quiescent-state reporting would also defer deboosting, -which in turn would degrade real-time latencies. - -In theory, if a given RCU read-side critical section could be guaranteed -to be less than one second in duration, holding a scheduler spinlock -across that critical section's ``rcu_read_unlock()`` would require only -that preemption be disabled across the entire RCU read-side critical -section, not interrupts. Unfortunately, given the possibility of vCPU -preemption, long-running interrupts, and so on, it is not possible in -practice to guarantee that a given RCU read-side critical section will -complete in less than one second. Therefore, as noted above, if -scheduler spinlocks are held across a given call to -``rcu_read_unlock()``, interrupts must be disabled across the entire RCU -read-side critical section. +There is no longer any prohibition against holding any of +scheduler's runqueue or priority-inheritance spinlocks across an +``rcu_read_unlock()``, even if interrupts and preemption were enabled +somewhere within the corresponding RCU read-side critical section. +Therefore, it is now perfectly legal to execute ``rcu_read_lock()`` +with preemption enabled, acquire one of the scheduler locks, and hold +that lock across the matching ``rcu_read_unlock()``. + +Similarly, the RCU flavor consolidation has removed the need for negative +nesting. The fact that interrupt-disabled regions of code act as RCU +read-side critical sections implicitly avoids earlier issues that used +to result in destructive recursion via interrupt handler's use of RCU. Tracing and RCU ~~~~~~~~~~~~~~~ diff --git a/Documentation/admin-guide/acpi/ssdt-overlays.rst b/Documentation/admin-guide/acpi/ssdt-overlays.rst index da37455f96c9..5d7e25988085 100644 --- a/Documentation/admin-guide/acpi/ssdt-overlays.rst +++ b/Documentation/admin-guide/acpi/ssdt-overlays.rst @@ -63,7 +63,7 @@ which can then be compiled to AML binary format:: ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes -[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29 +[1] https://www.elinux.org/Minnowboard:MinnowMax#Low_Speed_Expansion_.28Top.29 The resulting AML code can then be loaded by the kernel using one of the methods below. diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index 44b8a4edd348..f7c80f4649fc 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -49,15 +49,19 @@ the issue, it may also contain the word **Oops**, as on this one:: Despite being an **Oops** or some other sort of stack trace, the offended line is usually required to identify and handle the bug. Along this chapter, -we'll refer to "Oops" for all kinds of stack traces that need to be analized. +we'll refer to "Oops" for all kinds of stack traces that need to be analyzed. -.. note:: +If the kernel is compiled with ``CONFIG_DEBUG_INFO``, you can enhance the +quality of the stack trace by using file:`scripts/decode_stacktrace.sh`. + +Modules linked in +----------------- + +Modules that are tainted or are being loaded or unloaded are marked with +"(...)", where the taint flags are described in +file:`Documentation/admin-guide/tainted-kernels.rst`, "being loaded" is +annotated with "+", and "being unloaded" is annotated with "-". - ``ksymoops`` is useless on 2.6 or upper. Please use the Oops in its original - format (from ``dmesg``, etc). Ignore any references in this or other docs to - "decoding the Oops" or "running it through ksymoops". - If you post an Oops from 2.6+ that has been run through ``ksymoops``, - people will just tell you to repost it. Where is the Oops message is located? ------------------------------------- @@ -71,7 +75,7 @@ by running ``journalctl`` command. Sometimes ``klogd`` dies, in which case you can run ``dmesg > file`` to read the data from the kernel buffers and save it. Or you can ``cat /proc/kmsg > file``, however you have to break in to stop the transfer, -``kmsg`` is a "never ending file". +since ``kmsg`` is a "never ending file". If the machine has crashed so badly that you cannot enter commands or the disk is not available then you have three options: @@ -81,9 +85,9 @@ the disk is not available then you have three options: planned for a crash. Alternatively, you can take a picture of the screen with a digital camera - not nice, but better than nothing. If the messages scroll off the top of the console, you - may find that booting with a higher resolution (eg, ``vga=791``) + may find that booting with a higher resolution (e.g., ``vga=791``) will allow you to read more of the text. (Caveat: This needs ``vesafb``, - so won't help for 'early' oopses) + so won't help for 'early' oopses.) (2) Boot with a serial console (see :ref:`Documentation/admin-guide/serial-console.rst <serial_console>`), @@ -104,7 +108,7 @@ Kernel source file. There are two methods for doing that. Usually, using gdb ^^^ -The GNU debug (``gdb``) is the best way to figure out the exact file and line +The GNU debugger (``gdb``) is the best way to figure out the exact file and line number of the OOPS from the ``vmlinux`` file. The usage of gdb works best on a kernel compiled with ``CONFIG_DEBUG_INFO``. @@ -165,7 +169,7 @@ If you have a call trace, such as:: [<ffffffff8802770b>] :jbd:journal_stop+0x1be/0x1ee ... -this shows the problem likely in the :jbd: module. You can load that module +this shows the problem likely is in the :jbd: module. You can load that module in gdb and list the relevant code:: $ gdb fs/jbd/jbd.ko @@ -199,8 +203,9 @@ in the kernel hacking menu of the menu configuration.) For example:: You need to be at the top level of the kernel tree for this to pick up your C files. -If you don't have access to the code you can also debug on some crash dumps -e.g. crash dump output as shown by Dave Miller:: +If you don't have access to the source code you can still debug some crash +dumps using the following method (example crash dump output as shown by +Dave Miller):: EIP is at +0x14/0x4c0 ... @@ -230,6 +235,9 @@ e.g. crash dump output as shown by Dave Miller:: mov 0x8(%ebp), %ebx ! %ebx = skb->sk mov 0x13c(%ebx), %eax ! %eax = inet_sk(sk)->opt +file:`scripts/decodecode` can be used to automate most of this, depending +on what CPU architecture is being debugged. + Reporting the bug ----------------- @@ -241,7 +249,7 @@ used for the development of the affected code. This can be done by using the ``get_maintainer.pl`` script. For example, if you find a bug at the gspca's sonixj.c file, you can get -their maintainers with:: +its maintainers with:: $ ./scripts/get_maintainer.pl -f drivers/media/usb/gspca/sonixj.c Hans Verkuil <hverkuil@xs4all.nl> (odd fixer:GSPCA USB WEBCAM DRIVER,commit_signer:1/1=100%) @@ -253,16 +261,17 @@ their maintainers with:: Please notice that it will point to: -- The last developers that touched on the source code. On the above example, - Tejun and Bhaktipriya (in this specific case, none really envolved on the - development of this file); +- The last developers that touched the source code (if this is done inside + a git tree). On the above example, Tejun and Bhaktipriya (in this + specific case, none really envolved on the development of this file); - The driver maintainer (Hans Verkuil); - The subsystem maintainer (Mauro Carvalho Chehab); - The driver and/or subsystem mailing list (linux-media@vger.kernel.org); - the Linux Kernel mailing list (linux-kernel@vger.kernel.org). Usually, the fastest way to have your bug fixed is to report it to mailing -list used for the development of the code (linux-media ML) copying the driver maintainer (Hans). +list used for the development of the code (linux-media ML) copying the +driver maintainer (Hans). If you are totally stumped as to whom to send the report, and ``get_maintainer.pl`` didn't provide you anything useful, send it to @@ -303,9 +312,9 @@ protection fault message can be simply cut out of the message files and forwarded to the kernel developers. Two types of address resolution are performed by ``klogd``. The first is -static translation and the second is dynamic translation. Static -translation uses the System.map file in much the same manner that -ksymoops does. In order to do static translation the ``klogd`` daemon +static translation and the second is dynamic translation. +Static translation uses the System.map file. +In order to do static translation the ``klogd`` daemon must be able to find a system map file at daemon initialization time. See the klogd man page for information on how ``klogd`` searches for map files. diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst index 2d01ce43d2a2..ebdecf864080 100644 --- a/Documentation/admin-guide/cpu-load.rst +++ b/Documentation/admin-guide/cpu-load.rst @@ -105,7 +105,7 @@ References ---------- - http://lkml.org/lkml/2007/2/12/6 -- Documentation/filesystems/proc.txt (1.8) +- Documentation/filesystems/proc.rst (1.8) Thanks diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst index f83212fae4d5..3eeeb488d955 100644 --- a/Documentation/admin-guide/hw-vuln/l1tf.rst +++ b/Documentation/admin-guide/hw-vuln/l1tf.rst @@ -268,7 +268,7 @@ Guest mitigation mechanisms /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is available at: - https://www.kernel.org/doc/Documentation/IRQ-affinity.txt + https://www.kernel.org/doc/Documentation/core-api/irq/irq-affinity.rst .. _smt_control: diff --git a/Documentation/admin-guide/init.rst b/Documentation/admin-guide/init.rst index e89d97f31eaf..41f06a09152e 100644 --- a/Documentation/admin-guide/init.rst +++ b/Documentation/admin-guide/init.rst @@ -1,52 +1,48 @@ -Explaining the dreaded "No init found." boot hang message +Explaining the "No working init found." boot hang message ========================================================= +:Authors: Andreas Mohr <andi at lisas period de> + Cristian Souza <cristianmsbr at gmail period com> -OK, so you've got this pretty unintuitive message (currently located -in init/main.c) and are wondering what the H*** went wrong. -Some high-level reasons for failure (listed roughly in order of execution) -to load the init binary are: - -A) Unable to mount root FS -B) init binary doesn't exist on rootfs -C) broken console device -D) binary exists but dependencies not available -E) binary cannot be loaded - -Detailed explanations: - -A) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE) - to get more detailed kernel messages. -B) make sure you have the correct root FS type - (and ``root=`` kernel parameter points to the correct partition), - required drivers such as storage hardware (such as SCSI or USB!) - and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules, - to be pre-loaded by an initrd) -C) Possibly a conflict in ``console= setup`` --> initial console unavailable. - E.g. some serial consoles are unreliable due to serial IRQ issues (e.g. - missing interrupt-based configuration). +This document provides some high-level reasons for failure +(listed roughly in order of execution) to load the init binary. + +1) **Unable to mount root FS**: Set "debug" kernel parameter (in bootloader + config file or CONFIG_CMDLINE) to get more detailed kernel messages. + +2) **init binary doesn't exist on rootfs**: Make sure you have the correct + root FS type (and ``root=`` kernel parameter points to the correct + partition), required drivers such as storage hardware (such as SCSI or + USB!) and filesystem (ext3, jffs2, etc.) are builtin (alternatively as + modules, to be pre-loaded by an initrd). + +3) **Broken console device**: Possibly a conflict in ``console= setup`` + --> initial console unavailable. E.g. some serial consoles are unreliable + due to serial IRQ issues (e.g. missing interrupt-based configuration). Try using a different ``console= device`` or e.g. ``netconsole=``. -D) e.g. required library dependencies of the init binary such as - ``/lib/ld-linux.so.2`` missing or broken. Use - ``readelf -d <INIT>|grep NEEDED`` to find out which libraries are required. -E) make sure the binary's architecture matches your hardware. - E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware. - In case you tried loading a non-binary file here (shell script?), - you should make sure that the script specifies an interpreter in its shebang - header line (``#!/...``) that is fully working (including its library - dependencies). And before tackling scripts, better first test a simple - non-script binary such as ``/bin/sh`` and confirm its successful execution. - To find out more, add code ``to init/main.c`` to display kernel_execve()s - return values. + +4) **Binary exists but dependencies not available**: E.g. required library + dependencies of the init binary such as ``/lib/ld-linux.so.2`` missing or + broken. Use ``readelf -d <INIT>|grep NEEDED`` to find out which libraries + are required. + +5) **Binary cannot be loaded**: Make sure the binary's architecture matches + your hardware. E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM + hardware. In case you tried loading a non-binary file here (shell script?), + you should make sure that the script specifies an interpreter in its + shebang header line (``#!/...``) that is fully working (including its + library dependencies). And before tackling scripts, better first test a + simple non-script binary such as ``/bin/sh`` and confirm its successful + execution. To find out more, add code ``to init/main.c`` to display + kernel_execve()s return values. Please extend this explanation whenever you find new failure causes (after all loading the init binary is a CRITICAL and hard transition step -which needs to be made as painless as possible), then submit patch to LKML. +which needs to be made as painless as possible), then submit a patch to LKML. Further TODOs: - Implement the various ``run_init_process()`` invocations via a struct array which can then store the ``kernel_execve()`` result value and on failure log it all by iterating over **all** results (very important usability fix). -- try to make the implementation itself more helpful in general, - e.g. by providing additional error messages at affected places. +- Try to make the implementation itself more helpful in general, e.g. by + providing additional error messages at affected places. -Andreas Mohr <andi at lisas period de> diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 007a6b86e0ee..e4ee8b2db604 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -393,6 +393,12 @@ KERNELOFFSET The kernel randomization offset. Used to compute the page offset. If KASLR is disabled, this value is zero. +KERNELPACMASK +------------- + +The mask to extract the Pointer Authentication Code from a kernel virtual +address. + arm === diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7bc83f3d9bdf..4379c6ac3265 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1748,6 +1748,13 @@ initrd= [BOOT] Specify the location of the initial ramdisk + initrdmem= [KNL] Specify a physical address and size from which to + load the initrd. If an initrd is compiled in or + specified in the bootparams, it takes priority over this + setting. + Format: ss[KMG],nn[KMG] + Default is 0, 0 + init_on_alloc= [MM] Fill newly allocated pages and heap objects with zeroes. Format: 0 | 1 @@ -3329,7 +3336,7 @@ See Documentation/admin-guide/sysctl/vm.rst for details. ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. - See Documentation/debugging-via-ohci1394.txt for more + See Documentation/core-api/debugging-via-ohci1394.rst for more info. olpc_ec_timeout= [OLPC] ms delay when issuing EC commands @@ -4210,12 +4217,24 @@ Duration of CPU stall (s) to test RCU CPU stall warnings, zero to disable. + rcutorture.stall_cpu_block= [KNL] + Sleep while stalling if set. This will result + in warnings from preemptible RCU in addition + to any other stall-related activity. + rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. rcutorture.stall_cpu_irqsoff= [KNL] Disable interrupts while stalling if set. + rcutorture.stall_gp_kthread= [KNL] + Duration (s) of forced sleep within RCU + grace-period kthread to test RCU CPU stall + warnings, zero to disable. If both stall_cpu + and stall_gp_kthread are specified, the + kthread is starved first, then the CPU. + rcutorture.stat_interval= [KNL] Time (s) between statistics printk()s. @@ -4286,6 +4305,13 @@ only normal grace-period primitives. No effect on CONFIG_TINY_RCU kernels. + rcupdate.rcu_task_ipi_delay= [KNL] + Set time in jiffies during which RCU tasks will + avoid sending IPIs, starting with the beginning + of a given grace period. Setting a large + number avoids disturbing real-time workloads, + but lengthens grace periods. + rcupdate.rcu_task_stall_timeout= [KNL] Set timeout in jiffies for RCU task stall warning messages. Disable with a value less than or equal diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst index 21818aca4708..dc36aeb65d0a 100644 --- a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst +++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst @@ -10,7 +10,7 @@ them to a "housekeeping" CPU dedicated to such work. References ========== -- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. +- Documentation/core-api/irq/irq-affinity.rst: Binding interrupts to sets of CPUs. - Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index c30176e67900..0bf49d7313ad 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -12,107 +12,107 @@ and more generally they allow userland to take control of various memory page faults, something otherwise only the kernel code could do. For example userfaults allows a proper and more optimal implementation -of the PROT_NONE+SIGSEGV trick. +of the ``PROT_NONE+SIGSEGV`` trick. Design ====== -Userfaults are delivered and resolved through the userfaultfd syscall. +Userfaults are delivered and resolved through the ``userfaultfd`` syscall. -The userfaultfd (aside from registering and unregistering virtual +The ``userfaultfd`` (aside from registering and unregistering virtual memory ranges) provides two primary functionalities: -1) read/POLLIN protocol to notify a userland thread of the faults +1) ``read/POLLIN`` protocol to notify a userland thread of the faults happening -2) various UFFDIO_* ioctls that can manage the virtual memory regions - registered in the userfaultfd that allows userland to efficiently +2) various ``UFFDIO_*`` ioctls that can manage the virtual memory regions + registered in the ``userfaultfd`` that allows userland to efficiently resolve the userfaults it receives via 1) or to manage the virtual memory in the background The real advantage of userfaults if compared to regular virtual memory management of mremap/mprotect is that the userfaults in all their operations never involve heavyweight structures like vmas (in fact the -userfaultfd runtime load never takes the mmap_sem for writing). +``userfaultfd`` runtime load never takes the mmap_sem for writing). Vmas are not suitable for page- (or hugepage) granular fault tracking when dealing with virtual address spaces that could span Terabytes. Too many vmas would be needed for that. -The userfaultfd once opened by invoking the syscall, can also be +The ``userfaultfd`` once opened by invoking the syscall, can also be passed using unix domain sockets to a manager process, so the same manager process could handle the userfaults of a multitude of different processes without them being aware about what is going on -(well of course unless they later try to use the userfaultfd +(well of course unless they later try to use the ``userfaultfd`` themselves on the same region the manager is already tracking, which -is a corner case that would currently return -EBUSY). +is a corner case that would currently return ``-EBUSY``). API === -When first opened the userfaultfd must be enabled invoking the -UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or -a later API version) which will specify the read/POLLIN protocol -userland intends to speak on the UFFD and the uffdio_api.features -userland requires. The UFFDIO_API ioctl if successful (i.e. if the -requested uffdio_api.api is spoken also by the running kernel and the +When first opened the ``userfaultfd`` must be enabled invoking the +``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or +a later API version) which will specify the ``read/POLLIN`` protocol +userland intends to speak on the ``UFFD`` and the ``uffdio_api.features`` +userland requires. The ``UFFDIO_API`` ioctl if successful (i.e. if the +requested ``uffdio_api.api`` is spoken also by the running kernel and the requested features are going to be enabled) will return into -uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of +``uffdio_api.features`` and ``uffdio_api.ioctls`` two 64bit bitmasks of respectively all the available features of the read(2) protocol and the generic ioctl available. -The uffdio_api.features bitmask returned by the UFFDIO_API ioctl -defines what memory types are supported by the userfaultfd and what +The ``uffdio_api.features`` bitmask returned by the ``UFFDIO_API`` ioctl +defines what memory types are supported by the ``userfaultfd`` and what events, except page fault notifications, may be generated. -If the kernel supports registering userfaultfd ranges on hugetlbfs -virtual memory areas, UFFD_FEATURE_MISSING_HUGETLBFS will be set in -uffdio_api.features. Similarly, UFFD_FEATURE_MISSING_SHMEM will be -set if the kernel supports registering userfaultfd ranges on shared -memory (covering all shmem APIs, i.e. tmpfs, IPCSHM, /dev/zero -MAP_SHARED, memfd_create, etc). +If the kernel supports registering ``userfaultfd`` ranges on hugetlbfs +virtual memory areas, ``UFFD_FEATURE_MISSING_HUGETLBFS`` will be set in +``uffdio_api.features``. Similarly, ``UFFD_FEATURE_MISSING_SHMEM`` will be +set if the kernel supports registering ``userfaultfd`` ranges on shared +memory (covering all shmem APIs, i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, +``MAP_SHARED``, ``memfd_create``, etc). -The userland application that wants to use userfaultfd with hugetlbfs +The userland application that wants to use ``userfaultfd`` with hugetlbfs or shared memory need to set the corresponding flag in -uffdio_api.features to enable those features. +``uffdio_api.features`` to enable those features. If the userland desires to receive notifications for events other than -page faults, it has to verify that uffdio_api.features has appropriate -UFFD_FEATURE_EVENT_* bits set. These events are described in more -detail below in "Non-cooperative userfaultfd" section. - -Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should -be invoked (if present in the returned uffdio_api.ioctls bitmask) to -register a memory range in the userfaultfd by setting the -uffdio_register structure accordingly. The uffdio_register.mode +page faults, it has to verify that ``uffdio_api.features`` has appropriate +``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more +detail below in `Non-cooperative userfaultfd`_ section. + +Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should +be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to +register a memory range in the ``userfaultfd`` by setting the +uffdio_register structure accordingly. The ``uffdio_register.mode`` bitmask will specify to the kernel which kind of faults to track for -the range (UFFDIO_REGISTER_MODE_MISSING would track missing -pages). The UFFDIO_REGISTER ioctl will return the -uffdio_register.ioctls bitmask of ioctls that are suitable to resolve +the range (``UFFDIO_REGISTER_MODE_MISSING`` would track missing +pages). The ``UFFDIO_REGISTER`` ioctl will return the +``uffdio_register.ioctls`` bitmask of ioctls that are suitable to resolve userfaults on the range registered. Not all ioctls will necessarily be supported for all memory types depending on the underlying virtual memory backend (anonymous memory vs tmpfs vs real filebacked mappings). -Userland can use the uffdio_register.ioctls to manage the virtual +Userland can use the ``uffdio_register.ioctls`` to manage the virtual address space in the background (to add or potentially also remove -memory from the userfaultfd registered range). This means a userfault +memory from the ``userfaultfd`` registered range). This means a userfault could be triggering just before userland maps in the background the user-faulted page. -The primary ioctl to resolve userfaults is UFFDIO_COPY. That +The primary ioctl to resolve userfaults is ``UFFDIO_COPY``. That atomically copies a page into the userfault registered range and wakes -up the blocked userfaults (unless uffdio_copy.mode & -UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to -UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an -half copied page since it'll keep userfaulting until the copy has -finished. +up the blocked userfaults +(unless ``uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE`` is set). +Other ioctl works similarly to ``UFFDIO_COPY``. They're atomic as in +guaranteeing that nothing can see an half copied page since it'll +keep userfaulting until the copy has finished. Notes: -- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then +- If you requested ``UFFDIO_REGISTER_MODE_MISSING`` when registering then you must provide some kind of page in your thread after reading from - the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE. + the uffd. You must provide either ``UFFDIO_COPY`` or ``UFFDIO_ZEROPAGE``. The normal behavior of the OS automatically providing a zero page on an annonymous mmaping is not in place. @@ -122,13 +122,13 @@ Notes: - You get the address of the access that triggered the missing page event out of a struct uffd_msg that you read in the thread from the - uffd. You can supply as many pages as you want with UFFDIO_COPY or - UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then + uffd. You can supply as many pages as you want with ``UFFDIO_COPY`` or + ``UFFDIO_ZEROPAGE``. Keep in mind that unless you used DONTWAKE then the first of any of those IOCTLs wakes up the faulting thread. -- Be sure to test for all errors including (pollfd[0].revents & - POLLERR). This can happen, e.g. when ranges supplied were - incorrect. +- Be sure to test for all errors including + (``pollfd[0].revents & POLLERR``). This can happen, e.g. when ranges + supplied were incorrect. Write Protect Notifications --------------------------- @@ -136,41 +136,42 @@ Write Protect Notifications This is equivalent to (but faster than) using mprotect and a SIGSEGV signal handler. -Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP. -Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT, -struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP +Firstly you need to register a range with ``UFFDIO_REGISTER_MODE_WP``. +Instead of using mprotect(2) you use +``ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect)`` +while ``mode = UFFDIO_WRITEPROTECT_MODE_WP`` in the struct passed in. The range does not default to and does not have to be identical to the range you registered with. You can write protect as many ranges as you like (inside the registered range). Then, in the thread reading from uffd the struct will have -msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send -ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again -while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set. -This wakes up the thread which will continue to run with writes. This +``msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP`` set. Now you send +``ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect)`` +again while ``pagefault.mode`` does not have ``UFFDIO_WRITEPROTECT_MODE_WP`` +set. This wakes up the thread which will continue to run with writes. This allows you to do the bookkeeping about the write in the uffd reading thread before the ioctl. -If you registered with both UFFDIO_REGISTER_MODE_MISSING and -UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in +If you registered with both ``UFFDIO_REGISTER_MODE_MISSING`` and +``UFFDIO_REGISTER_MODE_WP`` then you need to think about the sequence in which you supply a page and undo write protect. Note that there is a difference between writes into a WP area and into a !WP area. The -former will have UFFD_PAGEFAULT_FLAG_WP set, the latter -UFFD_PAGEFAULT_FLAG_WRITE. The latter did not fail on protection but -you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was +former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter +``UFFD_PAGEFAULT_FLAG_WRITE``. The latter did not fail on protection but +you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was used. QEMU/KVM ======== -QEMU/KVM is using the userfaultfd syscall to implement postcopy live +QEMU/KVM is using the ``userfaultfd`` syscall to implement postcopy live migration. Postcopy live migration is one form of memory externalization consisting of a virtual machine running with part or all of its memory residing on a different node in the cloud. The -userfaultfd abstraction is generic enough that not a single line of +``userfaultfd`` abstraction is generic enough that not a single line of KVM kernel code had to be modified in order to add postcopy live migration to QEMU. -Guest async page faults, FOLL_NOWAIT and all other GUP features work +Guest async page faults, ``FOLL_NOWAIT`` and all other ``GUP*`` features work just fine in combination with userfaults. Userfaults trigger async page faults in the guest scheduler so those guest processes that aren't waiting for userfaults (i.e. network bound) can keep running in @@ -183,19 +184,19 @@ generating userfaults for readonly guest regions. The implementation of postcopy live migration currently uses one single bidirectional socket but in the future two different sockets will be used (to reduce the latency of the userfaults to the minimum -possible without having to decrease /proc/sys/net/ipv4/tcp_wmem). +possible without having to decrease ``/proc/sys/net/ipv4/tcp_wmem``). The QEMU in the source node writes all pages that it knows are missing in the destination node, into the socket, and the migration thread of -the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE -ioctls on the userfaultfd in order to map the received pages into the -guest (UFFDIO_ZEROCOPY is used if the source page was a zero page). +the QEMU running in the destination node runs ``UFFDIO_COPY|ZEROPAGE`` +ioctls on the ``userfaultfd`` in order to map the received pages into the +guest (``UFFDIO_ZEROCOPY`` is used if the source page was a zero page). A different postcopy thread in the destination node listens with -poll() to the userfaultfd in parallel. When a POLLIN event is +poll() to the ``userfaultfd`` in parallel. When a ``POLLIN`` event is generated after a userfault triggers, the postcopy thread read() from -the userfaultfd and receives the fault address (or -EAGAIN in case the -userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run +the ``userfaultfd`` and receives the fault address (or ``-EAGAIN`` in case the +userfault was already resolved and waken by a ``UFFDIO_COPY|ZEROPAGE`` run by the parallel QEMU migration thread). After the QEMU postcopy thread (running in the destination node) gets @@ -206,7 +207,7 @@ remaining missing pages from that new page offset. Soon after that (just the time to flush the tcp_wmem queue through the network) the migration thread in the QEMU running in the destination node will receive the page that triggered the userfault and it'll map it as -usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it +usual with the ``UFFDIO_COPY|ZEROPAGE`` (without actually knowing if it was spontaneously sent by the source or if it was an urgent page requested through a userfault). @@ -219,74 +220,74 @@ checked to find which missing pages to send in round robin and we seek over it when receiving incoming userfaults. After sending each page of course the bitmap is updated accordingly. It's also useful to avoid sending the same page twice (in case the userfault is read by the -postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration +postcopy thread just before ``UFFDIO_COPY|ZEROPAGE`` runs in the migration thread). Non-cooperative userfaultfd =========================== -When the userfaultfd is monitored by an external manager, the manager +When the ``userfaultfd`` is monitored by an external manager, the manager must be able to track changes in the process virtual memory layout. Userfaultfd can notify the manager about such changes using the same read(2) protocol as for the page fault notifications. The manager has to explicitly enable these events by setting appropriate -bits in uffdio_api.features passed to UFFDIO_API ioctl: +bits in ``uffdio_api.features`` passed to ``UFFDIO_API`` ioctl: -UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When this feature is - enabled, the userfaultfd context of the parent process is +``UFFD_FEATURE_EVENT_FORK`` + enable ``userfaultfd`` hooks for fork(). When this feature is + enabled, the ``userfaultfd`` context of the parent process is duplicated into the newly created process. The manager - receives UFFD_EVENT_FORK with file descriptor of the new - userfaultfd context in the uffd_msg.fork. + receives ``UFFD_EVENT_FORK`` with file descriptor of the new + ``userfaultfd`` context in the ``uffd_msg.fork``. -UFFD_FEATURE_EVENT_REMAP +``UFFD_FEATURE_EVENT_REMAP`` enable notifications about mremap() calls. When the non-cooperative process moves a virtual memory area to a different location, the manager will receive - UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and + ``UFFD_EVENT_REMAP``. The ``uffd_msg.remap`` will contain the old and new addresses of the area and its original length. -UFFD_FEATURE_EVENT_REMOVE +``UFFD_FEATURE_EVENT_REMOVE`` enable notifications about madvise(MADV_REMOVE) and - madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will - be generated upon these calls to madvise. The uffd_msg.remove + madvise(MADV_DONTNEED) calls. The event ``UFFD_EVENT_REMOVE`` will + be generated upon these calls to madvise(). The ``uffd_msg.remove`` will contain start and end addresses of the removed area. -UFFD_FEATURE_EVENT_UNMAP +``UFFD_FEATURE_EVENT_UNMAP`` enable notifications about memory unmapping. The manager will - get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and + get ``UFFD_EVENT_UNMAP`` with ``uffd_msg.remove`` containing start and end addresses of the unmapped area. -Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP +Although the ``UFFD_FEATURE_EVENT_REMOVE`` and ``UFFD_FEATURE_EVENT_UNMAP`` are pretty similar, they quite differ in the action expected from the -userfaultfd manager. In the former case, the virtual memory is +``userfaultfd`` manager. In the former case, the virtual memory is removed, but the area is not, the area remains monitored by the -userfaultfd, and if a page fault occurs in that area it will be +``userfaultfd``, and if a page fault occurs in that area it will be delivered to the manager. The proper resolution for such page fault is to zeromap the faulting address. However, in the latter case, when an area is unmapped, either explicitly (with munmap() system call), or implicitly (e.g. during mremap()), the area is removed and in turn the -userfaultfd context for such area disappears too and the manager will +``userfaultfd`` context for such area disappears too and the manager will not get further userland page faults from the removed area. Still, the notification is required in order to prevent manager from using -UFFDIO_COPY on the unmapped area. +``UFFDIO_COPY`` on the unmapped area. Unlike userland page faults which have to be synchronous and require explicit or implicit wakeup, all the events are delivered asynchronously and the non-cooperative process resumes execution as -soon as manager executes read(). The userfaultfd manager should -carefully synchronize calls to UFFDIO_COPY with the events -processing. To aid the synchronization, the UFFDIO_COPY ioctl will -return -ENOSPC when the monitored process exits at the time of -UFFDIO_COPY, and -ENOENT, when the non-cooperative process has changed -its virtual memory layout simultaneously with outstanding UFFDIO_COPY +soon as manager executes read(). The ``userfaultfd`` manager should +carefully synchronize calls to ``UFFDIO_COPY`` with the events +processing. To aid the synchronization, the ``UFFDIO_COPY`` ioctl will +return ``-ENOSPC`` when the monitored process exits at the time of +``UFFDIO_COPY``, and ``-ENOENT``, when the non-cooperative process has changed +its virtual memory layout simultaneously with outstanding ``UFFDIO_COPY`` operation. The current asynchronous model of the event delivery is optimal for -single threaded non-cooperative userfaultfd manager implementations. A +single threaded non-cooperative ``userfaultfd`` manager implementations. A synchronous event delivery model can be added later as a new -userfaultfd feature to facilitate multithreading enhancements of the -non cooperative manager, for example to allow UFFDIO_COPY ioctls to +``userfaultfd`` feature to facilitate multithreading enhancements of the +non cooperative manager, for example to allow ``UFFDIO_COPY`` ioctls to run in parallel to the event reception. Single threaded implementations should continue to use the current async event delivery model instead. diff --git a/Documentation/admin-guide/nfs/nfsroot.rst b/Documentation/admin-guide/nfs/nfsroot.rst index 82a4fda057f9..c6772075c80c 100644 --- a/Documentation/admin-guide/nfs/nfsroot.rst +++ b/Documentation/admin-guide/nfs/nfsroot.rst @@ -18,7 +18,7 @@ Mounting the root filesystem via NFS (nfsroot) In order to use a diskless system, such as an X-terminal or printer server for example, it is necessary for the root filesystem to be present on a non-disk device. This may be an initramfs (see -Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see +Documentation/filesystems/ramfs-rootfs-initramfs.rst), a ramdisk (see Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The following text describes on how to use NFS for the root filesystem. For the rest of this text 'client' means the diskless system, and 'server' means the NFS diff --git a/Documentation/admin-guide/numastat.rst b/Documentation/admin-guide/numastat.rst index aaf1667489f8..08ec2c2bdce3 100644 --- a/Documentation/admin-guide/numastat.rst +++ b/Documentation/admin-guide/numastat.rst @@ -6,6 +6,21 @@ Numa policy hit/miss statistics All units are pages. Hugepages have separate counters. +The numa_hit, numa_miss and numa_foreign counters reflect how well processes +are able to allocate memory from nodes they prefer. If they succeed, numa_hit +is incremented on the preferred node, otherwise numa_foreign is incremented on +the preferred node and numa_miss on the node where allocation succeeded. + +Usually preferred node is the one local to the CPU where the process executes, +but restrictions such as mempolicies can change that, so there are also two +counters based on CPU local node. local_node is similar to numa_hit and is +incremented on allocation from a node by CPU on the same node. other_node is +similar to numa_miss and is incremented on the node where allocation succeeds +from a CPU from a different node. Note there is no counter analogical to +numa_foreign. + +In more detail: + =============== ============================================================ numa_hit A process wanted to allocate memory from this node, and succeeded. @@ -14,11 +29,13 @@ numa_miss A process wanted to allocate memory from another node, but ended up with memory from this node. numa_foreign A process wanted to allocate on this node, - but ended up with memory from another one. + but ended up with memory from another node. -local_node A process ran on this node and got memory from it. +local_node A process ran on this node's CPU, + and got memory from this node. -other_node A process ran on this node and got memory from another node. +other_node A process ran on a different node's CPU + and got memory from this node. interleave_hit Interleaving wanted to allocate from this node and succeeded. @@ -28,3 +45,11 @@ For easier reading you can use the numastat utility from the numactl package (http://oss.sgi.com/projects/libnuma/). Note that it only works well right now on machines with a small number of CPUs. +Note that on systems with memoryless nodes (where a node has CPUs but no +memory) the numa_hit, numa_miss and numa_foreign statistics can be skewed +heavily. In the current kernel implementation, if a process prefers a +memoryless node (i.e. because it is running on one of its local CPU), the +implementation actually treats one of the nearest nodes with memory as the +preferred node. As a result, such allocation will not increase the numa_foreign +counter on the memoryless node, and will skew the numa_hit, numa_miss and +numa_foreign statistics of the nearest node. diff --git a/Documentation/admin-guide/perf-security.rst b/Documentation/admin-guide/perf-security.rst index 72effa7c23b9..1307b5274a0f 100644 --- a/Documentation/admin-guide/perf-security.rst +++ b/Documentation/admin-guide/perf-security.rst @@ -1,6 +1,6 @@ .. _perf_security: -Perf Events and tool security +Perf events and tool security ============================= Overview @@ -42,11 +42,11 @@ categories: Data that belong to the fourth category can potentially contain sensitive process data. If PMUs in some monitoring modes capture values of execution context registers or data from process memory then access -to such monitoring capabilities requires to be ordered and secured -properly. So, perf_events/Perf performance monitoring is the subject for -security access control management [5]_ . +to such monitoring modes requires to be ordered and secured properly. +So, perf_events performance monitoring and observability operations are +the subject for security access control management [5]_ . -perf_events/Perf access control +perf_events access control ------------------------------- To perform security checks, the Linux implementation splits processes @@ -66,11 +66,25 @@ into distinct units, known as capabilities [6]_ , which can be independently enabled and disabled on per-thread basis for processes and files of unprivileged users. -Unprivileged processes with enabled CAP_SYS_ADMIN capability are treated +Unprivileged processes with enabled CAP_PERFMON capability are treated as privileged processes with respect to perf_events performance -monitoring and bypass *scope* permissions checks in the kernel. - -Unprivileged processes using perf_events system call API is also subject +monitoring and observability operations, thus, bypass *scope* permissions +checks in the kernel. CAP_PERFMON implements the principle of least +privilege [13]_ (POSIX 1003.1e: 2.2.2.39) for performance monitoring and +observability operations in the kernel and provides a secure approach to +perfomance monitoring and observability in the system. + +For backward compatibility reasons the access to perf_events monitoring and +observability operations is also open for CAP_SYS_ADMIN privileged +processes but CAP_SYS_ADMIN usage for secure monitoring and observability +use cases is discouraged with respect to the CAP_PERFMON capability. +If system audit records [14]_ for a process using perf_events system call +API contain denial records of acquiring both CAP_PERFMON and CAP_SYS_ADMIN +capabilities then providing the process with CAP_PERFMON capability singly +is recommended as the preferred secure approach to resolve double access +denial logging related to usage of performance monitoring and observability. + +Unprivileged processes using perf_events system call are also subject for PTRACE_MODE_READ_REALCREDS ptrace access mode check [7]_ , whose outcome determines whether monitoring is permitted. So unprivileged processes provided with CAP_SYS_PTRACE capability are effectively @@ -82,14 +96,14 @@ performance analysis of monitored processes or a system. For example, CAP_SYSLOG capability permits reading kernel space memory addresses from /proc/kallsyms file. -perf_events/Perf privileged users +Privileged Perf users groups --------------------------------- Mechanisms of capabilities, privileged capability-dumb files [6]_ and -file system ACLs [10]_ can be used to create a dedicated group of -perf_events/Perf privileged users who are permitted to execute -performance monitoring without scope limits. The following steps can be -taken to create such a group of privileged Perf users. +file system ACLs [10]_ can be used to create dedicated groups of +privileged Perf users who are permitted to execute performance monitoring +and observability without scope limits. The following steps can be +taken to create such groups of privileged Perf users. 1. Create perf_users group of privileged Perf users, assign perf_users group to Perf tool executable and limit access to the executable for @@ -108,30 +122,51 @@ taken to create such a group of privileged Perf users. -rwxr-x--- 2 root perf_users 11M Oct 19 15:12 perf 2. Assign the required capabilities to the Perf tool executable file and - enable members of perf_users group with performance monitoring + enable members of perf_users group with monitoring and observability privileges [6]_ : :: - # setcap "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf - # setcap -v "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf + # setcap "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" perf + # setcap -v "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" perf perf: OK # getcap perf - perf = cap_sys_ptrace,cap_sys_admin,cap_syslog+ep + perf = cap_sys_ptrace,cap_syslog,cap_perfmon+ep + +If the libcap installed doesn't yet support "cap_perfmon", use "38" instead, +i.e.: + +:: + + # setcap "38,cap_ipc_lock,cap_sys_ptrace,cap_syslog=ep" perf + +Note that you may need to have 'cap_ipc_lock' in the mix for tools such as +'perf top', alternatively use 'perf top -m N', to reduce the memory that +it uses for the perf ring buffer, see the memory allocation section below. + +Using a libcap without support for CAP_PERFMON will make cap_get_flag(caps, 38, +CAP_EFFECTIVE, &val) fail, which will lead the default event to be 'cycles:u', +so as a workaround explicitly ask for the 'cycles' event, i.e.: + +:: + + # perf top -e cycles + +To get kernel and user samples with a perf binary with just CAP_PERFMON. As a result, members of perf_users group are capable of conducting -performance monitoring by using functionality of the configured Perf -tool executable that, when executes, passes perf_events subsystem scope -checks. +performance monitoring and observability by using functionality of the +configured Perf tool executable that, when executes, passes perf_events +subsystem scope checks. This specific access control management is only available to superuser or root running processes with CAP_SETPCAP, CAP_SETFCAP [6]_ capabilities. -perf_events/Perf unprivileged users +Unprivileged users ----------------------------------- -perf_events/Perf *scope* and *access* control for unprivileged processes +perf_events *scope* and *access* control for unprivileged processes is governed by perf_event_paranoid [2]_ setting: -1: @@ -166,7 +201,7 @@ is governed by perf_event_paranoid [2]_ setting: perf_event_mlock_kb locking limit is imposed but ignored for unprivileged processes with CAP_IPC_LOCK capability. -perf_events/Perf resource control +Resource control --------------------------------- Open file descriptors @@ -227,4 +262,5 @@ Bibliography .. [10] `<http://man7.org/linux/man-pages/man5/acl.5.html>`_ .. [11] `<http://man7.org/linux/man-pages/man2/getrlimit.2.html>`_ .. [12] `<http://man7.org/linux/man-pages/man5/limits.conf.5.html>`_ - +.. [13] `<https://sites.google.com/site/fullycapable>`_ +.. [14] `<http://man7.org/linux/man-pages/man8/auditd.8.html>`_ diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index 0310db624964..7b481b2a368e 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -156,11 +156,11 @@ the labels provided by the BIOS won't match the real ones. ECC memory ---------- -As mentioned on the previous section, ECC memory has extra bits to be -used for error correction. So, on 64 bit systems, a memory module -has 64 bits of *data width*, and 74 bits of *total width*. So, there are -8 bits extra bits to be used for the error detection and correction -mechanisms. Those extra bits are called *syndrome*\ [#f1]_\ [#f2]_. +As mentioned in the previous section, ECC memory has extra bits to be +used for error correction. In the above example, a memory module has +64 bits of *data width*, and 72 bits of *total width*. The extra 8 +bits which are used for the error detection and correction mechanisms +are referred to as the *syndrome*\ [#f1]_\ [#f2]_. So, when the cpu requests the memory controller to write a word with *data width*, the memory controller calculates the *syndrome* in real time, @@ -212,7 +212,7 @@ EDAC - Error Detection And Correction purposes. When the subsystem was pushed upstream for the first time, on - Kernel 2.6.16, for the first time, it was renamed to ``EDAC``. + Kernel 2.6.16, it was renamed to ``EDAC``. Purpose ------- @@ -351,15 +351,17 @@ controllers. The following example will assume 2 channels: +------------+-----------+-----------+ | | ``ch0`` | ``ch1`` | +============+===========+===========+ - | ``csrow0`` | DIMM_A0 | DIMM_B0 | - | | rank0 | rank0 | - +------------+ - | - | + | |**DIMM_A0**|**DIMM_B0**| + +------------+-----------+-----------+ + | ``csrow0`` | rank0 | rank0 | + +------------+-----------+-----------+ | ``csrow1`` | rank1 | rank1 | +------------+-----------+-----------+ - | ``csrow2`` | DIMM_A1 | DIMM_B1 | - | | rank0 | rank0 | - +------------+ - | - | - | ``csrow3`` | rank1 | rank1 | + | |**DIMM_A1**|**DIMM_B1**| + +------------+-----------+-----------+ + | ``csrow2`` | rank0 | rank0 | + +------------+-----------+-----------+ + | ``csrow3`` | rank1 | rank1 | +------------+-----------+-----------+ In the above example, there are 4 physical slots on the motherboard diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 0d427fd10941..1ebf68d01141 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -102,6 +102,30 @@ See the ``type_of_loader`` and ``ext_loader_ver`` fields in :doc:`/x86/boot` for additional information. +bpf_stats_enabled +================= + +Controls whether the kernel should collect statistics on BPF programs +(total time spent running, number of times run...). Enabling +statistics causes a slight reduction in performance on each program +run. The statistics can be seen using ``bpftool``. + += =================================== +0 Don't collect statistics (default). +1 Collect statistics. += =================================== + + +cad_pid +======= + +This is the pid which will be signalled on reboot (notably, by +Ctrl-Alt-Delete). Writing a value to this file which doesn't +correspond to a running process will result in ``-ESRCH``. + +See also `ctrl-alt-del`_. + + cap_last_cap ============ @@ -241,6 +265,40 @@ domain names are in general different. For a detailed discussion see the ``hostname(1)`` man page. +firmware_config +=============== + +See :doc:`/driver-api/firmware/fallback-mechanisms`. + +The entries in this directory allow the firmware loader helper +fallback to be controlled: + +* ``force_sysfs_fallback``, when set to 1, forces the use of the + fallback; +* ``ignore_sysfs_fallback``, when set to 1, ignores any fallback. + + +ftrace_dump_on_oops +=================== + +Determines whether ``ftrace_dump()`` should be called on an oops (or +kernel panic). This will output the contents of the ftrace buffers to +the console. This is very useful for capturing traces that lead to +crashes and outputting them to a serial console. + += =================================================== +0 Disabled (default). +1 Dump buffers of all CPUs. +2 Dump the buffer of the CPU that triggered the oops. += =================================================== + + +ftrace_enabled, stack_tracer_enabled +==================================== + +See :doc:`/trace/ftrace`. + + hardlockup_all_cpu_backtrace ============================ @@ -344,6 +402,25 @@ Controls whether the panic kmsg data should be reported to Hyper-V. = ========================================================= +ignore-unaligned-usertrap +========================= + +On architectures where unaligned accesses cause traps, and where this +feature is supported (``CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN``; +currently, ``arc`` and ``ia64``), controls whether all unaligned traps +are logged. + += ============================================================= +0 Log all unaligned accesses. +1 Only warn the first time a process traps. This is the default + setting. += ============================================================= + +See also `unaligned-trap`_ and `unaligned-dump-stack`_. On ``ia64``, +this allows system administrators to override the +``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. + + kexec_load_disabled =================== @@ -459,6 +536,15 @@ Notes: successful IPC object allocation. If an IPC object allocation syscall fails, it is undefined if the value remains unmodified or is reset to -1. + +ngroups_max +=========== + +Maximum number of supplementary groups, _i.e._ the maximum size which +``setgroups`` will accept. Exports ``NGROUPS_MAX`` from the kernel. + + + nmi_watchdog ============ @@ -721,7 +807,13 @@ perf_event_paranoid =================== Controls use of the performance events system by unprivileged -users (without CAP_SYS_ADMIN). The default value is 2. +users (without CAP_PERFMON). The default value is 2. + +For backward compatibility reasons access to system performance +monitoring and observability remains open for CAP_SYS_ADMIN +privileged processes but CAP_SYS_ADMIN usage for secure system +performance monitoring and observability operations is discouraged +with respect to CAP_PERFMON use cases. === ================================================================== -1 Allow use of (almost) all events by all users. @@ -730,13 +822,13 @@ users (without CAP_SYS_ADMIN). The default value is 2. ``CAP_IPC_LOCK``. >=0 Disallow ftrace function tracepoint by users without - ``CAP_SYS_ADMIN``. + ``CAP_PERFMON``. - Disallow raw tracepoint access by users without ``CAP_SYS_ADMIN``. + Disallow raw tracepoint access by users without ``CAP_PERFMON``. ->=1 Disallow CPU event access by users without ``CAP_SYS_ADMIN``. +>=1 Disallow CPU event access by users without ``CAP_PERFMON``. ->=2 Disallow kernel profiling by users without ``CAP_SYS_ADMIN``. +>=2 Disallow kernel profiling by users without ``CAP_PERFMON``. === ================================================================== @@ -871,7 +963,7 @@ this sysctl interface anymore. pty === -See Documentation/filesystems/devpts.txt. +See Documentation/filesystems/devpts.rst. randomize_va_space @@ -1167,6 +1259,65 @@ If a value outside of this range is written to ``threads-max`` an ``EINVAL`` error occurs. +traceoff_on_warning +=================== + +When set, disables tracing (see :doc:`/trace/ftrace`) when a +``WARN()`` is hit. + + +tracepoint_printk +================= + +When tracepoints are sent to printk() (enabled by the ``tp_printk`` +boot parameter), this entry provides runtime control:: + + echo 0 > /proc/sys/kernel/tracepoint_printk + +will stop tracepoints from being sent to printk(), and:: + + echo 1 > /proc/sys/kernel/tracepoint_printk + +will send them to printk() again. + +This only works if the kernel was booted with ``tp_printk`` enabled. + +See :doc:`/admin-guide/kernel-parameters` and +:doc:`/trace/boottime-trace`. + + +.. _unaligned-dump-stack: + +unaligned-dump-stack (ia64) +=========================== + +When logging unaligned accesses, controls whether the stack is +dumped. + += =================================================== +0 Do not dump the stack. This is the default setting. +1 Dump the stack. += =================================================== + +See also `ignore-unaligned-usertrap`_. + + +unaligned-trap +============== + +On architectures where unaligned accesses cause traps, and where this +feature is supported (``CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW``; currently, +``arc`` and ``parisc``), controls whether unaligned traps are caught +and emulated (instead of failing). + += ======================================================== +0 Do not emulate unaligned accesses. +1 Emulate unaligned accesses. This is the default setting. += ======================================================== + +See also `ignore-unaligned-usertrap`_. + + unknown_nmi_panic ================= @@ -1178,6 +1329,16 @@ NMI switch that most IA32 servers have fires unknown NMI up, for example. If a system hangs up, try pressing the NMI switch. +unprivileged_bpf_disabled +========================= + +Writing 1 to this entry will disable unprivileged calls to ``bpf()``; +once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` will return +``-EPERM``. + +Once set, this can't be cleared. + + watchdog ======== diff --git a/Documentation/arm64/amu.rst b/Documentation/arm64/amu.rst index 036783ee327f..452ec8b115c2 100644 --- a/Documentation/arm64/amu.rst +++ b/Documentation/arm64/amu.rst @@ -24,13 +24,13 @@ optional external memory-mapped interface. Version 1 of the Activity Monitors architecture implements a counter group of four fixed and architecturally defined 64-bit event counters. -- CPU cycle counter: increments at the frequency of the CPU. -- Constant counter: increments at the fixed frequency of the system - clock. -- Instructions retired: increments with every architecturally executed - instruction. -- Memory stall cycles: counts instruction dispatch stall cycles caused by - misses in the last level cache within the clock domain. + - CPU cycle counter: increments at the frequency of the CPU. + - Constant counter: increments at the fixed frequency of the system + clock. + - Instructions retired: increments with every architecturally executed + instruction. + - Memory stall cycles: counts instruction dispatch stall cycles caused by + misses in the last level cache within the clock domain. When in WFI or WFE these counters do not increment. @@ -59,11 +59,11 @@ counters, only the presence of the extension. Firmware (code running at higher exception levels, e.g. arm-tf) support is needed to: -- Enable access for lower exception levels (EL2 and EL1) to the AMU - registers. -- Enable the counters. If not enabled these will read as 0. -- Save/restore the counters before/after the CPU is being put/brought up - from the 'off' power state. + - Enable access for lower exception levels (EL2 and EL1) to the AMU + registers. + - Enable the counters. If not enabled these will read as 0. + - Save/restore the counters before/after the CPU is being put/brought up + from the 'off' power state. When using kernels that have this feature enabled but boot with broken firmware the user may experience panics or lockups when accessing the @@ -81,10 +81,10 @@ are not trapped in EL2/EL3. The fixed counters of AMUv1 are accessible though the following system register definitions: -- SYS_AMEVCNTR0_CORE_EL0 -- SYS_AMEVCNTR0_CONST_EL0 -- SYS_AMEVCNTR0_INST_RET_EL0 -- SYS_AMEVCNTR0_MEM_STALL_EL0 + - SYS_AMEVCNTR0_CORE_EL0 + - SYS_AMEVCNTR0_CONST_EL0 + - SYS_AMEVCNTR0_INST_RET_EL0 + - SYS_AMEVCNTR0_MEM_STALL_EL0 Auxiliary platform specific counters can be accessed using SYS_AMEVCNTR1_EL0(n), where n is a value between 0 and 15. @@ -97,9 +97,9 @@ Userspace access Currently, access from userspace to the AMU registers is disabled due to: -- Security reasons: they might expose information about code executed in - secure mode. -- Purpose: AMU counters are intended for system management use. + - Security reasons: they might expose information about code executed in + secure mode. + - Purpose: AMU counters are intended for system management use. Also, the presence of the feature is not visible to userspace. @@ -110,8 +110,8 @@ Virtualization Currently, access from userspace (EL0) and kernelspace (EL1) on the KVM guest side is disabled due to: -- Security reasons: they might expose information about code executed - by other guests or the host. + - Security reasons: they might expose information about code executed + by other guests or the host. Any attempt to access the AMU registers will result in an UNDEFINED exception being injected into the guest. diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst index a3f1a47b6f1c..7552dbc1cc54 100644 --- a/Documentation/arm64/booting.rst +++ b/Documentation/arm64/booting.rst @@ -173,7 +173,10 @@ Before jumping into the kernel, the following conditions must be met: - Caches, MMUs The MMU must be off. - Instruction cache may be on or off. + + The instruction cache may be on or off, and must not hold any stale + entries corresponding to the loaded kernel image. + The address range corresponding to the loaded kernel image must be cleaned to the PoC. In the presence of a system cache or other coherent masters with caches enabled, this will typically require @@ -238,6 +241,7 @@ Before jumping into the kernel, the following conditions must be met: - The DT or ACPI tables must describe a GICv2 interrupt controller. For CPUs with pointer authentication functionality: + - If EL3 is present: - SCR_EL3.APK (bit 16) must be initialised to 0b1 @@ -249,18 +253,22 @@ Before jumping into the kernel, the following conditions must be met: - HCR_EL2.API (bit 41) must be initialised to 0b1 For CPUs with Activity Monitors Unit v1 (AMUv1) extension present: + - If EL3 is present: - CPTR_EL3.TAM (bit 30) must be initialised to 0b0 - CPTR_EL2.TAM (bit 30) must be initialised to 0b0 - AMCNTENSET0_EL0 must be initialised to 0b1111 - AMCNTENSET1_EL0 must be initialised to a platform specific value - having 0b1 set for the corresponding bit for each of the auxiliary - counters present. + + - CPTR_EL3.TAM (bit 30) must be initialised to 0b0 + - CPTR_EL2.TAM (bit 30) must be initialised to 0b0 + - AMCNTENSET0_EL0 must be initialised to 0b1111 + - AMCNTENSET1_EL0 must be initialised to a platform specific value + having 0b1 set for the corresponding bit for each of the auxiliary + counters present. + - If the kernel is entered at EL1: - AMCNTENSET0_EL0 must be initialised to 0b1111 - AMCNTENSET1_EL0 must be initialised to a platform specific value - having 0b1 set for the corresponding bit for each of the auxiliary - counters present. + + - AMCNTENSET0_EL0 must be initialised to 0b1111 + - AMCNTENSET1_EL0 must be initialised to a platform specific value + having 0b1 set for the corresponding bit for each of the auxiliary + counters present. The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must @@ -304,7 +312,8 @@ following manner: Documentation/devicetree/bindings/arm/psci.yaml. - Secondary CPU general-purpose register settings - x0 = 0 (reserved for future use) - x1 = 0 (reserved for future use) - x2 = 0 (reserved for future use) - x3 = 0 (reserved for future use) + + - x0 = 0 (reserved for future use) + - x1 = 0 (reserved for future use) + - x2 = 0 (reserved for future use) + - x3 = 0 (reserved for future use) diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index 41937a8091aa..314fa5bc2655 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -176,6 +176,8 @@ infrastructure: +------------------------------+---------+---------+ | SSBS | [7-4] | y | +------------------------------+---------+---------+ + | BT | [3-0] | y | + +------------------------------+---------+---------+ 4) MIDR_EL1 - Main ID Register diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 7dfb97dfe416..84a9fd2d41b4 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -236,6 +236,11 @@ HWCAP2_RNG Functionality implied by ID_AA64ISAR0_EL1.RNDR == 0b0001. +HWCAP2_BTI + + Functionality implied by ID_AA64PFR0_EL1.BT == 0b0001. + + 4. Unused AT_HWCAP bits ----------------------- diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 2c08c628febd..936cf2a59ca4 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -64,6 +64,10 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #852523 | N/A | @@ -78,8 +82,6 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | +----------------+-----------------+-----------------+-----------------------------+ -| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 | -+----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 | @@ -88,8 +90,6 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 | +----------------+-----------------+-----------------+-----------------------------+ -| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 | -+----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1349291 | N/A | diff --git a/Documentation/conf.py b/Documentation/conf.py index 9ae8e9abf846..f6a1bc07c410 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -388,44 +388,6 @@ if major == 1 and minor < 6: # author, documentclass [howto, manual, or own class]). # Sorted in alphabetical order latex_documents = [ - ('admin-guide/index', 'linux-user.tex', 'Linux Kernel User Documentation', - 'The kernel development community', 'manual'), - ('core-api/index', 'core-api.tex', 'The kernel core API manual', - 'The kernel development community', 'manual'), - ('crypto/index', 'crypto-api.tex', 'Linux Kernel Crypto API manual', - 'The kernel development community', 'manual'), - ('dev-tools/index', 'dev-tools.tex', 'Development tools for the Kernel', - 'The kernel development community', 'manual'), - ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide', - 'The kernel development community', 'manual'), - ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual', - 'The kernel development community', 'manual'), - ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', - 'The kernel development community', 'manual'), - ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', - 'ext4 Community', 'manual'), - ('filesystems/ext4/index', 'ext4-data-structures.tex', - 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'), - ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', - 'The kernel development community', 'manual'), - ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', - 'The kernel development community', 'manual'), - ('kernel-hacking/index', 'kernel-hacking.tex', 'Unreliable Guide To Hacking The Linux Kernel', - 'The kernel development community', 'manual'), - ('media/index', 'media.tex', 'Linux Media Subsystem Documentation', - 'The kernel development community', 'manual'), - ('networking/index', 'networking.tex', 'Linux Networking Documentation', - 'The kernel development community', 'manual'), - ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation', - 'The kernel development community', 'manual'), - ('security/index', 'security.tex', 'The kernel security subsystem manual', - 'The kernel development community', 'manual'), - ('sh/index', 'sh.tex', 'SuperH architecture implementation manual', - 'The kernel development community', 'manual'), - ('sound/index', 'sound.tex', 'Linux Sound Subsystem Documentation', - 'The kernel development community', 'manual'), - ('userspace-api/index', 'userspace-api.tex', 'The Linux kernel user-space API guide', - 'The kernel development community', 'manual'), ] # Add all other index files from Documentation/ subdirectories diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/core-api/debugging-via-ohci1394.rst index 981ad4f89fd3..981ad4f89fd3 100644 --- a/Documentation/debugging-via-ohci1394.txt +++ b/Documentation/core-api/debugging-via-ohci1394.rst diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/core-api/dma-api-howto.rst index 358d495456d1..358d495456d1 100644 --- a/Documentation/DMA-API-HOWTO.txt +++ b/Documentation/core-api/dma-api-howto.rst diff --git a/Documentation/DMA-API.txt b/Documentation/core-api/dma-api.rst index 2d8d2fed7317..2d8d2fed7317 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/core-api/dma-api.rst diff --git a/Documentation/DMA-attributes.txt b/Documentation/core-api/dma-attributes.rst index 29dcbe8826e8..29dcbe8826e8 100644 --- a/Documentation/DMA-attributes.txt +++ b/Documentation/core-api/dma-attributes.rst diff --git a/Documentation/DMA-ISA-LPC.txt b/Documentation/core-api/dma-isa-lpc.rst index b1ec7b16c21f..b1ec7b16c21f 100644 --- a/Documentation/DMA-ISA-LPC.txt +++ b/Documentation/core-api/dma-isa-lpc.rst diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 0897ad12c119..15ab86112627 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -18,6 +18,7 @@ it. kernel-api workqueue + printk-basics printk-formats symbol-namespaces @@ -30,10 +31,12 @@ Library functionality that is used throughout the kernel. :maxdepth: 1 kobject + kref assoc_array xarray idr circular-buffers + rbtree generic-radix-tree packing timekeeping @@ -50,6 +53,7 @@ How Linux keeps everything from happening at the same time. See atomic_ops refcount-vs-atomic + irq/index local_ops padata ../RCU/index @@ -78,6 +82,10 @@ more memory-management documentation in :doc:`/vm/index`. :maxdepth: 1 memory-allocation + dma-api + dma-api-howto + dma-attributes + dma-isa-lpc mm-api genalloc pin_user_pages @@ -92,6 +100,7 @@ Interfaces for kernel debugging debug-objects tracepoint + debugging-via-ohci1394 Everything else =============== diff --git a/Documentation/IRQ.txt b/Documentation/core-api/irq/concepts.rst index 4273806a606b..4273806a606b 100644 --- a/Documentation/IRQ.txt +++ b/Documentation/core-api/irq/concepts.rst diff --git a/Documentation/core-api/irq/index.rst b/Documentation/core-api/irq/index.rst new file mode 100644 index 000000000000..0d65d11e5420 --- /dev/null +++ b/Documentation/core-api/irq/index.rst @@ -0,0 +1,11 @@ +==== +IRQs +==== + +.. toctree:: + :maxdepth: 1 + + concepts + irq-affinity + irq-domain + irqflags-tracing diff --git a/Documentation/IRQ-affinity.txt b/Documentation/core-api/irq/irq-affinity.rst index 29da5000836a..29da5000836a 100644 --- a/Documentation/IRQ-affinity.txt +++ b/Documentation/core-api/irq/irq-affinity.rst diff --git a/Documentation/IRQ-domain.txt b/Documentation/core-api/irq/irq-domain.rst index 507775cce753..096db12f32d5 100644 --- a/Documentation/IRQ-domain.txt +++ b/Documentation/core-api/irq/irq-domain.rst @@ -263,7 +263,8 @@ needs to: Hierarchy irq_domain is in no way x86 specific, and is heavily used to support other architectures, such as ARM, ARM64 etc. -=== Debugging === +Debugging +========= Most of the internals of the IRQ subsystem are exposed in debugfs by turning CONFIG_GENERIC_IRQ_DEBUGFS on. diff --git a/Documentation/irqflags-tracing.txt b/Documentation/core-api/irq/irqflags-tracing.rst index bdd208259fb3..bdd208259fb3 100644 --- a/Documentation/irqflags-tracing.txt +++ b/Documentation/core-api/irq/irqflags-tracing.rst diff --git a/Documentation/core-api/kobject.rst b/Documentation/core-api/kobject.rst index 1f62d4d7d966..e93dc8cf52dd 100644 --- a/Documentation/core-api/kobject.rst +++ b/Documentation/core-api/kobject.rst @@ -80,11 +80,11 @@ what is the pointer to the containing structure? You must avoid tricks (such as assuming that the kobject is at the beginning of the structure) and, instead, use the container_of() macro, found in ``<linux/kernel.h>``:: - container_of(pointer, type, member) + container_of(ptr, type, member) where: - * ``pointer`` is the pointer to the embedded kobject, + * ``ptr`` is the pointer to the embedded kobject, * ``type`` is the type of the containing structure, and * ``member`` is the name of the structure field to which ``pointer`` points. @@ -140,7 +140,7 @@ the name of the kobject, call kobject_rename():: int kobject_rename(struct kobject *kobj, const char *new_name); -kobject_rename does not perform any locking or have a solid notion of +kobject_rename() does not perform any locking or have a solid notion of what names are valid so the caller must provide their own sanity checking and serialization. @@ -210,7 +210,7 @@ statically and will warn the developer of this improper usage. If all that you want to use a kobject for is to provide a reference counter for your structure, please use the struct kref instead; a kobject would be overkill. For more information on how to use struct kref, please see the -file Documentation/kref.txt in the Linux kernel source tree. +file Documentation/core-api/kref.rst in the Linux kernel source tree. Creating "simple" kobjects @@ -222,17 +222,17 @@ ksets, show and store functions, and other details. This is the one exception where a single kobject should be created. To create such an entry, use the function:: - struct kobject *kobject_create_and_add(char *name, struct kobject *parent); + struct kobject *kobject_create_and_add(const char *name, struct kobject *parent); This function will create a kobject and place it in sysfs in the location underneath the specified parent kobject. To create simple attributes associated with this kobject, use:: - int sysfs_create_file(struct kobject *kobj, struct attribute *attr); + int sysfs_create_file(struct kobject *kobj, const struct attribute *attr); or:: - int sysfs_create_group(struct kobject *kobj, struct attribute_group *grp); + int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); Both types of attributes used here, with a kobject that has been created with the kobject_create_and_add(), can be of type kobj_attribute, so no @@ -300,8 +300,10 @@ kobj_type:: void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; struct attribute **default_attrs; + const struct attribute_group **default_groups; const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj); + void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; This structure is used to describe a particular type of kobject (or, more @@ -352,12 +354,12 @@ created and never declared statically or on the stack. To create a new kset use:: struct kset *kset_create_and_add(const char *name, - struct kset_uevent_ops *u, - struct kobject *parent); + const struct kset_uevent_ops *uevent_ops, + struct kobject *parent_kobj); When you are finished with the kset, call:: - void kset_unregister(struct kset *kset); + void kset_unregister(struct kset *k); to destroy it. This removes the kset from sysfs and decrements its reference count. When the reference count goes to zero, the kset will be released. @@ -371,9 +373,9 @@ If a kset wishes to control the uevent operations of the kobjects associated with it, it can use the struct kset_uevent_ops to handle it:: struct kset_uevent_ops { - int (*filter)(struct kset *kset, struct kobject *kobj); - const char *(*name)(struct kset *kset, struct kobject *kobj); - int (*uevent)(struct kset *kset, struct kobject *kobj, + int (* const filter)(struct kset *kset, struct kobject *kobj); + const char *(* const name)(struct kset *kset, struct kobject *kobj); + int (* const uevent)(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env); }; diff --git a/Documentation/kref.txt b/Documentation/core-api/kref.rst index c61eea6f1bf2..c61eea6f1bf2 100644 --- a/Documentation/kref.txt +++ b/Documentation/core-api/kref.rst diff --git a/Documentation/core-api/printk-basics.rst b/Documentation/core-api/printk-basics.rst new file mode 100644 index 000000000000..563a9ce5fe1d --- /dev/null +++ b/Documentation/core-api/printk-basics.rst @@ -0,0 +1,115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Message logging with printk +=========================== + +printk() is one of the most widely known functions in the Linux kernel. It's the +standard tool we have for printing messages and usually the most basic way of +tracing and debugging. If you're familiar with printf(3) you can tell printk() +is based on it, although it has some functional differences: + + - printk() messages can specify a log level. + + - the format string, while largely compatible with C99, doesn't follow the + exact same specification. It has some extensions and a few limitations + (no ``%n`` or floating point conversion specifiers). See :ref:`How to get + printk format specifiers right <printk-specifiers>`. + +All printk() messages are printed to the kernel log buffer, which is a ring +buffer exported to userspace through /dev/kmsg. The usual way to read it is +using ``dmesg``. + +printk() is typically used like this:: + + printk(KERN_INFO "Message: %s\n", arg); + +where ``KERN_INFO`` is the log level (note that it's concatenated to the format +string, the log level is not a separate argument). The available log levels are: + ++----------------+--------+-----------------------------------------------+ +| Name | String | Alias function | ++================+========+===============================================+ +| KERN_EMERG | "0" | pr_emerg() | ++----------------+--------+-----------------------------------------------+ +| KERN_ALERT | "1" | pr_alert() | ++----------------+--------+-----------------------------------------------+ +| KERN_CRIT | "2" | pr_crit() | ++----------------+--------+-----------------------------------------------+ +| KERN_ERR | "3" | pr_err() | ++----------------+--------+-----------------------------------------------+ +| KERN_WARNING | "4" | pr_warn() | ++----------------+--------+-----------------------------------------------+ +| KERN_NOTICE | "5" | pr_notice() | ++----------------+--------+-----------------------------------------------+ +| KERN_INFO | "6" | pr_info() | ++----------------+--------+-----------------------------------------------+ +| KERN_DEBUG | "7" | pr_debug() and pr_devel() if DEBUG is defined | ++----------------+--------+-----------------------------------------------+ +| KERN_DEFAULT | "" | | ++----------------+--------+-----------------------------------------------+ +| KERN_CONT | "c" | pr_cont() | ++----------------+--------+-----------------------------------------------+ + + +The log level specifies the importance of a message. The kernel decides whether +to show the message immediately (printing it to the current console) depending +on its log level and the current *console_loglevel* (a kernel variable). If the +message priority is higher (lower log level value) than the *console_loglevel* +the message will be printed to the console. + +If the log level is omitted, the message is printed with ``KERN_DEFAULT`` +level. + +You can check the current *console_loglevel* with:: + + $ cat /proc/sys/kernel/printk + 4 4 1 7 + +The result shows the *current*, *default*, *minimum* and *boot-time-default* log +levels. + +To change the current console_loglevel simply write the the desired level to +``/proc/sys/kernel/printk``. For example, to print all messages to the console:: + + # echo 8 > /proc/sys/kernel/printk + +Another way, using ``dmesg``:: + + # dmesg -n 5 + +sets the console_loglevel to print KERN_WARNING (4) or more severe messages to +console. See ``dmesg(1)`` for more information. + +As an alternative to printk() you can use the ``pr_*()`` aliases for +logging. This family of macros embed the log level in the macro names. For +example:: + + pr_info("Info message no. %d\n", msg_num); + +prints a ``KERN_INFO`` message. + +Besides being more concise than the equivalent printk() calls, they can use a +common definition for the format string through the pr_fmt() macro. For +instance, defining this at the top of a source file (before any ``#include`` +directive):: + + #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +would prefix every pr_*() message in that file with the module and function name +that originated the message. + +For debugging purposes there are also two conditionally-compiled macros: +pr_debug() and pr_devel(), which are compiled-out unless ``DEBUG`` (or +also ``CONFIG_DYNAMIC_DEBUG`` in the case of pr_debug()) is defined. + + +Function reference +================== + +.. kernel-doc:: kernel/printk/printk.c + :functions: printk + +.. kernel-doc:: include/linux/printk.h + :functions: pr_emerg pr_alert pr_crit pr_err pr_warn pr_notice pr_info + pr_fmt pr_debug pr_devel pr_cont diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 5d8f1e84dd90..8c9aba262b1e 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -2,6 +2,8 @@ How to get printk format specifiers right ========================================= +.. _printk-specifiers: + :Author: Randy Dunlap <rdunlap@infradead.org> :Author: Andrew Murray <amurray@mpc-data.co.uk> diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst index 49d9833af871..ec575e72d0b2 100644 --- a/Documentation/core-api/protection-keys.rst +++ b/Documentation/core-api/protection-keys.rst @@ -5,8 +5,9 @@ Memory Protection Keys ====================== Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature -which is found on Intel's Skylake "Scalable Processor" Server CPUs. -It will be avalable in future non-server parts. +which is found on Intel's Skylake (and later) "Scalable Processor" +Server CPUs. It will be available in future non-server Intel parts +and future AMD processors. For anyone wishing to test or use this feature, it is available in Amazon's EC2 C5 instances and is known to work there using an Ubuntu diff --git a/Documentation/rbtree.txt b/Documentation/core-api/rbtree.rst index 523d54b60087..523d54b60087 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/core-api/rbtree.rst diff --git a/Documentation/doc-guide/maintainer-profile.rst b/Documentation/doc-guide/maintainer-profile.rst index 5afc0ddba40a..755d39f0d407 100644 --- a/Documentation/doc-guide/maintainer-profile.rst +++ b/Documentation/doc-guide/maintainer-profile.rst @@ -6,7 +6,7 @@ Documentation subsystem maintainer entry profile The documentation "subsystem" is the central coordinating point for the kernel's documentation and associated infrastructure. It covers the hierarchy under Documentation/ (with the exception of -Documentation/device-tree), various utilities under scripts/ and, at least +Documentation/devicetree), various utilities under scripts/ and, at least some of the time, LICENSES/. It's worth noting, though, that the boundaries of this subsystem are rather diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst index c78db28519f7..63dec76d1d8d 100644 --- a/Documentation/driver-api/dma-buf.rst +++ b/Documentation/driver-api/dma-buf.rst @@ -11,7 +11,7 @@ course not limited to GPU use cases. The three main components of this are: (1) dma-buf, representing a sg_table and exposed to userspace as a file descriptor to allow passing between devices, (2) fence, which provides a mechanism to signal when -one device as finished access, and (3) reservation, which manages the +one device has finished access, and (3) reservation, which manages the shared or exclusive fence(s) associated with the buffer. Shared DMA Buffers @@ -31,7 +31,7 @@ The exporter - implements and manages operations in :c:type:`struct dma_buf_ops <dma_buf_ops>` for the buffer, - allows other users to share the buffer by using dma_buf sharing APIs, - - manages the details of buffer allocation, wrapped int a :c:type:`struct + - manages the details of buffer allocation, wrapped in a :c:type:`struct dma_buf <dma_buf>`, - decides about the actual backing storage where this allocation happens, - and takes care of any migration of scatterlist - for all (shared) users of diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst index 2b868d49d349..b9b022371e85 100644 --- a/Documentation/driver-api/driver-model/device.rst +++ b/Documentation/driver-api/driver-model/device.rst @@ -50,10 +50,10 @@ Attributes Attributes of devices can be exported by a device driver through sysfs. -Please see Documentation/filesystems/sysfs.txt for more information +Please see Documentation/filesystems/sysfs.rst for more information on how sysfs works. -As explained in Documentation/kobject.txt, device attributes must be +As explained in Documentation/core-api/kobject.rst, device attributes must be created before the KOBJ_ADD uevent is generated. The only way to realize that is by defining an attribute group. diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst index d4d1e9b40e0c..e98d0ab4a9b6 100644 --- a/Documentation/driver-api/driver-model/overview.rst +++ b/Documentation/driver-api/driver-model/overview.rst @@ -121,4 +121,4 @@ device-specific data or tunable interfaces. More information about the sysfs directory layout can be found in the other documents in this directory and in the file -Documentation/filesystems/sysfs.txt. +Documentation/filesystems/sysfs.rst. diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index d4e78cb3ef4d..20c431c8e7be 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -39,6 +39,7 @@ available subsections can be seen below. spi i2c ipmb + ipmi i3c/index interconnect devfreq diff --git a/Documentation/IPMI.txt b/Documentation/driver-api/ipmi.rst index 5ef1047e2e66..5ef1047e2e66 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/driver-api/ipmi.rst diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst index 08f855cbb4e6..79c0fd39f2af 100644 --- a/Documentation/driver-api/nvdimm/nvdimm.rst +++ b/Documentation/driver-api/nvdimm/nvdimm.rst @@ -278,8 +278,8 @@ by a region device with a dynamically assigned id (REGION0 - REGION5). be contiguous in DPA-space. This bus is provided by the kernel under the device - /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and - the nfit_test.ko module is loaded. This not only test LIBNVDIMM but the + /sys/devices/platform/nfit_test.0 when the nfit_test.ko module from + tools/testing/nvdimm is loaded. This not only test LIBNVDIMM but the acpi_nfit.ko driver as well. diff --git a/Documentation/driver-api/thermal/cpu-idle-cooling.rst b/Documentation/driver-api/thermal/cpu-idle-cooling.rst index a1c3edecae00..b9f34ceb2a38 100644 --- a/Documentation/driver-api/thermal/cpu-idle-cooling.rst +++ b/Documentation/driver-api/thermal/cpu-idle-cooling.rst @@ -1,3 +1,6 @@ +================ +CPU Idle Cooling +================ Situation: ---------- diff --git a/Documentation/driver-api/thermal/index.rst b/Documentation/driver-api/thermal/index.rst index 5ba61d19c6ae..4cb0b9b6bfb8 100644 --- a/Documentation/driver-api/thermal/index.rst +++ b/Documentation/driver-api/thermal/index.rst @@ -8,6 +8,7 @@ Thermal :maxdepth: 1 cpu-cooling-api + cpu-idle-cooling sysfs-api power_allocator diff --git a/Documentation/fb/efifb.rst b/Documentation/fb/efifb.rst index 04840331a00e..6badff64756f 100644 --- a/Documentation/fb/efifb.rst +++ b/Documentation/fb/efifb.rst @@ -2,8 +2,10 @@ What is efifb? ============== -This is a generic EFI platform driver for Intel based Apple computers. -efifb is only for EFI booted Intel Macs. +This is a generic EFI platform driver for systems with UEFI firmware. The +system must be booted via the EFI stub for this to be usable. efifb supports +both firmware with Graphics Output Protocol (GOP) displays as well as older +systems with only Universal Graphics Adapter (UGA) displays. Supported Hardware ================== @@ -12,11 +14,14 @@ Supported Hardware - Macbook - Macbook Pro 15"/17" - MacMini +- ARM/ARM64/X86 systems with UEFI firmware How to use it? ============== -efifb does not have any kind of autodetection of your machine. +For UGA displays, efifb does not have any kind of autodetection of your +machine. + You have to add the following kernel parameters in your elilo.conf:: Macbook : @@ -28,6 +33,9 @@ You have to add the following kernel parameters in your elilo.conf:: Macbook Pro 17", iMac 20" : video=efifb:i20 +For GOP displays, efifb can autodetect the display's resolution and framebuffer +address, so these should work out of the box without any special parameters. + Accepted options: ======= =========================================================== @@ -36,4 +44,28 @@ nowc Don't map the framebuffer write combined. This can be used when large amounts of console data are written. ======= =========================================================== +Options for GOP displays: + +mode=n + The EFI stub will set the mode of the display to mode number n if + possible. + +<xres>x<yres>[-(rgb|bgr|<bpp>)] + The EFI stub will search for a display mode that matches the specified + horizontal and vertical resolution, and optionally bit depth, and set + the mode of the display to it if one is found. The bit depth can either + "rgb" or "bgr" to match specifically those pixel formats, or a number + for a mode with matching bits per pixel. + +auto + The EFI stub will choose the mode with the highest resolution (product + of horizontal and vertical resolution). If there are multiple modes + with the highest resolution, it will choose one with the highest color + depth. + +list + The EFI stub will list out all the display modes that are available. A + specific mode can then be chosen using one of the above options for the + next boot. + Edgar Hucek <gimli@dark-green.com> diff --git a/Documentation/features/core/eBPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt index 9ae6e8d0d10d..9ed964f65224 100644 --- a/Documentation/features/core/eBPF-JIT/arch-support.txt +++ b/Documentation/features/core/eBPF-JIT/arch-support.txt @@ -23,7 +23,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | ok | diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index 304dcd461795..6ff38548923e 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -22,9 +22,9 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | - | riscv: | TODO | - | s390: | TODO | + | powerpc: | ok | + | riscv: | ok | + | s390: | ok | | sh: | TODO | | sparc: | TODO | | um: | TODO | diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt index 6fb2b0671994..210256f6a4cf 100644 --- a/Documentation/features/debug/gcov-profile-all/arch-support.txt +++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt index 32b297295fff..97cd7aa74905 100644 --- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt +++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt @@ -11,7 +11,7 @@ | arm: | TODO | | arm64: | TODO | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt index e68239b5d2f0..8b316c6e03d4 100644 --- a/Documentation/features/debug/kprobes/arch-support.txt +++ b/Documentation/features/debug/kprobes/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | ok | @@ -23,7 +23,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | ok | + | riscv: | TODO | | s390: | ok | | sh: | ok | | sparc: | ok | diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt index f17131b328e5..b805aada395e 100644 --- a/Documentation/features/debug/kretprobes/arch-support.txt +++ b/Documentation/features/debug/kretprobes/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | ok | diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt index 32bbdfc64c32..12410f606edc 100644 --- a/Documentation/features/debug/stackprotector/arch-support.txt +++ b/Documentation/features/debug/stackprotector/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt index 1c577d0cfc7f..be8acbb95b54 100644 --- a/Documentation/features/debug/uprobes/arch-support.txt +++ b/Documentation/features/debug/uprobes/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt index eb28b5c97ca6..895c3b0f6492 100644 --- a/Documentation/features/io/dma-contiguous/arch-support.txt +++ b/Documentation/features/io/dma-contiguous/arch-support.txt @@ -16,7 +16,7 @@ | hexagon: | TODO | | ia64: | TODO | | m68k: | TODO | - | microblaze: | TODO | + | microblaze: | ok | | mips: | ok | | nds32: | TODO | | nios2: | TODO | diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt index 941fd5b1094d..98cb9d85c55d 100644 --- a/Documentation/features/locking/lockdep/arch-support.txt +++ b/Documentation/features/locking/lockdep/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | ok | | ia64: | TODO | diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt index d8278bf62b85..518f352fc727 100644 --- a/Documentation/features/perf/kprobes-event/arch-support.txt +++ b/Documentation/features/perf/kprobes-event/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | ok | | ia64: | TODO | @@ -21,7 +21,7 @@ | nds32: | ok | | nios2: | TODO | | openrisc: | TODO | - | parisc: | TODO | + | parisc: | ok | | powerpc: | ok | | riscv: | TODO | | s390: | ok | diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt index 687d049d9cee..c22cd6f8aa5e 100644 --- a/Documentation/features/perf/perf-regs/arch-support.txt +++ b/Documentation/features/perf/perf-regs/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | @@ -23,7 +23,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | TODO | diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt index 90996e3d18a8..527fe4d0b074 100644 --- a/Documentation/features/perf/perf-stackdump/arch-support.txt +++ b/Documentation/features/perf/perf-stackdump/arch-support.txt @@ -11,7 +11,7 @@ | arm: | ok | | arm64: | ok | | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | @@ -23,7 +23,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | TODO | diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt index 4fe6c3c3be5c..c7b837f735b1 100644 --- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt +++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt @@ -23,7 +23,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | TODO | diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index 019131c5acce..8525f1981f19 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -22,7 +22,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | riscv: | TODO | | s390: | TODO | | sh: | TODO | diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 3d492a34c8ee..2e017387e228 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -17,7 +17,7 @@ | ia64: | TODO | | m68k: | TODO | | microblaze: | TODO | - | mips: | TODO | + | mips: | ok | | nds32: | TODO | | nios2: | TODO | | openrisc: | TODO | diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst index 671fef39a802..2995279ddc24 100644 --- a/Documentation/filesystems/9p.rst +++ b/Documentation/filesystems/9p.rst @@ -192,4 +192,4 @@ For more information on the Plan 9 Operating System check out http://plan9.bell-labs.com/plan9 For information on Plan 9 from User Space (Plan 9 applications and libraries -ported to Linux/BSD/OSX/etc) check out http://swtch.com/plan9 +ported to Linux/BSD/OSX/etc) check out https://9fans.github.io/plan9port/ diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.rst index 7d9f82607562..430f0b40796b 100644 --- a/Documentation/filesystems/automount-support.txt +++ b/Documentation/filesystems/automount-support.rst @@ -1,3 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Automount Support +================= + + Support is available for filesystems that wish to do automounting support (such as kAFS which can be found in fs/afs/ and NFS in fs/nfs/). This facility includes allowing in-kernel mounts to be @@ -5,13 +12,12 @@ performed and mountpoint degradation to be requested. The latter can also be requested by userspace. -====================== -IN-KERNEL AUTOMOUNTING +In-Kernel Automounting ====================== See section "Mount Traps" of Documentation/filesystems/autofs.rst -Then from userspace, you can just do something like: +Then from userspace, you can just do something like:: [root@andromeda root]# mount -t afs \#root.afs. /afs [root@andromeda root]# ls /afs @@ -21,7 +27,7 @@ Then from userspace, you can just do something like: [root@andromeda root]# ls /afs/cambridge/afsdoc/ ChangeLog html LICENSE pdf RELNOTES-1.2.2 -And then if you look in the mountpoint catalogue, you'll see something like: +And then if you look in the mountpoint catalogue, you'll see something like:: [root@andromeda root]# cat /proc/mounts ... @@ -30,8 +36,7 @@ And then if you look in the mountpoint catalogue, you'll see something like: #afsdoc. /afs/cambridge.redhat.com/afsdoc afs rw 0 0 -=========================== -AUTOMATIC MOUNTPOINT EXPIRY +Automatic Mountpoint Expiry =========================== Automatic expiration of mountpoints is easy, provided you've mounted the @@ -43,7 +48,8 @@ To do expiration, you need to follow these steps: hung. (2) When a new mountpoint is created in the ->d_automount method, add - the mnt to the list using mnt_set_expiry() + the mnt to the list using mnt_set_expiry():: + mnt_set_expiry(newmnt, &afs_vfsmounts); (3) When you want mountpoints to be expired, call mark_mounts_for_expiry() @@ -70,8 +76,7 @@ and the copies of those that are on an expiration list will be added to the same expiration list. -======================= -USERSPACE DRIVEN EXPIRY +Userspace Driven Expiry ======================= As an alternative, it is possible for userspace to request expiry of any diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.rst index c418280c915f..19fbf6b9aa36 100644 --- a/Documentation/filesystems/caching/backend-api.txt +++ b/Documentation/filesystems/caching/backend-api.rst @@ -1,6 +1,8 @@ - ========================== - FS-CACHE CACHE BACKEND API - ========================== +.. SPDX-License-Identifier: GPL-2.0 + +========================== +FS-Cache Cache backend API +========================== The FS-Cache system provides an API by which actual caches can be supplied to FS-Cache for it to then serve out to network filesystems and other interested @@ -9,15 +11,14 @@ parties. This API is declared in <linux/fscache-cache.h>. -==================================== -INITIALISING AND REGISTERING A CACHE +Initialising and Registering a Cache ==================================== To start off, a cache definition must be initialised and registered for each cache the backend wants to make available. For instance, CacheFS does this in the fill_super() operation on mounting. -The cache definition (struct fscache_cache) should be initialised by calling: +The cache definition (struct fscache_cache) should be initialised by calling:: void fscache_init_cache(struct fscache_cache *cache, struct fscache_cache_ops *ops, @@ -26,17 +27,17 @@ The cache definition (struct fscache_cache) should be initialised by calling: Where: - (*) "cache" is a pointer to the cache definition; + * "cache" is a pointer to the cache definition; - (*) "ops" is a pointer to the table of operations that the backend supports on + * "ops" is a pointer to the table of operations that the backend supports on this cache; and - (*) "idfmt" is a format and printf-style arguments for constructing a label + * "idfmt" is a format and printf-style arguments for constructing a label for the cache. The cache should then be registered with FS-Cache by passing a pointer to the -previously initialised cache definition to: +previously initialised cache definition to:: int fscache_add_cache(struct fscache_cache *cache, struct fscache_object *fsdef, @@ -44,12 +45,12 @@ previously initialised cache definition to: Two extra arguments should also be supplied: - (*) "fsdef" which should point to the object representation for the FS-Cache + * "fsdef" which should point to the object representation for the FS-Cache master index in this cache. Netfs primary index entries will be created here. FS-Cache keeps the caller's reference to the index object if successful and will release it upon withdrawal of the cache. - (*) "tagname" which, if given, should be a text string naming this cache. If + * "tagname" which, if given, should be a text string naming this cache. If this is NULL, the identifier will be used instead. For CacheFS, the identifier is set to name the underlying block device and the tag can be supplied by mount. @@ -58,20 +59,18 @@ This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag is already in use. 0 will be returned on success. -===================== -UNREGISTERING A CACHE +Unregistering a Cache ===================== A cache can be withdrawn from the system by calling this function with a -pointer to the cache definition: +pointer to the cache definition:: void fscache_withdraw_cache(struct fscache_cache *cache); In CacheFS's case, this is called by put_super(). -======== -SECURITY +Security ======== The cache methods are executed one of two contexts: @@ -89,8 +88,7 @@ be masqueraded for the duration of the cache driver's access to the cache. This is left to the cache to handle; FS-Cache makes no effort in this regard. -=================================== -CONTROL AND STATISTICS PRESENTATION +Control and Statistics Presentation =================================== The cache may present data to the outside world through FS-Cache's interfaces @@ -101,11 +99,10 @@ is enabled. This is accessible through the kobject struct fscache_cache::kobj and is for use by the cache as it sees fit. -======================== -RELEVANT DATA STRUCTURES +Relevant Data Structures ======================== - (*) Index/Data file FS-Cache representation cookie: + * Index/Data file FS-Cache representation cookie:: struct fscache_cookie { struct fscache_object_def *def; @@ -121,7 +118,7 @@ RELEVANT DATA STRUCTURES cache operations. - (*) In-cache object representation: + * In-cache object representation:: struct fscache_object { int debug_id; @@ -150,7 +147,7 @@ RELEVANT DATA STRUCTURES initialised by calling fscache_object_init(object). - (*) FS-Cache operation record: + * FS-Cache operation record:: struct fscache_operation { atomic_t usage; @@ -173,7 +170,7 @@ RELEVANT DATA STRUCTURES an operation needs more processing time, it should be enqueued again. - (*) FS-Cache retrieval operation record: + * FS-Cache retrieval operation record:: struct fscache_retrieval { struct fscache_operation op; @@ -198,7 +195,7 @@ RELEVANT DATA STRUCTURES it sees fit. - (*) FS-Cache storage operation record: + * FS-Cache storage operation record:: struct fscache_storage { struct fscache_operation op; @@ -212,16 +209,17 @@ RELEVANT DATA STRUCTURES storage. -================ -CACHE OPERATIONS +Cache Operations ================ The cache backend provides FS-Cache with a table of operations that can be performed on the denizens of the cache. These are held in a structure of type: - struct fscache_cache_ops + :: + + struct fscache_cache_ops - (*) Name of cache provider [mandatory]: + * Name of cache provider [mandatory]:: const char *name @@ -229,7 +227,7 @@ performed on the denizens of the cache. These are held in a structure of type: the backend. - (*) Allocate a new object [mandatory]: + * Allocate a new object [mandatory]:: struct fscache_object *(*alloc_object)(struct fscache_cache *cache, struct fscache_cookie *cookie) @@ -244,7 +242,7 @@ performed on the denizens of the cache. These are held in a structure of type: form once lookup is complete or aborted. - (*) Look up and create object [mandatory]: + * Look up and create object [mandatory]:: void (*lookup_object)(struct fscache_object *object) @@ -263,7 +261,7 @@ performed on the denizens of the cache. These are held in a structure of type: to abort the lookup of that object. - (*) Release lookup data [mandatory]: + * Release lookup data [mandatory]:: void (*lookup_complete)(struct fscache_object *object) @@ -271,7 +269,7 @@ performed on the denizens of the cache. These are held in a structure of type: using to perform a lookup. - (*) Increment object refcount [mandatory]: + * Increment object refcount [mandatory]:: struct fscache_object *(*grab_object)(struct fscache_object *object) @@ -280,7 +278,7 @@ performed on the denizens of the cache. These are held in a structure of type: It should return the object pointer if successful. - (*) Lock/Unlock object [mandatory]: + * Lock/Unlock object [mandatory]:: void (*lock_object)(struct fscache_object *object) void (*unlock_object)(struct fscache_object *object) @@ -289,7 +287,7 @@ performed on the denizens of the cache. These are held in a structure of type: to schedule with the lock held, so a spinlock isn't sufficient. - (*) Pin/Unpin object [optional]: + * Pin/Unpin object [optional]:: int (*pin_object)(struct fscache_object *object) void (*unpin_object)(struct fscache_object *object) @@ -299,7 +297,7 @@ performed on the denizens of the cache. These are held in a structure of type: enough space in the cache to permit this. - (*) Check coherency state of an object [mandatory]: + * Check coherency state of an object [mandatory]:: int (*check_consistency)(struct fscache_object *object) @@ -308,7 +306,7 @@ performed on the denizens of the cache. These are held in a structure of type: if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS may also be returned. - (*) Update object [mandatory]: + * Update object [mandatory]:: int (*update_object)(struct fscache_object *object) @@ -317,7 +315,7 @@ performed on the denizens of the cache. These are held in a structure of type: obtained by calling object->cookie->def->get_aux()/get_attr(). - (*) Invalidate data object [mandatory]: + * Invalidate data object [mandatory]:: int (*invalidate_object)(struct fscache_operation *op) @@ -329,7 +327,7 @@ performed on the denizens of the cache. These are held in a structure of type: fscache_op_complete() must be called on op before returning. - (*) Discard object [mandatory]: + * Discard object [mandatory]:: void (*drop_object)(struct fscache_object *object) @@ -341,7 +339,7 @@ performed on the denizens of the cache. These are held in a structure of type: caller. The caller will invoke the put_object() method as appropriate. - (*) Release object reference [mandatory]: + * Release object reference [mandatory]:: void (*put_object)(struct fscache_object *object) @@ -349,7 +347,7 @@ performed on the denizens of the cache. These are held in a structure of type: be freed when all the references to it are released. - (*) Synchronise a cache [mandatory]: + * Synchronise a cache [mandatory]:: void (*sync)(struct fscache_cache *cache) @@ -357,7 +355,7 @@ performed on the denizens of the cache. These are held in a structure of type: device. - (*) Dissociate a cache [mandatory]: + * Dissociate a cache [mandatory]:: void (*dissociate_pages)(struct fscache_cache *cache) @@ -365,7 +363,7 @@ performed on the denizens of the cache. These are held in a structure of type: cache withdrawal. - (*) Notification that the attributes on a netfs file changed [mandatory]: + * Notification that the attributes on a netfs file changed [mandatory]:: int (*attr_changed)(struct fscache_object *object); @@ -386,7 +384,7 @@ performed on the denizens of the cache. These are held in a structure of type: execution of this operation. - (*) Reserve cache space for an object's data [optional]: + * Reserve cache space for an object's data [optional]:: int (*reserve_space)(struct fscache_object *object, loff_t size); @@ -404,7 +402,7 @@ performed on the denizens of the cache. These are held in a structure of type: size if larger than that already. - (*) Request page be read from cache [mandatory]: + * Request page be read from cache [mandatory]:: int (*read_or_alloc_page)(struct fscache_retrieval *op, struct page *page, @@ -446,7 +444,7 @@ performed on the denizens of the cache. These are held in a structure of type: with. This will complete the operation when all pages are dealt with. - (*) Request pages be read from cache [mandatory]: + * Request pages be read from cache [mandatory]:: int (*read_or_alloc_pages)(struct fscache_retrieval *op, struct list_head *pages, @@ -457,7 +455,7 @@ performed on the denizens of the cache. These are held in a structure of type: of pages instead of one page. Any pages on which a read operation is started must be added to the page cache for the specified mapping and also to the LRU. Such pages must also be removed from the pages list and - *nr_pages decremented per page. + ``*nr_pages`` decremented per page. If there was an error such as -ENOMEM, then that should be returned; else if one or more pages couldn't be read or allocated, then -ENOBUFS should @@ -466,7 +464,7 @@ performed on the denizens of the cache. These are held in a structure of type: returned. - (*) Request page be allocated in the cache [mandatory]: + * Request page be allocated in the cache [mandatory]:: int (*allocate_page)(struct fscache_retrieval *op, struct page *page, @@ -482,7 +480,7 @@ performed on the denizens of the cache. These are held in a structure of type: allocated, then the netfs page should be marked and 0 returned. - (*) Request pages be allocated in the cache [mandatory]: + * Request pages be allocated in the cache [mandatory]:: int (*allocate_pages)(struct fscache_retrieval *op, struct list_head *pages, @@ -493,7 +491,7 @@ performed on the denizens of the cache. These are held in a structure of type: nr_pages should be treated as for the read_or_alloc_pages() method. - (*) Request page be written to cache [mandatory]: + * Request page be written to cache [mandatory]:: int (*write_page)(struct fscache_storage *op, struct page *page); @@ -514,7 +512,7 @@ performed on the denizens of the cache. These are held in a structure of type: appropriately. - (*) Discard retained per-page metadata [mandatory]: + * Discard retained per-page metadata [mandatory]:: void (*uncache_page)(struct fscache_object *object, struct page *page) @@ -523,13 +521,12 @@ performed on the denizens of the cache. These are held in a structure of type: maintains for this page. -================== -FS-CACHE UTILITIES +FS-Cache Utilities ================== FS-Cache provides some utilities that a cache backend may make use of: - (*) Note occurrence of an I/O error in a cache: + * Note occurrence of an I/O error in a cache:: void fscache_io_error(struct fscache_cache *cache) @@ -541,7 +538,7 @@ FS-Cache provides some utilities that a cache backend may make use of: This does not actually withdraw the cache. That must be done separately. - (*) Invoke the retrieval I/O completion function: + * Invoke the retrieval I/O completion function:: void fscache_end_io(struct fscache_retrieval *op, struct page *page, int error); @@ -550,8 +547,8 @@ FS-Cache provides some utilities that a cache backend may make use of: error value should be 0 if successful and an error otherwise. - (*) Record that one or more pages being retrieved or allocated have been dealt - with: + * Record that one or more pages being retrieved or allocated have been dealt + with:: void fscache_retrieval_complete(struct fscache_retrieval *op, int n_pages); @@ -562,7 +559,7 @@ FS-Cache provides some utilities that a cache backend may make use of: completed. - (*) Record operation completion: + * Record operation completion:: void fscache_op_complete(struct fscache_operation *op); @@ -571,7 +568,7 @@ FS-Cache provides some utilities that a cache backend may make use of: one or more pending operations to start running. - (*) Set highest store limit: + * Set highest store limit:: void fscache_set_store_limit(struct fscache_object *object, loff_t i_size); @@ -581,7 +578,7 @@ FS-Cache provides some utilities that a cache backend may make use of: rejected by fscache_read_alloc_page() and co with -ENOBUFS. - (*) Mark pages as being cached: + * Mark pages as being cached:: void fscache_mark_pages_cached(struct fscache_retrieval *op, struct pagevec *pagevec); @@ -590,7 +587,7 @@ FS-Cache provides some utilities that a cache backend may make use of: the netfs must call fscache_uncache_page() to unmark the pages. - (*) Perform coherency check on an object: + * Perform coherency check on an object:: enum fscache_checkaux fscache_check_aux(struct fscache_object *object, const void *data, @@ -603,29 +600,26 @@ FS-Cache provides some utilities that a cache backend may make use of: One of three values will be returned: - (*) FSCACHE_CHECKAUX_OKAY - + FSCACHE_CHECKAUX_OKAY The coherency data indicates the object is valid as is. - (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - + FSCACHE_CHECKAUX_NEEDS_UPDATE The coherency data needs updating, but otherwise the object is valid. - (*) FSCACHE_CHECKAUX_OBSOLETE - + FSCACHE_CHECKAUX_OBSOLETE The coherency data indicates that the object is obsolete and should be discarded. - (*) Initialise a freshly allocated object: + * Initialise a freshly allocated object:: void fscache_object_init(struct fscache_object *object); This initialises all the fields in an object representation. - (*) Indicate the destruction of an object: + * Indicate the destruction of an object:: void fscache_object_destroyed(struct fscache_cache *cache); @@ -635,7 +629,7 @@ FS-Cache provides some utilities that a cache backend may make use of: all the objects. - (*) Indicate negative lookup on an object: + * Indicate negative lookup on an object:: void fscache_object_lookup_negative(struct fscache_object *object); @@ -650,7 +644,7 @@ FS-Cache provides some utilities that a cache backend may make use of: significant - all subsequent calls are ignored. - (*) Indicate an object has been obtained: + * Indicate an object has been obtained:: void fscache_obtained_object(struct fscache_object *object); @@ -667,7 +661,7 @@ FS-Cache provides some utilities that a cache backend may make use of: (2) that writes may now proceed against this object. - (*) Indicate that object lookup failed: + * Indicate that object lookup failed:: void fscache_object_lookup_error(struct fscache_object *object); @@ -676,7 +670,7 @@ FS-Cache provides some utilities that a cache backend may make use of: as possible. - (*) Indicate that a stale object was found and discarded: + * Indicate that a stale object was found and discarded:: void fscache_object_retrying_stale(struct fscache_object *object); @@ -685,7 +679,7 @@ FS-Cache provides some utilities that a cache backend may make use of: discarded from the cache and the lookup will be performed again. - (*) Indicate that the caching backend killed an object: + * Indicate that the caching backend killed an object:: void fscache_object_mark_killed(struct fscache_object *object, enum fscache_why_object_killed why); @@ -693,13 +687,20 @@ FS-Cache provides some utilities that a cache backend may make use of: This is called to indicate that the cache backend preemptively killed an object. The why parameter should be set to indicate the reason: - FSCACHE_OBJECT_IS_STALE - the object was stale and needs discarding. - FSCACHE_OBJECT_NO_SPACE - there was insufficient cache space - FSCACHE_OBJECT_WAS_RETIRED - the object was retired when relinquished. - FSCACHE_OBJECT_WAS_CULLED - the object was culled to make space. + FSCACHE_OBJECT_IS_STALE + - the object was stale and needs discarding. + + FSCACHE_OBJECT_NO_SPACE + - there was insufficient cache space + + FSCACHE_OBJECT_WAS_RETIRED + - the object was retired when relinquished. + + FSCACHE_OBJECT_WAS_CULLED + - the object was culled to make space. - (*) Get and release references on a retrieval record: + * Get and release references on a retrieval record:: void fscache_get_retrieval(struct fscache_retrieval *op); void fscache_put_retrieval(struct fscache_retrieval *op); @@ -708,7 +709,7 @@ FS-Cache provides some utilities that a cache backend may make use of: asynchronous data retrieval and block allocation. - (*) Enqueue a retrieval record for processing. + * Enqueue a retrieval record for processing:: void fscache_enqueue_retrieval(struct fscache_retrieval *op); @@ -718,7 +719,7 @@ FS-Cache provides some utilities that a cache backend may make use of: within the callback function. - (*) List of object state names: + * List of object state names:: const char *fscache_object_states[]; diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.rst index 28aefcbb1442..65d3db476765 100644 --- a/Documentation/filesystems/caching/cachefiles.txt +++ b/Documentation/filesystems/caching/cachefiles.rst @@ -1,8 +1,10 @@ - =============================================== - CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM - =============================================== +.. SPDX-License-Identifier: GPL-2.0 -Contents: +=============================================== +CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM +=============================================== + +.. Contents: (*) Overview. @@ -27,8 +29,8 @@ Contents: (*) Debugging. -======== -OVERVIEW + +Overview ======== CacheFiles is a caching backend that's meant to use as a cache a directory on @@ -58,8 +60,8 @@ spare space and automatically contract when the set of data requires more space. -============ -REQUIREMENTS + +Requirements ============ The use of CacheFiles and its daemon requires the following features to be @@ -79,84 +81,70 @@ It is strongly recommended that the "dir_index" option is enabled on Ext3 filesystems being used as a cache. -============= -CONFIGURATION +Configuration ============= The cache is configured by a script in /etc/cachefilesd.conf. These commands set up cache ready for use. The following script commands are available: - (*) brun <N>% - (*) bcull <N>% - (*) bstop <N>% - (*) frun <N>% - (*) fcull <N>% - (*) fstop <N>% - + brun <N>%, bcull <N>%, bstop <N>%, frun <N>%, fcull <N>%, fstop <N>% Configure the culling limits. Optional. See the section on culling The defaults are 7% (run), 5% (cull) and 1% (stop) respectively. The commands beginning with a 'b' are file space (block) limits, those beginning with an 'f' are file count limits. - (*) dir <path> - + dir <path> Specify the directory containing the root of the cache. Mandatory. - (*) tag <name> - + tag <name> Specify a tag to FS-Cache to use in distinguishing multiple caches. Optional. The default is "CacheFiles". - (*) debug <mask> - + debug <mask> Specify a numeric bitmask to control debugging in the kernel module. Optional. The default is zero (all off). The following values can be OR'd into the mask to collect various information: + == ================================================= 1 Turn on trace of function entry (_enter() macros) 2 Turn on trace of function exit (_leave() macros) 4 Turn on trace of internal debug points (_debug()) + == ================================================= - This mask can also be set through sysfs, eg: + This mask can also be set through sysfs, eg:: echo 5 >/sys/modules/cachefiles/parameters/debug -================== -STARTING THE CACHE +Starting the Cache ================== The cache is started by running the daemon. The daemon opens the cache device, configures the cache and tells it to begin caching. At that point the cache binds to fscache and the cache becomes live. -The daemon is run as follows: +The daemon is run as follows:: /sbin/cachefilesd [-d]* [-s] [-n] [-f <configfile>] The flags are: - (*) -d - + ``-d`` Increase the debugging level. This can be specified multiple times and is cumulative with itself. - (*) -s - + ``-s`` Send messages to stderr instead of syslog. - (*) -n - + ``-n`` Don't daemonise and go into background. - (*) -f <configfile> - + ``-f <configfile>`` Use an alternative configuration file rather than the default one. -=============== -THINGS TO AVOID +Things to Avoid =============== Do not mount other things within the cache as this will cause problems. The @@ -179,8 +167,7 @@ Do not chmod files in the cache. The module creates things with minimal permissions to prevent random users being able to access them directly. -============= -CACHE CULLING +Cache Culling ============= The cache may need culling occasionally to make space. This involves @@ -192,27 +179,21 @@ Cache culling is done on the basis of the percentage of blocks and the percentage of files available in the underlying filesystem. There are six "limits": - (*) brun - (*) frun - + brun, frun If the amount of free space and the number of available files in the cache rises above both these limits, then culling is turned off. - (*) bcull - (*) fcull - + bcull, fcull If the amount of available space or the number of available files in the cache falls below either of these limits, then culling is started. - (*) bstop - (*) fstop - + bstop, fstop If the amount of available space or the number of available files in the cache falls below either of these limits, then no further allocation of disk space or files is permitted until culling has raised things above these limits again. -These must be configured thusly: +These must be configured thusly:: 0 <= bstop < bcull < brun < 100 0 <= fstop < fcull < frun < 100 @@ -226,16 +207,14 @@ started as soon as space is made in the table. Objects will be skipped if their atimes have changed or if the kernel module says it is still using them. -=============== -CACHE STRUCTURE +Cache Structure =============== The CacheFiles module will create two directories in the directory it was given: - (*) cache/ - - (*) graveyard/ + * cache/ + * graveyard/ The active cache objects all reside in the first directory. The CacheFiles kernel module moves any retired or culled objects that it can't simply unlink @@ -261,10 +240,10 @@ If an object has children, then it will be represented as a directory. Immediately in the representative directory are a collection of directories named for hash values of the child object keys with an '@' prepended. Into this directory, if possible, will be placed the representations of the child -objects: +objects:: - INDEX INDEX INDEX DATA FILES - ========= ========== ================================= ================ + /INDEX /INDEX /INDEX /DATA FILES + /=========/==========/=================================/================ cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry @@ -275,7 +254,7 @@ If the key is so long that it exceeds NAME_MAX with the decorations added on to it, then it will be cut into pieces, the first few of which will be used to make a nest of directories, and the last one of which will be the objects inside the last directory. The names of the intermediate directories will have -'+' prepended: +'+' prepended:: J1223/@23/+xy...z/+kl...m/Epqr @@ -288,11 +267,13 @@ To handle this, CacheFiles will use a suitably printable filename directly and "base-64" encode ones that aren't directly suitable. The two versions of object filenames indicate the encoding: + =============== =============== =============== OBJECT TYPE PRINTABLE ENCODED =============== =============== =============== Index "I..." "J..." Data "D..." "E..." Special "S..." "T..." + =============== =============== =============== Intermediate directories are always "@" or "+" as appropriate. @@ -307,8 +288,7 @@ Note that CacheFiles will erase from the cache any file it doesn't recognise or any file of an incorrect type (such as a FIFO file or a device file). -========================== -SECURITY MODEL AND SELINUX +Security Model and SELinux ========================== CacheFiles is implemented to deal properly with the LSM security features of @@ -331,26 +311,26 @@ When the CacheFiles module is asked to bind to its cache, it: (1) Finds the security label attached to the root cache directory and uses that as the security label with which it will create files. By default, - this is: + this is:: cachefiles_var_t (2) Finds the security label of the process which issued the bind request - (presumed to be the cachefilesd daemon), which by default will be: + (presumed to be the cachefilesd daemon), which by default will be:: cachefilesd_t and asks LSM to supply a security ID as which it should act given the - daemon's label. By default, this will be: + daemon's label. By default, this will be:: cachefiles_kernel_t SELinux transitions the daemon's security ID to the module's security ID - based on a rule of this form in the policy. + based on a rule of this form in the policy:: type_transition <daemon's-ID> kernel_t : process <module's-ID>; - For instance: + For instance:: type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t; @@ -370,7 +350,7 @@ There are policy source files available in: http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2 -and later versions. In that tarball, see the files: +and later versions. In that tarball, see the files:: cachefilesd.te cachefilesd.fc @@ -379,7 +359,7 @@ and later versions. In that tarball, see the files: They are built and installed directly by the RPM. If a non-RPM based system is being used, then copy the above files to their own -directory and run: +directory and run:: make -f /usr/share/selinux/devel/Makefile semodule -i cachefilesd.pp @@ -394,7 +374,7 @@ an auxiliary policy must be installed to label the alternate location of the cache. For instructions on how to add an auxiliary policy to enable the cache to be -located elsewhere when SELinux is in enforcing mode, please see: +located elsewhere when SELinux is in enforcing mode, please see:: /usr/share/doc/cachefilesd-*/move-cache.txt @@ -402,8 +382,7 @@ When the cachefilesd rpm is installed; alternatively, the document can be found in the sources. -================== -A NOTE ON SECURITY +A Note on Security ================== CacheFiles makes use of the split security in the task_struct. It allocates @@ -445,17 +424,18 @@ for CacheFiles to run in a context of a specific security label, or to create files and directories with another security label. -======================= -STATISTICAL INFORMATION +Statistical Information ======================= -If FS-Cache is compiled with the following option enabled: +If FS-Cache is compiled with the following option enabled:: CONFIG_CACHEFILES_HISTOGRAM=y then it will gather certain statistics and display them through a proc file. - (*) /proc/fs/cachefiles/histogram + /proc/fs/cachefiles/histogram + + :: cat /proc/fs/cachefiles/histogram JIFS SECS LOOKUPS MKDIRS CREATES @@ -465,36 +445,39 @@ then it will gather certain statistics and display them through a proc file. between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The columns are as follows: + ======= ======================================================= COLUMN TIME MEASUREMENT ======= ======================================================= LOOKUPS Length of time to perform a lookup on the backing fs MKDIRS Length of time to perform a mkdir on the backing fs CREATES Length of time to perform a create on the backing fs + ======= ======================================================= Each row shows the number of events that took a particular range of times. Each step is 1 jiffy in size. The JIFS column indicates the particular jiffy range covered, and the SECS field the equivalent number of seconds. -========= -DEBUGGING +Debugging ========= If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime -debugging enabled by adjusting the value in: +debugging enabled by adjusting the value in:: /sys/module/cachefiles/parameters/debug This is a bitmask of debugging streams to enable: + ======= ======= =============================== ======================= BIT VALUE STREAM POINT ======= ======= =============================== ======================= 0 1 General Function entry trace 1 2 Function exit trace 2 4 General + ======= ======= =============================== ======================= The appropriate set of values should be OR'd together and the result written to -the control file. For example: +the control file. For example:: echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst new file mode 100644 index 000000000000..70de86922b6a --- /dev/null +++ b/Documentation/filesystems/caching/fscache.rst @@ -0,0 +1,565 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +General Filesystem Caching +========================== + +Overview +======== + +This facility is a general purpose cache for network filesystems, though it +could be used for caching other things such as ISO9660 filesystems too. + +FS-Cache mediates between cache backends (such as CacheFS) and network +filesystems:: + + +---------+ + | | +--------------+ + | NFS |--+ | | + | | | +-->| CacheFS | + +---------+ | +----------+ | | /dev/hda5 | + | | | | +--------------+ + +---------+ +-->| | | + | | | |--+ + | AFS |----->| FS-Cache | + | | | |--+ + +---------+ +-->| | | + | | | | +--------------+ + +---------+ | +----------+ | | | + | | | +-->| CacheFiles | + | ISOFS |--+ | /var/cache | + | | +--------------+ + +---------+ + +Or to look at it another way, FS-Cache is a module that provides a caching +facility to a network filesystem such that the cache is transparent to the +user:: + + +---------+ + | | + | Server | + | | + +---------+ + | NETWORK + ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + | + | +----------+ + V | | + +---------+ | | + | | | | + | NFS |----->| FS-Cache | + | | | |--+ + +---------+ | | | +--------------+ +--------------+ + | | | | | | | | + V +----------+ +-->| CacheFiles |-->| Ext3 | + +---------+ | /var/cache | | /dev/sda6 | + | | +--------------+ +--------------+ + | VFS | ^ ^ + | | | | + +---------+ +--------------+ | + | KERNEL SPACE | | + ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~ + | USER SPACE | | + V | | + +---------+ +--------------+ + | | | | + | Process | | cachefilesd | + | | | | + +---------+ +--------------+ + + +FS-Cache does not follow the idea of completely loading every netfs file +opened in its entirety into a cache before permitting it to be accessed and +then serving the pages out of that cache rather than the netfs inode because: + + (1) It must be practical to operate without a cache. + + (2) The size of any accessible file must not be limited to the size of the + cache. + + (3) The combined size of all opened files (this includes mapped libraries) + must not be limited to the size of the cache. + + (4) The user should not be forced to download an entire file just to do a + one-off access of a small portion of it (such as might be done with the + "file" program). + +It instead serves the cache out in PAGE_SIZE chunks as and when requested by +the netfs('s) using it. + + +FS-Cache provides the following facilities: + + (1) More than one cache can be used at once. Caches can be selected + explicitly by use of tags. + + (2) Caches can be added / removed at any time. + + (3) The netfs is provided with an interface that allows either party to + withdraw caching facilities from a file (required for (2)). + + (4) The interface to the netfs returns as few errors as possible, preferring + rather to let the netfs remain oblivious. + + (5) Cookies are used to represent indices, files and other objects to the + netfs. The simplest cookie is just a NULL pointer - indicating nothing + cached there. + + (6) The netfs is allowed to propose - dynamically - any index hierarchy it + desires, though it must be aware that the index search function is + recursive, stack space is limited, and indices can only be children of + indices. + + (7) Data I/O is done direct to and from the netfs's pages. The netfs + indicates that page A is at index B of the data-file represented by cookie + C, and that it should be read or written. The cache backend may or may + not start I/O on that page, but if it does, a netfs callback will be + invoked to indicate completion. The I/O may be either synchronous or + asynchronous. + + (8) Cookies can be "retired" upon release. At this point FS-Cache will mark + them as obsolete and the index hierarchy rooted at that point will get + recycled. + + (9) The netfs provides a "match" function for index searches. In addition to + saying whether a match was made or not, this can also specify that an + entry should be updated or deleted. + +(10) As much as possible is done asynchronously. + + +FS-Cache maintains a virtual indexing tree in which all indices, files, objects +and pages are kept. Bits of this tree may actually reside in one or more +caches:: + + FSDEF + | + +------------------------------------+ + | | + NFS AFS + | | + +--------------------------+ +-----------+ + | | | | + homedir mirror afs.org redhat.com + | | | + +------------+ +---------------+ +----------+ + | | | | | | + 00001 00002 00007 00125 vol00001 vol00002 + | | | | | + +---+---+ +-----+ +---+ +------+------+ +-----+----+ + | | | | | | | | | | | | | + PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak + | | + PG0 +-------+ + | | + 00001 00003 + | + +---+---+ + | | | + PG0 PG1 PG2 + +In the example above, you can see two netfs's being backed: NFS and AFS. These +have different index hierarchies: + + * The NFS primary index contains per-server indices. Each server index is + indexed by NFS file handles to get data file objects. Each data file + objects can have an array of pages, but may also have further child + objects, such as extended attributes and directory entries. Extended + attribute objects themselves have page-array contents. + + * The AFS primary index contains per-cell indices. Each cell index contains + per-logical-volume indices. Each of volume index contains up to three + indices for the read-write, read-only and backup mirrors of those volumes. + Each of these contains vnode data file objects, each of which contains an + array of pages. + +The very top index is the FS-Cache master index in which individual netfs's +have entries. + +Any index object may reside in more than one cache, provided it only has index +children. Any index with non-index object children will be assumed to only +reside in one cache. + + +The netfs API to FS-Cache can be found in: + + Documentation/filesystems/caching/netfs-api.rst + +The cache backend API to FS-Cache can be found in: + + Documentation/filesystems/caching/backend-api.rst + +A description of the internal representations and object state machine can be +found in: + + Documentation/filesystems/caching/object.rst + + +Statistical Information +======================= + +If FS-Cache is compiled with the following options enabled:: + + CONFIG_FSCACHE_STATS=y + CONFIG_FSCACHE_HISTOGRAM=y + +then it will gather certain statistics and display them through a number of +proc files. + +/proc/fs/fscache/stats +---------------------- + + This shows counts of a number of events that can happen in FS-Cache: + ++--------------+-------+-------------------------------------------------------+ +|CLASS |EVENT |MEANING | ++==============+=======+=======================================================+ +|Cookies |idx=N |Number of index cookies allocated | ++ +-------+-------------------------------------------------------+ +| |dat=N |Number of data storage cookies allocated | ++ +-------+-------------------------------------------------------+ +| |spc=N |Number of special cookies allocated | ++--------------+-------+-------------------------------------------------------+ +|Objects |alc=N |Number of objects allocated | ++ +-------+-------------------------------------------------------+ +| |nal=N |Number of object allocation failures | ++ +-------+-------------------------------------------------------+ +| |avl=N |Number of objects that reached the available state | ++ +-------+-------------------------------------------------------+ +| |ded=N |Number of objects that reached the dead state | ++--------------+-------+-------------------------------------------------------+ +|ChkAux |non=N |Number of objects that didn't have a coherency check | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of objects that passed a coherency check | ++ +-------+-------------------------------------------------------+ +| |upd=N |Number of objects that needed a coherency data update | ++ +-------+-------------------------------------------------------+ +| |obs=N |Number of objects that were declared obsolete | ++--------------+-------+-------------------------------------------------------+ +|Pages |mrk=N |Number of pages marked as being cached | +| |unc=N |Number of uncache page requests seen | ++--------------+-------+-------------------------------------------------------+ +|Acquire |n=N |Number of acquire cookie requests seen | ++ +-------+-------------------------------------------------------+ +| |nul=N |Number of acq reqs given a NULL parent | ++ +-------+-------------------------------------------------------+ +| |noc=N |Number of acq reqs rejected due to no cache available | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of acq reqs succeeded | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of acq reqs rejected due to error | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of acq reqs failed on ENOMEM | ++--------------+-------+-------------------------------------------------------+ +|Lookups |n=N |Number of lookup calls made on cache backends | ++ +-------+-------------------------------------------------------+ +| |neg=N |Number of negative lookups made | ++ +-------+-------------------------------------------------------+ +| |pos=N |Number of positive lookups made | ++ +-------+-------------------------------------------------------+ +| |crt=N |Number of objects created by lookup | ++ +-------+-------------------------------------------------------+ +| |tmo=N |Number of lookups timed out and requeued | ++--------------+-------+-------------------------------------------------------+ +|Updates |n=N |Number of update cookie requests seen | ++ +-------+-------------------------------------------------------+ +| |nul=N |Number of upd reqs given a NULL parent | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of upd reqs granted CPU time | ++--------------+-------+-------------------------------------------------------+ +|Relinqs |n=N |Number of relinquish cookie requests seen | ++ +-------+-------------------------------------------------------+ +| |nul=N |Number of rlq reqs given a NULL parent | ++ +-------+-------------------------------------------------------+ +| |wcr=N |Number of rlq reqs waited on completion of creation | ++--------------+-------+-------------------------------------------------------+ +|AttrChg |n=N |Number of attribute changed requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of attr changed requests queued | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of attr changed rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of attr changed failed -ENOMEM | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of attr changed ops given CPU time | ++--------------+-------+-------------------------------------------------------+ +|Allocs |n=N |Number of allocation requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of successful alloc reqs | ++ +-------+-------------------------------------------------------+ +| |wt=N |Number of alloc reqs that waited on lookup completion | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of alloc reqs rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |int=N |Number of alloc reqs aborted -ERESTARTSYS | ++ +-------+-------------------------------------------------------+ +| |ops=N |Number of alloc reqs submitted | ++ +-------+-------------------------------------------------------+ +| |owt=N |Number of alloc reqs waited for CPU time | ++ +-------+-------------------------------------------------------+ +| |abt=N |Number of alloc reqs aborted due to object death | ++--------------+-------+-------------------------------------------------------+ +|Retrvls |n=N |Number of retrieval (read) requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of successful retr reqs | ++ +-------+-------------------------------------------------------+ +| |wt=N |Number of retr reqs that waited on lookup completion | ++ +-------+-------------------------------------------------------+ +| |nod=N |Number of retr reqs returned -ENODATA | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of retr reqs rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |int=N |Number of retr reqs aborted -ERESTARTSYS | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of retr reqs failed -ENOMEM | ++ +-------+-------------------------------------------------------+ +| |ops=N |Number of retr reqs submitted | ++ +-------+-------------------------------------------------------+ +| |owt=N |Number of retr reqs waited for CPU time | ++ +-------+-------------------------------------------------------+ +| |abt=N |Number of retr reqs aborted due to object death | ++--------------+-------+-------------------------------------------------------+ +|Stores |n=N |Number of storage (write) requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of successful store reqs | ++ +-------+-------------------------------------------------------+ +| |agn=N |Number of store reqs on a page already pending storage | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of store reqs rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of store reqs failed -ENOMEM | ++ +-------+-------------------------------------------------------+ +| |ops=N |Number of store reqs submitted | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of store reqs granted CPU time | ++ +-------+-------------------------------------------------------+ +| |pgs=N |Number of pages given store req processing time | ++ +-------+-------------------------------------------------------+ +| |rxd=N |Number of store reqs deleted from tracking tree | ++ +-------+-------------------------------------------------------+ +| |olm=N |Number of store reqs over store limit | ++--------------+-------+-------------------------------------------------------+ +|VmScan |nos=N |Number of release reqs against pages with no | +| | |pending store | ++ +-------+-------------------------------------------------------+ +| |gon=N |Number of release reqs against pages stored by | +| | |time lock granted | ++ +-------+-------------------------------------------------------+ +| |bsy=N |Number of release reqs ignored due to in-progress store| ++ +-------+-------------------------------------------------------+ +| |can=N |Number of page stores cancelled due to release req | ++--------------+-------+-------------------------------------------------------+ +|Ops |pend=N |Number of times async ops added to pending queues | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of times async ops given CPU time | ++ +-------+-------------------------------------------------------+ +| |enq=N |Number of times async ops queued for processing | ++ +-------+-------------------------------------------------------+ +| |can=N |Number of async ops cancelled | ++ +-------+-------------------------------------------------------+ +| |rej=N |Number of async ops rejected due to object | +| | |lookup/create failure | ++ +-------+-------------------------------------------------------+ +| |ini=N |Number of async ops initialised | ++ +-------+-------------------------------------------------------+ +| |dfr=N |Number of async ops queued for deferred release | ++ +-------+-------------------------------------------------------+ +| |rel=N |Number of async ops released | +| | |(should equal ini=N when idle) | ++ +-------+-------------------------------------------------------+ +| |gc=N |Number of deferred-release async ops garbage collected | ++--------------+-------+-------------------------------------------------------+ +|CacheOp |alo=N |Number of in-progress alloc_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |luo=N |Number of in-progress lookup_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |luc=N |Number of in-progress lookup_complete() cache ops | ++ +-------+-------------------------------------------------------+ +| |gro=N |Number of in-progress grab_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |upo=N |Number of in-progress update_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |dro=N |Number of in-progress drop_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |pto=N |Number of in-progress put_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |syn=N |Number of in-progress sync_cache() cache ops | ++ +-------+-------------------------------------------------------+ +| |atc=N |Number of in-progress attr_changed() cache ops | ++ +-------+-------------------------------------------------------+ +| |rap=N |Number of in-progress read_or_alloc_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |ras=N |Number of in-progress read_or_alloc_pages() cache ops | ++ +-------+-------------------------------------------------------+ +| |alp=N |Number of in-progress allocate_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |als=N |Number of in-progress allocate_pages() cache ops | ++ +-------+-------------------------------------------------------+ +| |wrp=N |Number of in-progress write_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |ucp=N |Number of in-progress uncache_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |dsp=N |Number of in-progress dissociate_pages() cache ops | ++--------------+-------+-------------------------------------------------------+ +|CacheEv |nsp=N |Number of object lookups/creations rejected due to | +| | |lack of space | ++ +-------+-------------------------------------------------------+ +| |stl=N |Number of stale objects deleted | ++ +-------+-------------------------------------------------------+ +| |rtr=N |Number of objects retired when relinquished | ++ +-------+-------------------------------------------------------+ +| |cul=N |Number of objects culled | ++--------------+-------+-------------------------------------------------------+ + + + +/proc/fs/fscache/histogram +-------------------------- + + :: + + cat /proc/fs/fscache/histogram + JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS + ===== ===== ========= ========= ========= ========= ========= + + This shows the breakdown of the number of times each amount of time + between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The + columns are as follows: + + ========= ======================================================= + COLUMN TIME MEASUREMENT + ========= ======================================================= + OBJ INST Length of time to instantiate an object + OP RUNS Length of time a call to process an operation took + OBJ RUNS Length of time a call to process an object event took + RETRV DLY Time between an requesting a read and lookup completing + RETRIEVLS Time between beginning and end of a retrieval + ========= ======================================================= + + Each row shows the number of events that took a particular range of times. + Each step is 1 jiffy in size. The JIFS column indicates the particular + jiffy range covered, and the SECS field the equivalent number of seconds. + + + +Object List +=========== + +If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a +list of all the objects currently allocated and allow them to be viewed +through:: + + /proc/fs/fscache/objects + +This will look something like:: + + [root@andromeda ~]# head /proc/fs/fscache/objects + OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA + ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================ + 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a + 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a + +where the first set of columns before the '|' describe the object: + + ======= =============================================================== + COLUMN DESCRIPTION + ======= =============================================================== + OBJECT Object debugging ID (appears as OBJ%x in some debug messages) + PARENT Debugging ID of parent object + STAT Object state + CHLDN Number of child objects of this object + OPS Number of outstanding operations on this object + OOP Number of outstanding child object management operations + IPR + EX Number of outstanding exclusive operations + READS Number of outstanding read operations + EM Object's event mask + EV Events raised on this object + F Object flags + S Object work item busy state mask (1:pending 2:running) + ======= =============================================================== + +and the second set of columns describe the object's cookie, if present: + + ================ ====================================================== + COLUMN DESCRIPTION + ================ ====================================================== + NETFS_COOKIE_DEF Name of netfs cookie definition + TY Cookie type (IX - index, DT - data, hex - special) + FL Cookie flags + NETFS_DATA Netfs private data stored in the cookie + OBJECT_KEY Object key } 1 column, with separating comma + AUX_DATA Object aux data } presence may be configured + ================ ====================================================== + +The data shown may be filtered by attaching the a key to an appropriate keyring +before viewing the file. Something like:: + + keyctl add user fscache:objlist <restrictions> @s + +where <restrictions> are a selection of the following letters: + + == ========================================================= + K Show hexdump of object key (don't show if not given) + A Show hexdump of object aux data (don't show if not given) + == ========================================================= + +and the following paired letters: + + == ========================================================= + C Show objects that have a cookie + c Show objects that don't have a cookie + B Show objects that are busy + b Show objects that aren't busy + W Show objects that have pending writes + w Show objects that don't have pending writes + R Show objects that have outstanding reads + r Show objects that don't have outstanding reads + S Show objects that have work queued + s Show objects that don't have work queued + == ========================================================= + +If neither side of a letter pair is given, then both are implied. For example: + + keyctl add user fscache:objlist KB @s + +shows objects that are busy, and lists their object keys, but does not dump +their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is +not implied. + +By default all objects and all fields will be shown. + + +Debugging +========= + +If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime +debugging enabled by adjusting the value in:: + + /sys/module/fscache/parameters/debug + +This is a bitmask of debugging streams to enable: + + ======= ======= =============================== ======================= + BIT VALUE STREAM POINT + ======= ======= =============================== ======================= + 0 1 Cache management Function entry trace + 1 2 Function exit trace + 2 4 General + 3 8 Cookie management Function entry trace + 4 16 Function exit trace + 5 32 General + 6 64 Page handling Function entry trace + 7 128 Function exit trace + 8 256 General + 9 512 Operation management Function entry trace + 10 1024 Function exit trace + 11 2048 General + ======= ======= =============================== ======================= + +The appropriate set of values should be OR'd together and the result written to +the control file. For example:: + + echo $((1|8|64)) >/sys/module/fscache/parameters/debug + +will turn on all function entry debugging. diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt deleted file mode 100644 index 50f0a5757f48..000000000000 --- a/Documentation/filesystems/caching/fscache.txt +++ /dev/null @@ -1,448 +0,0 @@ - ========================== - General Filesystem Caching - ========================== - -======== -OVERVIEW -======== - -This facility is a general purpose cache for network filesystems, though it -could be used for caching other things such as ISO9660 filesystems too. - -FS-Cache mediates between cache backends (such as CacheFS) and network -filesystems: - - +---------+ - | | +--------------+ - | NFS |--+ | | - | | | +-->| CacheFS | - +---------+ | +----------+ | | /dev/hda5 | - | | | | +--------------+ - +---------+ +-->| | | - | | | |--+ - | AFS |----->| FS-Cache | - | | | |--+ - +---------+ +-->| | | - | | | | +--------------+ - +---------+ | +----------+ | | | - | | | +-->| CacheFiles | - | ISOFS |--+ | /var/cache | - | | +--------------+ - +---------+ - -Or to look at it another way, FS-Cache is a module that provides a caching -facility to a network filesystem such that the cache is transparent to the -user: - - +---------+ - | | - | Server | - | | - +---------+ - | NETWORK - ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - | - | +----------+ - V | | - +---------+ | | - | | | | - | NFS |----->| FS-Cache | - | | | |--+ - +---------+ | | | +--------------+ +--------------+ - | | | | | | | | - V +----------+ +-->| CacheFiles |-->| Ext3 | - +---------+ | /var/cache | | /dev/sda6 | - | | +--------------+ +--------------+ - | VFS | ^ ^ - | | | | - +---------+ +--------------+ | - | KERNEL SPACE | | - ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~ - | USER SPACE | | - V | | - +---------+ +--------------+ - | | | | - | Process | | cachefilesd | - | | | | - +---------+ +--------------+ - - -FS-Cache does not follow the idea of completely loading every netfs file -opened in its entirety into a cache before permitting it to be accessed and -then serving the pages out of that cache rather than the netfs inode because: - - (1) It must be practical to operate without a cache. - - (2) The size of any accessible file must not be limited to the size of the - cache. - - (3) The combined size of all opened files (this includes mapped libraries) - must not be limited to the size of the cache. - - (4) The user should not be forced to download an entire file just to do a - one-off access of a small portion of it (such as might be done with the - "file" program). - -It instead serves the cache out in PAGE_SIZE chunks as and when requested by -the netfs('s) using it. - - -FS-Cache provides the following facilities: - - (1) More than one cache can be used at once. Caches can be selected - explicitly by use of tags. - - (2) Caches can be added / removed at any time. - - (3) The netfs is provided with an interface that allows either party to - withdraw caching facilities from a file (required for (2)). - - (4) The interface to the netfs returns as few errors as possible, preferring - rather to let the netfs remain oblivious. - - (5) Cookies are used to represent indices, files and other objects to the - netfs. The simplest cookie is just a NULL pointer - indicating nothing - cached there. - - (6) The netfs is allowed to propose - dynamically - any index hierarchy it - desires, though it must be aware that the index search function is - recursive, stack space is limited, and indices can only be children of - indices. - - (7) Data I/O is done direct to and from the netfs's pages. The netfs - indicates that page A is at index B of the data-file represented by cookie - C, and that it should be read or written. The cache backend may or may - not start I/O on that page, but if it does, a netfs callback will be - invoked to indicate completion. The I/O may be either synchronous or - asynchronous. - - (8) Cookies can be "retired" upon release. At this point FS-Cache will mark - them as obsolete and the index hierarchy rooted at that point will get - recycled. - - (9) The netfs provides a "match" function for index searches. In addition to - saying whether a match was made or not, this can also specify that an - entry should be updated or deleted. - -(10) As much as possible is done asynchronously. - - -FS-Cache maintains a virtual indexing tree in which all indices, files, objects -and pages are kept. Bits of this tree may actually reside in one or more -caches. - - FSDEF - | - +------------------------------------+ - | | - NFS AFS - | | - +--------------------------+ +-----------+ - | | | | - homedir mirror afs.org redhat.com - | | | - +------------+ +---------------+ +----------+ - | | | | | | - 00001 00002 00007 00125 vol00001 vol00002 - | | | | | - +---+---+ +-----+ +---+ +------+------+ +-----+----+ - | | | | | | | | | | | | | -PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak - | | - PG0 +-------+ - | | - 00001 00003 - | - +---+---+ - | | | - PG0 PG1 PG2 - -In the example above, you can see two netfs's being backed: NFS and AFS. These -have different index hierarchies: - - (*) The NFS primary index contains per-server indices. Each server index is - indexed by NFS file handles to get data file objects. Each data file - objects can have an array of pages, but may also have further child - objects, such as extended attributes and directory entries. Extended - attribute objects themselves have page-array contents. - - (*) The AFS primary index contains per-cell indices. Each cell index contains - per-logical-volume indices. Each of volume index contains up to three - indices for the read-write, read-only and backup mirrors of those volumes. - Each of these contains vnode data file objects, each of which contains an - array of pages. - -The very top index is the FS-Cache master index in which individual netfs's -have entries. - -Any index object may reside in more than one cache, provided it only has index -children. Any index with non-index object children will be assumed to only -reside in one cache. - - -The netfs API to FS-Cache can be found in: - - Documentation/filesystems/caching/netfs-api.txt - -The cache backend API to FS-Cache can be found in: - - Documentation/filesystems/caching/backend-api.txt - -A description of the internal representations and object state machine can be -found in: - - Documentation/filesystems/caching/object.txt - - -======================= -STATISTICAL INFORMATION -======================= - -If FS-Cache is compiled with the following options enabled: - - CONFIG_FSCACHE_STATS=y - CONFIG_FSCACHE_HISTOGRAM=y - -then it will gather certain statistics and display them through a number of -proc files. - - (*) /proc/fs/fscache/stats - - This shows counts of a number of events that can happen in FS-Cache: - - CLASS EVENT MEANING - ======= ======= ======================================================= - Cookies idx=N Number of index cookies allocated - dat=N Number of data storage cookies allocated - spc=N Number of special cookies allocated - Objects alc=N Number of objects allocated - nal=N Number of object allocation failures - avl=N Number of objects that reached the available state - ded=N Number of objects that reached the dead state - ChkAux non=N Number of objects that didn't have a coherency check - ok=N Number of objects that passed a coherency check - upd=N Number of objects that needed a coherency data update - obs=N Number of objects that were declared obsolete - Pages mrk=N Number of pages marked as being cached - unc=N Number of uncache page requests seen - Acquire n=N Number of acquire cookie requests seen - nul=N Number of acq reqs given a NULL parent - noc=N Number of acq reqs rejected due to no cache available - ok=N Number of acq reqs succeeded - nbf=N Number of acq reqs rejected due to error - oom=N Number of acq reqs failed on ENOMEM - Lookups n=N Number of lookup calls made on cache backends - neg=N Number of negative lookups made - pos=N Number of positive lookups made - crt=N Number of objects created by lookup - tmo=N Number of lookups timed out and requeued - Updates n=N Number of update cookie requests seen - nul=N Number of upd reqs given a NULL parent - run=N Number of upd reqs granted CPU time - Relinqs n=N Number of relinquish cookie requests seen - nul=N Number of rlq reqs given a NULL parent - wcr=N Number of rlq reqs waited on completion of creation - AttrChg n=N Number of attribute changed requests seen - ok=N Number of attr changed requests queued - nbf=N Number of attr changed rejected -ENOBUFS - oom=N Number of attr changed failed -ENOMEM - run=N Number of attr changed ops given CPU time - Allocs n=N Number of allocation requests seen - ok=N Number of successful alloc reqs - wt=N Number of alloc reqs that waited on lookup completion - nbf=N Number of alloc reqs rejected -ENOBUFS - int=N Number of alloc reqs aborted -ERESTARTSYS - ops=N Number of alloc reqs submitted - owt=N Number of alloc reqs waited for CPU time - abt=N Number of alloc reqs aborted due to object death - Retrvls n=N Number of retrieval (read) requests seen - ok=N Number of successful retr reqs - wt=N Number of retr reqs that waited on lookup completion - nod=N Number of retr reqs returned -ENODATA - nbf=N Number of retr reqs rejected -ENOBUFS - int=N Number of retr reqs aborted -ERESTARTSYS - oom=N Number of retr reqs failed -ENOMEM - ops=N Number of retr reqs submitted - owt=N Number of retr reqs waited for CPU time - abt=N Number of retr reqs aborted due to object death - Stores n=N Number of storage (write) requests seen - ok=N Number of successful store reqs - agn=N Number of store reqs on a page already pending storage - nbf=N Number of store reqs rejected -ENOBUFS - oom=N Number of store reqs failed -ENOMEM - ops=N Number of store reqs submitted - run=N Number of store reqs granted CPU time - pgs=N Number of pages given store req processing time - rxd=N Number of store reqs deleted from tracking tree - olm=N Number of store reqs over store limit - VmScan nos=N Number of release reqs against pages with no pending store - gon=N Number of release reqs against pages stored by time lock granted - bsy=N Number of release reqs ignored due to in-progress store - can=N Number of page stores cancelled due to release req - Ops pend=N Number of times async ops added to pending queues - run=N Number of times async ops given CPU time - enq=N Number of times async ops queued for processing - can=N Number of async ops cancelled - rej=N Number of async ops rejected due to object lookup/create failure - ini=N Number of async ops initialised - dfr=N Number of async ops queued for deferred release - rel=N Number of async ops released (should equal ini=N when idle) - gc=N Number of deferred-release async ops garbage collected - CacheOp alo=N Number of in-progress alloc_object() cache ops - luo=N Number of in-progress lookup_object() cache ops - luc=N Number of in-progress lookup_complete() cache ops - gro=N Number of in-progress grab_object() cache ops - upo=N Number of in-progress update_object() cache ops - dro=N Number of in-progress drop_object() cache ops - pto=N Number of in-progress put_object() cache ops - syn=N Number of in-progress sync_cache() cache ops - atc=N Number of in-progress attr_changed() cache ops - rap=N Number of in-progress read_or_alloc_page() cache ops - ras=N Number of in-progress read_or_alloc_pages() cache ops - alp=N Number of in-progress allocate_page() cache ops - als=N Number of in-progress allocate_pages() cache ops - wrp=N Number of in-progress write_page() cache ops - ucp=N Number of in-progress uncache_page() cache ops - dsp=N Number of in-progress dissociate_pages() cache ops - CacheEv nsp=N Number of object lookups/creations rejected due to lack of space - stl=N Number of stale objects deleted - rtr=N Number of objects retired when relinquished - cul=N Number of objects culled - - - (*) /proc/fs/fscache/histogram - - cat /proc/fs/fscache/histogram - JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS - ===== ===== ========= ========= ========= ========= ========= - - This shows the breakdown of the number of times each amount of time - between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The - columns are as follows: - - COLUMN TIME MEASUREMENT - ======= ======================================================= - OBJ INST Length of time to instantiate an object - OP RUNS Length of time a call to process an operation took - OBJ RUNS Length of time a call to process an object event took - RETRV DLY Time between an requesting a read and lookup completing - RETRIEVLS Time between beginning and end of a retrieval - - Each row shows the number of events that took a particular range of times. - Each step is 1 jiffy in size. The JIFS column indicates the particular - jiffy range covered, and the SECS field the equivalent number of seconds. - - -=========== -OBJECT LIST -=========== - -If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a -list of all the objects currently allocated and allow them to be viewed -through: - - /proc/fs/fscache/objects - -This will look something like: - - [root@andromeda ~]# head /proc/fs/fscache/objects - OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA - ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================ - 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a - 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a - -where the first set of columns before the '|' describe the object: - - COLUMN DESCRIPTION - ======= =============================================================== - OBJECT Object debugging ID (appears as OBJ%x in some debug messages) - PARENT Debugging ID of parent object - STAT Object state - CHLDN Number of child objects of this object - OPS Number of outstanding operations on this object - OOP Number of outstanding child object management operations - IPR - EX Number of outstanding exclusive operations - READS Number of outstanding read operations - EM Object's event mask - EV Events raised on this object - F Object flags - S Object work item busy state mask (1:pending 2:running) - -and the second set of columns describe the object's cookie, if present: - - COLUMN DESCRIPTION - =============== ======================================================= - NETFS_COOKIE_DEF Name of netfs cookie definition - TY Cookie type (IX - index, DT - data, hex - special) - FL Cookie flags - NETFS_DATA Netfs private data stored in the cookie - OBJECT_KEY Object key } 1 column, with separating comma - AUX_DATA Object aux data } presence may be configured - -The data shown may be filtered by attaching the a key to an appropriate keyring -before viewing the file. Something like: - - keyctl add user fscache:objlist <restrictions> @s - -where <restrictions> are a selection of the following letters: - - K Show hexdump of object key (don't show if not given) - A Show hexdump of object aux data (don't show if not given) - -and the following paired letters: - - C Show objects that have a cookie - c Show objects that don't have a cookie - B Show objects that are busy - b Show objects that aren't busy - W Show objects that have pending writes - w Show objects that don't have pending writes - R Show objects that have outstanding reads - r Show objects that don't have outstanding reads - S Show objects that have work queued - s Show objects that don't have work queued - -If neither side of a letter pair is given, then both are implied. For example: - - keyctl add user fscache:objlist KB @s - -shows objects that are busy, and lists their object keys, but does not dump -their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is -not implied. - -By default all objects and all fields will be shown. - - -========= -DEBUGGING -========= - -If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime -debugging enabled by adjusting the value in: - - /sys/module/fscache/parameters/debug - -This is a bitmask of debugging streams to enable: - - BIT VALUE STREAM POINT - ======= ======= =============================== ======================= - 0 1 Cache management Function entry trace - 1 2 Function exit trace - 2 4 General - 3 8 Cookie management Function entry trace - 4 16 Function exit trace - 5 32 General - 6 64 Page handling Function entry trace - 7 128 Function exit trace - 8 256 General - 9 512 Operation management Function entry trace - 10 1024 Function exit trace - 11 2048 General - -The appropriate set of values should be OR'd together and the result written to -the control file. For example: - - echo $((1|8|64)) >/sys/module/fscache/parameters/debug - -will turn on all function entry debugging. diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst new file mode 100644 index 000000000000..033da7ac7c6e --- /dev/null +++ b/Documentation/filesystems/caching/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Filesystem Caching +================== + +.. toctree:: + :maxdepth: 2 + + fscache + object + backend-api + cachefiles + netfs-api + operations diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.rst index ba968e8f5704..d9f14b8610ba 100644 --- a/Documentation/filesystems/caching/netfs-api.txt +++ b/Documentation/filesystems/caching/netfs-api.rst @@ -1,6 +1,8 @@ - =============================== - FS-CACHE NETWORK FILESYSTEM API - =============================== +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +FS-Cache Network Filesystem API +=============================== There's an API by which a network filesystem can make use of the FS-Cache facilities. This is based around a number of principles: @@ -19,7 +21,7 @@ facilities. This is based around a number of principles: This API is declared in <linux/fscache.h>. -This document contains the following sections: +.. This document contains the following sections: (1) Network filesystem definition (2) Index definition @@ -41,12 +43,11 @@ This document contains the following sections: (18) FS-Cache specific page flags. -============================= -NETWORK FILESYSTEM DEFINITION +Network Filesystem Definition ============================= FS-Cache needs a description of the network filesystem. This is specified -using a record of the following structure: +using a record of the following structure:: struct fscache_netfs { uint32_t version; @@ -71,7 +72,7 @@ The fields are: another parameter passed into the registration function. For example, kAFS (linux/fs/afs/) uses the following definitions to describe -itself: +itself:: struct fscache_netfs afs_cache_netfs = { .version = 0, @@ -79,8 +80,7 @@ itself: }; -================ -INDEX DEFINITION +Index Definition ================ Indices are used for two purposes: @@ -114,11 +114,10 @@ There are some limits on indices: function is recursive. Too many layers will run the kernel out of stack. -================= -OBJECT DEFINITION +Object Definition ================= -To define an object, a structure of the following type should be filled out: +To define an object, a structure of the following type should be filled out:: struct fscache_cookie_def { @@ -149,16 +148,13 @@ This has the following fields: This is one of the following values: - (*) FSCACHE_COOKIE_TYPE_INDEX - + FSCACHE_COOKIE_TYPE_INDEX This defines an index, which is a special FS-Cache type. - (*) FSCACHE_COOKIE_TYPE_DATAFILE - + FSCACHE_COOKIE_TYPE_DATAFILE This defines an ordinary data file. - (*) Any other value between 2 and 255 - + Any other value between 2 and 255 This defines an extraordinary object such as an XATTR. (2) The name of the object type (NUL terminated unless all 16 chars are used) @@ -192,9 +188,14 @@ This has the following fields: If present, the function should return one of the following values: - (*) FSCACHE_CHECKAUX_OKAY - the entry is okay as is - (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update - (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted + FSCACHE_CHECKAUX_OKAY + - the entry is okay as is + + FSCACHE_CHECKAUX_NEEDS_UPDATE + - the entry requires update + + FSCACHE_CHECKAUX_OBSOLETE + - the entry should be deleted This function can also be used to extract data from the auxiliary data in the cache and copy it into the netfs's structures. @@ -236,32 +237,30 @@ This has the following fields: This function is not required for indices as they're not permitted data. -=================================== -NETWORK FILESYSTEM (UN)REGISTRATION +Network Filesystem (Un)registration =================================== The first step is to declare the network filesystem to the cache. This also involves specifying the layout of the primary index (for AFS, this would be the "cell" level). -The registration function is: +The registration function is:: int fscache_register_netfs(struct fscache_netfs *netfs); It just takes a pointer to the netfs definition. It returns 0 or an error as appropriate. -For kAFS, registration is done as follows: +For kAFS, registration is done as follows:: ret = fscache_register_netfs(&afs_cache_netfs); -The last step is, of course, unregistration: +The last step is, of course, unregistration:: void fscache_unregister_netfs(struct fscache_netfs *netfs); -================ -CACHE TAG LOOKUP +Cache Tag Lookup ================ FS-Cache permits the use of more than one cache. To permit particular index @@ -270,7 +269,7 @@ representation tags. This step is optional; it can be left entirely up to FS-Cache as to which cache should be used. The problem with doing that is that FS-Cache will always pick the first cache that was registered. -To get the representation for a named tag: +To get the representation for a named tag:: struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name); @@ -278,7 +277,7 @@ This takes a text string as the name and returns a representation of a tag. It will never return an error. It may return a dummy tag, however, if it runs out of memory; this will inhibit caching with this tag. -Any representation so obtained must be released by passing it to this function: +Any representation so obtained must be released by passing it to this function:: void fscache_release_cache_tag(struct fscache_cache_tag *tag); @@ -286,13 +285,12 @@ The tag will be retrieved by FS-Cache when it calls the object definition operation select_cache(). -================== -INDEX REGISTRATION +Index Registration ================== The third step is to inform FS-Cache about part of an index hierarchy that can be used to locate files. This is done by requesting a cookie for each index in -the path to the file: +the path to the file:: struct fscache_cookie * fscache_acquire_cookie(struct fscache_cookie *parent, @@ -339,7 +337,7 @@ must be enabled to do anything with it. A disabled cookie can be enabled by calling fscache_enable_cookie() (see below). For example, with AFS, a cell would be added to the primary index. This index -entry would have a dependent inode containing volume mappings within this cell: +entry would have a dependent inode containing volume mappings within this cell:: cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, @@ -349,7 +347,7 @@ entry would have a dependent inode containing volume mappings within this cell: cell, 0, true); And then a particular volume could be added to that index by ID, creating -another index for vnodes (AFS inode equivalents): +another index for vnodes (AFS inode equivalents):: volume->cache = fscache_acquire_cookie(volume->cell->cache, @@ -359,13 +357,12 @@ another index for vnodes (AFS inode equivalents): volume, 0, true); -====================== -DATA FILE REGISTRATION +Data File Registration ====================== The fourth step is to request a data file be created in the cache. This is identical to index cookie acquisition. The only difference is that the type in -the object definition should be something other than index type. +the object definition should be something other than index type:: vnode->cache = fscache_acquire_cookie(volume->cache, @@ -375,15 +372,14 @@ the object definition should be something other than index type. vnode, vnode->status.size, true); -================================= -MISCELLANEOUS OBJECT REGISTRATION +Miscellaneous Object Registration ================================= An optional step is to request an object of miscellaneous type be created in the cache. This is almost identical to index cookie acquisition. The only difference is that the type in the object definition should be something other than index type. While the parent object could be an index, it's more likely -it would be some other type of object such as a data file. +it would be some other type of object such as a data file:: xattr->cache = fscache_acquire_cookie(vnode->cache, @@ -396,13 +392,12 @@ Miscellaneous objects might be used to store extended attributes or directory entries for example. -========================== -SETTING THE DATA FILE SIZE +Setting the Data File Size ========================== The fifth step is to set the physical attributes of the file, such as its size. This doesn't automatically reserve any space in the cache, but permits the -cache to adjust its metadata for data tracking appropriately: +cache to adjust its metadata for data tracking appropriately:: int fscache_attr_changed(struct fscache_cookie *cookie); @@ -417,8 +412,7 @@ some point in the future, and as such, it may happen after the function returns to the caller. The attribute adjustment excludes read and write operations. -===================== -PAGE ALLOC/READ/WRITE +Page alloc/read/write ===================== And the sixth step is to store and retrieve pages in the cache. There are @@ -441,7 +435,7 @@ PAGE READ Firstly, the netfs should ask FS-Cache to examine the caches and read the contents cached for a particular page of a particular file if present, or else -allocate space to store the contents if not: +allocate space to store the contents if not:: typedef void (*fscache_rw_complete_t)(struct page *page, @@ -474,14 +468,14 @@ Else if there's a copy of the page resident in the cache: (4) When the read is complete, end_io_func() will be invoked with: - (*) The netfs data supplied when the cookie was created. + * The netfs data supplied when the cookie was created. - (*) The page descriptor. + * The page descriptor. - (*) The context argument passed to the above function. This will be + * The context argument passed to the above function. This will be maintained with the get_context/put_context functions mentioned above. - (*) An argument that's 0 on success or negative for an error code. + * An argument that's 0 on success or negative for an error code. If an error occurs, it should be assumed that the page contains no usable data. fscache_readpages_cancel() may need to be called. @@ -504,11 +498,11 @@ This function may also return -ENOMEM or -EINTR, in which case it won't have read any data from the cache. -PAGE ALLOCATE +Page Allocate ------------- Alternatively, if there's not expected to be any data in the cache for a page -because the file has been extended, a block can simply be allocated instead: +because the file has been extended, a block can simply be allocated instead:: int fscache_alloc_page(struct fscache_cookie *cookie, struct page *page, @@ -523,12 +517,12 @@ The mark_pages_cached() cookie operation will be called on the page if successful. -PAGE WRITE +Page Write ---------- Secondly, if the netfs changes the contents of the page (either due to an initial download or if a user performs a write), then the page should be -written back to the cache: +written back to the cache:: int fscache_write_page(struct fscache_cookie *cookie, struct page *page, @@ -566,11 +560,11 @@ place if unforeseen circumstances arose (such as a disk error). Writing takes place asynchronously. -MULTIPLE PAGE READ +Multiple Page Read ------------------ A facility is provided to read several pages at once, as requested by the -readpages() address space operation: +readpages() address space operation:: int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, struct address_space *mapping, @@ -598,7 +592,7 @@ This works in a similar way to fscache_read_or_alloc_page(), except: be returned. Otherwise, if all pages had reads dispatched, then 0 will be returned, the - list will be empty and *nr_pages will be 0. + list will be empty and ``*nr_pages`` will be 0. (4) end_io_func will be called once for each page being read as the reads complete. It will be called in process context if error != 0, but it may @@ -609,13 +603,13 @@ some of the pages being read and some being allocated. Those pages will have been marked appropriately and will need uncaching. -CANCELLATION OF UNREAD PAGES +Cancellation of Unread Pages ---------------------------- If one or more pages are passed to fscache_read_or_alloc_pages() but not then read from the cache and also not read from the underlying filesystem then those pages will need to have any marks and reservations removed. This can be -done by calling: +done by calling:: void fscache_readpages_cancel(struct fscache_cookie *cookie, struct list_head *pages); @@ -625,11 +619,10 @@ fscache_read_or_alloc_pages(). Every page in the pages list will be examined and any that have PG_fscache set will be uncached. -============== -PAGE UNCACHING +Page Uncaching ============== -To uncache a page, this function should be called: +To uncache a page, this function should be called:: void fscache_uncache_page(struct fscache_cookie *cookie, struct page *page); @@ -644,12 +637,12 @@ data file must be retired (see the relinquish cookie function below). Furthermore, note that this does not cancel the asynchronous read or write operation started by the read/alloc and write functions, so the page -invalidation functions must use: +invalidation functions must use:: bool fscache_check_page_write(struct fscache_cookie *cookie, struct page *page); -to see if a page is being written to the cache, and: +to see if a page is being written to the cache, and:: void fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page); @@ -660,7 +653,7 @@ to wait for it to finish if it is. When releasepage() is being implemented, a special FS-Cache function exists to manage the heuristics of coping with vmscan trying to eject pages, which may conflict with the cache trying to write pages to the cache (which may itself -need to allocate memory): +need to allocate memory):: bool fscache_maybe_release_page(struct fscache_cookie *cookie, struct page *page, @@ -676,12 +669,12 @@ storage request to complete, or it may attempt to cancel the storage request - in which case the page will not be stored in the cache this time. -BULK INODE PAGE UNCACHE +Bulk Image Page Uncache ----------------------- A convenience routine is provided to perform an uncache on all the pages attached to an inode. This assumes that the pages on the inode correspond on a -1:1 basis with the pages in the cache. +1:1 basis with the pages in the cache:: void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, struct inode *inode); @@ -692,12 +685,11 @@ written to the cache and for the cache to finish with the page generally. No error is returned. -=============================== -INDEX AND DATA FILE CONSISTENCY +Index and Data File consistency =============================== To find out whether auxiliary data for an object is up to data within the -cache, the following function can be called: +cache, the following function can be called:: int fscache_check_consistency(struct fscache_cookie *cookie, const void *aux_data); @@ -708,7 +700,7 @@ data buffer first. It returns 0 if it is and -ESTALE if it isn't; it may also return -ENOMEM and -ERESTARTSYS. To request an update of the index data for an index or other object, the -following function should be called: +following function should be called:: void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data); @@ -721,8 +713,7 @@ Note that partial updates may happen automatically at other times, such as when data blocks are added to a data file object. -================= -COOKIE ENABLEMENT +Cookie Enablement ================= Cookies exist in one of two states: enabled and disabled. If a cookie is @@ -731,7 +722,7 @@ invalidate its state; allocate, read or write backing pages - though it is still possible to uncache pages and relinquish the cookie. The initial enablement state is set by fscache_acquire_cookie(), but the cookie -can be enabled or disabled later. To disable a cookie, call: +can be enabled or disabled later. To disable a cookie, call:: void fscache_disable_cookie(struct fscache_cookie *cookie, const void *aux_data, @@ -746,7 +737,7 @@ All possible failures are handled internally. The caller should consider calling fscache_uncache_all_inode_pages() afterwards to make sure all page markings are cleared up. -Cookies can be enabled or reenabled with: +Cookies can be enabled or reenabled with:: void fscache_enable_cookie(struct fscache_cookie *cookie, const void *aux_data, @@ -771,13 +762,12 @@ In both cases, the cookie's auxiliary data buffer is updated from aux_data if that is non-NULL inside the enablement lock before proceeding. -=============================== -MISCELLANEOUS COOKIE OPERATIONS +Miscellaneous Cookie operations =============================== There are a number of operations that can be used to control cookies: - (*) Cookie pinning: + * Cookie pinning:: int fscache_pin_cookie(struct fscache_cookie *cookie); void fscache_unpin_cookie(struct fscache_cookie *cookie); @@ -790,7 +780,7 @@ There are a number of operations that can be used to control cookies: -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or -EIO if there's any other problem. - (*) Data space reservation: + * Data space reservation:: int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size); @@ -809,11 +799,10 @@ There are a number of operations that can be used to control cookies: make space if it's not in use. -===================== -COOKIE UNREGISTRATION +Cookie Unregistration ===================== -To get rid of a cookie, this function should be called. +To get rid of a cookie, this function should be called:: void fscache_relinquish_cookie(struct fscache_cookie *cookie, const void *aux_data, @@ -835,16 +824,14 @@ the cookies for "child" indices, objects and pages have been relinquished first. -================== -INDEX INVALIDATION +Index Invalidation ================== There is no direct way to invalidate an index subtree. To do this, the caller should relinquish and retire the cookie they have, and then acquire a new one. -====================== -DATA FILE INVALIDATION +Data File Invalidation ====================== Sometimes it will be necessary to invalidate an object that contains data. @@ -853,7 +840,7 @@ change - at which point the netfs has to throw away all the state it had for an inode and reload from the server. To indicate that a cache object should be invalidated, the following function -can be called: +can be called:: void fscache_invalidate(struct fscache_cookie *cookie); @@ -868,13 +855,12 @@ auxiliary data update operation as it is very likely these will have changed. Using the following function, the netfs can wait for the invalidation operation to have reached a point at which it can start submitting ordinary operations -once again: +once again:: void fscache_wait_on_invalidate(struct fscache_cookie *cookie); -=========================== -FS-CACHE SPECIFIC PAGE FLAG +FS-cache Specific Page Flag =========================== FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is @@ -898,7 +884,7 @@ was given under certain circumstances. This bit does not overlap with such as PG_private. This means that FS-Cache can be used with a filesystem that uses the block buffering code. -There are a number of operations defined on this flag: +There are a number of operations defined on this flag:: int PageFsCache(struct page *page); void SetPageFsCache(struct page *page) diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.rst index 100ff41127e4..ce0e043ccd33 100644 --- a/Documentation/filesystems/caching/object.txt +++ b/Documentation/filesystems/caching/object.rst @@ -1,10 +1,12 @@ - ==================================================== - IN-KERNEL CACHE OBJECT REPRESENTATION AND MANAGEMENT - ==================================================== +.. SPDX-License-Identifier: GPL-2.0 + +==================================================== +In-Kernel Cache Object Representation and Management +==================================================== By: David Howells <dhowells@redhat.com> -Contents: +.. Contents: (*) Representation @@ -18,8 +20,7 @@ Contents: (*) The set of events. -============== -REPRESENTATION +Representation ============== FS-Cache maintains an in-kernel representation of each object that a netfs is @@ -38,7 +39,7 @@ or even by no objects (it may not be cached). Furthermore, both cookies and objects are hierarchical. The two hierarchies correspond, but the cookies tree is a superset of the union of the object trees -of multiple caches: +of multiple caches:: NETFS INDEX TREE : CACHE 1 : CACHE 2 : : @@ -89,8 +90,7 @@ pointers to the cookies. The cookies themselves and any objects attached to those cookies are hidden from it. -=============================== -OBJECT MANAGEMENT STATE MACHINE +Object Management State Machine =============================== Within FS-Cache, each active object is managed by its own individual state @@ -124,7 +124,7 @@ is not masked, the object will be queued for processing (by calling fscache_enqueue_object()). -PROVISION OF CPU TIME +Provision of CPU Time --------------------- The work to be done by the various states was given CPU time by the threads of @@ -141,7 +141,7 @@ because: workqueues don't necessarily have the right numbers of threads. -LOCKING SIMPLIFICATION +Locking Simplification ---------------------- Because only one worker thread may be operating on any particular object's @@ -151,8 +151,7 @@ from the cache backend's representation (fscache_object) - which may be requested from either end. -================= -THE SET OF STATES +The Set of States ================= The object state machine has a set of states that it can be in. There are @@ -275,19 +274,17 @@ memory and potentially deletes stuff from disk: this state. -THE SET OF EVENTS +The Set of Events ----------------- There are a number of events that can be raised to an object state machine: - (*) FSCACHE_OBJECT_EV_UPDATE - + FSCACHE_OBJECT_EV_UPDATE The netfs requested that an object be updated. The state machine will ask the cache backend to update the object, and the cache backend will ask the netfs for details of the change through its cookie definition ops. - (*) FSCACHE_OBJECT_EV_CLEARED - + FSCACHE_OBJECT_EV_CLEARED This is signalled in two circumstances: (a) when an object's last child object is dropped and @@ -296,20 +293,16 @@ There are a number of events that can be raised to an object state machine: This is used to proceed from the dying state. - (*) FSCACHE_OBJECT_EV_ERROR - + FSCACHE_OBJECT_EV_ERROR This is signalled when an I/O error occurs during the processing of some object. - (*) FSCACHE_OBJECT_EV_RELEASE - (*) FSCACHE_OBJECT_EV_RETIRE - + FSCACHE_OBJECT_EV_RELEASE, FSCACHE_OBJECT_EV_RETIRE These are signalled when the netfs relinquishes a cookie it was using. The event selected depends on whether the netfs asks for the backing object to be retired (deleted) or retained. - (*) FSCACHE_OBJECT_EV_WITHDRAW - + FSCACHE_OBJECT_EV_WITHDRAW This is signalled when the cache backend wants to withdraw an object. This means that the object will have to be detached from the netfs's cookie. diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.rst index d8976c434718..f7ddcc028939 100644 --- a/Documentation/filesystems/caching/operations.txt +++ b/Documentation/filesystems/caching/operations.rst @@ -1,10 +1,12 @@ - ================================ - ASYNCHRONOUS OPERATIONS HANDLING - ================================ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +Asynchronous Operations Handling +================================ By: David Howells <dhowells@redhat.com> -Contents: +.. Contents: (*) Overview. @@ -17,8 +19,7 @@ Contents: (*) Asynchronous callback. -======== -OVERVIEW +Overview ======== FS-Cache has an asynchronous operations handling facility that it uses for its @@ -33,11 +34,10 @@ backend for completion. To make use of this facility, <linux/fscache-cache.h> should be #included. -=============================== -OPERATION RECORD INITIALISATION +Operation Record Initialisation =============================== -An operation is recorded in an fscache_operation struct: +An operation is recorded in an fscache_operation struct:: struct fscache_operation { union { @@ -50,7 +50,7 @@ An operation is recorded in an fscache_operation struct: }; Someone wanting to issue an operation should allocate something with this -struct embedded in it. They should initialise it by calling: +struct embedded in it. They should initialise it by calling:: void fscache_operation_init(struct fscache_operation *op, fscache_operation_release_t release); @@ -67,8 +67,7 @@ FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the operation and waited for afterwards. -========== -PARAMETERS +Parameters ========== There are a number of parameters that can be set in the operation record's flag @@ -87,7 +86,7 @@ operations: If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags before submitting the operation, and the operating thread must wait for it - to be cleared before proceeding: + to be cleared before proceeding:: wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_UNINTERRUPTIBLE); @@ -101,7 +100,7 @@ operations: page to a netfs page after the backing fs has read the page in. If this option is used, op->fast_work and op->processor must be - initialised before submitting the operation: + initialised before submitting the operation:: INIT_WORK(&op->fast_work, do_some_work); @@ -114,7 +113,7 @@ operations: pages that have just been fetched from a remote server. If this option is used, op->slow_work and op->processor must be - initialised before submitting the operation: + initialised before submitting the operation:: fscache_operation_init_slow(op, processor) @@ -132,8 +131,7 @@ Furthermore, operations may be one of two types: operations running at the same time. -========= -PROCEDURE +Procedure ========= Operations are used through the following procedure: @@ -143,7 +141,7 @@ Operations are used through the following procedure: generic op embedded within. (2) The submitting thread must then submit the operation for processing using - one of the following two functions: + one of the following two functions:: int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op); @@ -164,7 +162,7 @@ Operations are used through the following procedure: operation of conflicting exclusivity is in progress on the object. If the operation is asynchronous, the manager will retain a reference to - it, so the caller should put their reference to it by passing it to: + it, so the caller should put their reference to it by passing it to:: void fscache_put_operation(struct fscache_operation *op); @@ -179,12 +177,12 @@ Operations are used through the following procedure: (4) The operation holds an effective lock upon the object, preventing other exclusive ops conflicting until it is released. The operation can be enqueued for further immediate asynchronous processing by adjusting the - CPU time provisioning option if necessary, eg: + CPU time provisioning option if necessary, eg:: op->flags &= ~FSCACHE_OP_TYPE; op->flags |= ~FSCACHE_OP_FAST; - and calling: + and calling:: void fscache_enqueue_operation(struct fscache_operation *op) @@ -192,13 +190,12 @@ Operations are used through the following procedure: pools. -===================== -ASYNCHRONOUS CALLBACK +Asynchronous Callback ===================== When used in asynchronous mode, the worker thread pool will invoke the processor method with a pointer to the operation. This should then get at the -container struct by using container_of(): +container struct by using container_of():: static void fscache_write_op(struct fscache_operation *_op) { diff --git a/Documentation/filesystems/cifs/cifsroot.txt b/Documentation/filesystems/cifs/cifsroot.rst index 947b7ec6ce9e..4930bb443134 100644 --- a/Documentation/filesystems/cifs/cifsroot.txt +++ b/Documentation/filesystems/cifs/cifsroot.rst @@ -1,7 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================== Mounting root file system via SMB (cifs.ko) =========================================== Written 2019 by Paulo Alcantara <palcantara@suse.de> + Written 2019 by Aurelien Aptel <aaptel@suse.com> The CONFIG_CIFS_ROOT option enables experimental root file system @@ -32,7 +36,7 @@ Server configuration ==================== To enable SMB1+UNIX extensions you will need to set these global -settings in Samba smb.conf: +settings in Samba smb.conf:: [global] server min protocol = NT1 @@ -41,12 +45,16 @@ settings in Samba smb.conf: Kernel command line =================== -root=/dev/cifs +:: + + root=/dev/cifs This is just a virtual device that basically tells the kernel to mount the root file system via SMB protocol. -cifsroot=//<server-ip>/<share>[,options] +:: + + cifsroot=//<server-ip>/<share>[,options] Enables the kernel to mount the root file system via SMB that are located in the <server-ip> and <share> specified in this option. @@ -65,33 +73,33 @@ options Examples ======== -Export root file system as a Samba share in smb.conf file. +Export root file system as a Samba share in smb.conf file:: -... -[linux] - path = /path/to/rootfs - read only = no - guest ok = yes - force user = root - force group = root - browseable = yes - writeable = yes - admin users = root - public = yes - create mask = 0777 - directory mask = 0777 -... + ... + [linux] + path = /path/to/rootfs + read only = no + guest ok = yes + force user = root + force group = root + browseable = yes + writeable = yes + admin users = root + public = yes + create mask = 0777 + directory mask = 0777 + ... -Restart smb service. +Restart smb service:: -# systemctl restart smb + # systemctl restart smb Test it under QEMU on a kernel built with CONFIG_CIFS_ROOT and -CONFIG_IP_PNP options enabled. +CONFIG_IP_PNP options enabled:: -# qemu-system-x86_64 -enable-kvm -cpu host -m 1024 \ - -kernel /path/to/linux/arch/x86/boot/bzImage -nographic \ - -append "root=/dev/cifs rw ip=dhcp cifsroot=//10.0.2.2/linux,username=foo,password=bar console=ttyS0 3" + # qemu-system-x86_64 -enable-kvm -cpu host -m 1024 \ + -kernel /path/to/linux/arch/x86/boot/bzImage -nographic \ + -append "root=/dev/cifs rw ip=dhcp cifsroot=//10.0.2.2/linux,username=foo,password=bar console=ttyS0 3" 1: https://wiki.samba.org/index.php/UNIX_Extensions diff --git a/Documentation/filesystems/coda.rst b/Documentation/filesystems/coda.rst new file mode 100644 index 000000000000..84c860c89887 --- /dev/null +++ b/Documentation/filesystems/coda.rst @@ -0,0 +1,1670 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Coda Kernel-Venus Interface +=========================== + +.. Note:: + + This is one of the technical documents describing a component of + Coda -- this document describes the client kernel-Venus interface. + +For more information: + + http://www.coda.cs.cmu.edu + +For user level software needed to run Coda: + + ftp://ftp.coda.cs.cmu.edu + +To run Coda you need to get a user level cache manager for the client, +named Venus, as well as tools to manipulate ACLs, to log in, etc. The +client needs to have the Coda filesystem selected in the kernel +configuration. + +The server needs a user level server and at present does not depend on +kernel support. + + The Venus kernel interface + + Peter J. Braam + + v1.0, Nov 9, 1997 + + This document describes the communication between Venus and kernel + level filesystem code needed for the operation of the Coda file sys- + tem. This document version is meant to describe the current interface + (version 1.0) as well as improvements we envisage. + +.. Table of Contents + + 1. Introduction + + 2. Servicing Coda filesystem calls + + 3. The message layer + + 3.1 Implementation details + + 4. The interface at the call level + + 4.1 Data structures shared by the kernel and Venus + 4.2 The pioctl interface + 4.3 root + 4.4 lookup + 4.5 getattr + 4.6 setattr + 4.7 access + 4.8 create + 4.9 mkdir + 4.10 link + 4.11 symlink + 4.12 remove + 4.13 rmdir + 4.14 readlink + 4.15 open + 4.16 close + 4.17 ioctl + 4.18 rename + 4.19 readdir + 4.20 vget + 4.21 fsync + 4.22 inactive + 4.23 rdwr + 4.24 odymount + 4.25 ody_lookup + 4.26 ody_expand + 4.27 prefetch + 4.28 signal + + 5. The minicache and downcalls + + 5.1 INVALIDATE + 5.2 FLUSH + 5.3 PURGEUSER + 5.4 ZAPFILE + 5.5 ZAPDIR + 5.6 ZAPVNODE + 5.7 PURGEFID + 5.8 REPLACE + + 6. Initialization and cleanup + + 6.1 Requirements + +1. Introduction +=============== + + A key component in the Coda Distributed File System is the cache + manager, Venus. + + When processes on a Coda enabled system access files in the Coda + filesystem, requests are directed at the filesystem layer in the + operating system. The operating system will communicate with Venus to + service the request for the process. Venus manages a persistent + client cache and makes remote procedure calls to Coda file servers and + related servers (such as authentication servers) to service these + requests it receives from the operating system. When Venus has + serviced a request it replies to the operating system with appropriate + return codes, and other data related to the request. Optionally the + kernel support for Coda may maintain a minicache of recently processed + requests to limit the number of interactions with Venus. Venus + possesses the facility to inform the kernel when elements from its + minicache are no longer valid. + + This document describes precisely this communication between the + kernel and Venus. The definitions of so called upcalls and downcalls + will be given with the format of the data they handle. We shall also + describe the semantic invariants resulting from the calls. + + Historically Coda was implemented in a BSD file system in Mach 2.6. + The interface between the kernel and Venus is very similar to the BSD + VFS interface. Similar functionality is provided, and the format of + the parameters and returned data is very similar to the BSD VFS. This + leads to an almost natural environment for implementing a kernel-level + filesystem driver for Coda in a BSD system. However, other operating + systems such as Linux and Windows 95 and NT have virtual filesystem + with different interfaces. + + To implement Coda on these systems some reverse engineering of the + Venus/Kernel protocol is necessary. Also it came to light that other + systems could profit significantly from certain small optimizations + and modifications to the protocol. To facilitate this work as well as + to make future ports easier, communication between Venus and the + kernel should be documented in great detail. This is the aim of this + document. + +2. Servicing Coda filesystem calls +=================================== + + The service of a request for a Coda file system service originates in + a process P which accessing a Coda file. It makes a system call which + traps to the OS kernel. Examples of such calls trapping to the kernel + are ``read``, ``write``, ``open``, ``close``, ``create``, ``mkdir``, + ``rmdir``, ``chmod`` in a Unix ontext. Similar calls exist in the Win32 + environment, and are named ``CreateFile``. + + Generally the operating system handles the request in a virtual + filesystem (VFS) layer, which is named I/O Manager in NT and IFS + manager in Windows 95. The VFS is responsible for partial processing + of the request and for locating the specific filesystem(s) which will + service parts of the request. Usually the information in the path + assists in locating the correct FS drivers. Sometimes after extensive + pre-processing, the VFS starts invoking exported routines in the FS + driver. This is the point where the FS specific processing of the + request starts, and here the Coda specific kernel code comes into + play. + + The FS layer for Coda must expose and implement several interfaces. + First and foremost the VFS must be able to make all necessary calls to + the Coda FS layer, so the Coda FS driver must expose the VFS interface + as applicable in the operating system. These differ very significantly + among operating systems, but share features such as facilities to + read/write and create and remove objects. The Coda FS layer services + such VFS requests by invoking one or more well defined services + offered by the cache manager Venus. When the replies from Venus have + come back to the FS driver, servicing of the VFS call continues and + finishes with a reply to the kernel's VFS. Finally the VFS layer + returns to the process. + + As a result of this design a basic interface exposed by the FS driver + must allow Venus to manage message traffic. In particular Venus must + be able to retrieve and place messages and to be notified of the + arrival of a new message. The notification must be through a mechanism + which does not block Venus since Venus must attend to other tasks even + when no messages are waiting or being processed. + + **Interfaces of the Coda FS Driver** + + Furthermore the FS layer provides for a special path of communication + between a user process and Venus, called the pioctl interface. The + pioctl interface is used for Coda specific services, such as + requesting detailed information about the persistent cache managed by + Venus. Here the involvement of the kernel is minimal. It identifies + the calling process and passes the information on to Venus. When + Venus replies the response is passed back to the caller in unmodified + form. + + Finally Venus allows the kernel FS driver to cache the results from + certain services. This is done to avoid excessive context switches + and results in an efficient system. However, Venus may acquire + information, for example from the network which implies that cached + information must be flushed or replaced. Venus then makes a downcall + to the Coda FS layer to request flushes or updates in the cache. The + kernel FS driver handles such requests synchronously. + + Among these interfaces the VFS interface and the facility to place, + receive and be notified of messages are platform specific. We will + not go into the calls exported to the VFS layer but we will state the + requirements of the message exchange mechanism. + + +3. The message layer +===================== + + At the lowest level the communication between Venus and the FS driver + proceeds through messages. The synchronization between processes + requesting Coda file service and Venus relies on blocking and waking + up processes. The Coda FS driver processes VFS- and pioctl-requests + on behalf of a process P, creates messages for Venus, awaits replies + and finally returns to the caller. The implementation of the exchange + of messages is platform specific, but the semantics have (so far) + appeared to be generally applicable. Data buffers are created by the + FS Driver in kernel memory on behalf of P and copied to user memory in + Venus. + + The FS Driver while servicing P makes upcalls to Venus. Such an + upcall is dispatched to Venus by creating a message structure. The + structure contains the identification of P, the message sequence + number, the size of the request and a pointer to the data in kernel + memory for the request. Since the data buffer is re-used to hold the + reply from Venus, there is a field for the size of the reply. A flags + field is used in the message to precisely record the status of the + message. Additional platform dependent structures involve pointers to + determine the position of the message on queues and pointers to + synchronization objects. In the upcall routine the message structure + is filled in, flags are set to 0, and it is placed on the *pending* + queue. The routine calling upcall is responsible for allocating the + data buffer; its structure will be described in the next section. + + A facility must exist to notify Venus that the message has been + created, and implemented using available synchronization objects in + the OS. This notification is done in the upcall context of the process + P. When the message is on the pending queue, process P cannot proceed + in upcall. The (kernel mode) processing of P in the filesystem + request routine must be suspended until Venus has replied. Therefore + the calling thread in P is blocked in upcall. A pointer in the + message structure will locate the synchronization object on which P is + sleeping. + + Venus detects the notification that a message has arrived, and the FS + driver allow Venus to retrieve the message with a getmsg_from_kernel + call. This action finishes in the kernel by putting the message on the + queue of processing messages and setting flags to READ. Venus is + passed the contents of the data buffer. The getmsg_from_kernel call + now returns and Venus processes the request. + + At some later point the FS driver receives a message from Venus, + namely when Venus calls sendmsg_to_kernel. At this moment the Coda FS + driver looks at the contents of the message and decides if: + + + * the message is a reply for a suspended thread P. If so it removes + the message from the processing queue and marks the message as + WRITTEN. Finally, the FS driver unblocks P (still in the kernel + mode context of Venus) and the sendmsg_to_kernel call returns to + Venus. The process P will be scheduled at some point and continues + processing its upcall with the data buffer replaced with the reply + from Venus. + + * The message is a ``downcall``. A downcall is a request from Venus to + the FS Driver. The FS driver processes the request immediately + (usually a cache eviction or replacement) and when it finishes + sendmsg_to_kernel returns. + + Now P awakes and continues processing upcall. There are some + subtleties to take account of. First P will determine if it was woken + up in upcall by a signal from some other source (for example an + attempt to terminate P) or as is normally the case by Venus in its + sendmsg_to_kernel call. In the normal case, the upcall routine will + deallocate the message structure and return. The FS routine can proceed + with its processing. + + + **Sleeping and IPC arrangements** + + In case P is woken up by a signal and not by Venus, it will first look + at the flags field. If the message is not yet READ, the process P can + handle its signal without notifying Venus. If Venus has READ, and + the request should not be processed, P can send Venus a signal message + to indicate that it should disregard the previous message. Such + signals are put in the queue at the head, and read first by Venus. If + the message is already marked as WRITTEN it is too late to stop the + processing. The VFS routine will now continue. (-- If a VFS request + involves more than one upcall, this can lead to complicated state, an + extra field "handle_signals" could be added in the message structure + to indicate points of no return have been passed.--) + + + +3.1. Implementation details +---------------------------- + + The Unix implementation of this mechanism has been through the + implementation of a character device associated with Coda. Venus + retrieves messages by doing a read on the device, replies are sent + with a write and notification is through the select system call on the + file descriptor for the device. The process P is kept waiting on an + interruptible wait queue object. + + In Windows NT and the DPMI Windows 95 implementation a DeviceIoControl + call is used. The DeviceIoControl call is designed to copy buffers + from user memory to kernel memory with OPCODES. The sendmsg_to_kernel + is issued as a synchronous call, while the getmsg_from_kernel call is + asynchronous. Windows EventObjects are used for notification of + message arrival. The process P is kept waiting on a KernelEvent + object in NT and a semaphore in Windows 95. + + +4. The interface at the call level +=================================== + + + This section describes the upcalls a Coda FS driver can make to Venus. + Each of these upcalls make use of two structures: inputArgs and + outputArgs. In pseudo BNF form the structures take the following + form:: + + + struct inputArgs { + u_long opcode; + u_long unique; /* Keep multiple outstanding msgs distinct */ + u_short pid; /* Common to all */ + u_short pgid; /* Common to all */ + struct CodaCred cred; /* Common to all */ + + <union "in" of call dependent parts of inputArgs> + }; + + struct outputArgs { + u_long opcode; + u_long unique; /* Keep multiple outstanding msgs distinct */ + u_long result; + + <union "out" of call dependent parts of inputArgs> + }; + + + + Before going on let us elucidate the role of the various fields. The + inputArgs start with the opcode which defines the type of service + requested from Venus. There are approximately 30 upcalls at present + which we will discuss. The unique field labels the inputArg with a + unique number which will identify the message uniquely. A process and + process group id are passed. Finally the credentials of the caller + are included. + + Before delving into the specific calls we need to discuss a variety of + data structures shared by the kernel and Venus. + + + + +4.1. Data structures shared by the kernel and Venus +---------------------------------------------------- + + + The CodaCred structure defines a variety of user and group ids as + they are set for the calling process. The vuid_t and vgid_t are 32 bit + unsigned integers. It also defines group membership in an array. On + Unix the CodaCred has proven sufficient to implement good security + semantics for Coda but the structure may have to undergo modification + for the Windows environment when these mature:: + + struct CodaCred { + vuid_t cr_uid, cr_euid, cr_suid, cr_fsuid; /* Real, effective, set, fs uid */ + vgid_t cr_gid, cr_egid, cr_sgid, cr_fsgid; /* same for groups */ + vgid_t cr_groups[NGROUPS]; /* Group membership for caller */ + }; + + + .. Note:: + + It is questionable if we need CodaCreds in Venus. Finally Venus + doesn't know about groups, although it does create files with the + default uid/gid. Perhaps the list of group membership is superfluous. + + + The next item is the fundamental identifier used to identify Coda + files, the ViceFid. A fid of a file uniquely defines a file or + directory in the Coda filesystem within a cell [1]_:: + + typedef struct ViceFid { + VolumeId Volume; + VnodeId Vnode; + Unique_t Unique; + } ViceFid; + + .. [1] A cell is agroup of Coda servers acting under the aegis of a single + system control machine or SCM. See the Coda Administration manual + for a detailed description of the role of the SCM. + + Each of the constituent fields: VolumeId, VnodeId and Unique_t are + unsigned 32 bit integers. We envisage that a further field will need + to be prefixed to identify the Coda cell; this will probably take the + form of a Ipv6 size IP address naming the Coda cell through DNS. + + The next important structure shared between Venus and the kernel is + the attributes of the file. The following structure is used to + exchange information. It has room for future extensions such as + support for device files (currently not present in Coda):: + + + struct coda_timespec { + int64_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ + }; + + struct coda_vattr { + enum coda_vtype va_type; /* vnode type (for create) */ + u_short va_mode; /* files access mode and type */ + short va_nlink; /* number of references to file */ + vuid_t va_uid; /* owner user id */ + vgid_t va_gid; /* owner group id */ + long va_fsid; /* file system id (dev for now) */ + long va_fileid; /* file id */ + u_quad_t va_size; /* file size in bytes */ + long va_blocksize; /* blocksize preferred for i/o */ + struct coda_timespec va_atime; /* time of last access */ + struct coda_timespec va_mtime; /* time of last modification */ + struct coda_timespec va_ctime; /* time file changed */ + u_long va_gen; /* generation number of file */ + u_long va_flags; /* flags defined for file */ + dev_t va_rdev; /* device special file represents */ + u_quad_t va_bytes; /* bytes of disk space held by file */ + u_quad_t va_filerev; /* file modification number */ + u_int va_vaflags; /* operations flags, see below */ + long va_spare; /* remain quad aligned */ + }; + + +4.2. The pioctl interface +-------------------------- + + + Coda specific requests can be made by application through the pioctl + interface. The pioctl is implemented as an ordinary ioctl on a + fictitious file /coda/.CONTROL. The pioctl call opens this file, gets + a file handle and makes the ioctl call. Finally it closes the file. + + The kernel involvement in this is limited to providing the facility to + open and close and pass the ioctl message and to verify that a path in + the pioctl data buffers is a file in a Coda filesystem. + + The kernel is handed a data packet of the form:: + + struct { + const char *path; + struct ViceIoctl vidata; + int follow; + } data; + + + + where:: + + + struct ViceIoctl { + caddr_t in, out; /* Data to be transferred in, or out */ + short in_size; /* Size of input buffer <= 2K */ + short out_size; /* Maximum size of output buffer, <= 2K */ + }; + + + + The path must be a Coda file, otherwise the ioctl upcall will not be + made. + + .. Note:: The data structures and code are a mess. We need to clean this up. + + +**We now proceed to document the individual calls**: + + +4.3. root +---------- + + + Arguments + in + + empty + + out:: + + struct cfs_root_out { + ViceFid VFid; + } cfs_root; + + + + Description + This call is made to Venus during the initialization of + the Coda filesystem. If the result is zero, the cfs_root structure + contains the ViceFid of the root of the Coda filesystem. If a non-zero + result is generated, its value is a platform dependent error code + indicating the difficulty Venus encountered in locating the root of + the Coda filesystem. + +4.4. lookup +------------ + + + Summary + Find the ViceFid and type of an object in a directory if it exists. + + Arguments + in:: + + struct cfs_lookup_in { + ViceFid VFid; + char *name; /* Place holder for data. */ + } cfs_lookup; + + + + out:: + + struct cfs_lookup_out { + ViceFid VFid; + int vtype; + } cfs_lookup; + + + + Description + This call is made to determine the ViceFid and filetype of + a directory entry. The directory entry requested carries name name + and Venus will search the directory identified by cfs_lookup_in.VFid. + The result may indicate that the name does not exist, or that + difficulty was encountered in finding it (e.g. due to disconnection). + If the result is zero, the field cfs_lookup_out.VFid contains the + targets ViceFid and cfs_lookup_out.vtype the coda_vtype giving the + type of object the name designates. + + The name of the object is an 8 bit character string of maximum length + CFS_MAXNAMLEN, currently set to 256 (including a 0 terminator.) + + It is extremely important to realize that Venus bitwise ors the field + cfs_lookup.vtype with CFS_NOCACHE to indicate that the object should + not be put in the kernel name cache. + + .. Note:: + + The type of the vtype is currently wrong. It should be + coda_vtype. Linux does not take note of CFS_NOCACHE. It should. + + +4.5. getattr +------------- + + + Summary Get the attributes of a file. + + Arguments + in:: + + struct cfs_getattr_in { + ViceFid VFid; + struct coda_vattr attr; /* XXXXX */ + } cfs_getattr; + + + + out:: + + struct cfs_getattr_out { + struct coda_vattr attr; + } cfs_getattr; + + + + Description + This call returns the attributes of the file identified by fid. + + Errors + Errors can occur if the object with fid does not exist, is + unaccessible or if the caller does not have permission to fetch + attributes. + + .. Note:: + + Many kernel FS drivers (Linux, NT and Windows 95) need to acquire + the attributes as well as the Fid for the instantiation of an internal + "inode" or "FileHandle". A significant improvement in performance on + such systems could be made by combining the lookup and getattr calls + both at the Venus/kernel interaction level and at the RPC level. + + The vattr structure included in the input arguments is superfluous and + should be removed. + + +4.6. setattr +------------- + + + Summary + Set the attributes of a file. + + Arguments + in:: + + struct cfs_setattr_in { + ViceFid VFid; + struct coda_vattr attr; + } cfs_setattr; + + + + + out + + empty + + Description + The structure attr is filled with attributes to be changed + in BSD style. Attributes not to be changed are set to -1, apart from + vtype which is set to VNON. Other are set to the value to be assigned. + The only attributes which the FS driver may request to change are the + mode, owner, groupid, atime, mtime and ctime. The return value + indicates success or failure. + + Errors + A variety of errors can occur. The object may not exist, may + be inaccessible, or permission may not be granted by Venus. + + +4.7. access +------------ + + + Arguments + in:: + + struct cfs_access_in { + ViceFid VFid; + int flags; + } cfs_access; + + + + out + + empty + + Description + Verify if access to the object identified by VFid for + operations described by flags is permitted. The result indicates if + access will be granted. It is important to remember that Coda uses + ACLs to enforce protection and that ultimately the servers, not the + clients enforce the security of the system. The result of this call + will depend on whether a token is held by the user. + + Errors + The object may not exist, or the ACL describing the protection + may not be accessible. + + +4.8. create +------------ + + + Summary + Invoked to create a file + + Arguments + in:: + + struct cfs_create_in { + ViceFid VFid; + struct coda_vattr attr; + int excl; + int mode; + char *name; /* Place holder for data. */ + } cfs_create; + + + + + out:: + + struct cfs_create_out { + ViceFid VFid; + struct coda_vattr attr; + } cfs_create; + + + + Description + This upcall is invoked to request creation of a file. + The file will be created in the directory identified by VFid, its name + will be name, and the mode will be mode. If excl is set an error will + be returned if the file already exists. If the size field in attr is + set to zero the file will be truncated. The uid and gid of the file + are set by converting the CodaCred to a uid using a macro CRTOUID + (this macro is platform dependent). Upon success the VFid and + attributes of the file are returned. The Coda FS Driver will normally + instantiate a vnode, inode or file handle at kernel level for the new + object. + + + Errors + A variety of errors can occur. Permissions may be insufficient. + If the object exists and is not a file the error EISDIR is returned + under Unix. + + .. Note:: + + The packing of parameters is very inefficient and appears to + indicate confusion between the system call creat and the VFS operation + create. The VFS operation create is only called to create new objects. + This create call differs from the Unix one in that it is not invoked + to return a file descriptor. The truncate and exclusive options, + together with the mode, could simply be part of the mode as it is + under Unix. There should be no flags argument; this is used in open + (2) to return a file descriptor for READ or WRITE mode. + + The attributes of the directory should be returned too, since the size + and mtime changed. + + +4.9. mkdir +----------- + + + Summary + Create a new directory. + + Arguments + in:: + + struct cfs_mkdir_in { + ViceFid VFid; + struct coda_vattr attr; + char *name; /* Place holder for data. */ + } cfs_mkdir; + + + + out:: + + struct cfs_mkdir_out { + ViceFid VFid; + struct coda_vattr attr; + } cfs_mkdir; + + + + + Description + This call is similar to create but creates a directory. + Only the mode field in the input parameters is used for creation. + Upon successful creation, the attr returned contains the attributes of + the new directory. + + Errors + As for create. + + .. Note:: + + The input parameter should be changed to mode instead of + attributes. + + The attributes of the parent should be returned since the size and + mtime changes. + + +4.10. link +----------- + + + Summary + Create a link to an existing file. + + Arguments + in:: + + struct cfs_link_in { + ViceFid sourceFid; /* cnode to link *to* */ + ViceFid destFid; /* Directory in which to place link */ + char *tname; /* Place holder for data. */ + } cfs_link; + + + + out + + empty + + Description + This call creates a link to the sourceFid in the directory + identified by destFid with name tname. The source must reside in the + target's parent, i.e. the source must be have parent destFid, i.e. Coda + does not support cross directory hard links. Only the return value is + relevant. It indicates success or the type of failure. + + Errors + The usual errors can occur. + + +4.11. symlink +-------------- + + + Summary + create a symbolic link + + Arguments + in:: + + struct cfs_symlink_in { + ViceFid VFid; /* Directory to put symlink in */ + char *srcname; + struct coda_vattr attr; + char *tname; + } cfs_symlink; + + + + out + + none + + Description + Create a symbolic link. The link is to be placed in the + directory identified by VFid and named tname. It should point to the + pathname srcname. The attributes of the newly created object are to + be set to attr. + + .. Note:: + + The attributes of the target directory should be returned since + its size changed. + + +4.12. remove +------------- + + + Summary + Remove a file + + Arguments + in:: + + struct cfs_remove_in { + ViceFid VFid; + char *name; /* Place holder for data. */ + } cfs_remove; + + + + out + + none + + Description + Remove file named cfs_remove_in.name in directory + identified by VFid. + + + .. Note:: + + The attributes of the directory should be returned since its + mtime and size may change. + + +4.13. rmdir +------------ + + + Summary + Remove a directory + + Arguments + in:: + + struct cfs_rmdir_in { + ViceFid VFid; + char *name; /* Place holder for data. */ + } cfs_rmdir; + + + + out + + none + + Description + Remove the directory with name name from the directory + identified by VFid. + + .. Note:: The attributes of the parent directory should be returned since + its mtime and size may change. + + +4.14. readlink +--------------- + + + Summary + Read the value of a symbolic link. + + Arguments + in:: + + struct cfs_readlink_in { + ViceFid VFid; + } cfs_readlink; + + + + out:: + + struct cfs_readlink_out { + int count; + caddr_t data; /* Place holder for data. */ + } cfs_readlink; + + + + Description + This routine reads the contents of symbolic link + identified by VFid into the buffer data. The buffer data must be able + to hold any name up to CFS_MAXNAMLEN (PATH or NAM??). + + Errors + No unusual errors. + + +4.15. open +----------- + + + Summary + Open a file. + + Arguments + in:: + + struct cfs_open_in { + ViceFid VFid; + int flags; + } cfs_open; + + + + out:: + + struct cfs_open_out { + dev_t dev; + ino_t inode; + } cfs_open; + + + + Description + This request asks Venus to place the file identified by + VFid in its cache and to note that the calling process wishes to open + it with flags as in open(2). The return value to the kernel differs + for Unix and Windows systems. For Unix systems the Coda FS Driver is + informed of the device and inode number of the container file in the + fields dev and inode. For Windows the path of the container file is + returned to the kernel. + + + .. Note:: + + Currently the cfs_open_out structure is not properly adapted to + deal with the Windows case. It might be best to implement two + upcalls, one to open aiming at a container file name, the other at a + container file inode. + + +4.16. close +------------ + + + Summary + Close a file, update it on the servers. + + Arguments + in:: + + struct cfs_close_in { + ViceFid VFid; + int flags; + } cfs_close; + + + + out + + none + + Description + Close the file identified by VFid. + + .. Note:: + + The flags argument is bogus and not used. However, Venus' code + has room to deal with an execp input field, probably this field should + be used to inform Venus that the file was closed but is still memory + mapped for execution. There are comments about fetching versus not + fetching the data in Venus vproc_vfscalls. This seems silly. If a + file is being closed, the data in the container file is to be the new + data. Here again the execp flag might be in play to create confusion: + currently Venus might think a file can be flushed from the cache when + it is still memory mapped. This needs to be understood. + + +4.17. ioctl +------------ + + + Summary + Do an ioctl on a file. This includes the pioctl interface. + + Arguments + in:: + + struct cfs_ioctl_in { + ViceFid VFid; + int cmd; + int len; + int rwflag; + char *data; /* Place holder for data. */ + } cfs_ioctl; + + + + out:: + + + struct cfs_ioctl_out { + int len; + caddr_t data; /* Place holder for data. */ + } cfs_ioctl; + + + + Description + Do an ioctl operation on a file. The command, len and + data arguments are filled as usual. flags is not used by Venus. + + .. Note:: + + Another bogus parameter. flags is not used. What is the + business about PREFETCHING in the Venus code? + + + +4.18. rename +------------- + + + Summary + Rename a fid. + + Arguments + in:: + + struct cfs_rename_in { + ViceFid sourceFid; + char *srcname; + ViceFid destFid; + char *destname; + } cfs_rename; + + + + out + + none + + Description + Rename the object with name srcname in directory + sourceFid to destname in destFid. It is important that the names + srcname and destname are 0 terminated strings. Strings in Unix + kernels are not always null terminated. + + +4.19. readdir +-------------- + + + Summary + Read directory entries. + + Arguments + in:: + + struct cfs_readdir_in { + ViceFid VFid; + int count; + int offset; + } cfs_readdir; + + + + + out:: + + struct cfs_readdir_out { + int size; + caddr_t data; /* Place holder for data. */ + } cfs_readdir; + + + + Description + Read directory entries from VFid starting at offset and + read at most count bytes. Returns the data in data and returns + the size in size. + + + .. Note:: + + This call is not used. Readdir operations exploit container + files. We will re-evaluate this during the directory revamp which is + about to take place. + + +4.20. vget +----------- + + + Summary + instructs Venus to do an FSDB->Get. + + Arguments + in:: + + struct cfs_vget_in { + ViceFid VFid; + } cfs_vget; + + + + out:: + + struct cfs_vget_out { + ViceFid VFid; + int vtype; + } cfs_vget; + + + + Description + This upcall asks Venus to do a get operation on an fsobj + labelled by VFid. + + .. Note:: + + This operation is not used. However, it is extremely useful + since it can be used to deal with read/write memory mapped files. + These can be "pinned" in the Venus cache using vget and released with + inactive. + + +4.21. fsync +------------ + + + Summary + Tell Venus to update the RVM attributes of a file. + + Arguments + in:: + + struct cfs_fsync_in { + ViceFid VFid; + } cfs_fsync; + + + + out + + none + + Description + Ask Venus to update RVM attributes of object VFid. This + should be called as part of kernel level fsync type calls. The + result indicates if the syncing was successful. + + .. Note:: Linux does not implement this call. It should. + + +4.22. inactive +--------------- + + + Summary + Tell Venus a vnode is no longer in use. + + Arguments + in:: + + struct cfs_inactive_in { + ViceFid VFid; + } cfs_inactive; + + + + out + + none + + Description + This operation returns EOPNOTSUPP. + + .. Note:: This should perhaps be removed. + + +4.23. rdwr +----------- + + + Summary + Read or write from a file + + Arguments + in:: + + struct cfs_rdwr_in { + ViceFid VFid; + int rwflag; + int count; + int offset; + int ioflag; + caddr_t data; /* Place holder for data. */ + } cfs_rdwr; + + + + + out:: + + struct cfs_rdwr_out { + int rwflag; + int count; + caddr_t data; /* Place holder for data. */ + } cfs_rdwr; + + + + Description + This upcall asks Venus to read or write from a file. + + + .. Note:: + + It should be removed since it is against the Coda philosophy that + read/write operations never reach Venus. I have been told the + operation does not work. It is not currently used. + + + +4.24. odymount +--------------- + + + Summary + Allows mounting multiple Coda "filesystems" on one Unix mount point. + + Arguments + in:: + + struct ody_mount_in { + char *name; /* Place holder for data. */ + } ody_mount; + + + + out:: + + struct ody_mount_out { + ViceFid VFid; + } ody_mount; + + + + Description + Asks Venus to return the rootfid of a Coda system named + name. The fid is returned in VFid. + + .. Note:: + + This call was used by David for dynamic sets. It should be + removed since it causes a jungle of pointers in the VFS mounting area. + It is not used by Coda proper. Call is not implemented by Venus. + + +4.25. ody_lookup +----------------- + + + Summary + Looks up something. + + Arguments + in + + irrelevant + + + out + + irrelevant + + + .. Note:: Gut it. Call is not implemented by Venus. + + +4.26. ody_expand +----------------- + + + Summary + expands something in a dynamic set. + + Arguments + in + + irrelevant + + out + + irrelevant + + .. Note:: Gut it. Call is not implemented by Venus. + + +4.27. prefetch +--------------- + + + Summary + Prefetch a dynamic set. + + Arguments + + in + + Not documented. + + out + + Not documented. + + Description + Venus worker.cc has support for this call, although it is + noted that it doesn't work. Not surprising, since the kernel does not + have support for it. (ODY_PREFETCH is not a defined operation). + + + .. Note:: Gut it. It isn't working and isn't used by Coda. + + + +4.28. signal +------------- + + + Summary + Send Venus a signal about an upcall. + + Arguments + in + + none + + out + + not applicable. + + Description + This is an out-of-band upcall to Venus to inform Venus + that the calling process received a signal after Venus read the + message from the input queue. Venus is supposed to clean up the + operation. + + Errors + No reply is given. + + .. Note:: + + We need to better understand what Venus needs to clean up and if + it is doing this correctly. Also we need to handle multiple upcall + per system call situations correctly. It would be important to know + what state changes in Venus take place after an upcall for which the + kernel is responsible for notifying Venus to clean up (e.g. open + definitely is such a state change, but many others are maybe not). + + +5. The minicache and downcalls +=============================== + + + The Coda FS Driver can cache results of lookup and access upcalls, to + limit the frequency of upcalls. Upcalls carry a price since a process + context switch needs to take place. The counterpart of caching the + information is that Venus will notify the FS Driver that cached + entries must be flushed or renamed. + + The kernel code generally has to maintain a structure which links the + internal file handles (called vnodes in BSD, inodes in Linux and + FileHandles in Windows) with the ViceFid's which Venus maintains. The + reason is that frequent translations back and forth are needed in + order to make upcalls and use the results of upcalls. Such linking + objects are called cnodes. + + The current minicache implementations have cache entries which record + the following: + + 1. the name of the file + + 2. the cnode of the directory containing the object + + 3. a list of CodaCred's for which the lookup is permitted. + + 4. the cnode of the object + + The lookup call in the Coda FS Driver may request the cnode of the + desired object from the cache, by passing its name, directory and the + CodaCred's of the caller. The cache will return the cnode or indicate + that it cannot be found. The Coda FS Driver must be careful to + invalidate cache entries when it modifies or removes objects. + + When Venus obtains information that indicates that cache entries are + no longer valid, it will make a downcall to the kernel. Downcalls are + intercepted by the Coda FS Driver and lead to cache invalidations of + the kind described below. The Coda FS Driver does not return an error + unless the downcall data could not be read into kernel memory. + + +5.1. INVALIDATE +---------------- + + + No information is available on this call. + + +5.2. FLUSH +----------- + + + + Arguments + None + + Summary + Flush the name cache entirely. + + Description + Venus issues this call upon startup and when it dies. This + is to prevent stale cache information being held. Some operating + systems allow the kernel name cache to be switched off dynamically. + When this is done, this downcall is made. + + +5.3. PURGEUSER +--------------- + + + Arguments + :: + + struct cfs_purgeuser_out {/* CFS_PURGEUSER is a venus->kernel call */ + struct CodaCred cred; + } cfs_purgeuser; + + + + Description + Remove all entries in the cache carrying the Cred. This + call is issued when tokens for a user expire or are flushed. + + +5.4. ZAPFILE +------------- + + + Arguments + :: + + struct cfs_zapfile_out { /* CFS_ZAPFILE is a venus->kernel call */ + ViceFid CodaFid; + } cfs_zapfile; + + + + Description + Remove all entries which have the (dir vnode, name) pair. + This is issued as a result of an invalidation of cached attributes of + a vnode. + + .. Note:: + + Call is not named correctly in NetBSD and Mach. The minicache + zapfile routine takes different arguments. Linux does not implement + the invalidation of attributes correctly. + + + +5.5. ZAPDIR +------------ + + + Arguments + :: + + struct cfs_zapdir_out { /* CFS_ZAPDIR is a venus->kernel call */ + ViceFid CodaFid; + } cfs_zapdir; + + + + Description + Remove all entries in the cache lying in a directory + CodaFid, and all children of this directory. This call is issued when + Venus receives a callback on the directory. + + +5.6. ZAPVNODE +-------------- + + + + Arguments + :: + + struct cfs_zapvnode_out { /* CFS_ZAPVNODE is a venus->kernel call */ + struct CodaCred cred; + ViceFid VFid; + } cfs_zapvnode; + + + + Description + Remove all entries in the cache carrying the cred and VFid + as in the arguments. This downcall is probably never issued. + + +5.7. PURGEFID +-------------- + + + Arguments + :: + + struct cfs_purgefid_out { /* CFS_PURGEFID is a venus->kernel call */ + ViceFid CodaFid; + } cfs_purgefid; + + + + Description + Flush the attribute for the file. If it is a dir (odd + vnode), purge its children from the namecache and remove the file from the + namecache. + + + +5.8. REPLACE +------------- + + + Summary + Replace the Fid's for a collection of names. + + Arguments + :: + + struct cfs_replace_out { /* cfs_replace is a venus->kernel call */ + ViceFid NewFid; + ViceFid OldFid; + } cfs_replace; + + + + Description + This routine replaces a ViceFid in the name cache with + another. It is added to allow Venus during reintegration to replace + locally allocated temp fids while disconnected with global fids even + when the reference counts on those fids are not zero. + + +6. Initialization and cleanup +============================== + + + This section gives brief hints as to desirable features for the Coda + FS Driver at startup and upon shutdown or Venus failures. Before + entering the discussion it is useful to repeat that the Coda FS Driver + maintains the following data: + + + 1. message queues + + 2. cnodes + + 3. name cache entries + + The name cache entries are entirely private to the driver, so they + can easily be manipulated. The message queues will generally have + clear points of initialization and destruction. The cnodes are + much more delicate. User processes hold reference counts in Coda + filesystems and it can be difficult to clean up the cnodes. + + It can expect requests through: + + 1. the message subsystem + + 2. the VFS layer + + 3. pioctl interface + + Currently the pioctl passes through the VFS for Coda so we can + treat these similarly. + + +6.1. Requirements +------------------ + + + The following requirements should be accommodated: + + 1. The message queues should have open and close routines. On Unix + the opening of the character devices are such routines. + + - Before opening, no messages can be placed. + + - Opening will remove any old messages still pending. + + - Close will notify any sleeping processes that their upcall cannot + be completed. + + - Close will free all memory allocated by the message queues. + + + 2. At open the namecache shall be initialized to empty state. + + 3. Before the message queues are open, all VFS operations will fail. + Fortunately this can be achieved by making sure than mounting the + Coda filesystem cannot succeed before opening. + + 4. After closing of the queues, no VFS operations can succeed. Here + one needs to be careful, since a few operations (lookup, + read/write, readdir) can proceed without upcalls. These must be + explicitly blocked. + + 5. Upon closing the namecache shall be flushed and disabled. + + 6. All memory held by cnodes can be freed without relying on upcalls. + + 7. Unmounting the file system can be done without relying on upcalls. + + 8. Mounting the Coda filesystem should fail gracefully if Venus cannot + get the rootfid or the attributes of the rootfid. The latter is + best implemented by Venus fetching these objects before attempting + to mount. + + .. Note:: + + NetBSD in particular but also Linux have not implemented the + above requirements fully. For smooth operation this needs to be + corrected. + + + diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt deleted file mode 100644 index 1711ad48e38a..000000000000 --- a/Documentation/filesystems/coda.txt +++ /dev/null @@ -1,1676 +0,0 @@ -NOTE: -This is one of the technical documents describing a component of -Coda -- this document describes the client kernel-Venus interface. - -For more information: - http://www.coda.cs.cmu.edu -For user level software needed to run Coda: - ftp://ftp.coda.cs.cmu.edu - -To run Coda you need to get a user level cache manager for the client, -named Venus, as well as tools to manipulate ACLs, to log in, etc. The -client needs to have the Coda filesystem selected in the kernel -configuration. - -The server needs a user level server and at present does not depend on -kernel support. - - - - - - - - The Venus kernel interface - Peter J. Braam - v1.0, Nov 9, 1997 - - This document describes the communication between Venus and kernel - level filesystem code needed for the operation of the Coda file sys- - tem. This document version is meant to describe the current interface - (version 1.0) as well as improvements we envisage. - ______________________________________________________________________ - - Table of Contents - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1. Introduction - - 2. Servicing Coda filesystem calls - - 3. The message layer - - 3.1 Implementation details - - 4. The interface at the call level - - 4.1 Data structures shared by the kernel and Venus - 4.2 The pioctl interface - 4.3 root - 4.4 lookup - 4.5 getattr - 4.6 setattr - 4.7 access - 4.8 create - 4.9 mkdir - 4.10 link - 4.11 symlink - 4.12 remove - 4.13 rmdir - 4.14 readlink - 4.15 open - 4.16 close - 4.17 ioctl - 4.18 rename - 4.19 readdir - 4.20 vget - 4.21 fsync - 4.22 inactive - 4.23 rdwr - 4.24 odymount - 4.25 ody_lookup - 4.26 ody_expand - 4.27 prefetch - 4.28 signal - - 5. The minicache and downcalls - - 5.1 INVALIDATE - 5.2 FLUSH - 5.3 PURGEUSER - 5.4 ZAPFILE - 5.5 ZAPDIR - 5.6 ZAPVNODE - 5.7 PURGEFID - 5.8 REPLACE - - 6. Initialization and cleanup - - 6.1 Requirements - - - ______________________________________________________________________ - 0wpage - - 11.. IInnttrroodduuccttiioonn - - - - A key component in the Coda Distributed File System is the cache - manager, _V_e_n_u_s. - - - When processes on a Coda enabled system access files in the Coda - filesystem, requests are directed at the filesystem layer in the - operating system. The operating system will communicate with Venus to - service the request for the process. Venus manages a persistent - client cache and makes remote procedure calls to Coda file servers and - related servers (such as authentication servers) to service these - requests it receives from the operating system. When Venus has - serviced a request it replies to the operating system with appropriate - return codes, and other data related to the request. Optionally the - kernel support for Coda may maintain a minicache of recently processed - requests to limit the number of interactions with Venus. Venus - possesses the facility to inform the kernel when elements from its - minicache are no longer valid. - - This document describes precisely this communication between the - kernel and Venus. The definitions of so called upcalls and downcalls - will be given with the format of the data they handle. We shall also - describe the semantic invariants resulting from the calls. - - Historically Coda was implemented in a BSD file system in Mach 2.6. - The interface between the kernel and Venus is very similar to the BSD - VFS interface. Similar functionality is provided, and the format of - the parameters and returned data is very similar to the BSD VFS. This - leads to an almost natural environment for implementing a kernel-level - filesystem driver for Coda in a BSD system. However, other operating - systems such as Linux and Windows 95 and NT have virtual filesystem - with different interfaces. - - To implement Coda on these systems some reverse engineering of the - Venus/Kernel protocol is necessary. Also it came to light that other - systems could profit significantly from certain small optimizations - and modifications to the protocol. To facilitate this work as well as - to make future ports easier, communication between Venus and the - kernel should be documented in great detail. This is the aim of this - document. - - 0wpage - - 22.. SSeerrvviicciinngg CCooddaa ffiilleessyysstteemm ccaallllss - - The service of a request for a Coda file system service originates in - a process PP which accessing a Coda file. It makes a system call which - traps to the OS kernel. Examples of such calls trapping to the kernel - are _r_e_a_d_, _w_r_i_t_e_, _o_p_e_n_, _c_l_o_s_e_, _c_r_e_a_t_e_, _m_k_d_i_r_, _r_m_d_i_r_, _c_h_m_o_d in a Unix - context. Similar calls exist in the Win32 environment, and are named - _C_r_e_a_t_e_F_i_l_e_, . - - Generally the operating system handles the request in a virtual - filesystem (VFS) layer, which is named I/O Manager in NT and IFS - manager in Windows 95. The VFS is responsible for partial processing - of the request and for locating the specific filesystem(s) which will - service parts of the request. Usually the information in the path - assists in locating the correct FS drivers. Sometimes after extensive - pre-processing, the VFS starts invoking exported routines in the FS - driver. This is the point where the FS specific processing of the - request starts, and here the Coda specific kernel code comes into - play. - - The FS layer for Coda must expose and implement several interfaces. - First and foremost the VFS must be able to make all necessary calls to - the Coda FS layer, so the Coda FS driver must expose the VFS interface - as applicable in the operating system. These differ very significantly - among operating systems, but share features such as facilities to - read/write and create and remove objects. The Coda FS layer services - such VFS requests by invoking one or more well defined services - offered by the cache manager Venus. When the replies from Venus have - come back to the FS driver, servicing of the VFS call continues and - finishes with a reply to the kernel's VFS. Finally the VFS layer - returns to the process. - - As a result of this design a basic interface exposed by the FS driver - must allow Venus to manage message traffic. In particular Venus must - be able to retrieve and place messages and to be notified of the - arrival of a new message. The notification must be through a mechanism - which does not block Venus since Venus must attend to other tasks even - when no messages are waiting or being processed. - - - - - - - Interfaces of the Coda FS Driver - - Furthermore the FS layer provides for a special path of communication - between a user process and Venus, called the pioctl interface. The - pioctl interface is used for Coda specific services, such as - requesting detailed information about the persistent cache managed by - Venus. Here the involvement of the kernel is minimal. It identifies - the calling process and passes the information on to Venus. When - Venus replies the response is passed back to the caller in unmodified - form. - - Finally Venus allows the kernel FS driver to cache the results from - certain services. This is done to avoid excessive context switches - and results in an efficient system. However, Venus may acquire - information, for example from the network which implies that cached - information must be flushed or replaced. Venus then makes a downcall - to the Coda FS layer to request flushes or updates in the cache. The - kernel FS driver handles such requests synchronously. - - Among these interfaces the VFS interface and the facility to place, - receive and be notified of messages are platform specific. We will - not go into the calls exported to the VFS layer but we will state the - requirements of the message exchange mechanism. - - 0wpage - - 33.. TThhee mmeessssaaggee llaayyeerr - - - - At the lowest level the communication between Venus and the FS driver - proceeds through messages. The synchronization between processes - requesting Coda file service and Venus relies on blocking and waking - up processes. The Coda FS driver processes VFS- and pioctl-requests - on behalf of a process P, creates messages for Venus, awaits replies - and finally returns to the caller. The implementation of the exchange - of messages is platform specific, but the semantics have (so far) - appeared to be generally applicable. Data buffers are created by the - FS Driver in kernel memory on behalf of P and copied to user memory in - Venus. - - The FS Driver while servicing P makes upcalls to Venus. Such an - upcall is dispatched to Venus by creating a message structure. The - structure contains the identification of P, the message sequence - number, the size of the request and a pointer to the data in kernel - memory for the request. Since the data buffer is re-used to hold the - reply from Venus, there is a field for the size of the reply. A flags - field is used in the message to precisely record the status of the - message. Additional platform dependent structures involve pointers to - determine the position of the message on queues and pointers to - synchronization objects. In the upcall routine the message structure - is filled in, flags are set to 0, and it is placed on the _p_e_n_d_i_n_g - queue. The routine calling upcall is responsible for allocating the - data buffer; its structure will be described in the next section. - - A facility must exist to notify Venus that the message has been - created, and implemented using available synchronization objects in - the OS. This notification is done in the upcall context of the process - P. When the message is on the pending queue, process P cannot proceed - in upcall. The (kernel mode) processing of P in the filesystem - request routine must be suspended until Venus has replied. Therefore - the calling thread in P is blocked in upcall. A pointer in the - message structure will locate the synchronization object on which P is - sleeping. - - Venus detects the notification that a message has arrived, and the FS - driver allow Venus to retrieve the message with a getmsg_from_kernel - call. This action finishes in the kernel by putting the message on the - queue of processing messages and setting flags to READ. Venus is - passed the contents of the data buffer. The getmsg_from_kernel call - now returns and Venus processes the request. - - At some later point the FS driver receives a message from Venus, - namely when Venus calls sendmsg_to_kernel. At this moment the Coda FS - driver looks at the contents of the message and decides if: - - - +o the message is a reply for a suspended thread P. If so it removes - the message from the processing queue and marks the message as - WRITTEN. Finally, the FS driver unblocks P (still in the kernel - mode context of Venus) and the sendmsg_to_kernel call returns to - Venus. The process P will be scheduled at some point and continues - processing its upcall with the data buffer replaced with the reply - from Venus. - - +o The message is a _d_o_w_n_c_a_l_l. A downcall is a request from Venus to - the FS Driver. The FS driver processes the request immediately - (usually a cache eviction or replacement) and when it finishes - sendmsg_to_kernel returns. - - Now P awakes and continues processing upcall. There are some - subtleties to take account of. First P will determine if it was woken - up in upcall by a signal from some other source (for example an - attempt to terminate P) or as is normally the case by Venus in its - sendmsg_to_kernel call. In the normal case, the upcall routine will - deallocate the message structure and return. The FS routine can proceed - with its processing. - - - - - - - - Sleeping and IPC arrangements - - In case P is woken up by a signal and not by Venus, it will first look - at the flags field. If the message is not yet READ, the process P can - handle its signal without notifying Venus. If Venus has READ, and - the request should not be processed, P can send Venus a signal message - to indicate that it should disregard the previous message. Such - signals are put in the queue at the head, and read first by Venus. If - the message is already marked as WRITTEN it is too late to stop the - processing. The VFS routine will now continue. (-- If a VFS request - involves more than one upcall, this can lead to complicated state, an - extra field "handle_signals" could be added in the message structure - to indicate points of no return have been passed.--) - - - - 33..11.. IImmpplleemmeennttaattiioonn ddeettaaiillss - - The Unix implementation of this mechanism has been through the - implementation of a character device associated with Coda. Venus - retrieves messages by doing a read on the device, replies are sent - with a write and notification is through the select system call on the - file descriptor for the device. The process P is kept waiting on an - interruptible wait queue object. - - In Windows NT and the DPMI Windows 95 implementation a DeviceIoControl - call is used. The DeviceIoControl call is designed to copy buffers - from user memory to kernel memory with OPCODES. The sendmsg_to_kernel - is issued as a synchronous call, while the getmsg_from_kernel call is - asynchronous. Windows EventObjects are used for notification of - message arrival. The process P is kept waiting on a KernelEvent - object in NT and a semaphore in Windows 95. - - 0wpage - - 44.. TThhee iinntteerrffaaccee aatt tthhee ccaallll lleevveell - - - This section describes the upcalls a Coda FS driver can make to Venus. - Each of these upcalls make use of two structures: inputArgs and - outputArgs. In pseudo BNF form the structures take the following - form: - - - struct inputArgs { - u_long opcode; - u_long unique; /* Keep multiple outstanding msgs distinct */ - u_short pid; /* Common to all */ - u_short pgid; /* Common to all */ - struct CodaCred cred; /* Common to all */ - - <union "in" of call dependent parts of inputArgs> - }; - - struct outputArgs { - u_long opcode; - u_long unique; /* Keep multiple outstanding msgs distinct */ - u_long result; - - <union "out" of call dependent parts of inputArgs> - }; - - - - Before going on let us elucidate the role of the various fields. The - inputArgs start with the opcode which defines the type of service - requested from Venus. There are approximately 30 upcalls at present - which we will discuss. The unique field labels the inputArg with a - unique number which will identify the message uniquely. A process and - process group id are passed. Finally the credentials of the caller - are included. - - Before delving into the specific calls we need to discuss a variety of - data structures shared by the kernel and Venus. - - - - - 44..11.. DDaattaa ssttrruuccttuurreess sshhaarreedd bbyy tthhee kkeerrnneell aanndd VVeennuuss - - - The CodaCred structure defines a variety of user and group ids as - they are set for the calling process. The vuid_t and vgid_t are 32 bit - unsigned integers. It also defines group membership in an array. On - Unix the CodaCred has proven sufficient to implement good security - semantics for Coda but the structure may have to undergo modification - for the Windows environment when these mature. - - struct CodaCred { - vuid_t cr_uid, cr_euid, cr_suid, cr_fsuid; /* Real, effective, set, fs uid */ - vgid_t cr_gid, cr_egid, cr_sgid, cr_fsgid; /* same for groups */ - vgid_t cr_groups[NGROUPS]; /* Group membership for caller */ - }; - - - - NNOOTTEE It is questionable if we need CodaCreds in Venus. Finally Venus - doesn't know about groups, although it does create files with the - default uid/gid. Perhaps the list of group membership is superfluous. - - - The next item is the fundamental identifier used to identify Coda - files, the ViceFid. A fid of a file uniquely defines a file or - directory in the Coda filesystem within a _c_e_l_l. (-- A _c_e_l_l is a - group of Coda servers acting under the aegis of a single system - control machine or SCM. See the Coda Administration manual for a - detailed description of the role of the SCM.--) - - - typedef struct ViceFid { - VolumeId Volume; - VnodeId Vnode; - Unique_t Unique; - } ViceFid; - - - - Each of the constituent fields: VolumeId, VnodeId and Unique_t are - unsigned 32 bit integers. We envisage that a further field will need - to be prefixed to identify the Coda cell; this will probably take the - form of a Ipv6 size IP address naming the Coda cell through DNS. - - The next important structure shared between Venus and the kernel is - the attributes of the file. The following structure is used to - exchange information. It has room for future extensions such as - support for device files (currently not present in Coda). - - - - - - - - - - - - - - - - - struct coda_timespec { - int64_t tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ - }; - - struct coda_vattr { - enum coda_vtype va_type; /* vnode type (for create) */ - u_short va_mode; /* files access mode and type */ - short va_nlink; /* number of references to file */ - vuid_t va_uid; /* owner user id */ - vgid_t va_gid; /* owner group id */ - long va_fsid; /* file system id (dev for now) */ - long va_fileid; /* file id */ - u_quad_t va_size; /* file size in bytes */ - long va_blocksize; /* blocksize preferred for i/o */ - struct coda_timespec va_atime; /* time of last access */ - struct coda_timespec va_mtime; /* time of last modification */ - struct coda_timespec va_ctime; /* time file changed */ - u_long va_gen; /* generation number of file */ - u_long va_flags; /* flags defined for file */ - dev_t va_rdev; /* device special file represents */ - u_quad_t va_bytes; /* bytes of disk space held by file */ - u_quad_t va_filerev; /* file modification number */ - u_int va_vaflags; /* operations flags, see below */ - long va_spare; /* remain quad aligned */ - }; - - - - - 44..22.. TThhee ppiiooccttll iinntteerrffaaccee - - - Coda specific requests can be made by application through the pioctl - interface. The pioctl is implemented as an ordinary ioctl on a - fictitious file /coda/.CONTROL. The pioctl call opens this file, gets - a file handle and makes the ioctl call. Finally it closes the file. - - The kernel involvement in this is limited to providing the facility to - open and close and pass the ioctl message _a_n_d to verify that a path in - the pioctl data buffers is a file in a Coda filesystem. - - The kernel is handed a data packet of the form: - - struct { - const char *path; - struct ViceIoctl vidata; - int follow; - } data; - - - - where - - - struct ViceIoctl { - caddr_t in, out; /* Data to be transferred in, or out */ - short in_size; /* Size of input buffer <= 2K */ - short out_size; /* Maximum size of output buffer, <= 2K */ - }; - - - - The path must be a Coda file, otherwise the ioctl upcall will not be - made. - - NNOOTTEE The data structures and code are a mess. We need to clean this - up. - - We now proceed to document the individual calls: - - 0wpage - - 44..33.. rroooott - - - AArrgguummeennttss - - iinn empty - - oouutt - - struct cfs_root_out { - ViceFid VFid; - } cfs_root; - - - - DDeessccrriippttiioonn This call is made to Venus during the initialization of - the Coda filesystem. If the result is zero, the cfs_root structure - contains the ViceFid of the root of the Coda filesystem. If a non-zero - result is generated, its value is a platform dependent error code - indicating the difficulty Venus encountered in locating the root of - the Coda filesystem. - - 0wpage - - 44..44.. llooookkuupp - - - SSuummmmaarryy Find the ViceFid and type of an object in a directory if it - exists. - - AArrgguummeennttss - - iinn - - struct cfs_lookup_in { - ViceFid VFid; - char *name; /* Place holder for data. */ - } cfs_lookup; - - - - oouutt - - struct cfs_lookup_out { - ViceFid VFid; - int vtype; - } cfs_lookup; - - - - DDeessccrriippttiioonn This call is made to determine the ViceFid and filetype of - a directory entry. The directory entry requested carries name name - and Venus will search the directory identified by cfs_lookup_in.VFid. - The result may indicate that the name does not exist, or that - difficulty was encountered in finding it (e.g. due to disconnection). - If the result is zero, the field cfs_lookup_out.VFid contains the - targets ViceFid and cfs_lookup_out.vtype the coda_vtype giving the - type of object the name designates. - - The name of the object is an 8 bit character string of maximum length - CFS_MAXNAMLEN, currently set to 256 (including a 0 terminator.) - - It is extremely important to realize that Venus bitwise ors the field - cfs_lookup.vtype with CFS_NOCACHE to indicate that the object should - not be put in the kernel name cache. - - NNOOTTEE The type of the vtype is currently wrong. It should be - coda_vtype. Linux does not take note of CFS_NOCACHE. It should. - - 0wpage - - 44..55.. ggeettaattttrr - - - SSuummmmaarryy Get the attributes of a file. - - AArrgguummeennttss - - iinn - - struct cfs_getattr_in { - ViceFid VFid; - struct coda_vattr attr; /* XXXXX */ - } cfs_getattr; - - - - oouutt - - struct cfs_getattr_out { - struct coda_vattr attr; - } cfs_getattr; - - - - DDeessccrriippttiioonn This call returns the attributes of the file identified by - fid. - - EErrrroorrss Errors can occur if the object with fid does not exist, is - unaccessible or if the caller does not have permission to fetch - attributes. - - NNoottee Many kernel FS drivers (Linux, NT and Windows 95) need to acquire - the attributes as well as the Fid for the instantiation of an internal - "inode" or "FileHandle". A significant improvement in performance on - such systems could be made by combining the _l_o_o_k_u_p and _g_e_t_a_t_t_r calls - both at the Venus/kernel interaction level and at the RPC level. - - The vattr structure included in the input arguments is superfluous and - should be removed. - - 0wpage - - 44..66.. sseettaattttrr - - - SSuummmmaarryy Set the attributes of a file. - - AArrgguummeennttss - - iinn - - struct cfs_setattr_in { - ViceFid VFid; - struct coda_vattr attr; - } cfs_setattr; - - - - - oouutt - empty - - DDeessccrriippttiioonn The structure attr is filled with attributes to be changed - in BSD style. Attributes not to be changed are set to -1, apart from - vtype which is set to VNON. Other are set to the value to be assigned. - The only attributes which the FS driver may request to change are the - mode, owner, groupid, atime, mtime and ctime. The return value - indicates success or failure. - - EErrrroorrss A variety of errors can occur. The object may not exist, may - be inaccessible, or permission may not be granted by Venus. - - 0wpage - - 44..77.. aacccceessss - - - SSuummmmaarryy - - AArrgguummeennttss - - iinn - - struct cfs_access_in { - ViceFid VFid; - int flags; - } cfs_access; - - - - oouutt - empty - - DDeessccrriippttiioonn Verify if access to the object identified by VFid for - operations described by flags is permitted. The result indicates if - access will be granted. It is important to remember that Coda uses - ACLs to enforce protection and that ultimately the servers, not the - clients enforce the security of the system. The result of this call - will depend on whether a _t_o_k_e_n is held by the user. - - EErrrroorrss The object may not exist, or the ACL describing the protection - may not be accessible. - - 0wpage - - 44..88.. ccrreeaattee - - - SSuummmmaarryy Invoked to create a file - - AArrgguummeennttss - - iinn - - struct cfs_create_in { - ViceFid VFid; - struct coda_vattr attr; - int excl; - int mode; - char *name; /* Place holder for data. */ - } cfs_create; - - - - - oouutt - - struct cfs_create_out { - ViceFid VFid; - struct coda_vattr attr; - } cfs_create; - - - - DDeessccrriippttiioonn This upcall is invoked to request creation of a file. - The file will be created in the directory identified by VFid, its name - will be name, and the mode will be mode. If excl is set an error will - be returned if the file already exists. If the size field in attr is - set to zero the file will be truncated. The uid and gid of the file - are set by converting the CodaCred to a uid using a macro CRTOUID - (this macro is platform dependent). Upon success the VFid and - attributes of the file are returned. The Coda FS Driver will normally - instantiate a vnode, inode or file handle at kernel level for the new - object. - - - EErrrroorrss A variety of errors can occur. Permissions may be insufficient. - If the object exists and is not a file the error EISDIR is returned - under Unix. - - NNOOTTEE The packing of parameters is very inefficient and appears to - indicate confusion between the system call creat and the VFS operation - create. The VFS operation create is only called to create new objects. - This create call differs from the Unix one in that it is not invoked - to return a file descriptor. The truncate and exclusive options, - together with the mode, could simply be part of the mode as it is - under Unix. There should be no flags argument; this is used in open - (2) to return a file descriptor for READ or WRITE mode. - - The attributes of the directory should be returned too, since the size - and mtime changed. - - 0wpage - - 44..99.. mmkkddiirr - - - SSuummmmaarryy Create a new directory. - - AArrgguummeennttss - - iinn - - struct cfs_mkdir_in { - ViceFid VFid; - struct coda_vattr attr; - char *name; /* Place holder for data. */ - } cfs_mkdir; - - - - oouutt - - struct cfs_mkdir_out { - ViceFid VFid; - struct coda_vattr attr; - } cfs_mkdir; - - - - - DDeessccrriippttiioonn This call is similar to create but creates a directory. - Only the mode field in the input parameters is used for creation. - Upon successful creation, the attr returned contains the attributes of - the new directory. - - EErrrroorrss As for create. - - NNOOTTEE The input parameter should be changed to mode instead of - attributes. - - The attributes of the parent should be returned since the size and - mtime changes. - - 0wpage - - 44..1100.. lliinnkk - - - SSuummmmaarryy Create a link to an existing file. - - AArrgguummeennttss - - iinn - - struct cfs_link_in { - ViceFid sourceFid; /* cnode to link *to* */ - ViceFid destFid; /* Directory in which to place link */ - char *tname; /* Place holder for data. */ - } cfs_link; - - - - oouutt - empty - - DDeessccrriippttiioonn This call creates a link to the sourceFid in the directory - identified by destFid with name tname. The source must reside in the - target's parent, i.e. the source must be have parent destFid, i.e. Coda - does not support cross directory hard links. Only the return value is - relevant. It indicates success or the type of failure. - - EErrrroorrss The usual errors can occur.0wpage - - 44..1111.. ssyymmlliinnkk - - - SSuummmmaarryy create a symbolic link - - AArrgguummeennttss - - iinn - - struct cfs_symlink_in { - ViceFid VFid; /* Directory to put symlink in */ - char *srcname; - struct coda_vattr attr; - char *tname; - } cfs_symlink; - - - - oouutt - none - - DDeessccrriippttiioonn Create a symbolic link. The link is to be placed in the - directory identified by VFid and named tname. It should point to the - pathname srcname. The attributes of the newly created object are to - be set to attr. - - EErrrroorrss - - NNOOTTEE The attributes of the target directory should be returned since - its size changed. - - 0wpage - - 44..1122.. rreemmoovvee - - - SSuummmmaarryy Remove a file - - AArrgguummeennttss - - iinn - - struct cfs_remove_in { - ViceFid VFid; - char *name; /* Place holder for data. */ - } cfs_remove; - - - - oouutt - none - - DDeessccrriippttiioonn Remove file named cfs_remove_in.name in directory - identified by VFid. - - EErrrroorrss - - NNOOTTEE The attributes of the directory should be returned since its - mtime and size may change. - - 0wpage - - 44..1133.. rrmmddiirr - - - SSuummmmaarryy Remove a directory - - AArrgguummeennttss - - iinn - - struct cfs_rmdir_in { - ViceFid VFid; - char *name; /* Place holder for data. */ - } cfs_rmdir; - - - - oouutt - none - - DDeessccrriippttiioonn Remove the directory with name name from the directory - identified by VFid. - - EErrrroorrss - - NNOOTTEE The attributes of the parent directory should be returned since - its mtime and size may change. - - 0wpage - - 44..1144.. rreeaaddlliinnkk - - - SSuummmmaarryy Read the value of a symbolic link. - - AArrgguummeennttss - - iinn - - struct cfs_readlink_in { - ViceFid VFid; - } cfs_readlink; - - - - oouutt - - struct cfs_readlink_out { - int count; - caddr_t data; /* Place holder for data. */ - } cfs_readlink; - - - - DDeessccrriippttiioonn This routine reads the contents of symbolic link - identified by VFid into the buffer data. The buffer data must be able - to hold any name up to CFS_MAXNAMLEN (PATH or NAM??). - - EErrrroorrss No unusual errors. - - 0wpage - - 44..1155.. ooppeenn - - - SSuummmmaarryy Open a file. - - AArrgguummeennttss - - iinn - - struct cfs_open_in { - ViceFid VFid; - int flags; - } cfs_open; - - - - oouutt - - struct cfs_open_out { - dev_t dev; - ino_t inode; - } cfs_open; - - - - DDeessccrriippttiioonn This request asks Venus to place the file identified by - VFid in its cache and to note that the calling process wishes to open - it with flags as in open(2). The return value to the kernel differs - for Unix and Windows systems. For Unix systems the Coda FS Driver is - informed of the device and inode number of the container file in the - fields dev and inode. For Windows the path of the container file is - returned to the kernel. - EErrrroorrss - - NNOOTTEE Currently the cfs_open_out structure is not properly adapted to - deal with the Windows case. It might be best to implement two - upcalls, one to open aiming at a container file name, the other at a - container file inode. - - 0wpage - - 44..1166.. cclloossee - - - SSuummmmaarryy Close a file, update it on the servers. - - AArrgguummeennttss - - iinn - - struct cfs_close_in { - ViceFid VFid; - int flags; - } cfs_close; - - - - oouutt - none - - DDeessccrriippttiioonn Close the file identified by VFid. - - EErrrroorrss - - NNOOTTEE The flags argument is bogus and not used. However, Venus' code - has room to deal with an execp input field, probably this field should - be used to inform Venus that the file was closed but is still memory - mapped for execution. There are comments about fetching versus not - fetching the data in Venus vproc_vfscalls. This seems silly. If a - file is being closed, the data in the container file is to be the new - data. Here again the execp flag might be in play to create confusion: - currently Venus might think a file can be flushed from the cache when - it is still memory mapped. This needs to be understood. - - 0wpage - - 44..1177.. iiooccttll - - - SSuummmmaarryy Do an ioctl on a file. This includes the pioctl interface. - - AArrgguummeennttss - - iinn - - struct cfs_ioctl_in { - ViceFid VFid; - int cmd; - int len; - int rwflag; - char *data; /* Place holder for data. */ - } cfs_ioctl; - - - - oouutt - - - struct cfs_ioctl_out { - int len; - caddr_t data; /* Place holder for data. */ - } cfs_ioctl; - - - - DDeessccrriippttiioonn Do an ioctl operation on a file. The command, len and - data arguments are filled as usual. flags is not used by Venus. - - EErrrroorrss - - NNOOTTEE Another bogus parameter. flags is not used. What is the - business about PREFETCHING in the Venus code? - - - 0wpage - - 44..1188.. rreennaammee - - - SSuummmmaarryy Rename a fid. - - AArrgguummeennttss - - iinn - - struct cfs_rename_in { - ViceFid sourceFid; - char *srcname; - ViceFid destFid; - char *destname; - } cfs_rename; - - - - oouutt - none - - DDeessccrriippttiioonn Rename the object with name srcname in directory - sourceFid to destname in destFid. It is important that the names - srcname and destname are 0 terminated strings. Strings in Unix - kernels are not always null terminated. - - EErrrroorrss - - 0wpage - - 44..1199.. rreeaaddddiirr - - - SSuummmmaarryy Read directory entries. - - AArrgguummeennttss - - iinn - - struct cfs_readdir_in { - ViceFid VFid; - int count; - int offset; - } cfs_readdir; - - - - - oouutt - - struct cfs_readdir_out { - int size; - caddr_t data; /* Place holder for data. */ - } cfs_readdir; - - - - DDeessccrriippttiioonn Read directory entries from VFid starting at offset and - read at most count bytes. Returns the data in data and returns - the size in size. - - EErrrroorrss - - NNOOTTEE This call is not used. Readdir operations exploit container - files. We will re-evaluate this during the directory revamp which is - about to take place. - - 0wpage - - 44..2200.. vvggeett - - - SSuummmmaarryy instructs Venus to do an FSDB->Get. - - AArrgguummeennttss - - iinn - - struct cfs_vget_in { - ViceFid VFid; - } cfs_vget; - - - - oouutt - - struct cfs_vget_out { - ViceFid VFid; - int vtype; - } cfs_vget; - - - - DDeessccrriippttiioonn This upcall asks Venus to do a get operation on an fsobj - labelled by VFid. - - EErrrroorrss - - NNOOTTEE This operation is not used. However, it is extremely useful - since it can be used to deal with read/write memory mapped files. - These can be "pinned" in the Venus cache using vget and released with - inactive. - - 0wpage - - 44..2211.. ffssyynncc - - - SSuummmmaarryy Tell Venus to update the RVM attributes of a file. - - AArrgguummeennttss - - iinn - - struct cfs_fsync_in { - ViceFid VFid; - } cfs_fsync; - - - - oouutt - none - - DDeessccrriippttiioonn Ask Venus to update RVM attributes of object VFid. This - should be called as part of kernel level fsync type calls. The - result indicates if the syncing was successful. - - EErrrroorrss - - NNOOTTEE Linux does not implement this call. It should. - - 0wpage - - 44..2222.. iinnaaccttiivvee - - - SSuummmmaarryy Tell Venus a vnode is no longer in use. - - AArrgguummeennttss - - iinn - - struct cfs_inactive_in { - ViceFid VFid; - } cfs_inactive; - - - - oouutt - none - - DDeessccrriippttiioonn This operation returns EOPNOTSUPP. - - EErrrroorrss - - NNOOTTEE This should perhaps be removed. - - 0wpage - - 44..2233.. rrddwwrr - - - SSuummmmaarryy Read or write from a file - - AArrgguummeennttss - - iinn - - struct cfs_rdwr_in { - ViceFid VFid; - int rwflag; - int count; - int offset; - int ioflag; - caddr_t data; /* Place holder for data. */ - } cfs_rdwr; - - - - - oouutt - - struct cfs_rdwr_out { - int rwflag; - int count; - caddr_t data; /* Place holder for data. */ - } cfs_rdwr; - - - - DDeessccrriippttiioonn This upcall asks Venus to read or write from a file. - - EErrrroorrss - - NNOOTTEE It should be removed since it is against the Coda philosophy that - read/write operations never reach Venus. I have been told the - operation does not work. It is not currently used. - - - 0wpage - - 44..2244.. ooddyymmoouunntt - - - SSuummmmaarryy Allows mounting multiple Coda "filesystems" on one Unix mount - point. - - AArrgguummeennttss - - iinn - - struct ody_mount_in { - char *name; /* Place holder for data. */ - } ody_mount; - - - - oouutt - - struct ody_mount_out { - ViceFid VFid; - } ody_mount; - - - - DDeessccrriippttiioonn Asks Venus to return the rootfid of a Coda system named - name. The fid is returned in VFid. - - EErrrroorrss - - NNOOTTEE This call was used by David for dynamic sets. It should be - removed since it causes a jungle of pointers in the VFS mounting area. - It is not used by Coda proper. Call is not implemented by Venus. - - 0wpage - - 44..2255.. ooddyy__llooookkuupp - - - SSuummmmaarryy Looks up something. - - AArrgguummeennttss - - iinn irrelevant - - - oouutt - irrelevant - - DDeessccrriippttiioonn - - EErrrroorrss - - NNOOTTEE Gut it. Call is not implemented by Venus. - - 0wpage - - 44..2266.. ooddyy__eexxppaanndd - - - SSuummmmaarryy expands something in a dynamic set. - - AArrgguummeennttss - - iinn irrelevant - - oouutt - irrelevant - - DDeessccrriippttiioonn - - EErrrroorrss - - NNOOTTEE Gut it. Call is not implemented by Venus. - - 0wpage - - 44..2277.. pprreeffeettcchh - - - SSuummmmaarryy Prefetch a dynamic set. - - AArrgguummeennttss - - iinn Not documented. - - oouutt - Not documented. - - DDeessccrriippttiioonn Venus worker.cc has support for this call, although it is - noted that it doesn't work. Not surprising, since the kernel does not - have support for it. (ODY_PREFETCH is not a defined operation). - - EErrrroorrss - - NNOOTTEE Gut it. It isn't working and isn't used by Coda. - - - 0wpage - - 44..2288.. ssiiggnnaall - - - SSuummmmaarryy Send Venus a signal about an upcall. - - AArrgguummeennttss - - iinn none - - oouutt - not applicable. - - DDeessccrriippttiioonn This is an out-of-band upcall to Venus to inform Venus - that the calling process received a signal after Venus read the - message from the input queue. Venus is supposed to clean up the - operation. - - EErrrroorrss No reply is given. - - NNOOTTEE We need to better understand what Venus needs to clean up and if - it is doing this correctly. Also we need to handle multiple upcall - per system call situations correctly. It would be important to know - what state changes in Venus take place after an upcall for which the - kernel is responsible for notifying Venus to clean up (e.g. open - definitely is such a state change, but many others are maybe not). - - 0wpage - - 55.. TThhee mmiinniiccaacchhee aanndd ddoowwnnccaallllss - - - The Coda FS Driver can cache results of lookup and access upcalls, to - limit the frequency of upcalls. Upcalls carry a price since a process - context switch needs to take place. The counterpart of caching the - information is that Venus will notify the FS Driver that cached - entries must be flushed or renamed. - - The kernel code generally has to maintain a structure which links the - internal file handles (called vnodes in BSD, inodes in Linux and - FileHandles in Windows) with the ViceFid's which Venus maintains. The - reason is that frequent translations back and forth are needed in - order to make upcalls and use the results of upcalls. Such linking - objects are called ccnnooddeess. - - The current minicache implementations have cache entries which record - the following: - - 1. the name of the file - - 2. the cnode of the directory containing the object - - 3. a list of CodaCred's for which the lookup is permitted. - - 4. the cnode of the object - - The lookup call in the Coda FS Driver may request the cnode of the - desired object from the cache, by passing its name, directory and the - CodaCred's of the caller. The cache will return the cnode or indicate - that it cannot be found. The Coda FS Driver must be careful to - invalidate cache entries when it modifies or removes objects. - - When Venus obtains information that indicates that cache entries are - no longer valid, it will make a downcall to the kernel. Downcalls are - intercepted by the Coda FS Driver and lead to cache invalidations of - the kind described below. The Coda FS Driver does not return an error - unless the downcall data could not be read into kernel memory. - - - 55..11.. IINNVVAALLIIDDAATTEE - - - No information is available on this call. - - - 55..22.. FFLLUUSSHH - - - - AArrgguummeennttss None - - SSuummmmaarryy Flush the name cache entirely. - - DDeessccrriippttiioonn Venus issues this call upon startup and when it dies. This - is to prevent stale cache information being held. Some operating - systems allow the kernel name cache to be switched off dynamically. - When this is done, this downcall is made. - - - 55..33.. PPUURRGGEEUUSSEERR - - - AArrgguummeennttss - - struct cfs_purgeuser_out {/* CFS_PURGEUSER is a venus->kernel call */ - struct CodaCred cred; - } cfs_purgeuser; - - - - DDeessccrriippttiioonn Remove all entries in the cache carrying the Cred. This - call is issued when tokens for a user expire or are flushed. - - - 55..44.. ZZAAPPFFIILLEE - - - AArrgguummeennttss - - struct cfs_zapfile_out { /* CFS_ZAPFILE is a venus->kernel call */ - ViceFid CodaFid; - } cfs_zapfile; - - - - DDeessccrriippttiioonn Remove all entries which have the (dir vnode, name) pair. - This is issued as a result of an invalidation of cached attributes of - a vnode. - - NNOOTTEE Call is not named correctly in NetBSD and Mach. The minicache - zapfile routine takes different arguments. Linux does not implement - the invalidation of attributes correctly. - - - - 55..55.. ZZAAPPDDIIRR - - - AArrgguummeennttss - - struct cfs_zapdir_out { /* CFS_ZAPDIR is a venus->kernel call */ - ViceFid CodaFid; - } cfs_zapdir; - - - - DDeessccrriippttiioonn Remove all entries in the cache lying in a directory - CodaFid, and all children of this directory. This call is issued when - Venus receives a callback on the directory. - - - 55..66.. ZZAAPPVVNNOODDEE - - - - AArrgguummeennttss - - struct cfs_zapvnode_out { /* CFS_ZAPVNODE is a venus->kernel call */ - struct CodaCred cred; - ViceFid VFid; - } cfs_zapvnode; - - - - DDeessccrriippttiioonn Remove all entries in the cache carrying the cred and VFid - as in the arguments. This downcall is probably never issued. - - - 55..77.. PPUURRGGEEFFIIDD - - - SSuummmmaarryy - - AArrgguummeennttss - - struct cfs_purgefid_out { /* CFS_PURGEFID is a venus->kernel call */ - ViceFid CodaFid; - } cfs_purgefid; - - - - DDeessccrriippttiioonn Flush the attribute for the file. If it is a dir (odd - vnode), purge its children from the namecache and remove the file from the - namecache. - - - - 55..88.. RREEPPLLAACCEE - - - SSuummmmaarryy Replace the Fid's for a collection of names. - - AArrgguummeennttss - - struct cfs_replace_out { /* cfs_replace is a venus->kernel call */ - ViceFid NewFid; - ViceFid OldFid; - } cfs_replace; - - - - DDeessccrriippttiioonn This routine replaces a ViceFid in the name cache with - another. It is added to allow Venus during reintegration to replace - locally allocated temp fids while disconnected with global fids even - when the reference counts on those fids are not zero. - - 0wpage - - 66.. IInniittiiaalliizzaattiioonn aanndd cclleeaannuupp - - - This section gives brief hints as to desirable features for the Coda - FS Driver at startup and upon shutdown or Venus failures. Before - entering the discussion it is useful to repeat that the Coda FS Driver - maintains the following data: - - - 1. message queues - - 2. cnodes - - 3. name cache entries - - The name cache entries are entirely private to the driver, so they - can easily be manipulated. The message queues will generally have - clear points of initialization and destruction. The cnodes are - much more delicate. User processes hold reference counts in Coda - filesystems and it can be difficult to clean up the cnodes. - - It can expect requests through: - - 1. the message subsystem - - 2. the VFS layer - - 3. pioctl interface - - Currently the _p_i_o_c_t_l passes through the VFS for Coda so we can - treat these similarly. - - - 66..11.. RReeqquuiirreemmeennttss - - - The following requirements should be accommodated: - - 1. The message queues should have open and close routines. On Unix - the opening of the character devices are such routines. - - +o Before opening, no messages can be placed. - - +o Opening will remove any old messages still pending. - - +o Close will notify any sleeping processes that their upcall cannot - be completed. - - +o Close will free all memory allocated by the message queues. - - - 2. At open the namecache shall be initialized to empty state. - - 3. Before the message queues are open, all VFS operations will fail. - Fortunately this can be achieved by making sure than mounting the - Coda filesystem cannot succeed before opening. - - 4. After closing of the queues, no VFS operations can succeed. Here - one needs to be careful, since a few operations (lookup, - read/write, readdir) can proceed without upcalls. These must be - explicitly blocked. - - 5. Upon closing the namecache shall be flushed and disabled. - - 6. All memory held by cnodes can be freed without relying on upcalls. - - 7. Unmounting the file system can be done without relying on upcalls. - - 8. Mounting the Coda filesystem should fail gracefully if Venus cannot - get the rootfid or the attributes of the rootfid. The latter is - best implemented by Venus fetching these objects before attempting - to mount. - - NNOOTTEE NetBSD in particular but also Linux have not implemented the - above requirements fully. For smooth operation this needs to be - corrected. - - - diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs.rst index 16e606c11f40..f8941954c667 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs.rst @@ -1,5 +1,6 @@ - -configfs - Userspace-driven kernel object configuration. +======================================================= +Configfs - Userspace-driven Kernel Object Configuration +======================================================= Joel Becker <joel.becker@oracle.com> @@ -9,7 +10,8 @@ Copyright (c) 2005 Oracle Corporation, Joel Becker <joel.becker@oracle.com> -[What is configfs?] +What is configfs? +================= configfs is a ram-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based view of @@ -35,10 +37,11 @@ kernel modules backing the items must respond to this. Both sysfs and configfs can and should exist together on the same system. One is not a replacement for the other. -[Using configfs] +Using configfs +============== configfs can be compiled as a module or into the kernel. You can access -it by doing +it by doing:: mount -t configfs none /config @@ -56,28 +59,29 @@ values. Don't mix more than one attribute in one attribute file. There are two types of configfs attributes: * Normal attributes, which similar to sysfs attributes, are small ASCII text -files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably -only one value per file should be used, and the same caveats from sysfs apply. -Configfs expects write(2) to store the entire buffer at once. When writing to -normal configfs attributes, userspace processes should first read the entire -file, modify the portions they wish to change, and then write the entire -buffer back. + files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably + only one value per file should be used, and the same caveats from sysfs apply. + Configfs expects write(2) to store the entire buffer at once. When writing to + normal configfs attributes, userspace processes should first read the entire + file, modify the portions they wish to change, and then write the entire + buffer back. * Binary attributes, which are somewhat similar to sysfs binary attributes, -but with a few slight changes to semantics. The PAGE_SIZE limitation does not -apply, but the whole binary item must fit in single kernel vmalloc'ed buffer. -The write(2) calls from user space are buffered, and the attributes' -write_bin_attribute method will be invoked on the final close, therefore it is -imperative for user-space to check the return code of close(2) in order to -verify that the operation finished successfully. -To avoid a malicious user OOMing the kernel, there's a per-binary attribute -maximum buffer value. + but with a few slight changes to semantics. The PAGE_SIZE limitation does not + apply, but the whole binary item must fit in single kernel vmalloc'ed buffer. + The write(2) calls from user space are buffered, and the attributes' + write_bin_attribute method will be invoked on the final close, therefore it is + imperative for user-space to check the return code of close(2) in order to + verify that the operation finished successfully. + To avoid a malicious user OOMing the kernel, there's a per-binary attribute + maximum buffer value. When an item needs to be destroyed, remove it with rmdir(2). An item cannot be destroyed if any other item has a link to it (via symlink(2)). Links can be removed via unlink(2). -[Configuring FakeNBD: an Example] +Configuring FakeNBD: an Example +=============================== Imagine there's a Network Block Device (NBD) driver that allows you to access remote block devices. Call it FakeNBD. FakeNBD uses configfs @@ -86,14 +90,14 @@ sysadmins use to configure FakeNBD, but somehow that program has to tell the driver about it. Here's where configfs comes in. When the FakeNBD driver is loaded, it registers itself with configfs. -readdir(3) sees this just fine: +readdir(3) sees this just fine:: # ls /config fakenbd A fakenbd connection can be created with mkdir(2). The name is arbitrary, but likely the tool will make some use of the name. Perhaps -it is a uuid or a disk name: +it is a uuid or a disk name:: # mkdir /config/fakenbd/disk1 # ls /config/fakenbd/disk1 @@ -102,7 +106,7 @@ it is a uuid or a disk name: The target attribute contains the IP address of the server FakeNBD will connect to. The device attribute is the device on the server. Predictably, the rw attribute determines whether the connection is -read-only or read-write. +read-only or read-write:: # echo 10.0.0.1 > /config/fakenbd/disk1/target # echo /dev/sda1 > /config/fakenbd/disk1/device @@ -111,7 +115,8 @@ read-only or read-write. That's it. That's all there is. Now the device is configured, via the shell no less. -[Coding With configfs] +Coding With configfs +==================== Every object in configfs is a config_item. A config_item reflects an object in the subsystem. It has attributes that match values on that @@ -130,7 +135,10 @@ appears as a directory at the top of the configfs filesystem. A subsystem is also a config_group, and can do everything a config_group can. -[struct config_item] +struct config_item +================== + +:: struct config_item { char *ci_name; @@ -168,7 +176,10 @@ By itself, a config_item cannot do much more than appear in configfs. Usually a subsystem wants the item to display and/or store attributes, among other things. For that, it needs a type. -[struct config_item_type] +struct config_item_type +======================= + +:: struct configfs_item_operations { void (*release)(struct config_item *); @@ -192,7 +203,10 @@ allocated dynamically will need to provide the ct_item_ops->release() method. This method is called when the config_item's reference count reaches zero. -[struct configfs_attribute] +struct configfs_attribute +========================= + +:: struct configfs_attribute { char *ca_name; @@ -214,7 +228,10 @@ be called whenever userspace asks for a read(2) on the attribute. If an attribute is writable and provides a ->store method, that method will be be called whenever userspace asks for a write(2) on the attribute. -[struct configfs_bin_attribute] +struct configfs_bin_attribute +============================= + +:: struct configfs_bin_attribute { struct configfs_attribute cb_attr; @@ -240,11 +257,12 @@ will happen for write(2). The reads/writes are bufferred so only a single read/write will occur; the attributes' need not concern itself with it. -[struct config_group] +struct config_group +=================== A config_item cannot live in a vacuum. The only way one can be created is via mkdir(2) on a config_group. This will trigger creation of a -child item. +child item:: struct config_group { struct config_item cg_item; @@ -264,7 +282,7 @@ The config_group structure contains a config_item. Properly configuring that item means that a group can behave as an item in its own right. However, it can do more: it can create child items or groups. This is accomplished via the group operations specified on the group's -config_item_type. +config_item_type:: struct configfs_group_operations { struct config_item *(*make_item)(struct config_group *group, @@ -279,7 +297,8 @@ config_item_type. }; A group creates child items by providing the -ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new +ct_group_ops->make_item() method. If provided, this method is called from +mkdir(2) in the group's directory. The subsystem allocates a new config_item (or more likely, its container structure), initializes it, and returns it to configfs. Configfs will then populate the filesystem tree to reflect the new item. @@ -296,13 +315,14 @@ upon item allocation. If a subsystem has no work to do, it may omit the ct_group_ops->drop_item() method, and configfs will call config_item_put() on the item on behalf of the subsystem. -IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2) -is called, configfs WILL remove the item from the filesystem tree -(assuming that it has no children to keep it busy). The subsystem is -responsible for responding to this. If the subsystem has references to -the item in other threads, the memory is safe. It may take some time -for the item to actually disappear from the subsystem's usage. But it -is gone from configfs. +Important: + drop_item() is void, and as such cannot fail. When rmdir(2) + is called, configfs WILL remove the item from the filesystem tree + (assuming that it has no children to keep it busy). The subsystem is + responsible for responding to this. If the subsystem has references to + the item in other threads, the memory is safe. It may take some time + for the item to actually disappear from the subsystem's usage. But it + is gone from configfs. When drop_item() is called, the item's linkage has already been torn down. It no longer has a reference on its parent and has no place in @@ -319,10 +339,11 @@ is implemented in the configfs rmdir(2) code. ->drop_item() will not be called, as the item has not been dropped. rmdir(2) will fail, as the directory is not empty. -[struct configfs_subsystem] +struct configfs_subsystem +========================= A subsystem must register itself, usually at module_init time. This -tells configfs to make the subsystem appear in the file tree. +tells configfs to make the subsystem appear in the file tree:: struct configfs_subsystem { struct config_group su_group; @@ -332,17 +353,19 @@ tells configfs to make the subsystem appear in the file tree. int configfs_register_subsystem(struct configfs_subsystem *subsys); void configfs_unregister_subsystem(struct configfs_subsystem *subsys); - A subsystem consists of a toplevel config_group and a mutex. +A subsystem consists of a toplevel config_group and a mutex. The group is where child config_items are created. For a subsystem, this group is usually defined statically. Before calling configfs_register_subsystem(), the subsystem must have initialized the group via the usual group _init() functions, and it must also have initialized the mutex. - When the register call returns, the subsystem is live, and it + +When the register call returns, the subsystem is live, and it will be visible via configfs. At that point, mkdir(2) can be called and the subsystem must be ready for it. -[An Example] +An Example +========== The best example of these basic concepts is the simple_children subsystem/group and the simple_child item in @@ -350,7 +373,8 @@ samples/configfs/configfs_sample.c. It shows a trivial object displaying and storing an attribute, and a simple group creating and destroying these children. -[Hierarchy Navigation and the Subsystem Mutex] +Hierarchy Navigation and the Subsystem Mutex +============================================ There is an extra bonus that configfs provides. The config_groups and config_items are arranged in a hierarchy due to the fact that they @@ -375,7 +399,8 @@ be in its parent's cg_children list for the same duration. This allows a subsystem to trust ci_parent and cg_children while they hold the mutex. -[Item Aggregation Via symlink(2)] +Item Aggregation Via symlink(2) +=============================== configfs provides a simple group via the group->item parent/child relationship. Often, however, a larger environment requires aggregation @@ -403,7 +428,8 @@ A config_item cannot be removed while it links to any other item, nor can it be removed while an item links to it. Dangling symlinks are not allowed in configfs. -[Automatically Created Subgroups] +Automatically Created Subgroups +=============================== A new config_group may want to have two types of child config_items. While this could be codified by magic names in ->make_item(), it is much @@ -433,7 +459,8 @@ As a consequence of this, default groups cannot be removed directly via rmdir(2). They also are not considered when rmdir(2) on the parent group is checking for children. -[Dependent Subsystems] +Dependent Subsystems +==================== Sometimes other drivers depend on particular configfs items. For example, ocfs2 mounts depend on a heartbeat region item. If that @@ -460,9 +487,11 @@ succeeds, then heartbeat knows the region is safe to give to ocfs2. If it fails, it was being torn down anyway, and heartbeat can gracefully pass up an error. -[Committable Items] +Committable Items +================= -NOTE: Committable items are currently unimplemented. +Note: + Committable items are currently unimplemented. Some config_items cannot have a valid initial state. That is, no default values can be specified for the item's attributes such that the @@ -504,5 +533,3 @@ As rmdir(2) does not work in the "live" directory, an item must be shutdown, or "uncommitted". Again, this is done via rename(2), this time from the "live" directory back to the "pending" one. The subsystem is notified by the ct_group_ops->uncommit_object() method. - - diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 679729442fd2..735f3859b19f 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -74,7 +74,7 @@ are zeroed out and converted to written extents before being returned to avoid exposure of uninitialized data through mmap. These filesystems may be used for inspiration: -- ext2: see Documentation/filesystems/ext2.txt +- ext2: see Documentation/filesystems/ext2.rst - ext4: see Documentation/filesystems/ext4/ - xfs: see Documentation/admin-guide/xfs.rst diff --git a/Documentation/filesystems/debugfs.rst b/Documentation/filesystems/debugfs.rst index 6c032db235a5..1da7a4b7383d 100644 --- a/Documentation/filesystems/debugfs.rst +++ b/Documentation/filesystems/debugfs.rst @@ -166,16 +166,17 @@ file:: }; struct debugfs_regset32 { - struct debugfs_reg32 *regs; + const struct debugfs_reg32 *regs; int nregs; void __iomem *base; + struct device *dev; /* Optional device for Runtime PM */ }; debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset); - void debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs, + void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, int nregs, void __iomem *base, char *prefix); The "base" argument may be 0, but you may want to build the reg32 array diff --git a/Documentation/filesystems/devpts.rst b/Documentation/filesystems/devpts.rst new file mode 100644 index 000000000000..a03248ddfb4c --- /dev/null +++ b/Documentation/filesystems/devpts.rst @@ -0,0 +1,36 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +The Devpts Filesystem +===================== + +Each mount of the devpts filesystem is now distinct such that ptys +and their indicies allocated in one mount are independent from ptys +and their indicies in all other mounts. + +All mounts of the devpts filesystem now create a ``/dev/pts/ptmx`` node +with permissions ``0000``. + +To retain backwards compatibility the a ptmx device node (aka any node +created with ``mknod name c 5 2``) when opened will look for an instance +of devpts under the name ``pts`` in the same directory as the ptmx device +node. + +As an option instead of placing a ``/dev/ptmx`` device node at ``/dev/ptmx`` +it is possible to place a symlink to ``/dev/pts/ptmx`` at ``/dev/ptmx`` or +to bind mount ``/dev/ptx/ptmx`` to ``/dev/ptmx``. If you opt for using +the devpts filesystem in this manner devpts should be mounted with +the ``ptmxmode=0666``, or ``chmod 0666 /dev/pts/ptmx`` should be called. + +Total count of pty pairs in all instances is limited by sysctls:: + + kernel.pty.max = 4096 - global limit + kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace + kernel.pty.nr - current count of ptys + +Per-instance limit could be set by adding mount option ``max=<count>``. + +This feature was added in kernel 3.4 together with +``sysctl kernel.pty.reserve``. + +In kernels older than 3.4 sysctl ``kernel.pty.max`` works as per-instance limit. diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt deleted file mode 100644 index 9f94fe276dea..000000000000 --- a/Documentation/filesystems/devpts.txt +++ /dev/null @@ -1,26 +0,0 @@ -Each mount of the devpts filesystem is now distinct such that ptys -and their indicies allocated in one mount are independent from ptys -and their indicies in all other mounts. - -All mounts of the devpts filesystem now create a /dev/pts/ptmx node -with permissions 0000. - -To retain backwards compatibility the a ptmx device node (aka any node -created with "mknod name c 5 2") when opened will look for an instance -of devpts under the name "pts" in the same directory as the ptmx device -node. - -As an option instead of placing a /dev/ptmx device node at /dev/ptmx -it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or -to bind mount /dev/ptx/ptmx to /dev/ptmx. If you opt for using -the devpts filesystem in this manner devpts should be mounted with -the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called. - -Total count of pty pairs in all instances is limited by sysctls: -kernel.pty.max = 4096 - global limit -kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace -kernel.pty.nr - current count of ptys - -Per-instance limit could be set by adding mount option "max=<count>". -This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve. -In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit. diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.rst index 15156883d321..a28a1f9ef79c 100644 --- a/Documentation/filesystems/dnotify.txt +++ b/Documentation/filesystems/dnotify.rst @@ -1,5 +1,8 @@ - Linux Directory Notification - ============================ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Linux Directory Notification +============================ Stephen Rothwell <sfr@canb.auug.org.au> @@ -12,6 +15,7 @@ being delivered using signals. The application decides which "events" it wants to be notified about. The currently defined events are: + ========= ===================================================== DN_ACCESS A file in the directory was accessed (read) DN_MODIFY A file in the directory was modified (write,truncate) DN_CREATE A file was created in the directory @@ -19,6 +23,7 @@ The currently defined events are: DN_RENAME A file in the directory was renamed DN_ATTRIB A file in the directory had its attributes changed (chmod,chown) + ========= ===================================================== Usually, the application must reregister after each notification, but if DN_MULTISHOT is or'ed with the event mask, then the registration will @@ -36,7 +41,7 @@ especially important if DN_MULTISHOT is specified. Note that SIGRTMIN is often blocked, so it is better to use (at least) SIGRTMIN + 1. Implementation expectations (features and bugs :-)) ---------------------------- +--------------------------------------------------- The notification should work for any local access to files even if the actual file system is on a remote server. This implies that remote @@ -67,4 +72,4 @@ See tools/testing/selftests/filesystems/dnotify_test.c for an example. NOTE ---- Beginning with Linux 2.6.13, dnotify has been replaced by inotify. -See Documentation/filesystems/inotify.txt for more information on it. +See Documentation/filesystems/inotify.rst for more information on it. diff --git a/Documentation/filesystems/efivarfs.rst b/Documentation/filesystems/efivarfs.rst index 90ac65683e7e..0551985821b8 100644 --- a/Documentation/filesystems/efivarfs.rst +++ b/Documentation/filesystems/efivarfs.rst @@ -24,3 +24,20 @@ files that are not well-known standardized variables are created as immutable files. This doesn't prevent removal - "chattr -i" will work - but it does prevent this kind of failure from being accomplished accidentally. + +.. warning :: + When a content of an UEFI variable in /sys/firmware/efi/efivars is + displayed, for example using "hexdump", pay attention that the first + 4 bytes of the output represent the UEFI variable attributes, + in little-endian format. + + Practically the output of each efivar is composed of: + + +-----------------------------------+ + |4_bytes_of_attributes + efivar_data| + +-----------------------------------+ + +*See also:* + +- Documentation/admin-guide/acpi/ssdt-overlays.rst +- Documentation/ABI/stable/sysfs-firmware-efi-vars diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.rst index ac87e6fda842..2a572e7edc08 100644 --- a/Documentation/filesystems/fiemap.txt +++ b/Documentation/filesystems/fiemap.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ============ Fiemap Ioctl ============ @@ -10,9 +12,9 @@ returns a list of extents. Request Basics -------------- -A fiemap request is encoded within struct fiemap: +A fiemap request is encoded within struct fiemap:: -struct fiemap { + struct fiemap { __u64 fm_start; /* logical offset (inclusive) at * which to start mapping (in) */ __u64 fm_length; /* logical length of mapping which @@ -23,7 +25,7 @@ struct fiemap { __u32 fm_extent_count; /* size of fm_extents array (in) */ __u32 fm_reserved; struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ -}; + }; fm_start, and fm_length specify the logical range within the file @@ -51,12 +53,12 @@ nothing to prevent the file from changing between calls to FIEMAP. The following flags can be set in fm_flags: -* FIEMAP_FLAG_SYNC -If this flag is set, the kernel will sync the file before mapping extents. +FIEMAP_FLAG_SYNC + If this flag is set, the kernel will sync the file before mapping extents. -* FIEMAP_FLAG_XATTR -If this flag is set, the extents returned will describe the inodes -extended attribute lookup tree, instead of its data tree. +FIEMAP_FLAG_XATTR + If this flag is set, the extents returned will describe the inodes + extended attribute lookup tree, instead of its data tree. Extent Mapping @@ -75,18 +77,18 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST flag set (see the next section on extent flags). Each extent is described by a single fiemap_extent structure as -returned in fm_extents. - -struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent */ - __u64 fe_length; /* length in bytes for the extent */ - __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ - __u32 fe_reserved[3]; -}; +returned in fm_extents:: + + struct fiemap_extent { + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent */ + __u64 fe_length; /* length in bytes for the extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_reserved[3]; + }; All offsets and lengths are in bytes and mirror those on disk. It is valid for an extents logical offset to start before the request or its logical @@ -114,26 +116,27 @@ worry about all present and future flags which might imply unaligned data. Note that the opposite is not true - it would be valid for FIEMAP_EXTENT_NOT_ALIGNED to appear alone. -* FIEMAP_EXTENT_LAST -This is generally the last extent in the file. A mapping attempt past -this extent may return nothing. Some implementations set this flag to -indicate this extent is the last one in the range queried by the user -(via fiemap->fm_length). +FIEMAP_EXTENT_LAST + This is generally the last extent in the file. A mapping attempt past + this extent may return nothing. Some implementations set this flag to + indicate this extent is the last one in the range queried by the user + (via fiemap->fm_length). + +FIEMAP_EXTENT_UNKNOWN + The location of this extent is currently unknown. This may indicate + the data is stored on an inaccessible volume or that no storage has + been allocated for the file yet. -* FIEMAP_EXTENT_UNKNOWN -The location of this extent is currently unknown. This may indicate -the data is stored on an inaccessible volume or that no storage has -been allocated for the file yet. +FIEMAP_EXTENT_DELALLOC + This will also set FIEMAP_EXTENT_UNKNOWN. -* FIEMAP_EXTENT_DELALLOC - - This will also set FIEMAP_EXTENT_UNKNOWN. -Delayed allocation - while there is data for this extent, its -physical location has not been allocated yet. + Delayed allocation - while there is data for this extent, its + physical location has not been allocated yet. -* FIEMAP_EXTENT_ENCODED -This extent does not consist of plain filesystem blocks but is -encoded (e.g. encrypted or compressed). Reading the data in this -extent via I/O to the block device will have undefined results. +FIEMAP_EXTENT_ENCODED + This extent does not consist of plain filesystem blocks but is + encoded (e.g. encrypted or compressed). Reading the data in this + extent via I/O to the block device will have undefined results. Note that it is *always* undefined to try to update the data in-place by writing to the indicated location without the @@ -145,32 +148,32 @@ unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is clear; user applications must not try reading or writing to the filesystem via the block device under any other circumstances. -* FIEMAP_EXTENT_DATA_ENCRYPTED - - This will also set FIEMAP_EXTENT_ENCODED -The data in this extent has been encrypted by the file system. +FIEMAP_EXTENT_DATA_ENCRYPTED + This will also set FIEMAP_EXTENT_ENCODED + The data in this extent has been encrypted by the file system. -* FIEMAP_EXTENT_NOT_ALIGNED -Extent offsets and length are not guaranteed to be block aligned. +FIEMAP_EXTENT_NOT_ALIGNED + Extent offsets and length are not guaranteed to be block aligned. -* FIEMAP_EXTENT_DATA_INLINE +FIEMAP_EXTENT_DATA_INLINE This will also set FIEMAP_EXTENT_NOT_ALIGNED -Data is located within a meta data block. + Data is located within a meta data block. -* FIEMAP_EXTENT_DATA_TAIL +FIEMAP_EXTENT_DATA_TAIL This will also set FIEMAP_EXTENT_NOT_ALIGNED -Data is packed into a block with data from other files. + Data is packed into a block with data from other files. -* FIEMAP_EXTENT_UNWRITTEN -Unwritten extent - the extent is allocated but its data has not been -initialized. This indicates the extent's data will be all zero if read -through the filesystem but the contents are undefined if read directly from -the device. +FIEMAP_EXTENT_UNWRITTEN + Unwritten extent - the extent is allocated but its data has not been + initialized. This indicates the extent's data will be all zero if read + through the filesystem but the contents are undefined if read directly from + the device. -* FIEMAP_EXTENT_MERGED -This will be set when a file does not support extents, i.e., it uses a block -based addressing scheme. Since returning an extent for each block back to -userspace would be highly inefficient, the kernel will try to merge most -adjacent blocks into 'extents'. +FIEMAP_EXTENT_MERGED + This will be set when a file does not support extents, i.e., it uses a block + based addressing scheme. Since returning an extent for each block back to + userspace would be highly inefficient, the kernel will try to merge most + adjacent blocks into 'extents'. VFS -> File System Implementation @@ -179,23 +182,23 @@ VFS -> File System Implementation File systems wishing to support fiemap must implement a ->fiemap callback on their inode_operations structure. The fs ->fiemap call is responsible for defining its set of supported fiemap flags, and calling a helper function on -each discovered extent: +each discovered extent:: -struct inode_operations { + struct inode_operations { ... int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); ->fiemap is passed struct fiemap_extent_info which describes the -fiemap request: +fiemap request:: -struct fiemap_extent_info { + struct fiemap_extent_info { unsigned int fi_flags; /* Flags as passed from user */ unsigned int fi_extents_mapped; /* Number of mapped extents */ unsigned int fi_extents_max; /* Size of fiemap_extent array */ struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ -}; + }; It is intended that the file system should not need to access any of this structure directly. Filesystem handlers should be tolerant to signals and return @@ -203,9 +206,9 @@ EINTR once fatal signal received. Flag checking should be done at the beginning of the ->fiemap callback via the -fiemap_check_flags() helper: +fiemap_check_flags() helper:: -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); + int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); The struct fieinfo should be passed in as received from ioctl_fiemap(). The set of fiemap flags which the fs understands should be passed via fs_flags. If @@ -216,10 +219,10 @@ ioctl_fiemap(). For each extent in the request range, the file system should call -the helper function, fiemap_fill_next_extent(): +the helper function, fiemap_fill_next_extent():: -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags, u32 dev); + int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, + u64 phys, u64 len, u32 flags, u32 dev); fiemap_fill_next_extent() will use the passed values to populate the next free extent in the fm_extents array. 'General' extent flags will diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.rst index 46dfc6b038c3..cbf8e57376bf 100644 --- a/Documentation/filesystems/files.txt +++ b/Documentation/filesystems/files.rst @@ -1,5 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== File management in the Linux kernel ------------------------------------ +=================================== This document describes how locking for files (struct file) and file descriptor table (struct files) works. @@ -34,7 +37,7 @@ appear atomic. Here are the locking rules for the fdtable structure - 1. All references to the fdtable must be done through - the files_fdtable() macro : + the files_fdtable() macro:: struct fdtable *fdt; @@ -61,7 +64,8 @@ the fdtable structure - 4. To look up the file structure given an fd, a reader must use either fcheck() or fcheck_files() APIs. These take care of barrier requirements due to lock-free lookup. - An example : + + An example:: struct file *file; @@ -77,7 +81,7 @@ the fdtable structure - of the fd (fget()/fget_light()) are lock-free, it is possible that look-up may race with the last put() operation on the file structure. This is avoided using atomic_long_inc_not_zero() - on ->f_count : + on ->f_count:: rcu_read_lock(); file = fcheck_files(files, fd); @@ -106,7 +110,8 @@ the fdtable structure - holding files->file_lock. If ->file_lock is dropped, then another thread expand the files thereby creating a new fdtable and making the earlier fdtable pointer stale. - For example : + + For example:: spin_lock(&files->file_lock); fd = locate_fd(files, file, start); diff --git a/Documentation/filesystems/fuse-io.txt b/Documentation/filesystems/fuse-io.rst index 07b8f73f100f..255a368fe534 100644 --- a/Documentation/filesystems/fuse-io.txt +++ b/Documentation/filesystems/fuse-io.rst @@ -1,3 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +Fuse I/O Modes +============== + Fuse supports the following I/O modes: - direct-io diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e7b46dac7079..17795341e0a3 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -24,6 +24,22 @@ algorithms work. splice locking directory-locking + devpts + dnotify + fiemap + files + locks + mandatory-locking + mount_api + quota + seq_file + sharedsubtree + sysfs-pci + sysfs-tagging + + automount-support + + caching/index porting @@ -57,7 +73,10 @@ Documentation for filesystem implementations. befs bfs btrfs + cifs/cifsroot ceph + coda + configfs cramfs debugfs dlmfs @@ -73,6 +92,7 @@ Documentation for filesystem implementations. hfsplus hpfs fuse + fuse-io inotify isofs nilfs2 @@ -88,6 +108,7 @@ Documentation for filesystem implementations. ramfs-rootfs-initramfs relay romfs + spufs/index squashfs sysfs sysv-fs @@ -97,4 +118,6 @@ Documentation for filesystem implementations. udf virtiofs vfat + xfs-delayed-logging-design + xfs-self-describing-metadata zonefs diff --git a/Documentation/filesystems/locks.txt b/Documentation/filesystems/locks.rst index 5368690f412e..c5ae858b1aac 100644 --- a/Documentation/filesystems/locks.txt +++ b/Documentation/filesystems/locks.rst @@ -1,4 +1,8 @@ - File Locking Release Notes +.. SPDX-License-Identifier: GPL-2.0 + +========================== +File Locking Release Notes +========================== Andy Walker <andy@lysaker.kvaerner.no> @@ -6,7 +10,7 @@ 1. What's New? --------------- +============== 1.1 Broken Flock Emulation -------------------------- @@ -25,7 +29,7 @@ anyway (see the file "Documentation/process/changes.rst".) --------------------------- 1.2.1 Typical Problems - Sendmail ---------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Because sendmail was unable to use the old flock() emulation, many sendmail installations use fcntl() instead of flock(). This is true of Slackware 3.0 for example. This gave rise to some other subtle problems if sendmail was @@ -37,7 +41,7 @@ to lock solid with deadlocked processes. 1.2.2 The Solution ------------------- +^^^^^^^^^^^^^^^^^^ The solution I have chosen, after much experimentation and discussion, is to make flock() and fcntl() locks oblivious to each other. Both can exists, and neither will have any effect on the other. @@ -54,7 +58,7 @@ fcntl(), with all the problems that implies. --------------------------------------- Mandatory locking, as described in -'Documentation/filesystems/mandatory-locking.txt' was prior to this release a +'Documentation/filesystems/mandatory-locking.rst' was prior to this release a general configuration option that was valid for all mounted filesystems. This had a number of inherent dangers, not the least of which was the ability to freeze an NFS server by asking it to read a file for which a mandatory lock diff --git a/Documentation/filesystems/mandatory-locking.txt b/Documentation/filesystems/mandatory-locking.rst index a251ca33164a..9ce73544a8f0 100644 --- a/Documentation/filesystems/mandatory-locking.txt +++ b/Documentation/filesystems/mandatory-locking.rst @@ -1,8 +1,13 @@ - Mandatory File Locking For The Linux Operating System +.. SPDX-License-Identifier: GPL-2.0 + +===================================================== +Mandatory File Locking For The Linux Operating System +===================================================== Andy Walker <andy@lysaker.kvaerner.no> 15 April 1996 + (Updated September 2007) 0. Why you should avoid mandatory locking @@ -53,15 +58,17 @@ possible on existing user code. The scheme is based on marking individual files as candidates for mandatory locking, and using the existing fcntl()/lockf() interface for applying locks just as if they were normal, advisory locks. -Note 1: In saying "file" in the paragraphs above I am actually not telling -the whole truth. System V locking is based on fcntl(). The granularity of -fcntl() is such that it allows the locking of byte ranges in files, in addition -to entire files, so the mandatory locking rules also have byte level -granularity. +.. Note:: + + 1. In saying "file" in the paragraphs above I am actually not telling + the whole truth. System V locking is based on fcntl(). The granularity of + fcntl() is such that it allows the locking of byte ranges in files, in + addition to entire files, so the mandatory locking rules also have byte + level granularity. -Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite -borrowing the fcntl() locking scheme from System V. The mandatory locking -scheme is defined by the System V Interface Definition (SVID) Version 3. + 2. POSIX.1 does not specify any scheme for mandatory locking, despite + borrowing the fcntl() locking scheme from System V. The mandatory locking + scheme is defined by the System V Interface Definition (SVID) Version 3. 2. Marking a file for mandatory locking --------------------------------------- diff --git a/Documentation/filesystems/mount_api.txt b/Documentation/filesystems/mount_api.rst index 87c14bbb2b35..dea22d64f060 100644 --- a/Documentation/filesystems/mount_api.txt +++ b/Documentation/filesystems/mount_api.rst @@ -1,8 +1,10 @@ - ==================== - FILESYSTEM MOUNT API - ==================== +.. SPDX-License-Identifier: GPL-2.0 -CONTENTS +==================== +fILESYSTEM Mount API +==================== + +.. CONTENTS (1) Overview. @@ -21,8 +23,7 @@ CONTENTS (8) Parameter helper functions. -======== -OVERVIEW +Overview ======== The creation of new mounts is now to be done in a multistep process: @@ -43,7 +44,7 @@ The creation of new mounts is now to be done in a multistep process: (7) Destroy the context. -To support this, the file_system_type struct gains two new fields: +To support this, the file_system_type struct gains two new fields:: int (*init_fs_context)(struct fs_context *fc); const struct fs_parameter_description *parameters; @@ -57,12 +58,11 @@ Note that security initialisation is done *after* the filesystem is called so that the namespaces may be adjusted first. -====================== -THE FILESYSTEM CONTEXT +The Filesystem context ====================== The creation and reconfiguration of a superblock is governed by a filesystem -context. This is represented by the fs_context structure: +context. This is represented by the fs_context structure:: struct fs_context { const struct fs_context_operations *ops; @@ -86,78 +86,106 @@ context. This is represented by the fs_context structure: The fs_context fields are as follows: - (*) const struct fs_context_operations *ops + * :: + + const struct fs_context_operations *ops These are operations that can be done on a filesystem context (see below). This must be set by the ->init_fs_context() file_system_type operation. - (*) struct file_system_type *fs_type + * :: + + struct file_system_type *fs_type A pointer to the file_system_type of the filesystem that is being constructed or reconfigured. This retains a reference on the type owner. - (*) void *fs_private + * :: + + void *fs_private A pointer to the file system's private data. This is where the filesystem will need to store any options it parses. - (*) struct dentry *root + * :: + + struct dentry *root A pointer to the root of the mountable tree (and indirectly, the superblock thereof). This is filled in by the ->get_tree() op. If this is set, an active reference on root->d_sb must also be held. - (*) struct user_namespace *user_ns - (*) struct net *net_ns + * :: + + struct user_namespace *user_ns + struct net *net_ns There are a subset of the namespaces in use by the invoking process. They retain references on each namespace. The subscribed namespaces may be replaced by the filesystem to reflect other sources, such as the parent mount superblock on an automount. - (*) const struct cred *cred + * :: + + const struct cred *cred The mounter's credentials. This retains a reference on the credentials. - (*) char *source + * :: + + char *source This specifies the source. It may be a block device (e.g. /dev/sda1) or something more exotic, such as the "host:/path" that NFS desires. - (*) char *subtype + * :: + + char *subtype This is a string to be added to the type displayed in /proc/mounts to qualify it (used by FUSE). This is available for the filesystem to set if desired. - (*) void *security + * :: + + void *security A place for the LSMs to hang their security data for the superblock. The relevant security operations are described below. - (*) void *s_fs_info + * :: + + void *s_fs_info The proposed s_fs_info for a new superblock, set in the superblock by sget_fc(). This can be used to distinguish superblocks. - (*) unsigned int sb_flags - (*) unsigned int sb_flags_mask + * :: + + unsigned int sb_flags + unsigned int sb_flags_mask Which bits SB_* flags are to be set/cleared in super_block::s_flags. - (*) unsigned int s_iflags + * :: + + unsigned int s_iflags These will be bitwise-OR'd with s->s_iflags when a superblock is created. - (*) enum fs_context_purpose + * :: + + enum fs_context_purpose This indicates the purpose for which the context is intended. The available values are: - FS_CONTEXT_FOR_MOUNT, -- New superblock for explicit mount - FS_CONTEXT_FOR_SUBMOUNT -- New automatic submount of extant mount - FS_CONTEXT_FOR_RECONFIGURE -- Change an existing mount + ========================== ====================================== + FS_CONTEXT_FOR_MOUNT, New superblock for explicit mount + FS_CONTEXT_FOR_SUBMOUNT New automatic submount of extant mount + FS_CONTEXT_FOR_RECONFIGURE Change an existing mount + ========================== ====================================== The mount context is created by calling vfs_new_fs_context() or vfs_dup_fs_context() and is destroyed with put_fs_context(). Note that the @@ -176,11 +204,10 @@ mount context. For instance, NFS might pin the appropriate protocol version module. -================================= -THE FILESYSTEM CONTEXT OPERATIONS +The Filesystem Context Operations ================================= -The filesystem context points to a table of operations: +The filesystem context points to a table of operations:: struct fs_context_operations { void (*free)(struct fs_context *fc); @@ -195,24 +222,32 @@ The filesystem context points to a table of operations: These operations are invoked by the various stages of the mount procedure to manage the filesystem context. They are as follows: - (*) void (*free)(struct fs_context *fc); + * :: + + void (*free)(struct fs_context *fc); Called to clean up the filesystem-specific part of the filesystem context when the context is destroyed. It should be aware that parts of the context may have been removed and NULL'd out by ->get_tree(). - (*) int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + * :: + + int (*dup)(struct fs_context *fc, struct fs_context *src_fc); Called when a filesystem context has been duplicated to duplicate the filesystem-private data. An error may be returned to indicate failure to do this. - [!] Note that even if this fails, put_fs_context() will be called + .. Warning:: + + Note that even if this fails, put_fs_context() will be called immediately thereafter, so ->dup() *must* make the filesystem-private data safe for ->free(). - (*) int (*parse_param)(struct fs_context *fc, - struct struct fs_parameter *param); + * :: + + int (*parse_param)(struct fs_context *fc, + struct struct fs_parameter *param); Called when a parameter is being added to the filesystem context. param points to the key name and maybe a value object. VFS-specific options @@ -224,7 +259,9 @@ manage the filesystem context. They are as follows: If successful, 0 should be returned or a negative error code otherwise. - (*) int (*parse_monolithic)(struct fs_context *fc, void *data); + * :: + + int (*parse_monolithic)(struct fs_context *fc, void *data); Called when the mount(2) system call is invoked to pass the entire data page in one go. If this is expected to be just a list of "key[=val]" @@ -236,7 +273,9 @@ manage the filesystem context. They are as follows: finds it's the standard key-val list then it may pass it off to generic_parse_monolithic(). - (*) int (*get_tree)(struct fs_context *fc); + * :: + + int (*get_tree)(struct fs_context *fc); Called to get or create the mountable root and superblock, using the information stored in the filesystem context (reconfiguration goes via a @@ -249,7 +288,9 @@ manage the filesystem context. They are as follows: The phase on a userspace-driven context will be set to only allow this to be called once on any particular context. - (*) int (*reconfigure)(struct fs_context *fc); + * :: + + int (*reconfigure)(struct fs_context *fc); Called to effect reconfiguration of a superblock using information stored in the filesystem context. It may detach any resources it desires from @@ -259,19 +300,20 @@ manage the filesystem context. They are as follows: On success it should return 0. In the case of an error, it should return a negative error code. - [NOTE] reconfigure is intended as a replacement for remount_fs. + .. Note:: reconfigure is intended as a replacement for remount_fs. -=========================== -FILESYSTEM CONTEXT SECURITY +Filesystem context Security =========================== The filesystem context contains a security pointer that the LSMs can use for building up a security context for the superblock to be mounted. There are a number of operations used by the new mount code for this purpose: - (*) int security_fs_context_alloc(struct fs_context *fc, - struct dentry *reference); + * :: + + int security_fs_context_alloc(struct fs_context *fc, + struct dentry *reference); Called to initialise fc->security (which is preset to NULL) and allocate any resources needed. It should return 0 on success or a negative error @@ -283,22 +325,28 @@ number of operations used by the new mount code for this purpose: non-NULL in the case of a submount (FS_CONTEXT_FOR_SUBMOUNT) in which case it indicates the automount point. - (*) int security_fs_context_dup(struct fs_context *fc, - struct fs_context *src_fc); + * :: + + int security_fs_context_dup(struct fs_context *fc, + struct fs_context *src_fc); Called to initialise fc->security (which is preset to NULL) and allocate any resources needed. The original filesystem context is pointed to by src_fc and may be used for reference. It should return 0 on success or a negative error code on failure. - (*) void security_fs_context_free(struct fs_context *fc); + * :: + + void security_fs_context_free(struct fs_context *fc); Called to clean up anything attached to fc->security. Note that the contents may have been transferred to a superblock and the pointer cleared during get_tree. - (*) int security_fs_context_parse_param(struct fs_context *fc, - struct fs_parameter *param); + * :: + + int security_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param); Called for each mount parameter, including the source. The arguments are as for the ->parse_param() method. It should return 0 to indicate that @@ -310,7 +358,9 @@ number of operations used by the new mount code for this purpose: (provided the value pointer is NULL'd out). If it is stolen, 1 must be returned to prevent it being passed to the filesystem. - (*) int security_fs_context_validate(struct fs_context *fc); + * :: + + int security_fs_context_validate(struct fs_context *fc); Called after all the options have been parsed to validate the collection as a whole and to do any necessary allocation so that @@ -320,36 +370,43 @@ number of operations used by the new mount code for this purpose: In the case of reconfiguration, the target superblock will be accessible via fc->root. - (*) int security_sb_get_tree(struct fs_context *fc); + * :: + + int security_sb_get_tree(struct fs_context *fc); Called during the mount procedure to verify that the specified superblock is allowed to be mounted and to transfer the security data there. It should return 0 or a negative error code. - (*) void security_sb_reconfigure(struct fs_context *fc); + * :: + + void security_sb_reconfigure(struct fs_context *fc); Called to apply any reconfiguration to an LSM's context. It must not fail. Error checking and resource allocation must be done in advance by the parameter parsing and validation hooks. - (*) int security_sb_mountpoint(struct fs_context *fc, struct path *mountpoint, - unsigned int mnt_flags); + * :: + + int security_sb_mountpoint(struct fs_context *fc, + struct path *mountpoint, + unsigned int mnt_flags); Called during the mount procedure to verify that the root dentry attached to the context is permitted to be attached to the specified mountpoint. It should return 0 on success or a negative error code on failure. -========================== -VFS FILESYSTEM CONTEXT API +VFS Filesystem context API ========================== There are four operations for creating a filesystem context and one for destroying a context: - (*) struct fs_context *fs_context_for_mount( - struct file_system_type *fs_type, - unsigned int sb_flags); + * :: + + struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags); Allocate a filesystem context for the purpose of setting up a new mount, whether that be with a new superblock or sharing an existing one. This @@ -359,7 +416,9 @@ destroying a context: fs_type specifies the filesystem type that will manage the context and sb_flags presets the superblock flags stored therein. - (*) struct fs_context *fs_context_for_reconfigure( + * :: + + struct fs_context *fs_context_for_reconfigure( struct dentry *dentry, unsigned int sb_flags, unsigned int sb_flags_mask); @@ -369,7 +428,9 @@ destroying a context: configured. sb_flags and sb_flags_mask indicate which superblock flags need changing and to what. - (*) struct fs_context *fs_context_for_submount( + * :: + + struct fs_context *fs_context_for_submount( struct file_system_type *fs_type, struct dentry *reference); @@ -382,7 +443,9 @@ destroying a context: Note that it's not a requirement that the reference dentry be of the same filesystem type as fs_type. - (*) struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc); + * :: + + struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc); Duplicate a filesystem context, copying any options noted and duplicating or additionally referencing any resources held therein. This is available @@ -392,14 +455,18 @@ destroying a context: The purpose in the new context is inherited from the old one. - (*) void put_fs_context(struct fs_context *fc); + * :: + + void put_fs_context(struct fs_context *fc); Destroy a filesystem context, releasing any resources it holds. This calls the ->free() operation. This is intended to be called by anyone who created a filesystem context. - [!] filesystem contexts are not refcounted, so this causes unconditional - destruction. + .. Warning:: + + filesystem contexts are not refcounted, so this causes unconditional + destruction. In all the above operations, apart from the put op, the return is a mount context pointer or a negative error code. @@ -407,8 +474,10 @@ context pointer or a negative error code. For the remaining operations, if an error occurs, a negative error code will be returned. - (*) int vfs_parse_fs_param(struct fs_context *fc, - struct fs_parameter *param); + * :: + + int vfs_parse_fs_param(struct fs_context *fc, + struct fs_parameter *param); Supply a single mount parameter to the filesystem context. This include the specification of the source/device which is specified as the "source" @@ -423,53 +492,64 @@ returned. The parameter value is typed and can be one of: - fs_value_is_flag, Parameter not given a value. - fs_value_is_string, Value is a string - fs_value_is_blob, Value is a binary blob - fs_value_is_filename, Value is a filename* + dirfd - fs_value_is_file, Value is an open file (file*) + ==================== ============================= + fs_value_is_flag Parameter not given a value + fs_value_is_string Value is a string + fs_value_is_blob Value is a binary blob + fs_value_is_filename Value is a filename* + dirfd + fs_value_is_file Value is an open file (file*) + ==================== ============================= If there is a value, that value is stored in a union in the struct in one of param->{string,blob,name,file}. Note that the function may steal and clear the pointer, but then becomes responsible for disposing of the object. - (*) int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size); + * :: + + int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size); A wrapper around vfs_parse_fs_param() that copies the value string it is passed. - (*) int generic_parse_monolithic(struct fs_context *fc, void *data); + * :: + + int generic_parse_monolithic(struct fs_context *fc, void *data); Parse a sys_mount() data page, assuming the form to be a text list consisting of key[=val] options separated by commas. Each item in the list is passed to vfs_mount_option(). This is the default when the ->parse_monolithic() method is NULL. - (*) int vfs_get_tree(struct fs_context *fc); + * :: + + int vfs_get_tree(struct fs_context *fc); Get or create the mountable root and superblock, using the parameters in the filesystem context to select/configure the superblock. This invokes the ->get_tree() method. - (*) struct vfsmount *vfs_create_mount(struct fs_context *fc); + * :: + + struct vfsmount *vfs_create_mount(struct fs_context *fc); Create a mount given the parameters in the specified filesystem context. Note that this does not attach the mount to anything. -=========================== -SUPERBLOCK CREATION HELPERS +Superblock Creation Helpers =========================== A number of VFS helpers are available for use by filesystems for the creation or looking up of superblocks. - (*) struct super_block * - sget_fc(struct fs_context *fc, - int (*test)(struct super_block *sb, struct fs_context *fc), - int (*set)(struct super_block *sb, struct fs_context *fc)); + * :: + + struct super_block * + sget_fc(struct fs_context *fc, + int (*test)(struct super_block *sb, struct fs_context *fc), + int (*set)(struct super_block *sb, struct fs_context *fc)); This is the core routine. If test is non-NULL, it searches for an existing superblock matching the criteria held in the fs_context, using @@ -482,10 +562,12 @@ or looking up of superblocks. The following helpers all wrap sget_fc(): - (*) int vfs_get_super(struct fs_context *fc, - enum vfs_get_super_keying keying, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) + * :: + + int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) This creates/looks up a deviceless superblock. The keying indicates how many superblocks of this type may exist and in what manner they may be @@ -515,14 +597,14 @@ PARAMETER DESCRIPTION ===================== Parameters are described using structures defined in linux/fs_parser.h. -There's a core description struct that links everything together: +There's a core description struct that links everything together:: struct fs_parameter_description { const struct fs_parameter_spec *specs; const struct fs_parameter_enum *enums; }; -For example: +For example:: enum { Opt_autocell, @@ -539,10 +621,12 @@ For example: The members are as follows: - (1) const struct fs_parameter_specification *specs; + (1) :: + + const struct fs_parameter_specification *specs; Table of parameter specifications, terminated with a null entry, where the - entries are of type: + entries are of type:: struct fs_parameter_spec { const char *name; @@ -558,6 +642,7 @@ The members are as follows: The 'type' field indicates the desired value type and must be one of: + ======================= ======================= ===================== TYPE NAME EXPECTED VALUE RESULT IN ======================= ======================= ===================== fs_param_is_flag No value n/a @@ -573,19 +658,23 @@ The members are as follows: fs_param_is_blockdev Blockdev path * Needs lookup fs_param_is_path Path * Needs lookup fs_param_is_fd File descriptor result->int_32 + ======================= ======================= ===================== Note that if the value is of fs_param_is_bool type, fs_parse() will try to match any string value against "0", "1", "no", "yes", "false", "true". Each parameter can also be qualified with 'flags': + ======================= ================================================ fs_param_v_optional The value is optional fs_param_neg_with_no result->negated set if key is prefixed with "no" fs_param_neg_with_empty result->negated set if value is "" fs_param_deprecated The parameter is deprecated. + ======================= ================================================ These are wrapped with a number of convenience wrappers: + ======================= =============================================== MACRO SPECIFIES ======================= =============================================== fsparam_flag() fs_param_is_flag @@ -602,9 +691,10 @@ The members are as follows: fsparam_bdev() fs_param_is_blockdev fsparam_path() fs_param_is_path fsparam_fd() fs_param_is_fd + ======================= =============================================== all of which take two arguments, name string and option number - for - example: + example:: static const struct fs_parameter_spec afs_param_specs[] = { fsparam_flag ("autocell", Opt_autocell), @@ -618,10 +708,12 @@ The members are as follows: of arguments to specify the type and the flags for anything that doesn't match one of the above macros. - (2) const struct fs_parameter_enum *enums; + (2) :: + + const struct fs_parameter_enum *enums; Table of enum value names to integer mappings, terminated with a null - entry. This is of type: + entry. This is of type:: struct fs_parameter_enum { u8 opt; @@ -630,7 +722,7 @@ The members are as follows: }; Where the array is an unsorted list of { parameter ID, name }-keyed - elements that indicate the value to map to, e.g.: + elements that indicate the value to map to, e.g.:: static const struct fs_parameter_enum afs_param_enums[] = { { Opt_bar, "x", 1}, @@ -648,18 +740,19 @@ CONFIG_VALIDATE_FS_PARSER=y) and will allow the description to be queried from userspace using the fsinfo() syscall. -========================== -PARAMETER HELPER FUNCTIONS +Parameter Helper Functions ========================== A number of helper functions are provided to help a filesystem or an LSM process the parameters it is given. - (*) int lookup_constant(const struct constant_table tbl[], - const char *name, int not_found); + * :: + + int lookup_constant(const struct constant_table tbl[], + const char *name, int not_found); Look up a constant by name in a table of name -> integer mappings. The - table is an array of elements of the following type: + table is an array of elements of the following type:: struct constant_table { const char *name; @@ -669,9 +762,11 @@ process the parameters it is given. If a match is found, the corresponding value is returned. If a match isn't found, the not_found value is returned instead. - (*) bool validate_constant_table(const struct constant_table *tbl, - size_t tbl_size, - int low, int high, int special); + * :: + + bool validate_constant_table(const struct constant_table *tbl, + size_t tbl_size, + int low, int high, int special); Validate a constant table. Checks that all the elements are appropriately ordered, that there are no duplicates and that the values are between low @@ -682,16 +777,20 @@ process the parameters it is given. If all is good, true is returned. If the table is invalid, errors are logged to dmesg and false is returned. - (*) bool fs_validate_description(const struct fs_parameter_description *desc); + * :: + + bool fs_validate_description(const struct fs_parameter_description *desc); This performs some validation checks on a parameter description. It returns true if the description is good and false if it is not. It will log errors to dmesg if validation fails. - (*) int fs_parse(struct fs_context *fc, - const struct fs_parameter_description *desc, - struct fs_parameter *param, - struct fs_parse_result *result); + * :: + + int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *param, + struct fs_parse_result *result); This is the main interpreter of parameters. It uses the parameter description to look up a parameter by key name and to convert that to an @@ -711,14 +810,16 @@ process the parameters it is given. parameter is matched, but the value is erroneous, -EINVAL will be returned; otherwise the parameter's option number will be returned. - (*) int fs_lookup_param(struct fs_context *fc, - struct fs_parameter *value, - bool want_bdev, - struct path *_path); + * :: + + int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *value, + bool want_bdev, + struct path *_path); This takes a parameter that carries a string or filename type and attempts to do a path lookup on it. If the parameter expects a blockdev, a check is made that the inode actually represents one. - Returns 0 if successful and *_path will be set; returns a negative error - code if not. + Returns 0 if successful and ``*_path`` will be set; returns a negative + error code if not. diff --git a/Documentation/filesystems/orangefs.rst b/Documentation/filesystems/orangefs.rst index e41369709c5b..463e37694250 100644 --- a/Documentation/filesystems/orangefs.rst +++ b/Documentation/filesystems/orangefs.rst @@ -119,9 +119,7 @@ it comes to that question:: /opt/ofs/bin/pvfs2-genconfig /etc/pvfs2.conf -Create an /etc/pvfs2tab file:: - -Localhost is fine for your pvfs2tab file: +Create an /etc/pvfs2tab file (localhost is fine):: echo tcp://localhost:3334/orangefs /pvfsmnt pvfs2 defaults,noauto 0 0 > \ /etc/pvfs2tab diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 092b7b44d158..430963e0e8c3 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -543,6 +543,7 @@ encoded manner. The codes are the following: hg huge page advise flag nh no huge page advise flag mg mergable advise flag + bt - arm64 BTI guarded page == ======================================= Note that there is no guarantee that every flag and associated mnemonic will @@ -1870,7 +1871,7 @@ unbindable mount is unbindable For more information on mount propagation see: - Documentation/filesystems/sharedsubtree.txt + Documentation/filesystems/sharedsubtree.rst 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm diff --git a/Documentation/filesystems/quota.txt b/Documentation/filesystems/quota.rst index 32874b06ebe9..a30cdd47c652 100644 --- a/Documentation/filesystems/quota.txt +++ b/Documentation/filesystems/quota.rst @@ -1,4 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 +=============== Quota subsystem =============== @@ -39,6 +41,7 @@ Currently, the interface supports only one message type QUOTA_NL_C_WARNING. This command is used to send a notification about any of the above mentioned events. Each message has six attributes. These are (type of the argument is in parentheses): + QUOTA_NL_A_QTYPE (u32) - type of quota being exceeded (one of USRQUOTA, GRPQUOTA) QUOTA_NL_A_EXCESS_ID (u64) @@ -48,20 +51,34 @@ in parentheses): - UID of a user who caused the event QUOTA_NL_A_WARNING (u32) - what kind of limit is exceeded: - QUOTA_NL_IHARDWARN - inode hardlimit - QUOTA_NL_ISOFTLONGWARN - inode softlimit is exceeded longer - than given grace period - QUOTA_NL_ISOFTWARN - inode softlimit - QUOTA_NL_BHARDWARN - space (block) hardlimit - QUOTA_NL_BSOFTLONGWARN - space (block) softlimit is exceeded - longer than given grace period. - QUOTA_NL_BSOFTWARN - space (block) softlimit + + QUOTA_NL_IHARDWARN + inode hardlimit + QUOTA_NL_ISOFTLONGWARN + inode softlimit is exceeded longer + than given grace period + QUOTA_NL_ISOFTWARN + inode softlimit + QUOTA_NL_BHARDWARN + space (block) hardlimit + QUOTA_NL_BSOFTLONGWARN + space (block) softlimit is exceeded + longer than given grace period. + QUOTA_NL_BSOFTWARN + space (block) softlimit + - four warnings are also defined for the event when user stops exceeding some limit: - QUOTA_NL_IHARDBELOW - inode hardlimit - QUOTA_NL_ISOFTBELOW - inode softlimit - QUOTA_NL_BHARDBELOW - space (block) hardlimit - QUOTA_NL_BSOFTBELOW - space (block) softlimit + + QUOTA_NL_IHARDBELOW + inode hardlimit + QUOTA_NL_ISOFTBELOW + inode softlimit + QUOTA_NL_BHARDBELOW + space (block) hardlimit + QUOTA_NL_BSOFTBELOW + space (block) softlimit + QUOTA_NL_A_DEV_MAJOR (u32) - major number of a device with the affected filesystem QUOTA_NL_A_DEV_MINOR (u32) diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.rst b/Documentation/filesystems/ramfs-rootfs-initramfs.rst index 6c576e241d86..3fddacc6bf14 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.rst +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.rst @@ -71,7 +71,7 @@ be allowed write access to a ramfs mount. A ramfs derivative called tmpfs was created to add size limits, and the ability to write the data to swap space. Normal users can be allowed write access to -tmpfs mounts. See Documentation/filesystems/tmpfs.txt for more information. +tmpfs mounts. See Documentation/filesystems/tmpfs.rst for more information. What is rootfs? --------------- diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.rst index d412b236a9d6..fab302046b13 100644 --- a/Documentation/filesystems/seq_file.txt +++ b/Documentation/filesystems/seq_file.rst @@ -1,6 +1,11 @@ -The seq_file interface +.. SPDX-License-Identifier: GPL-2.0 + +====================== +The seq_file Interface +====================== Copyright 2003 Jonathan Corbet <corbet@lwn.net> + This file is originally from the LWN.net Driver Porting series at http://lwn.net/Articles/driver-porting/ @@ -43,7 +48,7 @@ loadable module which creates a file called /proc/sequence. The file, when read, simply produces a set of increasing integer values, one per line. The sequence will continue until the user loses patience and finds something better to do. The file is seekable, in that one can do something like the -following: +following:: dd if=/proc/sequence of=out1 count=1 dd if=/proc/sequence skip=1 of=out2 count=1 @@ -55,16 +60,18 @@ wanting to see the full source for this module can find it at http://lwn.net/Articles/22359/). Deprecated create_proc_entry +============================ Note that the above article uses create_proc_entry which was removed in -kernel 3.10. Current versions require the following update +kernel 3.10. Current versions require the following update:: -- entry = create_proc_entry("sequence", 0, NULL); -- if (entry) -- entry->proc_fops = &ct_file_ops; -+ entry = proc_create("sequence", 0, NULL, &ct_file_ops); + - entry = create_proc_entry("sequence", 0, NULL); + - if (entry) + - entry->proc_fops = &ct_file_ops; + + entry = proc_create("sequence", 0, NULL, &ct_file_ops); The iterator interface +====================== Modules implementing a virtual file with seq_file must implement an iterator object that allows stepping through the data of interest @@ -99,7 +106,7 @@ position. The pos passed to start() will always be either zero, or the most recent pos used in the previous session. For our simple sequence example, -the start() function looks like: +the start() function looks like:: static void *ct_seq_start(struct seq_file *s, loff_t *pos) { @@ -129,7 +136,7 @@ move the iterator forward to the next position in the sequence. The example module can simply increment the position by one; more useful modules will do what is needed to step through some data structure. The next() function returns a new iterator, or NULL if the sequence is -complete. Here's the example version: +complete. Here's the example version:: static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { @@ -141,10 +148,10 @@ complete. Here's the example version: The stop() function closes a session; its job, of course, is to clean up. If dynamic memory is allocated for the iterator, stop() is the place to free it; if a lock was taken by start(), stop() must release -that lock. The value that *pos was set to by the last next() call +that lock. The value that ``*pos`` was set to by the last next() call before stop() is remembered, and used for the first start() call of the next session unless lseek() has been called on the file; in that -case next start() will be asked to start at position zero. +case next start() will be asked to start at position zero:: static void ct_seq_stop(struct seq_file *s, void *v) { @@ -152,7 +159,7 @@ case next start() will be asked to start at position zero. } Finally, the show() function should format the object currently pointed to -by the iterator for output. The example module's show() function is: +by the iterator for output. The example module's show() function is:: static int ct_seq_show(struct seq_file *s, void *v) { @@ -169,7 +176,7 @@ generated output before returning SEQ_SKIP, that output will be dropped. We will look at seq_printf() in a moment. But first, the definition of the seq_file iterator is finished by creating a seq_operations structure with -the four functions we have just defined: +the four functions we have just defined:: static const struct seq_operations ct_seq_ops = { .start = ct_seq_start, @@ -194,6 +201,7 @@ other locks while the iterator is active. Formatted output +================ The seq_file code manages positioning within the output created by the iterator and getting it into the user's buffer. But, for that to work, that @@ -203,7 +211,7 @@ been defined which make this task easy. Most code will simply use seq_printf(), which works pretty much like printk(), but which requires the seq_file pointer as an argument. -For straight character output, the following functions may be used: +For straight character output, the following functions may be used:: seq_putc(struct seq_file *m, char c); seq_puts(struct seq_file *m, const char *s); @@ -213,7 +221,7 @@ The first two output a single character and a string, just like one would expect. seq_escape() is like seq_puts(), except that any character in s which is in the string esc will be represented in octal form in the output. -There are also a pair of functions for printing filenames: +There are also a pair of functions for printing filenames:: int seq_path(struct seq_file *m, const struct path *path, const char *esc); @@ -226,8 +234,10 @@ the path relative to the current process's filesystem root. If a different root is desired, it can be used with seq_path_root(). If it turns out that path cannot be reached from root, seq_path_root() returns SEQ_SKIP. -A function producing complicated output may want to check +A function producing complicated output may want to check:: + bool seq_has_overflowed(struct seq_file *m); + and avoid further seq_<output> calls if true is returned. A true return from seq_has_overflowed means that the seq_file buffer will @@ -236,6 +246,7 @@ buffer and retry printing. Making it all work +================== So far, we have a nice set of functions which can produce output within the seq_file system, but we have not yet turned them into a file that a user @@ -244,7 +255,7 @@ creation of a set of file_operations which implement the operations on that file. The seq_file interface provides a set of canned operations which do most of the work. The virtual file author still must implement the open() method, however, to hook everything up. The open function is often a single -line, as in the example module: +line, as in the example module:: static int ct_open(struct inode *inode, struct file *file) { @@ -263,7 +274,7 @@ by the iterator functions. There is also a wrapper function to seq_open() called seq_open_private(). It kmallocs a zero filled block of memory and stores a pointer to it in the private field of the seq_file structure, returning 0 on success. The -block size is specified in a third parameter to the function, e.g.: +block size is specified in a third parameter to the function, e.g.:: static int ct_open(struct inode *inode, struct file *file) { @@ -273,7 +284,7 @@ block size is specified in a third parameter to the function, e.g.: There is also a variant function, __seq_open_private(), which is functionally identical except that, if successful, it returns the pointer to the allocated -memory block, allowing further initialisation e.g.: +memory block, allowing further initialisation e.g.:: static int ct_open(struct inode *inode, struct file *file) { @@ -295,7 +306,7 @@ frees the memory allocated in the corresponding open. The other operations of interest - read(), llseek(), and release() - are all implemented by the seq_file code itself. So a virtual file's -file_operations structure will look like: +file_operations structure will look like:: static const struct file_operations ct_file_ops = { .owner = THIS_MODULE, @@ -309,7 +320,7 @@ There is also a seq_release_private() which passes the contents of the seq_file private field to kfree() before releasing the structure. The final step is the creation of the /proc file itself. In the example -code, that is done in the initialization code in the usual way: +code, that is done in the initialization code in the usual way:: static int ct_init(void) { @@ -325,9 +336,10 @@ And that is pretty much it. seq_list +======== If your file will be iterating through a linked list, you may find these -routines useful: +routines useful:: struct list_head *seq_list_start(struct list_head *head, loff_t pos); @@ -338,15 +350,16 @@ routines useful: These helpers will interpret pos as a position within the list and iterate accordingly. Your start() and next() functions need only invoke the -seq_list_* helpers with a pointer to the appropriate list_head structure. +``seq_list_*`` helpers with a pointer to the appropriate list_head structure. The extra-simple version +======================== For extremely simple virtual files, there is an even easier interface. A module can define only the show() function, which should create all the output that the virtual file will contain. The file's open() method then -calls: +calls:: int single_open(struct file *file, int (*show)(struct seq_file *m, void *p), diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.rst index 8ccfbd55244b..d83395354250 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.rst @@ -1,7 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== Shared Subtrees ---------------- +=============== -Contents: +.. Contents: 1) Overview 2) Features 3) Setting mount states @@ -41,31 +44,38 @@ replicas continue to be exactly same. Here is an example: - Let's say /mnt has a mount that is shared. - mount --make-shared /mnt + Let's say /mnt has a mount that is shared:: + + mount --make-shared /mnt Note: mount(8) command now supports the --make-shared flag, so the sample 'smount' program is no longer needed and has been removed. - # mount --bind /mnt /tmp + :: + + # mount --bind /mnt /tmp + The above command replicates the mount at /mnt to the mountpoint /tmp and the contents of both the mounts remain identical. - #ls /mnt - a b c + :: - #ls /tmp - a b c + #ls /mnt + a b c - Now let's say we mount a device at /tmp/a - # mount /dev/sd0 /tmp/a + #ls /tmp + a b c - #ls /tmp/a - t1 t2 t3 + Now let's say we mount a device at /tmp/a:: - #ls /mnt/a - t1 t2 t3 + # mount /dev/sd0 /tmp/a + + #ls /tmp/a + t1 t2 t3 + + #ls /mnt/a + t1 t2 t3 Note that the mount has propagated to the mount at /mnt as well. @@ -123,14 +133,15 @@ replicas continue to be exactly same. 2d) A unbindable mount is a unbindable private mount - let's say we have a mount at /mnt and we make it unbindable + let's say we have a mount at /mnt and we make it unbindable:: + + # mount --make-unbindable /mnt - # mount --make-unbindable /mnt + Let's try to bind mount this mount somewhere else:: - Let's try to bind mount this mount somewhere else. - # mount --bind /mnt /tmp - mount: wrong fs type, bad option, bad superblock on /mnt, - or too many mounted file systems + # mount --bind /mnt /tmp + mount: wrong fs type, bad option, bad superblock on /mnt, + or too many mounted file systems Binding a unbindable mount is a invalid operation. @@ -138,12 +149,12 @@ replicas continue to be exactly same. 3) Setting mount states The mount command (util-linux package) can be used to set mount - states: + states:: - mount --make-shared mountpoint - mount --make-slave mountpoint - mount --make-private mountpoint - mount --make-unbindable mountpoint + mount --make-shared mountpoint + mount --make-slave mountpoint + mount --make-private mountpoint + mount --make-unbindable mountpoint 4) Use cases @@ -154,9 +165,10 @@ replicas continue to be exactly same. Solution: - The system administrator can make the mount at /cdrom shared - mount --bind /cdrom /cdrom - mount --make-shared /cdrom + The system administrator can make the mount at /cdrom shared:: + + mount --bind /cdrom /cdrom + mount --make-shared /cdrom Now any process that clones off a new namespace will have a mount at /cdrom which is a replica of the same mount in the @@ -172,14 +184,14 @@ replicas continue to be exactly same. Solution: To begin with, the administrator can mark the entire mount tree - as shareable. + as shareable:: - mount --make-rshared / + mount --make-rshared / A new process can clone off a new namespace. And mark some part - of its namespace as slave + of its namespace as slave:: - mount --make-rslave /myprivatetree + mount --make-rslave /myprivatetree Hence forth any mounts within the /myprivatetree done by the process will not show up in any other namespace. However mounts @@ -206,13 +218,13 @@ replicas continue to be exactly same. versions of the file depending on the path used to access that file. - An example is: + An example is:: - mount --make-shared / - mount --rbind / /view/v1 - mount --rbind / /view/v2 - mount --rbind / /view/v3 - mount --rbind / /view/v4 + mount --make-shared / + mount --rbind / /view/v1 + mount --rbind / /view/v2 + mount --rbind / /view/v3 + mount --rbind / /view/v4 and if /usr has a versioning filesystem mounted, then that mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and @@ -224,8 +236,8 @@ replicas continue to be exactly same. filesystem is being requested and return the corresponding inode. -5) Detailed semantics: -------------------- +5) Detailed semantics +--------------------- The section below explains the detailed semantics of bind, rbind, move, mount, umount and clone-namespace operations. @@ -235,6 +247,7 @@ replicas continue to be exactly same. 5a) Mount states A given mount can be in one of the following states + 1) shared 2) slave 3) shared and slave @@ -252,7 +265,8 @@ replicas continue to be exactly same. A 'shared mount' is defined as a vfsmount that belongs to a 'peer group'. - For example: + For example:: + mount --make-shared /mnt mount --bind /mnt /tmp @@ -270,7 +284,7 @@ replicas continue to be exactly same. A slave mount as the name implies has a master mount from which mount/unmount events are received. Events do not propagate from the slave mount to the master. Only a shared mount can be made - a slave by executing the following command + a slave by executing the following command:: mount --make-slave mount @@ -290,8 +304,10 @@ replicas continue to be exactly same. peer group. Only a slave vfsmount can be made as 'shared and slave' by - either executing the following command + either executing the following command:: + mount --make-shared mount + or by moving the slave vfsmount under a shared vfsmount. (4) Private mount @@ -307,30 +323,32 @@ replicas continue to be exactly same. State diagram: + The state diagram below explains the state transition of a mount, - in response to various commands. - ------------------------------------------------------------------------ - | |make-shared | make-slave | make-private |make-unbindab| - --------------|------------|--------------|--------------|-------------| - |shared |shared |*slave/private| private | unbindable | - | | | | | | - |-------------|------------|--------------|--------------|-------------| - |slave |shared | **slave | private | unbindable | - | |and slave | | | | - |-------------|------------|--------------|--------------|-------------| - |shared |shared | slave | private | unbindable | - |and slave |and slave | | | | - |-------------|------------|--------------|--------------|-------------| - |private |shared | **private | private | unbindable | - |-------------|------------|--------------|--------------|-------------| - |unbindable |shared |**unbindable | private | unbindable | - ------------------------------------------------------------------------ - - * if the shared mount is the only mount in its peer group, making it - slave, makes it private automatically. Note that there is no master to - which it can be slaved to. - - ** slaving a non-shared mount has no effect on the mount. + in response to various commands:: + + ----------------------------------------------------------------------- + | |make-shared | make-slave | make-private |make-unbindab| + --------------|------------|--------------|--------------|-------------| + |shared |shared |*slave/private| private | unbindable | + | | | | | | + |-------------|------------|--------------|--------------|-------------| + |slave |shared | **slave | private | unbindable | + | |and slave | | | | + |-------------|------------|--------------|--------------|-------------| + |shared |shared | slave | private | unbindable | + |and slave |and slave | | | | + |-------------|------------|--------------|--------------|-------------| + |private |shared | **private | private | unbindable | + |-------------|------------|--------------|--------------|-------------| + |unbindable |shared |**unbindable | private | unbindable | + ------------------------------------------------------------------------ + + * if the shared mount is the only mount in its peer group, making it + slave, makes it private automatically. Note that there is no master to + which it can be slaved to. + + ** slaving a non-shared mount has no effect on the mount. Apart from the commands listed below, the 'move' operation also changes the state of a mount depending on type of the destination mount. Its @@ -338,31 +356,32 @@ replicas continue to be exactly same. 5b) Bind semantics - Consider the following command + Consider the following command:: - mount --bind A/a B/b + mount --bind A/a B/b where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B' is the destination mount and 'b' is the dentry in the destination mount. The outcome depends on the type of mount of 'A' and 'B'. The table - below contains quick reference. - --------------------------------------------------------------------------- - | BIND MOUNT OPERATION | - |************************************************************************** - |source(A)->| shared | private | slave | unbindable | - | dest(B) | | | | | - | | | | | | | - | v | | | | | - |************************************************************************** - | shared | shared | shared | shared & slave | invalid | - | | | | | | - |non-shared| shared | private | slave | invalid | - *************************************************************************** + below contains quick reference:: + + -------------------------------------------------------------------------- + | BIND MOUNT OPERATION | + |************************************************************************| + |source(A)->| shared | private | slave | unbindable | + | dest(B) | | | | | + | | | | | | | + | v | | | | | + |************************************************************************| + | shared | shared | shared | shared & slave | invalid | + | | | | | | + |non-shared| shared | private | slave | invalid | + ************************************************************************** Details: - 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' + 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . 'C' is mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... are created and mounted at the dentry 'b' on all mounts where 'B' @@ -371,7 +390,7 @@ replicas continue to be exactly same. 'B'. And finally the peer-group of 'C' is merged with the peer group of 'A'. - 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' + 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' which is clone of 'A', is created. Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... are created and mounted at the dentry 'b' on all mounts where 'B' @@ -379,7 +398,7 @@ replicas continue to be exactly same. 'C', 'C1', .., 'Cn' with exactly the same configuration as the propagation tree for 'B'. - 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new + 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2', 'C3' ... are created and mounted at the dentry 'b' on all mounts where @@ -389,19 +408,19 @@ replicas continue to be exactly same. is made the slave of mount 'Z'. In other words, mount 'C' is in the state 'slave and shared'. - 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a + 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a invalid operation. - 5. 'A' is a private mount and 'B' is a non-shared(private or slave or + 5. 'A' is a private mount and 'B' is a non-shared(private or slave or unbindable) mount. A new mount 'C' which is clone of 'A', is created. Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. - 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' + 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. 'C' is made a member of the peer-group of 'A'. - 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A + 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A new mount 'C' which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of @@ -409,7 +428,7 @@ replicas continue to be exactly same. mount/unmount on 'A' do not propagate anywhere else. Similarly mount/unmount on 'C' do not propagate anywhere else. - 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a + 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a invalid operation. A unbindable mount cannot be bind mounted. 5c) Rbind semantics @@ -422,7 +441,9 @@ replicas continue to be exactly same. then the subtree under the unbindable mount is pruned in the new location. - eg: let's say we have the following mount tree. + eg: + + let's say we have the following mount tree:: A / \ @@ -430,12 +451,12 @@ replicas continue to be exactly same. / \ / \ D E F G - Let's say all the mount except the mount C in the tree are - of a type other than unbindable. + Let's say all the mount except the mount C in the tree are + of a type other than unbindable. - If this tree is rbound to say Z + If this tree is rbound to say Z - We will have the following tree at the new location. + We will have the following tree at the new location:: Z | @@ -457,24 +478,26 @@ replicas continue to be exactly same. the dentry in the destination mount. The outcome depends on the type of the mount of 'A' and 'B'. The table - below is a quick reference. - --------------------------------------------------------------------------- - | MOVE MOUNT OPERATION | - |************************************************************************** - | source(A)->| shared | private | slave | unbindable | - | dest(B) | | | | | - | | | | | | | - | v | | | | | - |************************************************************************** - | shared | shared | shared |shared and slave| invalid | - | | | | | | - |non-shared| shared | private | slave | unbindable | - *************************************************************************** - NOTE: moving a mount residing under a shared mount is invalid. + below is a quick reference:: + + --------------------------------------------------------------------------- + | MOVE MOUNT OPERATION | + |************************************************************************** + | source(A)->| shared | private | slave | unbindable | + | dest(B) | | | | | + | | | | | | | + | v | | | | | + |************************************************************************** + | shared | shared | shared |shared and slave| invalid | + | | | | | | + |non-shared| shared | private | slave | unbindable | + *************************************************************************** + + .. Note:: moving a mount residing under a shared mount is invalid. Details follow: - 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is + 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An' are created and mounted at dentry 'b' on all mounts that receive propagation from mount 'B'. A new propagation tree is created in the @@ -483,7 +506,7 @@ replicas continue to be exactly same. propagation tree is appended to the already existing propagation tree of 'A'. - 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is + 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that receive propagation from mount 'B'. The mount 'A' becomes a shared mount and a @@ -491,7 +514,7 @@ replicas continue to be exactly same. 'B'. This new propagation tree contains all the new mounts 'A1', 'A2'... 'An'. - 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The + 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that receive propagation from mount 'B'. A new propagation tree is created @@ -501,32 +524,32 @@ replicas continue to be exactly same. 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also becomes 'shared'. - 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation + 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation is invalid. Because mounting anything on the shared mount 'B' can create new mounts that get mounted on the mounts that receive propagation from 'B'. And since the mount 'A' is unbindable, cloning it to mount at other mountpoints is not possible. - 5. 'A' is a private mount and 'B' is a non-shared(private or slave or + 5. 'A' is a private mount and 'B' is a non-shared(private or slave or unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. - 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' + 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a shared mount. - 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. + 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a slave mount of mount 'Z'. - 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount + 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a unbindable mount. 5e) Mount semantics - Consider the following command + Consider the following command:: - mount device B/b + mount device B/b 'B' is the destination mount and 'b' is the dentry in the destination mount. @@ -537,9 +560,9 @@ replicas continue to be exactly same. 5f) Unmount semantics - Consider the following command + Consider the following command:: - umount A + umount A where 'A' is a mount mounted on mount 'B' at dentry 'b'. @@ -592,10 +615,12 @@ replicas continue to be exactly same. A. What is the result of the following command sequence? - mount --bind /mnt /mnt - mount --make-shared /mnt - mount --bind /mnt /tmp - mount --move /tmp /mnt/1 + :: + + mount --bind /mnt /mnt + mount --make-shared /mnt + mount --bind /mnt /tmp + mount --move /tmp /mnt/1 what should be the contents of /mnt /mnt/1 /mnt/1/1 should be? Should they all be identical? or should /mnt and /mnt/1 be @@ -604,23 +629,27 @@ replicas continue to be exactly same. B. What is the result of the following command sequence? - mount --make-rshared / - mkdir -p /v/1 - mount --rbind / /v/1 + :: + + mount --make-rshared / + mkdir -p /v/1 + mount --rbind / /v/1 what should be the content of /v/1/v/1 be? C. What is the result of the following command sequence? - mount --bind /mnt /mnt - mount --make-shared /mnt - mkdir -p /mnt/1/2/3 /mnt/1/test - mount --bind /mnt/1 /tmp - mount --make-slave /mnt - mount --make-shared /mnt - mount --bind /mnt/1/2 /tmp1 - mount --make-slave /mnt + :: + + mount --bind /mnt /mnt + mount --make-shared /mnt + mkdir -p /mnt/1/2/3 /mnt/1/test + mount --bind /mnt/1 /tmp + mount --make-slave /mnt + mount --make-shared /mnt + mount --bind /mnt/1/2 /tmp1 + mount --make-slave /mnt At this point we have the first mount at /tmp and its root dentry is 1. Let's call this mount 'A' @@ -668,7 +697,8 @@ replicas continue to be exactly same. step 1: let's say the root tree has just two directories with - one vfsmount. + one vfsmount:: + root / \ tmp usr @@ -676,14 +706,17 @@ replicas continue to be exactly same. And we want to replicate the tree at multiple mountpoints under /root/tmp - step2: - mount --make-shared /root + step 2: + :: - mkdir -p /tmp/m1 - mount --rbind /root /tmp/m1 + mount --make-shared /root - the new tree now looks like this: + mkdir -p /tmp/m1 + + mount --rbind /root /tmp/m1 + + the new tree now looks like this:: root / \ @@ -697,11 +730,13 @@ replicas continue to be exactly same. it has two vfsmounts - step3: + step 3: + :: + mkdir -p /tmp/m2 mount --rbind /root /tmp/m2 - the new tree now looks like this: + the new tree now looks like this:: root / \ @@ -724,6 +759,7 @@ replicas continue to be exactly same. it has 6 vfsmounts step 4: + :: mkdir -p /tmp/m3 mount --rbind /root /tmp/m3 @@ -740,7 +776,8 @@ replicas continue to be exactly same. step 1: let's say the root tree has just two directories with - one vfsmount. + one vfsmount:: + root / \ tmp usr @@ -748,17 +785,20 @@ replicas continue to be exactly same. How do we set up the same tree at multiple locations under /root/tmp - step2: - mount --bind /root/tmp /root/tmp + step 2: + :: - mount --make-rshared /root - mount --make-unbindable /root/tmp - mkdir -p /tmp/m1 + mount --bind /root/tmp /root/tmp - mount --rbind /root /tmp/m1 + mount --make-rshared /root + mount --make-unbindable /root/tmp - the new tree now looks like this: + mkdir -p /tmp/m1 + + mount --rbind /root /tmp/m1 + + the new tree now looks like this:: root / \ @@ -768,11 +808,13 @@ replicas continue to be exactly same. / \ tmp usr - step3: + step 3: + :: + mkdir -p /tmp/m2 mount --rbind /root /tmp/m2 - the new tree now looks like this: + the new tree now looks like this:: root / \ @@ -782,12 +824,13 @@ replicas continue to be exactly same. / \ / \ tmp usr tmp usr - step4: + step 4: + :: mkdir -p /tmp/m3 mount --rbind /root /tmp/m3 - the new tree now looks like this: + the new tree now looks like this:: root / \ @@ -801,25 +844,31 @@ replicas continue to be exactly same. 8A) Datastructure - 4 new fields are introduced to struct vfsmount - ->mnt_share - ->mnt_slave_list - ->mnt_slave - ->mnt_master + 4 new fields are introduced to struct vfsmount: + + * ->mnt_share + * ->mnt_slave_list + * ->mnt_slave + * ->mnt_master - ->mnt_share links together all the mount to/from which this vfsmount + ->mnt_share + links together all the mount to/from which this vfsmount send/receives propagation events. - ->mnt_slave_list links all the mounts to which this vfsmount propagates + ->mnt_slave_list + links all the mounts to which this vfsmount propagates to. - ->mnt_slave links together all the slaves that its master vfsmount + ->mnt_slave + links together all the slaves that its master vfsmount propagates to. - ->mnt_master points to the master vfsmount from which this vfsmount + ->mnt_master + points to the master vfsmount from which this vfsmount receives propagation. - ->mnt_flags takes two more flags to indicate the propagation status of + ->mnt_flags + takes two more flags to indicate the propagation status of the vfsmount. MNT_SHARE indicates that the vfsmount is a shared vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be replicated. @@ -842,7 +891,7 @@ replicas continue to be exactly same. A example propagation tree looks as shown in the figure below. [ NOTE: Though it looks like a forest, if we consider all the shared - mounts as a conceptual entity called 'pnode', it becomes a tree] + mounts as a conceptual entity called 'pnode', it becomes a tree]:: A <--> B <--> C <---> D @@ -864,14 +913,19 @@ replicas continue to be exactly same. A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G' E's ->mnt_share links with ->mnt_share of K - 'E', 'K', 'F', 'G' have their ->mnt_master point to struct - vfsmount of 'A' + + 'E', 'K', 'F', 'G' have their ->mnt_master point to struct vfsmount of 'A' + 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K' + K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N' C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K' + J and K's ->mnt_master points to struct vfsmount of C + and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I' + 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'. @@ -903,6 +957,7 @@ replicas continue to be exactly same. Prepare phase: for each mount in the source tree: + a) Create the necessary number of mount trees to be attached to each of the mounts that receive propagation from the destination mount. @@ -929,11 +984,12 @@ replicas continue to be exactly same. Abort phase delete all the newly created trees. - NOTE: all the propagation related functionality resides in the file - pnode.c + .. Note:: + all the propagation related functionality resides in the file pnode.c ------------------------------------------------------------------------ version 0.1 (created the initial document, Ram Pai linuxram@us.ibm.com) + version 0.2 (Incorporated comments from Al Viro) diff --git a/Documentation/filesystems/spufs/index.rst b/Documentation/filesystems/spufs/index.rst new file mode 100644 index 000000000000..5ed4a8494967 --- /dev/null +++ b/Documentation/filesystems/spufs/index.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +SPU Filesystem +============== + + +.. toctree:: + :maxdepth: 1 + + spufs + spu_create + spu_run diff --git a/Documentation/filesystems/spufs/spu_create.rst b/Documentation/filesystems/spufs/spu_create.rst new file mode 100644 index 000000000000..83108c099696 --- /dev/null +++ b/Documentation/filesystems/spufs/spu_create.rst @@ -0,0 +1,131 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +spu_create +========== + +Name +==== + spu_create - create a new spu context + + +Synopsis +======== + + :: + + #include <sys/types.h> + #include <sys/spu.h> + + int spu_create(const char *pathname, int flags, mode_t mode); + +Description +=========== + The spu_create system call is used on PowerPC machines that implement + the Cell Broadband Engine Architecture in order to access Synergistic + Processor Units (SPUs). It creates a new logical context for an SPU in + pathname and returns a handle to associated with it. pathname must + point to a non-existing directory in the mount point of the SPU file + system (spufs). When spu_create is successful, a directory gets cre- + ated on pathname and it is populated with files. + + The returned file handle can only be passed to spu_run(2) or closed, + other operations are not defined on it. When it is closed, all associ- + ated directory entries in spufs are removed. When the last file handle + pointing either inside of the context directory or to this file + descriptor is closed, the logical SPU context is destroyed. + + The parameter flags can be zero or any bitwise or'd combination of the + following constants: + + SPU_RAWIO + Allow mapping of some of the hardware registers of the SPU into + user space. This flag requires the CAP_SYS_RAWIO capability, see + capabilities(7). + + The mode parameter specifies the permissions used for creating the new + directory in spufs. mode is modified with the user's umask(2) value + and then used for both the directory and the files contained in it. The + file permissions mask out some more bits of mode because they typically + support only read or write access. See stat(2) for a full list of the + possible mode values. + + +Return Value +============ + spu_create returns a new file descriptor. It may return -1 to indicate + an error condition and set errno to one of the error codes listed + below. + + +Errors +====== + EACCES + The current user does not have write access on the spufs mount + point. + + EEXIST An SPU context already exists at the given path name. + + EFAULT pathname is not a valid string pointer in the current address + space. + + EINVAL pathname is not a directory in the spufs mount point. + + ELOOP Too many symlinks were found while resolving pathname. + + EMFILE The process has reached its maximum open file limit. + + ENAMETOOLONG + pathname was too long. + + ENFILE The system has reached the global open file limit. + + ENOENT Part of pathname could not be resolved. + + ENOMEM The kernel could not allocate all resources required. + + ENOSPC There are not enough SPU resources available to create a new + context or the user specific limit for the number of SPU con- + texts has been reached. + + ENOSYS the functionality is not provided by the current system, because + either the hardware does not provide SPUs or the spufs module is + not loaded. + + ENOTDIR + A part of pathname is not a directory. + + + +Notes +===== + spu_create is meant to be used from libraries that implement a more + abstract interface to SPUs, not to be used from regular applications. + See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- + ommended libraries. + + +Files +===== + pathname must point to a location beneath the mount point of spufs. By + convention, it gets mounted in /spu. + + +Conforming to +============= + This call is Linux specific and only implemented by the ppc64 architec- + ture. Programs using this system call are not portable. + + +Bugs +==== + The code does not yet fully implement all features lined out here. + + +Author +====== + Arnd Bergmann <arndb@de.ibm.com> + +See Also +======== + capabilities(7), close(2), spu_run(2), spufs(7) diff --git a/Documentation/filesystems/spufs/spu_run.rst b/Documentation/filesystems/spufs/spu_run.rst new file mode 100644 index 000000000000..7fdb1c31cb91 --- /dev/null +++ b/Documentation/filesystems/spufs/spu_run.rst @@ -0,0 +1,138 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +spu_run +======= + + +Name +==== + spu_run - execute an spu context + + +Synopsis +======== + + :: + + #include <sys/spu.h> + + int spu_run(int fd, unsigned int *npc, unsigned int *event); + +Description +=========== + The spu_run system call is used on PowerPC machines that implement the + Cell Broadband Engine Architecture in order to access Synergistic Pro- + cessor Units (SPUs). It uses the fd that was returned from spu_cre- + ate(2) to address a specific SPU context. When the context gets sched- + uled to a physical SPU, it starts execution at the instruction pointer + passed in npc. + + Execution of SPU code happens synchronously, meaning that spu_run does + not return while the SPU is still running. If there is a need to exe- + cute SPU code in parallel with other code on either the main CPU or + other SPUs, you need to create a new thread of execution first, e.g. + using the pthread_create(3) call. + + When spu_run returns, the current value of the SPU instruction pointer + is written back to npc, so you can call spu_run again without updating + the pointers. + + event can be a NULL pointer or point to an extended status code that + gets filled when spu_run returns. It can be one of the following con- + stants: + + SPE_EVENT_DMA_ALIGNMENT + A DMA alignment error + + SPE_EVENT_SPE_DATA_SEGMENT + A DMA segmentation error + + SPE_EVENT_SPE_DATA_STORAGE + A DMA storage error + + If NULL is passed as the event argument, these errors will result in a + signal delivered to the calling process. + +Return Value +============ + spu_run returns the value of the spu_status register or -1 to indicate + an error and set errno to one of the error codes listed below. The + spu_status register value contains a bit mask of status codes and + optionally a 14 bit code returned from the stop-and-signal instruction + on the SPU. The bit masks for the status codes are: + + 0x02 + SPU was stopped by stop-and-signal. + + 0x04 + SPU was stopped by halt. + + 0x08 + SPU is waiting for a channel. + + 0x10 + SPU is in single-step mode. + + 0x20 + SPU has tried to execute an invalid instruction. + + 0x40 + SPU has tried to access an invalid channel. + + 0x3fff0000 + The bits masked with this value contain the code returned from + stop-and-signal. + + There are always one or more of the lower eight bits set or an error + code is returned from spu_run. + +Errors +====== + EAGAIN or EWOULDBLOCK + fd is in non-blocking mode and spu_run would block. + + EBADF fd is not a valid file descriptor. + + EFAULT npc is not a valid pointer or status is neither NULL nor a valid + pointer. + + EINTR A signal occurred while spu_run was in progress. The npc value + has been updated to the new program counter value if necessary. + + EINVAL fd is not a file descriptor returned from spu_create(2). + + ENOMEM Insufficient memory was available to handle a page fault result- + ing from an MFC direct memory access. + + ENOSYS the functionality is not provided by the current system, because + either the hardware does not provide SPUs or the spufs module is + not loaded. + + +Notes +===== + spu_run is meant to be used from libraries that implement a more + abstract interface to SPUs, not to be used from regular applications. + See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- + ommended libraries. + + +Conforming to +============= + This call is Linux specific and only implemented by the ppc64 architec- + ture. Programs using this system call are not portable. + + +Bugs +==== + The code does not yet fully implement all features lined out here. + + +Author +====== + Arnd Bergmann <arndb@de.ibm.com> + +See Also +======== + capabilities(7), close(2), spu_create(2), spufs(7) diff --git a/Documentation/filesystems/spufs.txt b/Documentation/filesystems/spufs/spufs.rst index eb9e3aa63026..8a42859bb100 100644 --- a/Documentation/filesystems/spufs.txt +++ b/Documentation/filesystems/spufs/spufs.rst @@ -1,12 +1,18 @@ -SPUFS(2) Linux Programmer's Manual SPUFS(2) +.. SPDX-License-Identifier: GPL-2.0 +===== +spufs +===== +Name +==== -NAME spufs - the SPU file system -DESCRIPTION +Description +=========== + The SPU file system is used on PowerPC machines that implement the Cell Broadband Engine Architecture in order to access Synergistic Processor Units (SPUs). @@ -21,7 +27,9 @@ DESCRIPTION ally add or remove files. -MOUNT OPTIONS +Mount Options +============= + uid=<uid> set the user owning the mount point, the default is 0 (root). @@ -29,7 +37,9 @@ MOUNT OPTIONS set the group owning the mount point, the default is 0 (root). -FILES +Files +===== + The files in spufs mostly follow the standard behavior for regular sys- tem calls like read(2) or write(2), but often support only a subset of the operations supported on regular file systems. This list details the @@ -125,14 +135,12 @@ FILES space is available for writing. - /mbox_stat - /ibox_stat - /wbox_stat + /mbox_stat, /ibox_stat, /wbox_stat Read-only files that contain the length of the current queue, i.e. how many words can be read from mbox or ibox or how many words can be written to wbox without blocking. The files can be read only in 4-byte units and return a big-endian binary integer number. The possible - operations on an open *box_stat file are: + operations on an open ``*box_stat`` file are: read(2) If a count smaller than four is requested, read returns -1 and @@ -143,12 +151,7 @@ FILES in EAGAIN. - /npc - /decr - /decr_status - /spu_tag_mask - /event_mask - /srr0 + /npc, /decr, /decr_status, /spu_tag_mask, /event_mask, /srr0 Internal registers of the SPU. The representation is an ASCII string with the numeric value of the next instruction to be executed. These can be used in read/write mode for debugging, but normal operation of @@ -157,17 +160,14 @@ FILES The contents of these files are: + =================== =================================== npc Next Program Counter - decr SPU Decrementer - decr_status Decrementer Status - spu_tag_mask MFC tag mask for SPU DMA - event_mask Event mask for SPU interrupts - srr0 Interrupt Return address register + =================== =================================== The possible operations on an open npc, decr, decr_status, @@ -206,8 +206,7 @@ FILES from the data buffer, updating the value of the fpcr register. - /signal1 - /signal2 + /signal1, /signal2 The two signal notification channels of an SPU. These are read-write files that operate on a 32 bit word. Writing to one of these files triggers an interrupt on the SPU. The value written to the signal @@ -233,8 +232,7 @@ FILES file. - /signal1_type - /signal2_type + /signal1_type, /signal2_type These two files change the behavior of the signal1 and signal2 notifi- cation files. The contain a numerical ASCII string which is read as either "1" or "0". In mode 0 (overwrite), the hardware replaces the @@ -259,263 +257,17 @@ FILES the previous setting. -EXAMPLES +Examples +======== /etc/fstab entry none /spu spufs gid=spu 0 0 -AUTHORS +Authors +======= Arnd Bergmann <arndb@de.ibm.com>, Mark Nutter <mnutter@us.ibm.com>, Ulrich Weigand <Ulrich.Weigand@de.ibm.com> -SEE ALSO +See Also +======== capabilities(7), close(2), spu_create(2), spu_run(2), spufs(7) - - - -Linux 2005-09-28 SPUFS(2) - ------------------------------------------------------------------------------- - -SPU_RUN(2) Linux Programmer's Manual SPU_RUN(2) - - - -NAME - spu_run - execute an spu context - - -SYNOPSIS - #include <sys/spu.h> - - int spu_run(int fd, unsigned int *npc, unsigned int *event); - -DESCRIPTION - The spu_run system call is used on PowerPC machines that implement the - Cell Broadband Engine Architecture in order to access Synergistic Pro- - cessor Units (SPUs). It uses the fd that was returned from spu_cre- - ate(2) to address a specific SPU context. When the context gets sched- - uled to a physical SPU, it starts execution at the instruction pointer - passed in npc. - - Execution of SPU code happens synchronously, meaning that spu_run does - not return while the SPU is still running. If there is a need to exe- - cute SPU code in parallel with other code on either the main CPU or - other SPUs, you need to create a new thread of execution first, e.g. - using the pthread_create(3) call. - - When spu_run returns, the current value of the SPU instruction pointer - is written back to npc, so you can call spu_run again without updating - the pointers. - - event can be a NULL pointer or point to an extended status code that - gets filled when spu_run returns. It can be one of the following con- - stants: - - SPE_EVENT_DMA_ALIGNMENT - A DMA alignment error - - SPE_EVENT_SPE_DATA_SEGMENT - A DMA segmentation error - - SPE_EVENT_SPE_DATA_STORAGE - A DMA storage error - - If NULL is passed as the event argument, these errors will result in a - signal delivered to the calling process. - -RETURN VALUE - spu_run returns the value of the spu_status register or -1 to indicate - an error and set errno to one of the error codes listed below. The - spu_status register value contains a bit mask of status codes and - optionally a 14 bit code returned from the stop-and-signal instruction - on the SPU. The bit masks for the status codes are: - - 0x02 SPU was stopped by stop-and-signal. - - 0x04 SPU was stopped by halt. - - 0x08 SPU is waiting for a channel. - - 0x10 SPU is in single-step mode. - - 0x20 SPU has tried to execute an invalid instruction. - - 0x40 SPU has tried to access an invalid channel. - - 0x3fff0000 - The bits masked with this value contain the code returned from - stop-and-signal. - - There are always one or more of the lower eight bits set or an error - code is returned from spu_run. - -ERRORS - EAGAIN or EWOULDBLOCK - fd is in non-blocking mode and spu_run would block. - - EBADF fd is not a valid file descriptor. - - EFAULT npc is not a valid pointer or status is neither NULL nor a valid - pointer. - - EINTR A signal occurred while spu_run was in progress. The npc value - has been updated to the new program counter value if necessary. - - EINVAL fd is not a file descriptor returned from spu_create(2). - - ENOMEM Insufficient memory was available to handle a page fault result- - ing from an MFC direct memory access. - - ENOSYS the functionality is not provided by the current system, because - either the hardware does not provide SPUs or the spufs module is - not loaded. - - -NOTES - spu_run is meant to be used from libraries that implement a more - abstract interface to SPUs, not to be used from regular applications. - See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- - ommended libraries. - - -CONFORMING TO - This call is Linux specific and only implemented by the ppc64 architec- - ture. Programs using this system call are not portable. - - -BUGS - The code does not yet fully implement all features lined out here. - - -AUTHOR - Arnd Bergmann <arndb@de.ibm.com> - -SEE ALSO - capabilities(7), close(2), spu_create(2), spufs(7) - - - -Linux 2005-09-28 SPU_RUN(2) - ------------------------------------------------------------------------------- - -SPU_CREATE(2) Linux Programmer's Manual SPU_CREATE(2) - - - -NAME - spu_create - create a new spu context - - -SYNOPSIS - #include <sys/types.h> - #include <sys/spu.h> - - int spu_create(const char *pathname, int flags, mode_t mode); - -DESCRIPTION - The spu_create system call is used on PowerPC machines that implement - the Cell Broadband Engine Architecture in order to access Synergistic - Processor Units (SPUs). It creates a new logical context for an SPU in - pathname and returns a handle to associated with it. pathname must - point to a non-existing directory in the mount point of the SPU file - system (spufs). When spu_create is successful, a directory gets cre- - ated on pathname and it is populated with files. - - The returned file handle can only be passed to spu_run(2) or closed, - other operations are not defined on it. When it is closed, all associ- - ated directory entries in spufs are removed. When the last file handle - pointing either inside of the context directory or to this file - descriptor is closed, the logical SPU context is destroyed. - - The parameter flags can be zero or any bitwise or'd combination of the - following constants: - - SPU_RAWIO - Allow mapping of some of the hardware registers of the SPU into - user space. This flag requires the CAP_SYS_RAWIO capability, see - capabilities(7). - - The mode parameter specifies the permissions used for creating the new - directory in spufs. mode is modified with the user's umask(2) value - and then used for both the directory and the files contained in it. The - file permissions mask out some more bits of mode because they typically - support only read or write access. See stat(2) for a full list of the - possible mode values. - - -RETURN VALUE - spu_create returns a new file descriptor. It may return -1 to indicate - an error condition and set errno to one of the error codes listed - below. - - -ERRORS - EACCES - The current user does not have write access on the spufs mount - point. - - EEXIST An SPU context already exists at the given path name. - - EFAULT pathname is not a valid string pointer in the current address - space. - - EINVAL pathname is not a directory in the spufs mount point. - - ELOOP Too many symlinks were found while resolving pathname. - - EMFILE The process has reached its maximum open file limit. - - ENAMETOOLONG - pathname was too long. - - ENFILE The system has reached the global open file limit. - - ENOENT Part of pathname could not be resolved. - - ENOMEM The kernel could not allocate all resources required. - - ENOSPC There are not enough SPU resources available to create a new - context or the user specific limit for the number of SPU con- - texts has been reached. - - ENOSYS the functionality is not provided by the current system, because - either the hardware does not provide SPUs or the spufs module is - not loaded. - - ENOTDIR - A part of pathname is not a directory. - - - -NOTES - spu_create is meant to be used from libraries that implement a more - abstract interface to SPUs, not to be used from regular applications. - See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- - ommended libraries. - - -FILES - pathname must point to a location beneath the mount point of spufs. By - convention, it gets mounted in /spu. - - -CONFORMING TO - This call is Linux specific and only implemented by the ppc64 architec- - ture. Programs using this system call are not portable. - - -BUGS - The code does not yet fully implement all features lined out here. - - -AUTHOR - Arnd Bergmann <arndb@de.ibm.com> - -SEE ALSO - capabilities(7), close(2), spu_run(2), spufs(7) - - - -Linux 2005-09-28 SPU_CREATE(2) diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.rst index 06f1d64c6f70..a265f3e2cc80 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.rst @@ -1,8 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================ Accessing PCI device resources through sysfs --------------------------------------------- +============================================ sysfs, usually mounted at /sys, provides access to PCI resources on platforms -that support it. For example, a given bus might look like this: +that support it. For example, a given bus might look like this:: /sys/devices/pci0000:17 |-- 0000:17:00.0 @@ -30,8 +33,9 @@ This bus contains a single function device in slot 0. The domain and bus numbers are reproduced for convenience. Under the device directory are several files, each with their own function. + =================== ===================================================== file function - ---- -------- + =================== ===================================================== class PCI class (ascii, ro) config PCI config space (binary, rw) device PCI device (ascii, ro) @@ -40,13 +44,16 @@ files, each with their own function. local_cpus nearby CPU mask (cpumask, ro) remove remove device from kernel's list (ascii, wo) resource PCI resource host addresses (ascii, ro) - resource0..N PCI resource N, if present (binary, mmap, rw[1]) + resource0..N PCI resource N, if present (binary, mmap, rw\ [1]_) resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) revision PCI revision (ascii, ro) rom PCI ROM resource, if present (binary, ro) subsystem_device PCI subsystem device (ascii, ro) subsystem_vendor PCI subsystem vendor (ascii, ro) vendor PCI vendor (ascii, ro) + =================== ===================================================== + +:: ro - read only file rw - file is readable and writable @@ -56,7 +63,7 @@ files, each with their own function. binary - file contains binary data cpumask - file contains a cpumask type -[1] rw for RESOURCE_IO (I/O port) regions only +.. [1] rw for RESOURCE_IO (I/O port) regions only The read only files are informational, writes to them will be ignored, with the exception of the 'rom' file. Writable files can be used to perform @@ -67,11 +74,11 @@ don't support mmapping of certain resources, so be sure to check the return value from any attempted mmap. The most notable of these are I/O port resources, which also provide read/write access. -The 'enable' file provides a counter that indicates how many times the device +The 'enable' file provides a counter that indicates how many times the device has been enabled. If the 'enable' file currently returns '4', and a '1' is echoed into it, it will then return '5'. Echoing a '0' into it will decrease the count. Even when it returns to 0, though, some of the initialisation -may not be reversed. +may not be reversed. The 'rom' file is special in that it provides read-only access to the device's ROM file, if available. It's disabled by default, however, so applications @@ -93,7 +100,7 @@ Accessing legacy resources through sysfs Legacy I/O port and ISA memory resources are also provided in sysfs if the underlying platform supports them. They're located in the PCI class hierarchy, -e.g. +e.g.:: /sys/class/pci_bus/0000:17/ |-- bridge -> ../../../devices/pci0000:17 diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.rst index c7c8e6438958..8888a05c398e 100644 --- a/Documentation/filesystems/sysfs-tagging.txt +++ b/Documentation/filesystems/sysfs-tagging.rst @@ -1,5 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= Sysfs tagging -------------- +============= (Taken almost verbatim from Eric Biederman's netns tagging patch commit msg) @@ -18,25 +21,28 @@ in the directories and applications only see a limited set of the network devices. Each sysfs directory entry may be tagged with a namespace via the -void *ns member of its kernfs_node. If a directory entry is tagged, -then kernfs_node->flags will have a flag between KOBJ_NS_TYPE_NONE +``void *ns member`` of its ``kernfs_node``. If a directory entry is tagged, +then ``kernfs_node->flags`` will have a flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and ns will point to the namespace to which it belongs. -Each sysfs superblock's kernfs_super_info contains an array void -*ns[KOBJ_NS_TYPES]. When a task in a tagging namespace +Each sysfs superblock's kernfs_super_info contains an array +``void *ns[KOBJ_NS_TYPES]``. When a task in a tagging namespace kobj_nstype first mounts sysfs, a new superblock is created. It will be differentiated from other sysfs mounts by having its -s_fs_info->ns[kobj_nstype] set to the new namespace. Note that +``s_fs_info->ns[kobj_nstype]`` set to the new namespace. Note that through bind mounting and mounts propagation, a task can easily view the contents of other namespaces' sysfs mounts. Therefore, when a namespace exits, it will call kobj_ns_exit() to invalidate any kernfs_node->ns pointers pointing to it. Users of this interface: -- define a type in the kobj_ns_type enumeration. -- call kobj_ns_type_register() with its kobj_ns_type_operations which has + +- define a type in the ``kobj_ns_type`` enumeration. +- call kobj_ns_type_register() with its ``kobj_ns_type_operations`` which has + - current_ns() which returns current's namespace - netlink_ns() which returns a socket's namespace - initial_ns() which returns the initial namesapce + - call kobj_ns_exit() when an individual tag is no longer valid diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst index 290891c3fecb..ab0f7795792b 100644 --- a/Documentation/filesystems/sysfs.rst +++ b/Documentation/filesystems/sysfs.rst @@ -20,7 +20,7 @@ a means to export kernel data structures, their attributes, and the linkages between them to userspace. sysfs is tied inherently to the kobject infrastructure. Please read -Documentation/kobject.txt for more information concerning the kobject +Documentation/core-api/kobject.rst for more information concerning the kobject interface. diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.rst index 9a6dd289b17b..464405d2801e 100644 --- a/Documentation/filesystems/xfs-delayed-logging-design.txt +++ b/Documentation/filesystems/xfs-delayed-logging-design.rst @@ -1,8 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== XFS Delayed Logging Design --------------------------- +========================== Introduction to Re-logging in XFS ---------------------------------- +================================= XFS logging is a combination of logical and physical logging. Some objects, such as inodes and dquots, are logged in logical format where the details @@ -25,7 +28,7 @@ changes in the new transaction that is written to the log. That is, if we have a sequence of changes A through to F, and the object was written to disk after change D, we would see in the log the following series of transactions, their contents and the log sequence number (LSN) of the -transaction: +transaction:: Transaction Contents LSN A A X @@ -85,7 +88,7 @@ IO permanently. Hence the XFS journalling subsystem can be considered to be IO bound. Delayed Logging: Concepts -------------------------- +========================= The key thing to note about the asynchronous logging combined with the relogging technique XFS uses is that we can be relogging changed objects @@ -154,9 +157,10 @@ The fundamental requirements for delayed logging in XFS are simple: 6. No performance regressions for synchronous transaction workloads. Delayed Logging: Design ------------------------ +======================= Storing Changes +--------------- The problem with accumulating changes at a logical level (i.e. just using the existing log item dirty region tracking) is that when it comes to writing the @@ -194,30 +198,30 @@ asynchronous transactions to the log. The differences between the existing formatting method and the delayed logging formatting can be seen in the diagram below. -Current format log vector: +Current format log vector:: -Object +---------------------------------------------+ -Vector 1 +----+ -Vector 2 +----+ -Vector 3 +----------+ + Object +---------------------------------------------+ + Vector 1 +----+ + Vector 2 +----+ + Vector 3 +----------+ -After formatting: +After formatting:: -Log Buffer +-V1-+-V2-+----V3----+ + Log Buffer +-V1-+-V2-+----V3----+ -Delayed logging vector: +Delayed logging vector:: -Object +---------------------------------------------+ -Vector 1 +----+ -Vector 2 +----+ -Vector 3 +----------+ + Object +---------------------------------------------+ + Vector 1 +----+ + Vector 2 +----+ + Vector 3 +----------+ -After formatting: +After formatting:: -Memory Buffer +-V1-+-V2-+----V3----+ -Vector 1 +----+ -Vector 2 +----+ -Vector 3 +----------+ + Memory Buffer +-V1-+-V2-+----V3----+ + Vector 1 +----+ + Vector 2 +----+ + Vector 3 +----------+ The memory buffer and associated vector need to be passed as a single object, but still need to be associated with the parent object so if the object is @@ -242,6 +246,7 @@ relogged in memory. Tracking Changes +---------------- Now that we can record transactional changes in memory in a form that allows them to be used without limitations, we need to be able to track and accumulate @@ -278,6 +283,7 @@ done for convenience/sanity of the developers. Delayed Logging: Checkpoints +---------------------------- When we have a log synchronisation event, commonly known as a "log force", all the items in the CIL must be written into the log via the log buffers. @@ -341,7 +347,7 @@ Hence log vectors need to be able to be chained together to allow them to be detached from the log items. That is, when the CIL is flushed the memory buffer and log vector attached to each log item needs to be attached to the checkpoint context so that the log item can be released. In diagrammatic form, -the CIL would look like this before the flush: +the CIL would look like this before the flush:: CIL Head | @@ -362,7 +368,7 @@ the CIL would look like this before the flush: -> vector array And after the flush the CIL head is empty, and the checkpoint context log -vector list would look like: +vector list would look like:: Checkpoint Context | @@ -411,6 +417,7 @@ compare" situation that can be done after a working and reviewed implementation is in the dev tree.... Delayed Logging: Checkpoint Sequencing +-------------------------------------- One of the key aspects of the XFS transaction subsystem is that it tags committed transactions with the log sequence number of the transaction commit. @@ -474,6 +481,7 @@ force the log at the LSN of that transaction) and so the higher level code behaves the same regardless of whether delayed logging is being used or not. Delayed Logging: Checkpoint Log Space Accounting +------------------------------------------------ The big issue for a checkpoint transaction is the log space reservation for the transaction. We don't know how big a checkpoint transaction is going to be @@ -491,7 +499,7 @@ the size of the transaction and the number of regions being logged (the number of log vectors in the transaction). An example of the differences would be logging directory changes versus logging -inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then +inode changes. If you modify lots of inode cores (e.g. ``chmod -R g+w *``), then there are lots of transactions that only contain an inode core and an inode log format structure. That is, two vectors totaling roughly 150 bytes. If we modify 10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each @@ -565,6 +573,7 @@ which is once every 30s. Delayed Logging: Log Item Pinning +--------------------------------- Currently log items are pinned during transaction commit while the items are still locked. This happens just after the items are formatted, though it could @@ -605,6 +614,7 @@ object, we have a race with CIL being flushed between the check and the pin lock to guarantee that we pin the items correctly. Delayed Logging: Concurrent Scalability +--------------------------------------- A fundamental requirement for the CIL is that accesses through transaction commits must scale to many concurrent commits. The current transaction commit @@ -683,8 +693,9 @@ woken by the wrong event. Lifecycle Changes +----------------- -The existing log item life cycle is as follows: +The existing log item life cycle is as follows:: 1. Transaction allocate 2. Transaction reserve @@ -729,7 +740,7 @@ at the same time. If the log item is in the AIL or between steps 6 and 7 and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9 are entered and completed is the object considered clean. -With delayed logging, there are new steps inserted into the life cycle: +With delayed logging, there are new steps inserted into the life cycle:: 1. Transaction allocate 2. Transaction reserve diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.rst index 8db0121d0980..51cdafe01ab1 100644 --- a/Documentation/filesystems/xfs-self-describing-metadata.txt +++ b/Documentation/filesystems/xfs-self-describing-metadata.rst @@ -1,8 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ XFS Self Describing Metadata ----------------------------- +============================ Introduction ------------- +============ The largest scalability problem facing XFS is not one of algorithmic scalability, but of verification of the filesystem structure. Scalabilty of the @@ -34,7 +37,7 @@ required for basic forensic analysis of the filesystem structure. Self Describing Metadata ------------------------- +======================== One of the problems with the current metadata format is that apart from the magic number in the metadata block, we have no other way of identifying what it @@ -142,7 +145,7 @@ modification occurred between the corruption being written and when it was detected. Runtime Validation ------------------- +================== Validation of self-describing metadata takes place at runtime in two places: @@ -183,18 +186,18 @@ error occurs during this process, the buffer is again marked with a EFSCORRUPTED error for the higher layers to catch. Structures ----------- +========== -A typical on-disk structure needs to contain the following information: +A typical on-disk structure needs to contain the following information:: -struct xfs_ondisk_hdr { - __be32 magic; /* magic number */ - __be32 crc; /* CRC, not logged */ - uuid_t uuid; /* filesystem identifier */ - __be64 owner; /* parent object */ - __be64 blkno; /* location on disk */ - __be64 lsn; /* last modification in log, not logged */ -}; + struct xfs_ondisk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC, not logged */ + uuid_t uuid; /* filesystem identifier */ + __be64 owner; /* parent object */ + __be64 blkno; /* location on disk */ + __be64 lsn; /* last modification in log, not logged */ + }; Depending on the metadata, this information may be part of a header structure separate to the metadata contents, or may be distributed through an existing @@ -214,24 +217,24 @@ level of information is generally provided. For example: well. hence the additional metadata headers change the overall format of the metadata. -A typical buffer read verifier is structured as follows: +A typical buffer read verifier is structured as follows:: -#define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc) + #define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc) -static void -xfs_foo_read_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; + static void + xfs_foo_read_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_FOO_CRC_OFF)) || - !xfs_foo_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } -} + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_FOO_CRC_OFF)) || + !xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + } The code ensures that the CRC is only checked if the filesystem has CRCs enabled by checking the superblock of the feature bit, and then if the CRC verifies OK @@ -239,83 +242,83 @@ by checking the superblock of the feature bit, and then if the CRC verifies OK The verifier function will take a couple of different forms, depending on whether the magic number can be used to determine the format of the block. In -the case it can't, the code is structured as follows: +the case it can't, the code is structured as follows:: -static bool -xfs_foo_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_ondisk_hdr *hdr = bp->b_addr; + static bool + xfs_foo_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; - if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) - return false; + if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; - if (!xfs_sb_version_hascrc(&mp->m_sb)) { - if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) - return false; - if (bp->b_bn != be64_to_cpu(hdr->blkno)) - return false; - if (hdr->owner == 0) - return false; - } + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } - /* object specific verification checks here */ + /* object specific verification checks here */ - return true; -} + return true; + } If there are different magic numbers for the different formats, the verifier -will look like: - -static bool -xfs_foo_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_ondisk_hdr *hdr = bp->b_addr; - - if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) { - if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) - return false; - if (bp->b_bn != be64_to_cpu(hdr->blkno)) - return false; - if (hdr->owner == 0) - return false; - } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) - return false; - - /* object specific verification checks here */ - - return true; -} +will look like:: + + static bool + xfs_foo_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; + + if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; + + /* object specific verification checks here */ + + return true; + } Write verifiers are very similar to the read verifiers, they just do things in -the opposite order to the read verifiers. A typical write verifier: +the opposite order to the read verifiers. A typical write verifier:: -static void -xfs_foo_write_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + static void + xfs_foo_write_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; - if (!xfs_foo_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } + if (!xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; - if (bip) { - struct xfs_ondisk_hdr *hdr = bp->b_addr; - hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn); - } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF); -} + if (bip) { + struct xfs_ondisk_hdr *hdr = bp->b_addr; + hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF); + } This will verify the internal structure of the metadata before we go any further, detecting corruptions that have occurred as the metadata has been @@ -324,7 +327,7 @@ update the LSN field (when it was last modified) and calculate the CRC on the metadata. Once this is done, we can issue the IO. Inodes and Dquots ------------------ +================= Inodes and dquots are special snowflakes. They have per-object CRC and self-identifiers, but they are packed so that there are multiple objects per @@ -347,4 +350,3 @@ XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of the unlinked list modifications check or update CRCs, neither during unlink nor log recovery. So, it's gone unnoticed until now. This won't matter immediately - repair will probably complain about it - but it needs to be fixed. - diff --git a/Documentation/i2c/i2c.svg b/Documentation/i2c/i2c_bus.svg index 5979405ad1c3..3170de976373 100644 --- a/Documentation/i2c/i2c.svg +++ b/Documentation/i2c/i2c_bus.svg @@ -9,7 +9,7 @@ xmlns="http://www.w3.org/2000/svg" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - sodipodi:docname="i2c.svg" + sodipodi:docname="i2c_bus.svg" inkscape:version="0.92.3 (2405546, 2018-03-11)" version="1.1" id="svg2" diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index ce7230025b33..136c4e333be7 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -34,7 +34,7 @@ Terminology Using the terminology from the official documentation, the I2C bus connects one or more *master* chips and one or more *slave* chips. -.. kernel-figure:: i2c.svg +.. kernel-figure:: i2c_bus.svg :alt: Simple I2C bus with one master and 3 slaves Simple I2C bus diff --git a/Documentation/ia64/irq-redir.rst b/Documentation/ia64/irq-redir.rst index 39bf94484a15..6bbbbe4f73ef 100644 --- a/Documentation/ia64/irq-redir.rst +++ b/Documentation/ia64/irq-redir.rst @@ -7,7 +7,7 @@ IRQ affinity on IA64 platforms By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be controlled. The behavior on IA64 platforms is slightly different from -that described in Documentation/IRQ-affinity.txt for i386 systems. +that described in Documentation/core-api/irq/irq-affinity.rst for i386 systems. Because of the usage of SAPIC mode and physical destination mode the IRQ target is one particular CPU and cannot be a mask of several diff --git a/Documentation/iio/iio_configfs.rst b/Documentation/iio/iio_configfs.rst index ecbfdb3afef7..6e38cbbd2981 100644 --- a/Documentation/iio/iio_configfs.rst +++ b/Documentation/iio/iio_configfs.rst @@ -9,7 +9,7 @@ Configfs is a filesystem-based manager of kernel objects. IIO uses some objects that could be easily configured using configfs (e.g.: devices, triggers). -See Documentation/filesystems/configfs/configfs.txt for more information +See Documentation/filesystems/configfs.rst for more information about how configfs works. 2. Usage diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/locking/futex-requeue-pi.rst index 14ab5787b9a7..14ab5787b9a7 100644 --- a/Documentation/futex-requeue-pi.txt +++ b/Documentation/locking/futex-requeue-pi.rst diff --git a/Documentation/hwspinlock.txt b/Documentation/locking/hwspinlock.rst index 6f03713b7003..6f03713b7003 100644 --- a/Documentation/hwspinlock.txt +++ b/Documentation/locking/hwspinlock.rst diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst index 5d6800a723dc..d785878cad65 100644 --- a/Documentation/locking/index.rst +++ b/Documentation/locking/index.rst @@ -16,6 +16,13 @@ locking rt-mutex spinlocks ww-mutex-design + preempt-locking + pi-futex + futex-requeue-pi + hwspinlock + percpu-rw-semaphore + robust-futexes + robust-futex-ABI .. only:: subproject and html diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst index 5bcb99ba7bd9..8012a74555e7 100644 --- a/Documentation/locking/locktorture.rst +++ b/Documentation/locking/locktorture.rst @@ -110,7 +110,7 @@ stutter same period of time. Defaults to "stutter=5", so as to run and pause for (roughly) five-second intervals. Specifying "stutter=0" causes the test to run continuously - without pausing, which is the old default behavior. + without pausing. shuffle_interval The number of seconds to keep the test threads affinitied diff --git a/Documentation/locking/locktypes.rst b/Documentation/locking/locktypes.rst index 09f45ce38d26..1b577a8bf982 100644 --- a/Documentation/locking/locktypes.rst +++ b/Documentation/locking/locktypes.rst @@ -13,6 +13,7 @@ The kernel provides a variety of locking primitives which can be divided into two categories: - Sleeping locks + - CPU local locks - Spinning locks This document conceptually describes these lock types and provides rules @@ -44,9 +45,23 @@ Sleeping lock types: On PREEMPT_RT kernels, these lock types are converted to sleeping locks: + - local_lock - spinlock_t - rwlock_t + +CPU local locks +--------------- + + - local_lock + +On non-PREEMPT_RT kernels, local_lock functions are wrappers around +preemption and interrupt disabling primitives. Contrary to other locking +mechanisms, disabling preemption or interrupts are pure CPU local +concurrency control mechanisms and not suited for inter-CPU concurrency +control. + + Spinning locks -------------- @@ -67,6 +82,7 @@ can have suffixes which apply further protections: _irqsave/restore() Save and disable / restore interrupt disabled state =================== ==================================================== + Owner semantics =============== @@ -139,6 +155,56 @@ implementation, thus changing the fairness: writer from starving readers. +local_lock +========== + +local_lock provides a named scope to critical sections which are protected +by disabling preemption or interrupts. + +On non-PREEMPT_RT kernels local_lock operations map to the preemption and +interrupt disabling and enabling primitives: + + =========================== ====================== + local_lock(&llock) preempt_disable() + local_unlock(&llock) preempt_enable() + local_lock_irq(&llock) local_irq_disable() + local_unlock_irq(&llock) local_irq_enable() + local_lock_save(&llock) local_irq_save() + local_lock_restore(&llock) local_irq_save() + =========================== ====================== + +The named scope of local_lock has two advantages over the regular +primitives: + + - The lock name allows static analysis and is also a clear documentation + of the protection scope while the regular primitives are scopeless and + opaque. + + - If lockdep is enabled the local_lock gains a lockmap which allows to + validate the correctness of the protection. This can detect cases where + e.g. a function using preempt_disable() as protection mechanism is + invoked from interrupt or soft-interrupt context. Aside of that + lockdep_assert_held(&llock) works as with any other locking primitive. + +local_lock and PREEMPT_RT +------------------------- + +PREEMPT_RT kernels map local_lock to a per-CPU spinlock_t, thus changing +semantics: + + - All spinlock_t changes also apply to local_lock. + +local_lock usage +---------------- + +local_lock should be used in situations where disabling preemption or +interrupts is the appropriate form of concurrency control to protect +per-CPU data structures on a non PREEMPT_RT kernel. + +local_lock is not suitable to protect against preemption or interrupts on a +PREEMPT_RT kernel due to the PREEMPT_RT specific spinlock_t semantics. + + raw_spinlock_t and spinlock_t ============================= @@ -258,10 +324,82 @@ implementation, thus changing semantics: PREEMPT_RT caveats ================== +local_lock on RT +---------------- + +The mapping of local_lock to spinlock_t on PREEMPT_RT kernels has a few +implications. For example, on a non-PREEMPT_RT kernel the following code +sequence works as expected:: + + local_lock_irq(&local_lock); + raw_spin_lock(&lock); + +and is fully equivalent to:: + + raw_spin_lock_irq(&lock); + +On a PREEMPT_RT kernel this code sequence breaks because local_lock_irq() +is mapped to a per-CPU spinlock_t which neither disables interrupts nor +preemption. The following code sequence works perfectly correct on both +PREEMPT_RT and non-PREEMPT_RT kernels:: + + local_lock_irq(&local_lock); + spin_lock(&lock); + +Another caveat with local locks is that each local_lock has a specific +protection scope. So the following substitution is wrong:: + + func1() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock_1, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock_1, flags); + } + + func2() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock_2, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock_2, flags); + } + + func3() + { + lockdep_assert_irqs_disabled(); + access_protected_data(); + } + +On a non-PREEMPT_RT kernel this works correctly, but on a PREEMPT_RT kernel +local_lock_1 and local_lock_2 are distinct and cannot serialize the callers +of func3(). Also the lockdep assert will trigger on a PREEMPT_RT kernel +because local_lock_irqsave() does not disable interrupts due to the +PREEMPT_RT-specific semantics of spinlock_t. The correct substitution is:: + + func1() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock, flags); + } + + func2() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock, flags); + } + + func3() + { + lockdep_assert_held(&local_lock); + access_protected_data(); + } + + spinlock_t and rwlock_t ----------------------- -These changes in spinlock_t and rwlock_t semantics on PREEMPT_RT kernels +The changes in spinlock_t and rwlock_t semantics on PREEMPT_RT kernels have a few implications. For example, on a non-PREEMPT_RT kernel the following code sequence works as expected:: @@ -282,9 +420,61 @@ local_lock mechanism. Acquiring the local_lock pins the task to a CPU, allowing things like per-CPU interrupt disabled locks to be acquired. However, this approach should be used only where absolutely necessary. +A typical scenario is protection of per-CPU variables in thread context:: -raw_spinlock_t --------------- + struct foo *p = get_cpu_ptr(&var1); + + spin_lock(&p->lock); + p->count += this_cpu_read(var2); + +This is correct code on a non-PREEMPT_RT kernel, but on a PREEMPT_RT kernel +this breaks. The PREEMPT_RT-specific change of spinlock_t semantics does +not allow to acquire p->lock because get_cpu_ptr() implicitly disables +preemption. The following substitution works on both kernels:: + + struct foo *p; + + migrate_disable(); + p = this_cpu_ptr(&var1); + spin_lock(&p->lock); + p->count += this_cpu_read(var2); + +On a non-PREEMPT_RT kernel migrate_disable() maps to preempt_disable() +which makes the above code fully equivalent. On a PREEMPT_RT kernel +migrate_disable() ensures that the task is pinned on the current CPU which +in turn guarantees that the per-CPU access to var1 and var2 are staying on +the same CPU. + +The migrate_disable() substitution is not valid for the following +scenario:: + + func() + { + struct foo *p; + + migrate_disable(); + p = this_cpu_ptr(&var1); + p->val = func2(); + +While correct on a non-PREEMPT_RT kernel, this breaks on PREEMPT_RT because +here migrate_disable() does not protect against reentrancy from a +preempting task. A correct substitution for this case is:: + + func() + { + struct foo *p; + + local_lock(&foo_lock); + p = this_cpu_ptr(&var1); + p->val = func2(); + +On a non-PREEMPT_RT kernel this protects against reentrancy by disabling +preemption. On a PREEMPT_RT kernel this is achieved by acquiring the +underlying per-CPU spinlock. + + +raw_spinlock_t on RT +-------------------- Acquiring a raw_spinlock_t disables preemption and possibly also interrupts, so the critical section must avoid acquiring a regular @@ -325,22 +515,25 @@ Lock type nesting rules The most basic rules are: - - Lock types of the same lock category (sleeping, spinning) can nest - arbitrarily as long as they respect the general lock ordering rules to - prevent deadlocks. + - Lock types of the same lock category (sleeping, CPU local, spinning) + can nest arbitrarily as long as they respect the general lock ordering + rules to prevent deadlocks. + + - Sleeping lock types cannot nest inside CPU local and spinning lock types. - - Sleeping lock types cannot nest inside spinning lock types. + - CPU local and spinning lock types can nest inside sleeping lock types. - - Spinning lock types can nest inside sleeping lock types. + - Spinning lock types can nest inside all lock types These constraints apply both in PREEMPT_RT and otherwise. The fact that PREEMPT_RT changes the lock category of spinlock_t and -rwlock_t from spinning to sleeping means that they cannot be acquired while -holding a raw spinlock. This results in the following nesting ordering: +rwlock_t from spinning to sleeping and substitutes local_lock with a +per-CPU spinlock_t means that they cannot be acquired while holding a raw +spinlock. This results in the following nesting ordering: 1) Sleeping locks - 2) spinlock_t and rwlock_t + 2) spinlock_t, rwlock_t, local_lock 3) raw_spinlock_t and bit spinlocks Lockdep will complain if these constraints are violated, both in diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/locking/percpu-rw-semaphore.rst index 247de6410855..247de6410855 100644 --- a/Documentation/percpu-rw-semaphore.txt +++ b/Documentation/locking/percpu-rw-semaphore.rst diff --git a/Documentation/pi-futex.txt b/Documentation/locking/pi-futex.rst index c33ba2befbf8..c33ba2befbf8 100644 --- a/Documentation/pi-futex.txt +++ b/Documentation/locking/pi-futex.rst diff --git a/Documentation/preempt-locking.txt b/Documentation/locking/preempt-locking.rst index dce336134e54..dce336134e54 100644 --- a/Documentation/preempt-locking.txt +++ b/Documentation/locking/preempt-locking.rst diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/locking/robust-futex-ABI.rst index f24904f1c16f..f24904f1c16f 100644 --- a/Documentation/robust-futex-ABI.txt +++ b/Documentation/locking/robust-futex-ABI.rst diff --git a/Documentation/robust-futexes.txt b/Documentation/locking/robust-futexes.rst index 6361fb01c9c1..6361fb01c9c1 100644 --- a/Documentation/robust-futexes.txt +++ b/Documentation/locking/robust-futexes.rst diff --git a/Documentation/locking/rt-mutex.rst b/Documentation/locking/rt-mutex.rst index c365dc302081..3b5097a380e6 100644 --- a/Documentation/locking/rt-mutex.rst +++ b/Documentation/locking/rt-mutex.rst @@ -4,7 +4,7 @@ RT-mutex subsystem with PI support RT-mutexes with priority inheritance are used to support PI-futexes, which enable pthread_mutex_t priority inheritance attributes -(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details +(PTHREAD_PRIO_INHERIT). [See Documentation/locking/pi-futex.rst for more details about PI-futexes.] This technology was developed in the -rt tree and streamlined for diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst index 11ebe3682771..77e43c8b24b4 100644 --- a/Documentation/maintainer/maintainer-entry-profile.rst +++ b/Documentation/maintainer/maintainer-entry-profile.rst @@ -7,7 +7,7 @@ The Maintainer Entry Profile supplements the top-level process documents (submitting-patches, submitting drivers...) with subsystem/device-driver-local customs as well as details about the patch submission life-cycle. A contributor uses this document to level set -their expectations and avoid common mistakes, maintainers may use these +their expectations and avoid common mistakes; maintainers may use these profiles to look across subsystems for opportunities to converge on common practices. @@ -26,7 +26,7 @@ Example questions to consider: - Does the subsystem have a patchwork instance? Are patchwork state changes notified? - Any bots or CI infrastructure that watches the list, or automated - testing feedback that the subsystem gates acceptance? + testing feedback that the subsystem uses to gate acceptance? - Git branches that are pulled into -next? - What branch should contributors submit against? - Links to any other Maintainer Entry Profiles? For example a @@ -54,8 +54,8 @@ One of the common misunderstandings of submitters is that patches can be sent at any time before the merge window closes and can still be considered for the next -rc1. The reality is that most patches need to be settled in soaking in linux-next in advance of the merge window -opening. Clarify for the submitter the key dates (in terms rc release -week) that patches might considered for merging and when patches need to +opening. Clarify for the submitter the key dates (in terms of -rc release +week) that patches might be considered for merging and when patches need to wait for the next -rc. At a minimum: - Last -rc for new feature submissions: @@ -70,8 +70,8 @@ wait for the next -rc. At a minimum: - Last -rc to merge features: Deadline for merge decisions Indicate to contributors the point at which an as yet un-applied patch set will need to wait for the NEXT+1 merge window. Of course there is no - obligation to ever except any given patchset, but if the review has not - concluded by this point the expectation the contributor should wait and + obligation to ever accept any given patchset, but if the review has not + concluded by this point the expectation is the contributor should wait and resubmit for the following merge window. Optional: diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index e1c355e84edd..eaabc3134294 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -620,7 +620,7 @@ because the CPUs that the Linux kernel supports don't do writes until they are certain (1) that the write will actually happen, (2) of the location of the write, and (3) of the value to be written. But please carefully read the "CONTROL DEPENDENCIES" section and the -Documentation/RCU/rcu_dereference.txt file: The compiler can and does +Documentation/RCU/rcu_dereference.rst file: The compiler can and does break dependencies in a great many highly creative ways. CPU 1 CPU 2 diff --git a/Documentation/misc-devices/index.rst b/Documentation/misc-devices/index.rst index c1dcd2628911..1ecc05fbe6f4 100644 --- a/Documentation/misc-devices/index.rst +++ b/Documentation/misc-devices/index.rst @@ -21,4 +21,5 @@ fit into other categories. lis3lv02d max6875 mic/index + uacce xilinx_sdfec diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index f78d7bf27ff5..8f0347b9fb3d 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -81,7 +81,7 @@ of queues to IRQs can be determined from /proc/interrupts. By default, an IRQ may be handled on any CPU. Because a non-negligible part of packet processing takes place in receive interrupt handling, it is advantageous to spread receive interrupts between CPUs. To manually adjust the IRQ -affinity of each interrupt see Documentation/IRQ-affinity.txt. Some systems +affinity of each interrupt see Documentation/core-api/irq/irq-affinity.rst. Some systems will be running irqbalance, a daemon that dynamically optimizes IRQ assignments and as a result may override any manual settings. @@ -160,7 +160,7 @@ can be configured for each receive queue using a sysfs file entry:: This file implements a bitmap of CPUs. RPS is disabled when it is zero (the default), in which case packets are processed on the interrupting -CPU. Documentation/IRQ-affinity.txt explains how CPUs are assigned to +CPU. Documentation/core-api/irq/irq-affinity.rst explains how CPUs are assigned to the bitmap. diff --git a/Documentation/nvdimm/maintainer-entry-profile.rst b/Documentation/nvdimm/maintainer-entry-profile.rst index efe37adadcea..9da748e42623 100644 --- a/Documentation/nvdimm/maintainer-entry-profile.rst +++ b/Documentation/nvdimm/maintainer-entry-profile.rst @@ -4,15 +4,15 @@ LIBNVDIMM Maintainer Entry Profile Overview -------- The libnvdimm subsystem manages persistent memory across multiple -architectures. The mailing list, is tracked by patchwork here: +architectures. The mailing list is tracked by patchwork here: https://patchwork.kernel.org/project/linux-nvdimm/list/ ...and that instance is configured to give feedback to submitters on patch acceptance and upstream merge. Patches are merged to either the -'libnvdimm-fixes', or 'libnvdimm-for-next' branch. Those branches are +'libnvdimm-fixes' or 'libnvdimm-for-next' branch. Those branches are available here: https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git/ -In general patches can be submitted against the latest -rc, however if +In general patches can be submitted against the latest -rc; however, if the incoming code change is dependent on other pending changes then the patch should be based on the libnvdimm-for-next branch. However, since persistent memory sits at the intersection of storage and memory there @@ -35,12 +35,12 @@ getting the test environment set up. ACPI Device Specific Methods (_DSM) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Before patches enabling for a new _DSM family will be considered it must +Before patches enabling a new _DSM family will be considered, it must be assigned a format-interface-code from the NVDIMM Sub-team of the ACPI Specification Working Group. In general, the stance of the subsystem is -to push back on the proliferation of NVDIMM command sets, do strongly +to push back on the proliferation of NVDIMM command sets, so do strongly consider implementing support for an existing command set. See -drivers/acpi/nfit/nfit.h for the set of support command sets. +drivers/acpi/nfit/nfit.h for the set of supported command sets. Key Cycle Dates @@ -48,7 +48,7 @@ Key Cycle Dates New submissions can be sent at any time, but if they intend to hit the next merge window they should be sent before -rc4, and ideally stabilized in the libnvdimm-for-next branch by -rc6. Of course if a -patch set requires more than 2 weeks of review -rc4 is already too late +patch set requires more than 2 weeks of review, -rc4 is already too late and some patches may require multiple development cycles to review. diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst index 572d968c5375..ebedb6c75db9 100644 --- a/Documentation/power/suspend-and-cpuhotplug.rst +++ b/Documentation/power/suspend-and-cpuhotplug.rst @@ -48,7 +48,7 @@ More details follow:: | | v - disable_nonboot_cpus() + freeze_secondary_cpus() /* start */ | v @@ -83,7 +83,7 @@ More details follow:: Release cpu_add_remove_lock | v - /* disable_nonboot_cpus() complete */ + /* freeze_secondary_cpus() complete */ | v Do suspend @@ -93,7 +93,7 @@ More details follow:: Resuming back is likewise, with the counterparts being (in the order of execution during resume): -* enable_nonboot_cpus() which involves:: +* thaw_secondary_cpus() which involves:: | Acquire cpu_add_remove_lock | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug diff --git a/Documentation/powerpc/cxl.rst b/Documentation/powerpc/cxl.rst index 920546d81326..d2d77057610e 100644 --- a/Documentation/powerpc/cxl.rst +++ b/Documentation/powerpc/cxl.rst @@ -133,6 +133,7 @@ User API ======== 1. AFU character devices +^^^^^^^^^^^^^^^^^^^^^^^^ For AFUs operating in AFU directed mode, two character device files will be created. /dev/cxl/afu0.0m will correspond to a @@ -395,6 +396,7 @@ read 2. Card character device (powerVM guest only) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In a powerVM guest, an extra character device is created for the card. The device is only used to write (flash) a new image on the diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index b3f3ee135dbe..20ea8cdee0aa 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -344,7 +344,7 @@ Here is the list of files under powerpc debugfs: NOTE: - Please refer to Documentation/filesystems/debugfs.txt on + Please refer to Documentation/filesystems/debugfs.rst on how to mount the debugfs filesystem. diff --git a/Documentation/process/adding-syscalls.rst b/Documentation/process/adding-syscalls.rst index 1c3a840d06b9..a6b4a3a5bf3f 100644 --- a/Documentation/process/adding-syscalls.rst +++ b/Documentation/process/adding-syscalls.rst @@ -33,7 +33,7 @@ interface. to a somewhat opaque API. - If you're just exposing runtime system information, a new node in sysfs - (see ``Documentation/filesystems/sysfs.txt``) or the ``/proc`` filesystem may + (see ``Documentation/filesystems/sysfs.rst``) or the ``/proc`` filesystem may be more appropriate. However, access to these mechanisms requires that the relevant filesystem is mounted, which might not always be the case (e.g. in a namespaced/sandboxed/chrooted environment). Avoid adding any API to diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index 6399d92f0b21..f07c9250c3ac 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -61,6 +61,7 @@ lack of a better place. botching-up-ioctls clang-format ../riscv/patch-acceptance + unaligned-memory-access .. only:: subproject and html diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index 8e56337d422d..3f8e9d5d95c2 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches. and why. 26) If any ioctl's are added by the patch, then also update - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 27) If your modified source code depends on or uses any of the kernel APIs or features that are related to the following ``Kconfig`` symbols, diff --git a/Documentation/unaligned-memory-access.txt b/Documentation/process/unaligned-memory-access.rst index 1ee82419d8aa..1ee82419d8aa 100644 --- a/Documentation/unaligned-memory-access.txt +++ b/Documentation/process/unaligned-memory-access.rst diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst index b5c51f7c748d..367e27ec3c50 100644 --- a/Documentation/s390/vfio-ap.rst +++ b/Documentation/s390/vfio-ap.rst @@ -484,7 +484,7 @@ CARD.DOMAIN TYPE MODE 05.00ff CEX5A Accelerator =========== ===== ============ -Guest2 +Guest3 ------ =========== ===== ============ CARD.DOMAIN TYPE MODE diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index f7504226f445..5c4b7f4f0062 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -19,10 +19,12 @@ CPUs". Each scheduling domain must have one or more CPU groups (struct sched_group) which are organised as a circular one way linked list from the ->groups pointer. The union of cpumasks of these groups MUST be the same as the -domain's span. The intersection of cpumasks from any two of these groups -MUST be the empty set. The group pointed to by the ->groups pointer MUST -contain the CPU to which the domain belongs. Groups may be shared among -CPUs as they contain read only data after they have been set up. +domain's span. The group pointed to by the ->groups pointer MUST contain the CPU +to which the domain belongs. Groups may be shared among CPUs as they contain +read only data after they have been set up. The intersection of cpumasks from +any two of these groups may be non empty. If this is the case the SD_OVERLAP +flag is set on the corresponding scheduling domain and its groups may not be +shared between CPUs. Balancing within a sched domain occurs between groups. That is, each group is treated as one entity. The load of a group is defined as the sum of the diff --git a/Documentation/digsig.txt b/Documentation/security/digsig.rst index f6a8902d3ef7..f6a8902d3ef7 100644 --- a/Documentation/digsig.txt +++ b/Documentation/security/digsig.rst diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst index fc503dd689a7..8129405eb2cc 100644 --- a/Documentation/security/index.rst +++ b/Documentation/security/index.rst @@ -15,3 +15,4 @@ Security Documentation self-protection siphash tpm/index + digsig diff --git a/Documentation/security/lsm.rst b/Documentation/security/lsm.rst index aadf47c808c0..6a2a2e973080 100644 --- a/Documentation/security/lsm.rst +++ b/Documentation/security/lsm.rst @@ -35,47 +35,50 @@ desired model of security. Linus also suggested the possibility of migrating the Linux capabilities code into such a module. The Linux Security Modules (LSM) project was started by WireX to develop -such a framework. LSM is a joint development effort by several security +such a framework. LSM was a joint development effort by several security projects, including Immunix, SELinux, SGI and Janus, and several individuals, including Greg Kroah-Hartman and James Morris, to develop a -Linux kernel patch that implements this framework. The patch is -currently tracking the 2.4 series and is targeted for integration into -the 2.5 development series. This technical report provides an overview -of the framework and the example capabilities security module provided -by the LSM kernel patch. +Linux kernel patch that implements this framework. The work was +incorporated in the mainstream in December of 2003. This technical +report provides an overview of the framework and the capabilities +security module. LSM Framework ============= -The LSM kernel patch provides a general kernel framework to support +The LSM framework provides a general kernel framework to support security modules. In particular, the LSM framework is primarily focused on supporting access control modules, although future development is -likely to address other security needs such as auditing. By itself, the +likely to address other security needs such as sandboxing. By itself, the framework does not provide any additional security; it merely provides -the infrastructure to support security modules. The LSM kernel patch -also moves most of the capabilities logic into an optional security -module, with the system defaulting to the traditional superuser logic. +the infrastructure to support security modules. The LSM framework is +optional, requiring `CONFIG_SECURITY` to be enabled. The capabilities +logic is implemented as a security module. This capabilities module is discussed further in `LSM Capabilities Module`_. -The LSM kernel patch adds security fields to kernel data structures and -inserts calls to hook functions at critical points in the kernel code to -manage the security fields and to perform access control. It also adds -functions for registering and unregistering security modules, and adds a -general :c:func:`security()` system call to support new system calls -for security-aware applications. - -The LSM security fields are simply ``void*`` pointers. For process and -program execution security information, security fields were added to +The LSM framework includes security fields in kernel data structures and +calls to hook functions at critical points in the kernel code to +manage the security fields and to perform access control. +It also adds functions for registering security modules. +An interface `/sys/kernel/security/lsm` reports a comma separated list +of security modules that are active on the system. + +The LSM security fields are simply ``void*`` pointers. +The data is referred to as a blob, which may be managed by +the framework or by the individual security modules that use it. +Security blobs that are used by more than one security module are +typically managed by the framework. +For process and +program execution security information, security fields are included in :c:type:`struct task_struct <task_struct>` and -:c:type:`struct linux_binprm <linux_binprm>`. For filesystem -security information, a security field was added to :c:type:`struct +:c:type:`struct cred <cred>`. +For filesystem +security information, a security field is included in :c:type:`struct super_block <super_block>`. For pipe, file, and socket security -information, security fields were added to :c:type:`struct inode -<inode>` and :c:type:`struct file <file>`. For packet and -network device security information, security fields were added to -:c:type:`struct sk_buff <sk_buff>` and :c:type:`struct -net_device <net_device>`. For System V IPC security information, +information, security fields are included in :c:type:`struct inode +<inode>` and :c:type:`struct file <file>`. +For System V IPC security information, security fields were added to :c:type:`struct kern_ipc_perm <kern_ipc_perm>` and :c:type:`struct msg_msg <msg_msg>`; additionally, the definitions for :c:type:`struct @@ -84,118 +87,45 @@ were moved to header files (``include/linux/msg.h`` and ``include/linux/shm.h`` as appropriate) to allow the security modules to use these definitions. -Each LSM hook is a function pointer in a global table, security_ops. -This table is a :c:type:`struct security_operations -<security_operations>` structure as defined by -``include/linux/security.h``. Detailed documentation for each hook is -included in this header file. At present, this structure consists of a -collection of substructures that group related hooks based on the kernel -object (e.g. task, inode, file, sk_buff, etc) as well as some top-level -hook function pointers for system operations. This structure is likely -to be flattened in the future for performance. The placement of the hook -calls in the kernel code is described by the "called:" lines in the -per-hook documentation in the header file. The hook calls can also be -easily found in the kernel code by looking for the string -"security_ops->". - -Linus mentioned per-process security hooks in his original remarks as a -possible alternative to global security hooks. However, if LSM were to -start from the perspective of per-process hooks, then the base framework -would have to deal with how to handle operations that involve multiple -processes (e.g. kill), since each process might have its own hook for -controlling the operation. This would require a general mechanism for -composing hooks in the base framework. Additionally, LSM would still -need global hooks for operations that have no process context (e.g. -network input operations). Consequently, LSM provides global security -hooks, but a security module is free to implement per-process hooks -(where that makes sense) by storing a security_ops table in each -process' security field and then invoking these per-process hooks from -the global hooks. The problem of composition is thus deferred to the -module. - -The global security_ops table is initialized to a set of hook functions -provided by a dummy security module that provides traditional superuser -logic. A :c:func:`register_security()` function (in -``security/security.c``) is provided to allow a security module to set -security_ops to refer to its own hook functions, and an -:c:func:`unregister_security()` function is provided to revert -security_ops to the dummy module hooks. This mechanism is used to set -the primary security module, which is responsible for making the final -decision for each hook. - -LSM also provides a simple mechanism for stacking additional security -modules with the primary security module. It defines -:c:func:`register_security()` and -:c:func:`unregister_security()` hooks in the :c:type:`struct -security_operations <security_operations>` structure and -provides :c:func:`mod_reg_security()` and -:c:func:`mod_unreg_security()` functions that invoke these hooks -after performing some sanity checking. A security module can call these -functions in order to stack with other modules. However, the actual -details of how this stacking is handled are deferred to the module, -which can implement these hooks in any way it wishes (including always -returning an error if it does not wish to support stacking). In this -manner, LSM again defers the problem of composition to the module. - -Although the LSM hooks are organized into substructures based on kernel -object, all of the hooks can be viewed as falling into two major +For packet and +network device security information, security fields were added to +:c:type:`struct sk_buff <sk_buff>` and +:c:type:`struct scm_cookie <scm_cookie>`. +Unlike the other security module data, the data used here is a +32-bit integer. The security modules are required to map or otherwise +associate these values with real security attributes. + +LSM hooks are maintained in lists. A list is maintained for each +hook, and the hooks are called in the order specified by CONFIG_LSM. +Detailed documentation for each hook is +included in the `include/linux/lsm_hooks.h` header file. + +The LSM framework provides for a close approximation of +general security module stacking. It defines +security_add_hooks() to which each security module passes a +:c:type:`struct security_hooks_list <security_hooks_list>`, +which are added to the lists. +The LSM framework does not provide a mechanism for removing hooks that +have been registered. The SELinux security module has implemented +a way to remove itself, however the feature has been deprecated. + +The hooks can be viewed as falling into two major categories: hooks that are used to manage the security fields and hooks that are used to perform access control. Examples of the first category -of hooks include the :c:func:`alloc_security()` and -:c:func:`free_security()` hooks defined for each kernel data -structure that has a security field. These hooks are used to allocate -and free security structures for kernel objects. The first category of -hooks also includes hooks that set information in the security field -after allocation, such as the :c:func:`post_lookup()` hook in -:c:type:`struct inode_security_ops <inode_security_ops>`. -This hook is used to set security information for inodes after -successful lookup operations. An example of the second category of hooks -is the :c:func:`permission()` hook in :c:type:`struct -inode_security_ops <inode_security_ops>`. This hook checks -permission when accessing an inode. +of hooks include the security_inode_alloc() and security_inode_free() +These hooks are used to allocate +and free security structures for inode objects. +An example of the second category of hooks +is the security_inode_permission() hook. +This hook checks permission when accessing an inode. LSM Capabilities Module ======================= -The LSM kernel patch moves most of the existing POSIX.1e capabilities -logic into an optional security module stored in the file -``security/capability.c``. This change allows users who do not want to -use capabilities to omit this code entirely from their kernel, instead -using the dummy module for traditional superuser logic or any other -module that they desire. This change also allows the developers of the -capabilities logic to maintain and enhance their code more freely, -without needing to integrate patches back into the base kernel. - -In addition to moving the capabilities logic, the LSM kernel patch could -move the capability-related fields from the kernel data structures into -the new security fields managed by the security modules. However, at -present, the LSM kernel patch leaves the capability fields in the kernel -data structures. In his original remarks, Linus suggested that this -might be preferable so that other security modules can be easily stacked -with the capabilities module without needing to chain multiple security -structures on the security field. It also avoids imposing extra overhead -on the capabilities module to manage the security fields. However, the -LSM framework could certainly support such a move if it is determined to -be desirable, with only a few additional changes described below. - -At present, the capabilities logic for computing process capabilities on -:c:func:`execve()` and :c:func:`set\*uid()`, checking -capabilities for a particular process, saving and checking capabilities -for netlink messages, and handling the :c:func:`capget()` and -:c:func:`capset()` system calls have been moved into the -capabilities module. There are still a few locations in the base kernel -where capability-related fields are directly examined or modified, but -the current version of the LSM patch does allow a security module to -completely replace the assignment and testing of capabilities. These few -locations would need to be changed if the capability-related fields were -moved into the security field. The following is a list of known -locations that still perform such direct examination or modification of -capability-related fields: - -- ``fs/open.c``::c:func:`sys_access()` - -- ``fs/lockd/host.c``::c:func:`nlm_bind_host()` - -- ``fs/nfsd/auth.c``::c:func:`nfsd_setuser()` - -- ``fs/proc/array.c``::c:func:`task_cap()` +The POSIX.1e capabilities logic is maintained as a security module +stored in the file ``security/commoncap.c``. The capabilities +module uses the order field of the :c:type:`lsm_info` description +to identify it as the first security module to be registered. +The capabilities security module does not use the general security +blobs, unlike other modules. The reasons are historical and are +based on overhead, complexity and performance concerns. diff --git a/Documentation/sphinx/requirements.txt b/Documentation/sphinx/requirements.txt index 14e29a0ae480..489f6626de67 100644 --- a/Documentation/sphinx/requirements.txt +++ b/Documentation/sphinx/requirements.txt @@ -1,3 +1,3 @@ docutils -Sphinx==1.7.9 +Sphinx==2.4.4 sphinx_rtd_theme diff --git a/Documentation/trace/coresight/coresight-ect.rst b/Documentation/trace/coresight/coresight-ect.rst index ecc1e57012ef..a93e52abcf46 100644 --- a/Documentation/trace/coresight/coresight-ect.rst +++ b/Documentation/trace/coresight/coresight-ect.rst @@ -1,4 +1,5 @@ .. SPDX-License-Identifier: GPL-2.0 + ============================================= CoreSight Embedded Cross Trigger (CTI & CTM). ============================================= diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 4a2ebe0bd19b..f792b1959a33 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -527,8 +527,8 @@ The following commands are supported: See Documentation/trace/histogram.rst for details and examples. -6.3 In-kernel trace event API ------------------------------ +7. In-kernel trace event API +============================ In most cases, the command-line interface to trace events is more than sufficient. Sometimes, however, applications might find the need for @@ -560,8 +560,8 @@ following: - tracing synthetic events from in-kernel code - the low-level "dynevent_cmd" API -6.3.1 Dyamically creating synthetic event definitions ------------------------------------------------------ +7.1 Dyamically creating synthetic event definitions +--------------------------------------------------- There are a couple ways to create a new synthetic event from a kernel module or other kernel code. @@ -666,8 +666,8 @@ registered by calling the synth_event_gen_cmd_end() function:: At this point, the event object is ready to be used for tracing new events. -6.3.3 Tracing synthetic events from in-kernel code --------------------------------------------------- +7.2 Tracing synthetic events from in-kernel code +------------------------------------------------ To trace a synthetic event, there are several options. The first option is to trace the event in one call, using synth_event_trace() @@ -678,8 +678,8 @@ synth_event_trace_start() and synth_event_trace_end() along with synth_event_add_next_val() or synth_event_add_val() to add the values piecewise. -6.3.3.1 Tracing a synthetic event all at once ---------------------------------------------- +7.2.1 Tracing a synthetic event all at once +------------------------------------------- To trace a synthetic event all at once, the synth_event_trace() or synth_event_trace_array() functions can be used. @@ -780,8 +780,8 @@ remove the event:: ret = synth_event_delete("schedtest"); -6.3.3.1 Tracing a synthetic event piecewise -------------------------------------------- +7.2.2 Tracing a synthetic event piecewise +----------------------------------------- To trace a synthetic using the piecewise method described above, the synth_event_trace_start() function is used to 'open' the synthetic @@ -864,8 +864,8 @@ Note that synth_event_trace_end() must be called at the end regardless of whether any of the add calls failed (say due to a bad field name being passed in). -6.3.4 Dyamically creating kprobe and kretprobe event definitions ----------------------------------------------------------------- +7.3 Dyamically creating kprobe and kretprobe event definitions +-------------------------------------------------------------- To create a kprobe or kretprobe trace event from kernel code, the kprobe_event_gen_cmd_start() or kretprobe_event_gen_cmd_start() @@ -941,8 +941,8 @@ used to give the kprobe event file back and delete the event:: ret = kprobe_event_delete("gen_kprobe_test"); -6.3.4 The "dynevent_cmd" low-level API --------------------------------------- +7.4 The "dynevent_cmd" low-level API +------------------------------------ Both the in-kernel synthetic event and kprobe interfaces are built on top of a lower-level "dynevent_cmd" interface. This interface is diff --git a/Documentation/trace/ftrace-design.rst b/Documentation/trace/ftrace-design.rst index a8e22e0db63c..6893399157f0 100644 --- a/Documentation/trace/ftrace-design.rst +++ b/Documentation/trace/ftrace-design.rst @@ -229,14 +229,6 @@ Adding support for it is easy: just define the macro in asm/ftrace.h and pass the return address pointer as the 'retp' argument to ftrace_push_return_trace(). -HAVE_FTRACE_NMI_ENTER ---------------------- - -If you can't trace NMI functions, then skip this option. - -<details to be filled> - - HAVE_SYSCALL_TRACEPOINTS ------------------------ diff --git a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst index a4ecd8f27631..524ad86cadbb 100644 --- a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst +++ b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst @@ -515,6 +515,22 @@ internal: *[source-pattern ...]* .. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c :internal: +identifiers: *[ function/type ...]* + Include la documentazione per ogni *function* e *type* in *source*. + Se non vengono esplicitamente specificate le funzioni da includere, allora + verranno incluse tutte quelle disponibili in *source*. + + Esempi:: + + .. kernel-doc:: lib/bitmap.c + :identifiers: bitmap_parselist bitmap_parselist_user + + .. kernel-doc:: lib/idr.c + :identifiers: + +functions: *[ function ...]* + Questo è uno pseudonimo, deprecato, per la direttiva 'identifiers'. + doc: *title* Include la documentazione del paragrafo ``DOC:`` identificato dal titolo (*title*) all'interno del file sorgente (*source*). Gli spazi in *title* sono @@ -528,15 +544,6 @@ doc: *title* .. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c :doc: High Definition Audio over HDMI and Display Port -functions: *function* *[...]* - Dal file sorgente (*source*) include la documentazione per le funzioni - elencate (*function*). - - Esempio:: - - .. kernel-doc:: lib/bitmap.c - :functions: bitmap_parselist bitmap_parselist_user - Senza alcuna opzione, la direttiva kernel-doc include tutti i commenti di documentazione presenti nel file sorgente (*source*). diff --git a/Documentation/translations/it_IT/kernel-hacking/hacking.rst b/Documentation/translations/it_IT/kernel-hacking/hacking.rst index 24c592852bf1..6aab27a8d323 100644 --- a/Documentation/translations/it_IT/kernel-hacking/hacking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/hacking.rst @@ -627,6 +627,24 @@ Alcuni manutentori e sviluppatori potrebbero comunque richiedere :c:func:`EXPORT_SYMBOL_GPL()` quando si aggiungono nuove funzionalità o interfacce. +:c:func:`EXPORT_SYMBOL_NS()` +---------------------------- + +Definita in ``include/linux/export.h`` + +Questa è una variate di `EXPORT_SYMBOL()` che permette di specificare uno +spazio dei nomi. Lo spazio dei nomi è documentato in +:doc:`../core-api/symbol-namespaces` + +:c:func:`EXPORT_SYMBOL_NS_GPL()` +-------------------------------- + +Definita in ``include/linux/export.h`` + +Questa è una variate di `EXPORT_SYMBOL_GPL()` che permette di specificare uno +spazio dei nomi. Lo spazio dei nomi è documentato in +:doc:`../core-api/symbol-namespaces` + Procedure e convenzioni ======================= diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst index b9a6be4b8499..4615df5723fb 100644 --- a/Documentation/translations/it_IT/kernel-hacking/locking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst @@ -159,17 +159,17 @@ Sincronizzazione in contesto utente Se avete una struttura dati che verrà utilizzata solo dal contesto utente, allora, per proteggerla, potete utilizzare un semplice mutex (``include/linux/mutex.h``). Questo è il caso più semplice: inizializzate il -mutex; invocate :c:func:`mutex_lock_interruptible()` per trattenerlo e -:c:func:`mutex_unlock()` per rilasciarlo. C'è anche :c:func:`mutex_lock()` +mutex; invocate mutex_lock_interruptible() per trattenerlo e +mutex_unlock() per rilasciarlo. C'è anche mutex_lock() ma questa dovrebbe essere evitata perché non ritorna in caso di segnali. Per esempio: ``net/netfilter/nf_sockopt.c`` permette la registrazione -di nuove chiamate per :c:func:`setsockopt()` e :c:func:`getsockopt()` -usando la funzione :c:func:`nf_register_sockopt()`. La registrazione e +di nuove chiamate per setsockopt() e getsockopt() +usando la funzione nf_register_sockopt(). La registrazione e la rimozione vengono eseguite solamente quando il modulo viene caricato o scaricato (e durante l'avvio del sistema, qui non abbiamo concorrenza), e la lista delle funzioni registrate viene consultata solamente quando -:c:func:`setsockopt()` o :c:func:`getsockopt()` sono sconosciute al sistema. +setsockopt() o getsockopt() sono sconosciute al sistema. In questo caso ``nf_sockopt_mutex`` è perfetto allo scopo, in particolar modo visto che setsockopt e getsockopt potrebbero dormire. @@ -179,19 +179,19 @@ Sincronizzazione fra il contesto utente e i softirq Se un softirq condivide dati col contesto utente, avete due problemi. Primo, il contesto utente corrente potrebbe essere interroto da un softirq, e secondo, la sezione critica potrebbe essere eseguita da un altro -processore. Questo è quando :c:func:`spin_lock_bh()` +processore. Questo è quando spin_lock_bh() (``include/linux/spinlock.h``) viene utilizzato. Questo disabilita i softirq -sul processore e trattiene il *lock*. Invece, :c:func:`spin_unlock_bh()` fa +sul processore e trattiene il *lock*. Invece, spin_unlock_bh() fa l'opposto. (Il suffisso '_bh' è un residuo storico che fa riferimento al "Bottom Halves", il vecchio nome delle interruzioni software. In un mondo perfetto questa funzione si chiamerebbe 'spin_lock_softirq()'). -Da notare che in questo caso potete utilizzare anche :c:func:`spin_lock_irq()` -o :c:func:`spin_lock_irqsave()`, queste fermano anche le interruzioni hardware: +Da notare che in questo caso potete utilizzare anche spin_lock_irq() +o spin_lock_irqsave(), queste fermano anche le interruzioni hardware: vedere :ref:`Contesto di interruzione hardware <it_hardirq-context>`. Questo funziona alla perfezione anche sui sistemi monoprocessore: gli spinlock -svaniscono e questa macro diventa semplicemente :c:func:`local_bh_disable()` +svaniscono e questa macro diventa semplicemente local_bh_disable() (``include/linux/interrupt.h``), la quale impedisce ai softirq d'essere eseguiti. @@ -224,8 +224,8 @@ Differenti tasklet/timer ~~~~~~~~~~~~~~~~~~~~~~~~ Se un altro tasklet/timer vuole condividere dati col vostro tasklet o timer, -allora avrete bisogno entrambe di :c:func:`spin_lock()` e -:c:func:`spin_unlock()`. Qui :c:func:`spin_lock_bh()` è inutile, siete già +allora avrete bisogno entrambe di spin_lock() e +spin_unlock(). Qui spin_lock_bh() è inutile, siete già in un tasklet ed avete la garanzia che nessun altro verrà eseguito sullo stesso processore. @@ -243,13 +243,13 @@ processore (vedere :ref:`Dati per processore <it_per-cpu>`). Se siete arrivati fino a questo punto nell'uso dei softirq, probabilmente tenete alla scalabilità delle prestazioni abbastanza da giustificarne la complessità aggiuntiva. -Dovete utilizzare :c:func:`spin_lock()` e :c:func:`spin_unlock()` per +Dovete utilizzare spin_lock() e spin_unlock() per proteggere i dati condivisi. Diversi Softirqs ~~~~~~~~~~~~~~~~ -Dovete utilizzare :c:func:`spin_lock()` e :c:func:`spin_unlock()` per +Dovete utilizzare spin_lock() e spin_unlock() per proteggere i dati condivisi, che siano timer, tasklet, diversi softirq o lo stesso o altri softirq: uno qualsiasi di essi potrebbe essere in esecuzione su un diverso processore. @@ -270,40 +270,40 @@ Se un gestore di interruzioni hardware condivide dati con un softirq, allora avrete due preoccupazioni. Primo, il softirq può essere interrotto da un'interruzione hardware, e secondo, la sezione critica potrebbe essere eseguita da un'interruzione hardware su un processore diverso. Questo è il caso -dove :c:func:`spin_lock_irq()` viene utilizzato. Disabilita le interruzioni -sul processore che l'esegue, poi trattiene il lock. :c:func:`spin_unlock_irq()` +dove spin_lock_irq() viene utilizzato. Disabilita le interruzioni +sul processore che l'esegue, poi trattiene il lock. spin_unlock_irq() fa l'opposto. -Il gestore d'interruzione hardware non usa :c:func:`spin_lock_irq()` perché -i softirq non possono essere eseguiti quando il gestore d'interruzione hardware -è in esecuzione: per questo si può usare :c:func:`spin_lock()`, che è un po' +Il gestore d'interruzione hardware non ha bisogno di usare spin_lock_irq() +perché i softirq non possono essere eseguiti quando il gestore d'interruzione +hardware è in esecuzione: per questo si può usare spin_lock(), che è un po' più veloce. L'unica eccezione è quando un altro gestore d'interruzioni -hardware utilizza lo stesso *lock*: :c:func:`spin_lock_irq()` impedirà a questo +hardware utilizza lo stesso *lock*: spin_lock_irq() impedirà a questo secondo gestore di interrompere quello in esecuzione. Questo funziona alla perfezione anche sui sistemi monoprocessore: gli spinlock -svaniscono e questa macro diventa semplicemente :c:func:`local_irq_disable()` +svaniscono e questa macro diventa semplicemente local_irq_disable() (``include/asm/smp.h``), la quale impedisce a softirq/tasklet/BH d'essere eseguiti. -:c:func:`spin_lock_irqsave()` (``include/linux/spinlock.h``) è una variante che +spin_lock_irqsave() (``include/linux/spinlock.h``) è una variante che salva lo stato delle interruzioni in una variabile, questa verrà poi passata -a :c:func:`spin_unlock_irqrestore()`. Questo significa che lo stesso codice +a spin_unlock_irqrestore(). Questo significa che lo stesso codice potrà essere utilizzato in un'interruzione hardware (dove le interruzioni sono già disabilitate) e in un softirq (dove la disabilitazione delle interruzioni è richiesta). Da notare che i softirq (e quindi tasklet e timer) sono eseguiti al ritorno -da un'interruzione hardware, quindi :c:func:`spin_lock_irq()` interrompe +da un'interruzione hardware, quindi spin_lock_irq() interrompe anche questi. Tenuto conto di questo si può dire che -:c:func:`spin_lock_irqsave()` è la funzione di sincronizzazione più generica +spin_lock_irqsave() è la funzione di sincronizzazione più generica e potente. Sincronizzazione fra due gestori d'interruzioni hardware -------------------------------------------------------- Condividere dati fra due gestori di interruzione hardware è molto raro, ma se -succede, dovreste usare :c:func:`spin_lock_irqsave()`: è una specificità +succede, dovreste usare spin_lock_irqsave(): è una specificità dell'architettura il fatto che tutte le interruzioni vengano interrotte quando si eseguono di gestori di interruzioni. @@ -317,11 +317,11 @@ Pete Zaitcev ci offre il seguente riassunto: il mutex e dormire (``copy_from_user*(`` o ``kmalloc(x,GFP_KERNEL)``). - Altrimenti (== i dati possono essere manipolati da un'interruzione) usate - :c:func:`spin_lock_irqsave()` e :c:func:`spin_unlock_irqrestore()`. + spin_lock_irqsave() e spin_unlock_irqrestore(). - Evitate di trattenere uno spinlock per più di 5 righe di codice incluse le chiamate a funzione (ad eccezione di quell per l'accesso come - :c:func:`readb()`). + readb()). Tabella dei requisiti minimi ---------------------------- @@ -334,7 +334,7 @@ processore alla volta, ma se deve condividere dati con un altro thread, allora la sincronizzazione è necessaria). Ricordatevi il suggerimento qui sopra: potete sempre usare -:c:func:`spin_lock_irqsave()`, che è un sovrainsieme di tutte le altre funzioni +spin_lock_irqsave(), che è un sovrainsieme di tutte le altre funzioni per spinlock. ============== ============= ============= ========= ========= ========= ========= ======= ======= ============== ============== @@ -378,13 +378,13 @@ protetti dal *lock* quando qualche altro thread lo sta già facendo trattenendo il *lock*. Potrete acquisire il *lock* più tardi se vi serve accedere ai dati protetti da questo *lock*. -La funzione :c:func:`spin_trylock()` non ritenta di acquisire il *lock*, +La funzione spin_trylock() non ritenta di acquisire il *lock*, se ci riesce al primo colpo ritorna un valore diverso da zero, altrimenti se fallisce ritorna 0. Questa funzione può essere utilizzata in un qualunque -contesto, ma come :c:func:`spin_lock()`: dovete disabilitare i contesti che +contesto, ma come spin_lock(): dovete disabilitare i contesti che potrebbero interrompervi e quindi trattenere lo spinlock. -La funzione :c:func:`mutex_trylock()` invece di sospendere il vostro processo +La funzione mutex_trylock() invece di sospendere il vostro processo ritorna un valore diverso da zero se è possibile trattenere il lock al primo colpo, altrimenti se fallisce ritorna 0. Nonostante non dorma, questa funzione non può essere usata in modo sicuro in contesti di interruzione hardware o @@ -506,7 +506,7 @@ della memoria che il suo contenuto sono protetti dal *lock*. Questo caso è semplice dato che copiamo i dati dall'utente e non permettiamo mai loro di accedere direttamente agli oggetti. -C'è una piccola ottimizzazione qui: nella funzione :c:func:`cache_add()` +C'è una piccola ottimizzazione qui: nella funzione cache_add() impostiamo i campi dell'oggetto prima di acquisire il *lock*. Questo è sicuro perché nessun altro potrà accedervi finché non lo inseriremo nella memoria. @@ -514,7 +514,7 @@ nella memoria. Accesso dal contesto utente --------------------------- -Ora consideriamo il caso in cui :c:func:`cache_find()` può essere invocata +Ora consideriamo il caso in cui cache_find() può essere invocata dal contesto d'interruzione: sia hardware che software. Un esempio potrebbe essere un timer che elimina oggetti dalla memoria. @@ -583,15 +583,15 @@ sono quelle rimosse, mentre quelle ``+`` sono quelle aggiunte. return ret; } -Da notare che :c:func:`spin_lock_irqsave()` disabiliterà le interruzioni +Da notare che spin_lock_irqsave() disabiliterà le interruzioni se erano attive, altrimenti non farà niente (quando siamo già in un contesto d'interruzione); dunque queste funzioni possono essere chiamante in sicurezza da qualsiasi contesto. -Sfortunatamente, :c:func:`cache_add()` invoca :c:func:`kmalloc()` con +Sfortunatamente, cache_add() invoca kmalloc() con l'opzione ``GFP_KERNEL`` che è permessa solo in contesto utente. Ho supposto -che :c:func:`cache_add()` venga chiamata dal contesto utente, altrimenti -questa opzione deve diventare un parametro di :c:func:`cache_add()`. +che cache_add() venga chiamata dal contesto utente, altrimenti +questa opzione deve diventare un parametro di cache_add(). Esporre gli oggetti al di fuori del file ---------------------------------------- @@ -610,7 +610,7 @@ Il secondo problema è il problema del ciclo di vita: se un'altra struttura mantiene un puntatore ad un oggetto, presumibilmente si aspetta che questo puntatore rimanga valido. Sfortunatamente, questo è garantito solo mentre si trattiene il *lock*, altrimenti qualcuno potrebbe chiamare -:c:func:`cache_delete()` o peggio, aggiungere un oggetto che riutilizza lo +cache_delete() o peggio, aggiungere un oggetto che riutilizza lo stesso indirizzo. Dato che c'è un solo *lock*, non potete trattenerlo a vita: altrimenti @@ -710,9 +710,9 @@ Ecco il codice:: } Abbiamo incapsulato il contatore di riferimenti nelle tipiche funzioni -di 'get' e 'put'. Ora possiamo ritornare l'oggetto da :c:func:`cache_find()` +di 'get' e 'put'. Ora possiamo ritornare l'oggetto da cache_find() col vantaggio che l'utente può dormire trattenendo l'oggetto (per esempio, -:c:func:`copy_to_user()` per copiare il nome verso lo spazio utente). +copy_to_user() per copiare il nome verso lo spazio utente). Un altro punto da notare è che ho detto che il contatore dovrebbe incrementarsi per ogni puntatore ad un oggetto: quindi il contatore di riferimenti è 1 @@ -727,8 +727,8 @@ Ci sono un certo numbero di operazioni atomiche definite in ``include/asm/atomic.h``: queste sono garantite come atomiche su qualsiasi processore del sistema, quindi non sono necessari i *lock*. In questo caso è più semplice rispetto all'uso degli spinlock, benché l'uso degli spinlock -sia più elegante per casi non banali. Le funzioni :c:func:`atomic_inc()` e -:c:func:`atomic_dec_and_test()` vengono usate al posto dei tipici operatori di +sia più elegante per casi non banali. Le funzioni atomic_inc() e +atomic_dec_and_test() vengono usate al posto dei tipici operatori di incremento e decremento, e i *lock* non sono più necessari per proteggere il contatore stesso. @@ -820,7 +820,7 @@ al nome di cambiare abbiamo tre possibilità : - Si può togliere static da ``cache_lock`` e dire agli utenti che devono trattenere il *lock* prima di modificare il nome di un oggetto. -- Si può fornire una funzione :c:func:`cache_obj_rename()` che prende il +- Si può fornire una funzione cache_obj_rename() che prende il *lock* e cambia il nome per conto del chiamante; si dirà poi agli utenti di usare questa funzione. @@ -878,11 +878,11 @@ Da notare che ho deciso che il contatore di popolarità dovesse essere protetto da ``cache_lock`` piuttosto che dal *lock* dell'oggetto; questo perché è logicamente parte dell'infrastruttura (come :c:type:`struct list_head <list_head>` nell'oggetto). In questo modo, -in :c:func:`__cache_add()`, non ho bisogno di trattenere il *lock* di ogni +in __cache_add(), non ho bisogno di trattenere il *lock* di ogni oggetto mentre si cerca il meno popolare. Ho anche deciso che il campo id è immutabile, quindi non ho bisogno di -trattenere il lock dell'oggetto quando si usa :c:func:`__cache_find()` +trattenere il lock dell'oggetto quando si usa __cache_find() per leggere questo campo; il *lock* dell'oggetto è usato solo dal chiamante che vuole leggere o scrivere il campo name. @@ -907,7 +907,7 @@ Questo è facile da diagnosticare: non è uno di quei problemi che ti tengono sveglio 5 notti a parlare da solo. Un caso un pochino più complesso; immaginate d'avere una spazio condiviso -fra un softirq ed il contesto utente. Se usate :c:func:`spin_lock()` per +fra un softirq ed il contesto utente. Se usate spin_lock() per proteggerlo, il contesto utente potrebbe essere interrotto da un softirq mentre trattiene il lock, da qui il softirq rimarrà in attesa attiva provando ad acquisire il *lock* già trattenuto nel contesto utente. @@ -1006,12 +1006,12 @@ potreste fare come segue:: spin_unlock_bh(&list_lock); Primo o poi, questo esploderà su un sistema multiprocessore perché un -temporizzatore potrebbe essere già partiro prima di :c:func:`spin_lock_bh()`, -e prenderà il *lock* solo dopo :c:func:`spin_unlock_bh()`, e cercherà +temporizzatore potrebbe essere già partiro prima di spin_lock_bh(), +e prenderà il *lock* solo dopo spin_unlock_bh(), e cercherà di eliminare il suo oggetto (che però è già stato eliminato). Questo può essere evitato controllando il valore di ritorno di -:c:func:`del_timer()`: se ritorna 1, il temporizzatore è stato già +del_timer(): se ritorna 1, il temporizzatore è stato già rimosso. Se 0, significa (in questo caso) che il temporizzatore è in esecuzione, quindi possiamo fare come segue:: @@ -1032,9 +1032,9 @@ esecuzione, quindi possiamo fare come segue:: spin_unlock_bh(&list_lock); Un altro problema è l'eliminazione dei temporizzatori che si riavviano -da soli (chiamando :c:func:`add_timer()` alla fine della loro esecuzione). +da soli (chiamando add_timer() alla fine della loro esecuzione). Dato che questo è un problema abbastanza comune con una propensione -alle corse critiche, dovreste usare :c:func:`del_timer_sync()` +alle corse critiche, dovreste usare del_timer_sync() (``include/linux/timer.h``) per gestire questo caso. Questa ritorna il numero di volte che il temporizzatore è stato interrotto prima che fosse in grado di fermarlo senza che si riavviasse. @@ -1116,7 +1116,7 @@ chiamata ``list``:: wmb(); list->next = new; -La funzione :c:func:`wmb()` è una barriera di sincronizzazione delle +La funzione wmb() è una barriera di sincronizzazione delle scritture. Questa garantisce che la prima operazione (impostare l'elemento ``next`` del nuovo elemento) venga completata e vista da tutti i processori prima che venga eseguita la seconda operazione (che sarebbe quella di mettere @@ -1127,7 +1127,7 @@ completamente il nuovo elemento; oppure che lo vedano correttamente e quindi il puntatore ``next`` deve puntare al resto della lista. Fortunatamente, c'è una funzione che fa questa operazione sulle liste -:c:type:`struct list_head <list_head>`: :c:func:`list_add_rcu()` +:c:type:`struct list_head <list_head>`: list_add_rcu() (``include/linux/list.h``). Rimuovere un elemento dalla lista è anche più facile: sostituiamo il puntatore @@ -1138,7 +1138,7 @@ l'elemento o lo salteranno. list->next = old->next; -La funzione :c:func:`list_del_rcu()` (``include/linux/list.h``) fa esattamente +La funzione list_del_rcu() (``include/linux/list.h``) fa esattamente questo (la versione normale corrompe il vecchio oggetto, e non vogliamo che accada). @@ -1146,9 +1146,9 @@ Anche i lettori devono stare attenti: alcuni processori potrebbero leggere attraverso il puntatore ``next`` il contenuto dell'elemento successivo troppo presto, ma non accorgersi che il contenuto caricato è sbagliato quando il puntatore ``next`` viene modificato alla loro spalle. Ancora una volta -c'è una funzione che viene in vostro aiuto :c:func:`list_for_each_entry_rcu()` +c'è una funzione che viene in vostro aiuto list_for_each_entry_rcu() (``include/linux/list.h``). Ovviamente, gli scrittori possono usare -:c:func:`list_for_each_entry()` dato che non ci possono essere due scrittori +list_for_each_entry() dato che non ci possono essere due scrittori in contemporanea. Il nostro ultimo dilemma è il seguente: quando possiamo realmente distruggere @@ -1156,15 +1156,15 @@ l'elemento rimosso? Ricordate, un lettore potrebbe aver avuto accesso a questo elemento proprio ora: se eliminiamo questo elemento ed il puntatore ``next`` cambia, il lettore salterà direttamente nella spazzatura e scoppierà . Dobbiamo aspettare finché tutti i lettori che stanno attraversando la lista abbiano -finito. Utilizziamo :c:func:`call_rcu()` per registrare una funzione di +finito. Utilizziamo call_rcu() per registrare una funzione di richiamo che distrugga l'oggetto quando tutti i lettori correnti hanno terminato. In alternative, potrebbe essere usata la funzione -:c:func:`synchronize_rcu()` che blocca l'esecuzione finché tutti i lettori +synchronize_rcu() che blocca l'esecuzione finché tutti i lettori non terminano di ispezionare la lista. Ma come fa l'RCU a sapere quando i lettori sono finiti? Il meccanismo è il seguente: innanzi tutto i lettori accedono alla lista solo fra la coppia -:c:func:`rcu_read_lock()`/:c:func:`rcu_read_unlock()` che disabilita la +rcu_read_lock()/rcu_read_unlock() che disabilita la prelazione così che i lettori non vengano sospesi mentre stanno leggendo la lista. @@ -1253,12 +1253,12 @@ codice RCU è un po' più ottimizzato di così, ma questa è l'idea di fondo. } Da notare che i lettori modificano il campo popularity nella funzione -:c:func:`__cache_find()`, e ora non trattiene alcun *lock*. Una soluzione +__cache_find(), e ora non trattiene alcun *lock*. Una soluzione potrebbe essere quella di rendere la variabile ``atomic_t``, ma per l'uso che ne abbiamo fatto qui, non ci interessano queste corse critiche perché un risultato approssimativo è comunque accettabile, quindi non l'ho cambiato. -Il risultato è che la funzione :c:func:`cache_find()` non ha bisogno di alcuna +Il risultato è che la funzione cache_find() non ha bisogno di alcuna sincronizzazione con le altre funzioni, quindi è veloce su un sistema multi-processore tanto quanto lo sarebbe su un sistema mono-processore. @@ -1271,9 +1271,9 @@ riferimenti. Ora, dato che il '*lock* di lettura' di un RCU non fa altro che disabilitare la prelazione, un chiamante che ha sempre la prelazione disabilitata fra le -chiamate :c:func:`cache_find()` e :c:func:`object_put()` non necessita +chiamate cache_find() e object_put() non necessita di incrementare e decrementare il contatore di riferimenti. Potremmo -esporre la funzione :c:func:`__cache_find()` dichiarandola non-static, +esporre la funzione __cache_find() dichiarandola non-static, e quel chiamante potrebbe usare direttamente questa funzione. Il beneficio qui sta nel fatto che il contatore di riferimenti no @@ -1293,10 +1293,10 @@ singolo contatore. Facile e pulito. Se questo dovesse essere troppo lento (solitamente non lo è, ma se avete dimostrato che lo è devvero), potreste usare un contatore per ogni processore e quindi non sarebbe più necessaria la mutua esclusione. Vedere -:c:func:`DEFINE_PER_CPU()`, :c:func:`get_cpu_var()` e :c:func:`put_cpu_var()` +DEFINE_PER_CPU(), get_cpu_var() e put_cpu_var() (``include/linux/percpu.h``). -Il tipo di dato ``local_t``, la funzione :c:func:`cpu_local_inc()` e tutte +Il tipo di dato ``local_t``, la funzione cpu_local_inc() e tutte le altre funzioni associate, sono di particolare utilità per semplici contatori per-processore; su alcune architetture sono anche più efficienti (``include/asm/local.h``). @@ -1324,11 +1324,11 @@ da un'interruzione software. Il gestore d'interruzione non utilizza alcun enable_irq(irq); spin_unlock(&lock); -La funzione :c:func:`disable_irq()` impedisce al gestore d'interruzioni +La funzione disable_irq() impedisce al gestore d'interruzioni d'essere eseguito (e aspetta che finisca nel caso fosse in esecuzione su un altro processore). Lo spinlock, invece, previene accessi simultanei. Naturalmente, questo è più lento della semplice chiamata -:c:func:`spin_lock_irq()`, quindi ha senso solo se questo genere di accesso +spin_lock_irq(), quindi ha senso solo se questo genere di accesso è estremamente raro. .. _`it_sleeping-things`: @@ -1336,7 +1336,7 @@ Naturalmente, questo è più lento della semplice chiamata Quali funzioni possono essere chiamate in modo sicuro dalle interruzioni? ========================================================================= -Molte funzioni del kernel dormono (in sostanza, chiamano ``schedule()``) +Molte funzioni del kernel dormono (in sostanza, chiamano schedule()) direttamente od indirettamente: non potete chiamarle se trattenere uno spinlock o avete la prelazione disabilitata, mai. Questo significa che dovete necessariamente essere nel contesto utente: chiamarle da un @@ -1354,23 +1354,23 @@ dormire. - Accessi allo spazio utente: - - :c:func:`copy_from_user()` + - copy_from_user() - - :c:func:`copy_to_user()` + - copy_to_user() - - :c:func:`get_user()` + - get_user() - - :c:func:`put_user()` + - put_user() -- :c:func:`kmalloc(GFP_KERNEL) <kmalloc>` +- kmalloc(GFP_KERNEL) <kmalloc>` -- :c:func:`mutex_lock_interruptible()` and - :c:func:`mutex_lock()` +- mutex_lock_interruptible() and + mutex_lock() - C'è anche :c:func:`mutex_trylock()` che però non dorme. + C'è anche mutex_trylock() che però non dorme. Comunque, non deve essere usata in un contesto d'interruzione dato che la sua implementazione non è sicura in quel contesto. - Anche :c:func:`mutex_unlock()` non dorme mai. Non può comunque essere + Anche mutex_unlock() non dorme mai. Non può comunque essere usata in un contesto d'interruzione perché un mutex deve essere rilasciato dallo stesso processo che l'ha acquisito. @@ -1380,11 +1380,11 @@ Alcune funzioni che non dormono Alcune funzioni possono essere chiamate tranquillamente da qualsiasi contesto, o trattenendo un qualsiasi *lock*. -- :c:func:`printk()` +- printk() -- :c:func:`kfree()` +- kfree() -- :c:func:`add_timer()` e :c:func:`del_timer()` +- add_timer() e del_timer() Riferimento per l'API dei Mutex =============================== @@ -1444,14 +1444,14 @@ prelazione bh Bottom Half: per ragioni storiche, le funzioni che contengono '_bh' nel loro nome ora si riferiscono a qualsiasi interruzione software; per esempio, - :c:func:`spin_lock_bh()` blocca qualsiasi interuzione software sul processore + spin_lock_bh() blocca qualsiasi interuzione software sul processore corrente. I *Bottom Halves* sono deprecati, e probabilmente verranno sostituiti dai tasklet. In un dato momento potrà esserci solo un *bottom half* in esecuzione. contesto d'interruzione Non è il contesto utente: qui si processano le interruzioni hardware e - software. La macro :c:func:`in_interrupt()` ritorna vero. + software. La macro in_interrupt() ritorna vero. contesto utente Il kernel che esegue qualcosa per conto di un particolare processo (per @@ -1461,12 +1461,12 @@ contesto utente che hardware. interruzione hardware - Richiesta di interruzione hardware. :c:func:`in_irq()` ritorna vero in un + Richiesta di interruzione hardware. in_irq() ritorna vero in un gestore d'interruzioni hardware. interruzione software / softirq - Gestore di interruzioni software: :c:func:`in_irq()` ritorna falso; - :c:func:`in_softirq()` ritorna vero. I tasklet e le softirq sono entrambi + Gestore di interruzioni software: in_irq() ritorna falso; + in_softirq() ritorna vero. I tasklet e le softirq sono entrambi considerati 'interruzioni software'. In soldoni, un softirq è uno delle 32 interruzioni software che possono diff --git a/Documentation/translations/it_IT/process/2.Process.rst b/Documentation/translations/it_IT/process/2.Process.rst index 9af4d01617c4..30dc172f06b0 100644 --- a/Documentation/translations/it_IT/process/2.Process.rst +++ b/Documentation/translations/it_IT/process/2.Process.rst @@ -23,18 +23,18 @@ ogni due o tre mesi viene effettuata un rilascio importante del kernel. I rilasci più recenti sono stati: ====== ================= - 4.11 Aprile 30, 2017 - 4.12 Luglio 2, 2017 - 4.13 Settembre 3, 2017 - 4.14 Novembre 12, 2017 - 4.15 Gennaio 28, 2018 - 4.16 Aprile 1, 2018 + 5.0 3 marzo, 2019 + 5.1 5 maggio, 2019 + 5.2 7 luglio, 2019 + 5.3 15 settembre, 2019 + 5.4 24 novembre, 2019 + 5.5 6 gennaio, 2020 ====== ================= -Ciascun rilascio 4.x è un importante rilascio del kernel con nuove +Ciascun rilascio 5.x è un importante rilascio del kernel con nuove funzionalità , modifiche interne dell'API, e molto altro. Un tipico -rilascio 4.x contiene quasi 13,000 gruppi di modifiche con ulteriori -modifiche a parecchie migliaia di linee di codice. La 4.x. è pertanto la +rilascio contiene quasi 13,000 gruppi di modifiche con ulteriori +modifiche a parecchie migliaia di linee di codice. La 5.x. è pertanto la linea di confine nello sviluppo del kernel Linux; il kernel utilizza un sistema di sviluppo continuo che integra costantemente nuove importanti modifiche. @@ -55,8 +55,8 @@ verrà descritto dettagliatamente più avanti). La finestra di inclusione resta attiva approssimativamente per due settimane. Al termine di questo periodo, Linus Torvald dichiarerà che la finestra è chiusa e rilascerà il primo degli "rc" del kernel. -Per il kernel che è destinato ad essere 2.6.40, per esempio, il rilascio -che emerge al termine della finestra d'inclusione si chiamerà 2.6.40-rc1. +Per il kernel che è destinato ad essere 5.6, per esempio, il rilascio +che emerge al termine della finestra d'inclusione si chiamerà 5.6-rc1. Questo rilascio indica che il momento di aggiungere nuovi componenti è passato, e che è iniziato il periodo di stabilizzazione del prossimo kernel. @@ -76,22 +76,23 @@ Mentre le correzioni si aprono la loro strada all'interno del ramo principale, il ritmo delle modifiche rallenta col tempo. Linus rilascia un nuovo kernel -rc circa una volta alla settimana; e ne usciranno circa 6 o 9 prima che il kernel venga considerato sufficientemente stabile e che il rilascio -finale 2.6.x venga fatto. A quel punto tutto il processo ricomincerà . +finale venga fatto. A quel punto tutto il processo ricomincerà . -Esempio: ecco com'è andato il ciclo di sviluppo della versione 4.16 +Esempio: ecco com'è andato il ciclo di sviluppo della versione 5.4 (tutte le date si collocano nel 2018) ============== ======================================= - Gennaio 28 4.15 rilascio stabile - Febbraio 11 4.16-rc1, finestra di inclusione chiusa - Febbraio 18 4.16-rc2 - Febbraio 25 4.16-rc3 - Marzo 4 4.16-rc4 - Marzo 11 4.16-rc5 - Marzo 18 4.16-rc6 - Marzo 25 4.16-rc7 - Aprile 1 4.17 rilascio stabile + 15 settembre 5.3 rilascio stabile + 30 settembre 5.4-rc1, finestra di inclusione chiusa + 6 ottobre 5.4-rc2 + 13 ottobre 5.4-rc3 + 20 ottobre 5.4-rc4 + 27 ottobre 5.4-rc5 + 3 novembre 5.4-rc6 + 10 novembre 5.4-rc7 + 17 novembre 5.4-rc8 + 24 novembre 5.4 rilascio stabile ============== ======================================= In che modo gli sviluppatori decidono quando chiudere il ciclo di sviluppo e @@ -108,43 +109,44 @@ tipo di perfezione difficilmente viene raggiunta; esistono troppe variabili in un progetto di questa portata. Arriva un punto dove ritardare il rilascio finale peggiora la situazione; la quantità di modifiche in attesa della prossima finestra di inclusione crescerà enormemente, creando ancor più -regressioni al giro successivo. Quindi molti kernel 4.x escono con una +regressioni al giro successivo. Quindi molti kernel 5.x escono con una manciata di regressioni delle quali, si spera, nessuna è grave. Una volta che un rilascio stabile è fatto, il suo costante mantenimento è affidato al "squadra stabilità ", attualmente composta da Greg Kroah-Hartman. Questa squadra rilascia occasionalmente degli aggiornamenti relativi al -rilascio stabile usando la numerazione 4.x.y. Per essere presa in +rilascio stabile usando la numerazione 5.x.y. Per essere presa in considerazione per un rilascio d'aggiornamento, una modifica deve: (1) correggere un baco importante (2) essere già inserita nel ramo principale per il prossimo sviluppo del kernel. Solitamente, passato il loro rilascio iniziale, i kernel ricevono aggiornamenti per più di un ciclo di sviluppo. -Quindi, per esempio, la storia del kernel 4.13 appare così: +Quindi, per esempio, la storia del kernel 5.2 appare così (anno 2019): ============== =============================== - Settembre 3 4.13 rilascio stabile - Settembre 13 4.13.1 - Settembre 20 4.13.2 - Settembre 27 4.13.3 - Ottobre 5 4.13.4 - Ottobre 12 4.13.5 + 15 settembre 5.2 rilascio stabile FIXME settembre è sbagliato + 14 luglio 5.2.1 + 21 luglio 5.2.2 + 26 luglio 5.2.3 + 28 luglio 5.2.4 + 31 luglio 5.2.5 ... ... - Novembre 24 4.13.16 + 11 ottobre 5.2.21 ============== =============================== -La 4.13.16 fu l'aggiornamento finale per la versione 4.13. +La 5.2.21 fu l'aggiornamento finale per la versione 5.2. Alcuni kernel sono destinati ad essere kernel a "lungo termine"; questi riceveranno assistenza per un lungo periodo di tempo. Al momento in cui scriviamo, i manutentori dei kernel stabili a lungo termine sono: - ====== ====================== ========================================== - 3.16 Ben Hutchings (kernel stabile molto più a lungo termine) - 4.1 Sasha Levin - 4.4 Greg Kroah-Hartman (kernel stabile molto più a lungo termine) - 4.9 Greg Kroah-Hartman - 4.14 Greg Kroah-Hartman - ====== ====================== ========================================== + ====== ================================ ========================================== + 3.16 Ben Hutchings (kernel stabile molto più a lungo termine) + 4.4 Greg Kroah-Hartman e Sasha Levin (kernel stabile molto più a lungo termine) + 4.9 Greg Kroah-Hartman e Sasha Levin + 4.14 Greg Kroah-Hartman e Sasha Levin + 4.19 Greg Kroah-Hartman e Sasha Levin + 5.4i Greg Kroah-Hartman e Sasha Levin + ====== ================================ ========================================== Questa selezione di kernel di lungo periodo sono puramente dovuti ai loro @@ -229,12 +231,13 @@ Come le modifiche finiscono nel Kernel -------------------------------------- Esiste una sola persona che può inserire le patch nel repositorio principale -del kernel: Linus Torvalds. Ma, di tutte le 9500 patch che entrarono nella -versione 2.6.38 del kernel, solo 112 (circa l'1,3%) furono scelte direttamente -da Linus in persona. Il progetto del kernel è cresciuto fino a raggiungere -una dimensione tale per cui un singolo sviluppatore non può controllare e -selezionare indipendentemente ogni modifica senza essere supportato. -La via scelta dagli sviluppatori per indirizzare tale crescita è stata quella +del kernel: Linus Torvalds. Ma, per esempio, di tutte le 9500 patch +che entrarono nella versione 2.6.38 del kernel, solo 112 (circa +l'1,3%) furono scelte direttamente da Linus in persona. Il progetto +del kernel è cresciuto fino a raggiungere una dimensione tale per cui +un singolo sviluppatore non può controllare e selezionare +indipendentemente ogni modifica senza essere supportato. La via +scelta dagli sviluppatori per indirizzare tale crescita è stata quella di utilizzare un sistema di "sottotenenti" basato sulla fiducia. Il codice base del kernel è spezzato in una serie si sottosistemi: rete, diff --git a/Documentation/translations/it_IT/process/adding-syscalls.rst b/Documentation/translations/it_IT/process/adding-syscalls.rst index c3a3439595a6..bff0a82bf127 100644 --- a/Documentation/translations/it_IT/process/adding-syscalls.rst +++ b/Documentation/translations/it_IT/process/adding-syscalls.rst @@ -39,7 +39,7 @@ vostra interfaccia. un qualche modo opaca. - Se dovete esporre solo delle informazioni sul sistema, un nuovo nodo in - sysfs (vedere ``Documentation/filesystems/sysfs.txt``) o + sysfs (vedere ``Documentation/filesystems/sysfs.rst``) o in procfs potrebbe essere sufficiente. Tuttavia, l'accesso a questi meccanismi richiede che il filesystem sia montato, il che potrebbe non essere sempre vero (per esempio, in ambienti come namespace/sandbox/chroot). diff --git a/Documentation/translations/it_IT/process/coding-style.rst b/Documentation/translations/it_IT/process/coding-style.rst index 8725f2b9e960..6f4f85832dee 100644 --- a/Documentation/translations/it_IT/process/coding-style.rst +++ b/Documentation/translations/it_IT/process/coding-style.rst @@ -313,7 +313,7 @@ che conta gli utenti attivi, dovreste chiamarla ``count_active_users()`` o qualcosa di simile, **non** dovreste chiamarla ``cntusr()``. Codificare il tipo di funzione nel suo nome (quella cosa chiamata notazione -ungherese) fa male al cervello - il compilatore conosce comunque il tipo e +ungherese) è stupido - il compilatore conosce comunque il tipo e può verificarli, e inoltre confonde i programmatori. Non c'è da sorprendersi che MicroSoft faccia programmi bacati. @@ -825,8 +825,8 @@ linguaggio assembler. Agli sviluppatori del kernel piace essere visti come dotti. Tenete un occhio di riguardo per l'ortografia e farete una belle figura. In inglese, evitate -l'uso di parole mozzate come ``dont``: usate ``do not`` oppure ``don't``. -Scrivete messaggi concisi, chiari, e inequivocabili. +l'uso incorretto di abbreviazioni come ``dont``: usate ``do not`` oppure +``don't``. Scrivete messaggi concisi, chiari, e inequivocabili. I messaggi del kernel non devono terminare con un punto fermo. diff --git a/Documentation/translations/it_IT/process/deprecated.rst b/Documentation/translations/it_IT/process/deprecated.rst index 776f26732a94..e108eaf82cf6 100644 --- a/Documentation/translations/it_IT/process/deprecated.rst +++ b/Documentation/translations/it_IT/process/deprecated.rst @@ -34,6 +34,33 @@ interfaccia come 'vecchia', questa non è una soluzione completa. L'interfaccia deve essere rimossa dal kernel, o aggiunta a questo documento per scoraggiarne l'uso. +BUG() e BUG_ON() +---------------- +Al loro posto usate WARN() e WARN_ON() per gestire le +condizioni "impossibili" e gestitele come se fosse possibile farlo. +Nonostante le funzioni della famiglia BUG() siano state progettate +per asserire "situazioni impossibili" e interrompere in sicurezza un +thread del kernel, queste si sono rivelate essere troppo rischiose +(per esempio, in quale ordine rilasciare i *lock*? Ci sono stati che +sono stati ripristinati?). Molto spesso l'uso di BUG() +destabilizza il sistema o lo corrompe del tutto, il che rende +impossibile un'attività di debug o anche solo leggere un rapporto +circa l'errore. Linus ha un'opinione molto critica al riguardo: +`email 1 +<https://lore.kernel.org/lkml/CA+55aFy6jNLsywVYdGp83AMrXBo_P-pkjkphPGrO=82SPKCpLQ@mail.gmail.com/>`_, +`email 2 +<https://lore.kernel.org/lkml/CAHk-=whDHsbK3HTOpTF=ue_o04onRwTEaK_ZoJp_fjbqq4+=Jw@mail.gmail.com/>`_ + +Tenete presente che la famiglia di funzioni WARN() dovrebbe essere +usato solo per situazioni che si suppone siano "impossibili". Se +volete avvisare gli utenti riguardo a qualcosa di possibile anche se +indesiderato, usare le funzioni della famiglia pr_warn(). Chi +amministra il sistema potrebbe aver attivato l'opzione sysctl +*panic_on_warn* per essere sicuri che il sistema smetta di funzionare +in caso si verifichino delle condizioni "inaspettate". (per esempio, +date un'occhiata al questo `commit +<https://git.kernel.org/linus/d4689846881d160a4d12a514e991a740bcb5d65a>`_) + Calcoli codificati negli argomenti di un allocatore ---------------------------------------------------- Il calcolo dinamico delle dimensioni (specialmente le moltiplicazioni) non @@ -68,52 +95,81 @@ Invece, usate la seguente funzione:: header = kzalloc(struct_size(header, item, count), GFP_KERNEL); -Per maggiori dettagli fate riferimento a :c:func:`array_size`, -:c:func:`array3_size`, e :c:func:`struct_size`, così come la famiglia di -funzioni :c:func:`check_add_overflow` e :c:func:`check_mul_overflow`. +Per maggiori dettagli fate riferimento a array_size(), +array3_size(), e struct_size(), così come la famiglia di +funzioni check_add_overflow() e check_mul_overflow(). simple_strtol(), simple_strtoll(), simple_strtoul(), simple_strtoull() ---------------------------------------------------------------------- -Le funzioni :c:func:`simple_strtol`, :c:func:`simple_strtoll`, -:c:func:`simple_strtoul`, e :c:func:`simple_strtoull` ignorano volutamente +Le funzioni simple_strtol(), simple_strtoll(), +simple_strtoul(), e simple_strtoull() ignorano volutamente i possibili overflow, e questo può portare il chiamante a generare risultati -inaspettati. Le rispettive funzioni :c:func:`kstrtol`, :c:func:`kstrtoll`, -:c:func:`kstrtoul`, e :c:func:`kstrtoull` sono da considerarsi le corrette +inaspettati. Le rispettive funzioni kstrtol(), kstrtoll(), +kstrtoul(), e kstrtoull() sono da considerarsi le corrette sostitute; tuttavia va notato che queste richiedono che la stringa sia terminata con il carattere NUL o quello di nuova riga. strcpy() -------- -La funzione :c:func:`strcpy` non fa controlli agli estremi del buffer +La funzione strcpy() non fa controlli agli estremi del buffer di destinazione. Questo può portare ad un overflow oltre i limiti del buffer e generare svariati tipi di malfunzionamenti. Nonostante l'opzione `CONFIG_FORTIFY_SOURCE=y` e svariate opzioni del compilatore aiutano a ridurne il rischio, non c'è alcuna buona ragione per continuare ad usare -questa funzione. La versione sicura da usare è :c:func:`strscpy`. +questa funzione. La versione sicura da usare è strscpy(). strncpy() su stringe terminate con NUL -------------------------------------- -L'utilizzo di :c:func:`strncpy` non fornisce alcuna garanzia sul fatto che +L'utilizzo di strncpy() non fornisce alcuna garanzia sul fatto che il buffer di destinazione verrà terminato con il carattere NUL. Questo potrebbe portare a diversi overflow di lettura o altri malfunzionamenti causati, appunto, dalla mancanza del terminatore. Questa estende la terminazione nel buffer di destinazione quando la stringa d'origine è più corta; questo potrebbe portare ad una penalizzazione delle prestazioni per chi usa solo stringe terminate. La versione sicura da usare è -:c:func:`strscpy`. (chi usa :c:func:`strscpy` e necessita di estendere la -terminazione con NUL deve aggiungere una chiamata a :c:func:`memset`) +strscpy(). (chi usa strscpy() e necessita di estendere la +terminazione con NUL deve aggiungere una chiamata a memset()) -Se il chiamate no usa stringhe terminate con NUL, allore :c:func:`strncpy()` +Se il chiamate no usa stringhe terminate con NUL, allore strncpy()() può continuare ad essere usata, ma i buffer di destinazione devono essere marchiati con l'attributo `__nonstring <https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html>`_ per evitare avvisi durante la compilazione. strlcpy() --------- -La funzione :c:func:`strlcpy`, per prima cosa, legge interamente il buffer di +La funzione strlcpy(), per prima cosa, legge interamente il buffer di origine, magari leggendo più di quanto verrà effettivamente copiato. Questo è inefficiente e può portare a overflow di lettura quando la stringa non è -terminata con NUL. La versione sicura da usare è :c:func:`strscpy`. +terminata con NUL. La versione sicura da usare è strscpy(). + +Segnaposto %p nella stringa di formato +-------------------------------------- + +Tradizionalmente, l'uso del segnaposto "%p" nella stringa di formato +esponne un indirizzo di memoria in dmesg, proc, sysfs, eccetera. Per +evitare che questi indirizzi vengano sfruttati da malintenzionati, +tutto gli usi di "%p" nel kernel rappresentano l'hash dell'indirizzo, +rendendolo di fatto inutilizzabile. Nuovi usi di "%p" non dovrebbero +essere aggiunti al kernel. Per una rappresentazione testuale di un +indirizzo usate "%pS", l'output è migliore perché mostrerà il nome del +simbolo. Per tutto il resto, semplicemente non usate "%p". + +Parafrasando la `guida +<https://lore.kernel.org/lkml/CA+55aFwQEd_d40g4mUCSsVRZzrFPUJt74vc6PPpb675hYNXcKw@mail.gmail.com/>`_ +di Linus: + +- Se il valore hash di "%p" è inutile, chiediti se il puntatore stesso + è importante. Forse dovrebbe essere rimosso del tutto? +- Se credi davvero che il vero valore del puntatore sia importante, + perché alcuni stati del sistema o i livelli di privilegi di un + utente sono considerati "special"? Se pensi di poterlo giustificare + (in un commento e nel messaggio del commit) abbastanza bene da + affrontare il giudizio di Linus, allora forse potrai usare "%px", + assicurandosi anche di averne il permesso. + +Infine, sappi che un cambio in favore di "%p" con hash `non verrà +accettato +<https://lore.kernel.org/lkml/CA+55aFwieC1-nAs+NFq9RTwaR8ef9hWa4MjNBWL41F-8wM49eA@mail.gmail.com/>`_. Vettori a dimensione variabile (VLA) ------------------------------------ @@ -127,3 +183,47 @@ Questo può portare a dei malfunzionamenti, potrebbe sovrascrivere dati importanti alla fine dello stack (quando il kernel è compilato senza `CONFIG_THREAD_INFO_IN_TASK=y`), o sovrascrivere un pezzo di memoria adiacente allo stack (quando il kernel è compilato senza `CONFIG_VMAP_STACK=y`). + +Salto implicito nell'istruzione switch-case +------------------------------------------- + +Il linguaggio C permette ai casi di un'istruzione `switch` di saltare al +prossimo caso quando l'istruzione "break" viene omessa alla fine del caso +corrente. Tuttavia questo rende il codice ambiguo perché non è sempre ovvio se +l'istruzione "break" viene omessa intenzionalmente o è un baco. Per esempio, +osservando il seguente pezzo di codice non è chiaro se lo stato +`STATE_ONE` è stato progettato apposta per eseguire anche `STATE_TWO`:: + + switch (value) { + case STATE_ONE: + do_something(); + case STATE_TWO: + do_other(); + break; + default: + WARN("unknown state"); + } + +Dato che c'è stata una lunga lista di problemi `dovuti alla mancanza dell'istruzione +"break" <https://cwe.mitre.org/data/definitions/484.html>`_, oggigiorno non +permettiamo più che vi sia un "salto implicito" (*fall-through*). Per +identificare un salto implicito intenzionale abbiamo adottato la pseudo +parola chiave 'fallthrough' che viene espansa nell'estensione di gcc +`__attribute__((fallthrough))` `Statement Attributes +<https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html>`_. +(Quando la sintassi C17/C18 `[[fallthrough]]` sarà più comunemente +supportata dai compilatori C, analizzatori statici, e dagli IDE, +allora potremo usare quella sintassi per la pseudo parola chiave) + +Quando la sintassi [[fallthrough]] sarà più comunemente supportata dai +compilatori, analizzatori statici, e ambienti di sviluppo IDE, +allora potremo usarla anche noi. + +Ne consegue che tutti i blocchi switch/case devono finire in uno dei seguenti +modi: + +* ``break;`` +* `fallthrough;`` +* ``continue;`` +* ``goto <label>;`` +* ``return [expression];`` diff --git a/Documentation/translations/it_IT/process/email-clients.rst b/Documentation/translations/it_IT/process/email-clients.rst index 224ab031ffd3..89abf6d325f2 100644 --- a/Documentation/translations/it_IT/process/email-clients.rst +++ b/Documentation/translations/it_IT/process/email-clients.rst @@ -1,12 +1,334 @@ .. include:: ../disclaimer-ita.rst -:Original: :ref:`Documentation/process/email-clients.rst <email_clients>` - -.. _it_email_clients: +:Original: :doc:`../../../process/email-clients` +:Translator: Alessia Mantegazza <amantegazza@vaga.pv.it> Informazioni sui programmi di posta elettronica per Linux ========================================================= -.. warning:: +Git +--- + +Oggigiorno, la maggior parte degli sviluppatori utilizza ``git send-email`` +al posto dei classici programmi di posta elettronica. Le pagine man sono +abbastanza buone. Dal lato del ricevente, i manutentori utilizzano ``git am`` +per applicare le patch. + +Se siete dei novelli utilizzatori di ``git`` allora inviate la patch a voi +stessi. Salvatela come testo includendo tutte le intestazioni. Poi eseguite +il comando ``git am messaggio-formato-testo.txt`` e revisionatene il risultato +con ``git log``. Quando tutto funziona correttamente, allora potete inviare +la patch alla lista di discussione più appropriata. + +Panoramica delle opzioni +------------------------ + +Le patch per il kernel vengono inviate per posta elettronica, preferibilmente +come testo integrante del messaggio. Alcuni manutentori accettano gli +allegati, ma in questo caso gli allegati devono avere il *content-type* +impostato come ``text/plain``. Tuttavia, generalmente gli allegati non sono +ben apprezzati perché rende più difficile citare porzioni di patch durante il +processo di revisione. + +I programmi di posta elettronica che vengono usati per inviare le patch per il +kernel Linux dovrebbero inviarle senza alterazioni. Per esempio, non +dovrebbero modificare o rimuovere tabulazioni o spazi, nemmeno all'inizio o +alla fine delle righe. + +Non inviate patch con ``format=flowed``. Questo potrebbe introdurre +interruzioni di riga inaspettate e indesiderate. + +Non lasciate che il vostro programma di posta vada a capo automaticamente. +Questo può corrompere le patch. + +I programmi di posta non dovrebbero modificare la codifica dei caratteri nel +testo. Le patch inviate per posta elettronica dovrebbero essere codificate in +ASCII o UTF-8. +Se configurate il vostro programma per inviare messaggi codificati con UTF-8 +eviterete possibili problemi di codifica. + +I programmi di posta dovrebbero generare e mantenere le intestazioni +"References" o "In-Reply-To:" cosicché la discussione non venga interrotta. + +Di solito, il copia-e-incolla (o taglia-e-incolla) non funziona con le patch +perché le tabulazioni vengono convertite in spazi. Usando xclipboard, xclip +e/o xcutsel potrebbe funzionare, ma è meglio che lo verifichiate o meglio +ancora: non usate il copia-e-incolla. + +Non usate firme PGP/GPG nei messaggi che contengono delle patch. Questo +impedisce il corretto funzionamento di alcuni script per leggere o applicare +patch (questo si dovrebbe poter correggere). + +Prima di inviare le patch sulle liste di discussione Linux, può essere una +buona idea quella di inviare la patch a voi stessi, salvare il messaggio +ricevuto, e applicarlo ai sorgenti con successo. + + +Alcuni suggerimenti per i programmi di posta elettronica (MUA) +-------------------------------------------------------------- + +Qui troverete alcuni suggerimenti per configurare i vostri MUA allo scopo +di modificare ed inviare patch per il kernel Linux. Tuttavia, questi +suggerimenti non sono da considerarsi come un riassunto di una configurazione +completa. + +Legenda: + +- TUI = interfaccia utente testuale (*text-based user interface*) +- GUI = interfaccia utente grafica (*graphical user interface*) + +Alpine (TUI) +************ + +Opzioni per la configurazione: + +Nella sezione :menuselection:`Sending Preferences`: + +- :menuselection:`Do Not Send Flowed Text` deve essere ``enabled`` +- :menuselection:`Strip Whitespace Before Sending` deve essere ``disabled`` + +Quando state scrivendo un messaggio, il cursore dev'essere posizionato +dove volete che la patch inizi, poi premendo :kbd:`CTRL-R` vi verrà chiesto +di selezionare il file patch da inserire nel messaggio. + +Claws Mail (GUI) +**************** + +Funziona. Alcune persone riescono ad usarlo con successo per inviare le patch. + +Per inserire una patch usate :menuselection:`Messaggio-->Inserisci file` +(:kbd:`CTRL-I`) oppure un editor esterno. + +Se la patch che avete inserito dev'essere modificata usato la finestra di +scrittura di Claws, allora assicuratevi che l'"auto-interruzione" sia +disabilitata :menuselection:`Configurazione-->Preferenze-->Composizione-->Interruzione riga`. + +Evolution (GUI) +*************** + +Alcune persone riescono ad usarlo con successo per inviare le patch. + +Quando state scrivendo una lettera selezionate: Preformattato + da :menuselection:`Formato-->Stile del paragrafo-->Preformattato` + (:kbd:`CTRL-7`) o dalla barra degli strumenti + +Poi per inserire la patch usate: +:menuselection:`Inserisci--> File di testo...` (:kbd:`ALT-N x`) + +Potete anche eseguire ``diff -Nru old.c new.c | xclip``, selezionare +:menuselection:`Preformattato`, e poi usare il tasto centrale del mouse. + +Kmail (GUI) +*********** + +Alcune persone riescono ad usarlo con successo per inviare le patch. + +La configurazione base che disabilita la composizione di messaggi HTML è +corretta; non abilitatela. + +Quando state scrivendo un messaggio, nel menu opzioni, togliete la selezione a +"A capo automatico". L'unico svantaggio sarà che qualsiasi altra cosa scriviate +nel messaggio non verrà mandata a capo in automatico ma dovrete farlo voi. +Il modo più semplice per ovviare a questo problema è quello di scrivere il +messaggio con l'opzione abilitata e poi di salvarlo nelle bozze. Riaprendo ora +il messaggio dalle bozze le andate a capo saranno parte integrante del +messaggio, per cui togliendo l'opzione "A capo automatico" non perderete nulla. + +Alla fine del vostro messaggio, appena prima di inserire la vostra patch, +aggiungete il delimitatore di patch: tre trattini (``---``). + +Ora, dal menu :menuselection:`Messaggio`, selezionate :menuselection:`Inserisci file di testo...` +quindi scegliete la vostra patch. +Come soluzione aggiuntiva potreste personalizzare la vostra barra degli +strumenti aggiungendo un'icona per :menuselection:`Inserisci file di testo...`. + +Allargate la finestra di scrittura abbastanza da evitare andate a capo. +Questo perché in Kmail 1.13.5 (KDE 4.5.4), Kmail aggiunge andate a capo +automaticamente al momento dell'invio per tutte quelle righe che graficamente, +nella vostra finestra di composizione, si sono estete su una riga successiva. +Disabilitare l'andata a capo automatica non è sufficiente. Dunque, se la vostra +patch contiene delle righe molto lunghe, allora dovrete allargare la finestra +di composizione per evitare che quelle righe vadano a capo. Vedere: +https://bugs.kde.org/show_bug.cgi?id=174034 + +Potete firmare gli allegati con GPG, ma per le patch si preferisce aggiungerle +al testo del messaggio per cui non usate la firma GPG. Firmare le patch +inserite come testo del messaggio le rende più difficili da estrarre dalla loro +codifica a 7-bit. + +Se dovete assolutamente inviare delle patch come allegati invece di integrarle +nel testo del messaggio, allora premete il tasto destro sull'allegato e +selezionate :menuselection:`Proprietà `, e poi attivate +:menuselection:`Suggerisci visualizzazione automatica` per far si che +l'allegato sia più leggibile venendo visualizzato come parte del messaggio. + +Per salvare le patch inviate come parte di un messaggio, selezionate il +messaggio che la contiene, premete il tasto destro e selezionate +:menuselection:`Salva come`. Se il messaggio fu ben preparato, allora potrete +usarlo interamente senza alcuna modifica. +I messaggi vengono salvati con permessi di lettura-scrittura solo per l'utente, +nel caso in cui vogliate copiarli altrove per renderli disponibili ad altri +gruppi o al mondo, ricordatevi di usare ``chmod`` per cambiare i permessi. + +Lotus Notes (GUI) +***************** + +Scappate finché potete. + +IBM Verse (Web GUI) +******************* + +Vedi il commento per Lotus Notes. + +Mutt (TUI) +********** + +Un sacco di sviluppatori Linux usano ``mutt``, per cui deve funzionare +abbastanza bene. + +Mutt non ha un proprio editor, quindi qualunque sia il vostro editor dovrete +configurarlo per non aggiungere automaticamente le andate a capo. Molti +editor hanno un'opzione :menuselection:`Inserisci file` che inserisce il +contenuto di un file senza alterarlo. + +Per usare ``vim`` come editor per mutt:: + + set editor="vi" + +Se per inserire la patch nel messaggio usate xclip, scrivete il comando:: + + :set paste + +prima di premere il tasto centrale o shift-insert. Oppure usate il +comando:: + + :r filename + +(a)llega funziona bene senza ``set paste`` + +Potete generare le patch con ``git format-patch`` e usare Mutt per inviarle:: + + $ mutt -H 0001-some-bug-fix.patch + +Opzioni per la configurazione: + +Tutto dovrebbe funzionare già nella configurazione base. +Tuttavia, è una buona idea quella di impostare ``send_charset``:: + + set send_charset="us-ascii:utf-8" + +Mutt è molto personalizzabile. Qui di seguito trovate la configurazione minima +per iniziare ad usare Mutt per inviare patch usando Gmail:: + + # .muttrc + # ================ IMAP ==================== + set imap_user = 'yourusername@gmail.com' + set imap_pass = 'yourpassword' + set spoolfile = imaps://imap.gmail.com/INBOX + set folder = imaps://imap.gmail.com/ + set record="imaps://imap.gmail.com/[Gmail]/Sent Mail" + set postponed="imaps://imap.gmail.com/[Gmail]/Drafts" + set mbox="imaps://imap.gmail.com/[Gmail]/All Mail" + + # ================ SMTP ==================== + set smtp_url = "smtp://username@smtp.gmail.com:587/" + set smtp_pass = $imap_pass + set ssl_force_tls = yes # Require encrypted connection + + # ================ Composition ==================== + set editor = `echo \$EDITOR` + set edit_headers = yes # See the headers when editing + set charset = UTF-8 # value of $LANG; also fallback for send_charset + # Sender, email address, and sign-off line must match + unset use_domain # because joe@localhost is just embarrassing + set realname = "YOUR NAME" + set from = "username@gmail.com" + set use_from = yes + +La documentazione di Mutt contiene molte più informazioni: + + https://gitlab.com/muttmua/mutt/-/wikis/UseCases/Gmail + + http://www.mutt.org/doc/manual/ + +Pine (TUI) +********** + +Pine aveva alcuni problemi con gli spazi vuoti, ma questi dovrebbero essere +stati risolti. + +Se potete usate alpine (il successore di pine). + +Opzioni di configurazione: + +- Nelle versioni più recenti è necessario avere ``quell-flowed-text`` +- l'opzione ``no-strip-whitespace-before-send`` è necessaria + +Sylpheed (GUI) +************** + +- funziona bene per aggiungere testo in linea (o usando allegati) +- permette di utilizzare editor esterni +- è lento su cartelle grandi +- non farà l'autenticazione TSL SMTP su una connessione non SSL +- ha un utile righello nella finestra di scrittura +- la rubrica non comprende correttamente il nome da visualizzare e + l'indirizzo associato + +Thunderbird (GUI) +***************** + +Thunderbird è un clone di Outlook a cui piace maciullare il testo, ma esistono +modi per impedirglielo. + +- permettere l'uso di editor esterni: + La cosa più semplice da fare con Thunderbird e le patch è quello di usare + l'estensione "external editor" e di usare il vostro ``$EDITOR`` preferito per + leggere/includere patch nel vostro messaggio. Per farlo, scaricate ed + installate l'estensione e aggiungete un bottone per chiamarla rapidamente + usando :menuselection:`Visualizza-->Barra degli strumenti-->Personalizza...`; + una volta fatto potrete richiamarlo premendo sul bottone mentre siete nella + finestra :menuselection:`Scrivi` + + Tenete presente che "external editor" richiede che il vostro editor non + faccia alcun fork, in altre parole, l'editor non deve ritornare prima di + essere stato chiuso. Potreste dover passare dei parametri aggiuntivi al + vostro editor oppure cambiargli la configurazione. Per esempio, usando + gvim dovrete aggiungere l'opzione -f ``/usr/bin/gvim -f`` (Se il binario + si trova in ``/usr/bin``) nell'apposito campo nell'interfaccia di + configurazione di :menuselection:`external editor`. Se usate altri editor + consultate il loro manuale per sapere come configurarli. + +Per rendere l'editor interno un po' più sensato, fate così: + +- Modificate le impostazioni di Thunderbird per far si che non usi + ``format=flowed``. Andate in :menuselection:`Modifica-->Preferenze-->Avanzate-->Editor di configurazione` + per invocare il registro delle impostazioni. + +- impostate ``mailnews.send_plaintext_flowed`` a ``false`` + +- impostate ``mailnews.wraplength`` da ``72`` a ``0`` + +- :menuselection:`Visualizza-->Corpo del messaggio come-->Testo semplice` + +- :menuselection:`Visualizza-->Codifica del testo-->Unicode` + + +TkRat (GUI) +*********** + +Funziona. Usare "Inserisci file..." o un editor esterno. + +Gmail (Web GUI) +*************** + +Non funziona per inviare le patch. + +Il programma web Gmail converte automaticamente i tab in spazi. + +Allo stesso tempo aggiunge andata a capo ogni 78 caratteri. Comunque +il problema della conversione fra spazi e tab può essere risolto usando +un editor esterno. - TODO ancora da tradurre +Un altro problema è che Gmail usa la codifica base64 per tutti quei messaggi +che contengono caratteri non ASCII. Questo include cose tipo i nomi europei. diff --git a/Documentation/translations/it_IT/process/index.rst b/Documentation/translations/it_IT/process/index.rst index 012de0f3154a..c4c867132c88 100644 --- a/Documentation/translations/it_IT/process/index.rst +++ b/Documentation/translations/it_IT/process/index.rst @@ -59,6 +59,7 @@ perché non si è trovato un posto migliore. magic-number volatile-considered-harmful clang-format + ../riscv/patch-acceptance .. only:: subproject and html diff --git a/Documentation/translations/it_IT/process/management-style.rst b/Documentation/translations/it_IT/process/management-style.rst index 07e68bfb8402..c709285138a7 100644 --- a/Documentation/translations/it_IT/process/management-style.rst +++ b/Documentation/translations/it_IT/process/management-style.rst @@ -1,12 +1,293 @@ .. include:: ../disclaimer-ita.rst -:Original: :ref:`Documentation/process/management-style.rst <managementstyle>` +:Original: :doc:`../../../process/management-style` +:Translator: Alessia Mantegazza <amantegazza@vaga.pv.it> -.. _it_managementstyle: +Il modello di gestione del kernel Linux +======================================= -Tipo di gestione del kernel Linux -================================= +Questo breve documento descrive il modello di gestione del kernel Linux. +Per certi versi, esso rispecchia il documento +:ref:`translations/it_IT/process/coding-style.rst <it_codingstyle>`, +ed è principalmente scritto per evitare di rispondere [#f1]_ in continuazione +alle stesse identiche (o quasi) domande. -.. warning:: +Il modello di gestione è qualcosa di molto personale e molto più difficile da +qualificare rispetto a delle semplici regole di codifica, quindi questo +documento potrebbe avere più o meno a che fare con la realtà . È cominciato +come un gioco, ma ciò non significa che non possa essere vero. +Lo dovrete decidere voi stessi. - TODO ancora da tradurre +In ogni caso, quando si parla del "dirigente del kernel", ci si riferisce +sempre alla persona che dirige tecnicamente, e non a coloro che +tradizionalmente hanno un ruolo direttivo all'interno delle aziende. Se vi +occupate di convalidare acquisti o avete una qualche idea sul budget del vostro +gruppo, probabilmente non siete un dirigente del kernel. Quindi i suggerimenti +qui indicati potrebbero fare al caso vostro, oppure no. + +Prima di tutto, suggerirei di acquistare "Le sette regole per avere successo", +e di non leggerlo. Bruciatelo, è un grande gesto simbolico. + +.. [#f1] Questo documento non fa molto per risponde alla domanda, ma rende + così dannatamente ovvio a chi la pone che non abbiamo la minima idea + di come rispondere. + +Comunque, partiamo: + +.. _it_decisions: + +1) Le decisioni +--------------- + +Tutti pensano che i dirigenti decidano, e che questo prendere decisioni +sia importante. Più grande e dolorosa è la decisione, più importante deve +essere il dirigente che la prende. Questo è molto profondo ed ovvio, ma non è +del tutto vero. + +Il gioco consiste nell'"evitare" di dover prendere decisioni. In particolare +se qualcuno vi chiede di "Decidere" tra (a) o (b), e vi dice che ha +davvero bisogno di voi per questo, come dirigenti siete nei guai. +Le persone che gestite devono conoscere i dettagli più di quanto li conosciate +voi, quindi se vengono da voi per una decisione tecnica, siete fottuti. +Non sarete chiaramente competente per prendere quella decisione per loro. + +(Corollario: se le persone che gestite non conoscono i dettagli meglio di voi, +anche in questo caso sarete fregati, tuttavia per altre ragioni. Ossia state +facendo il lavoro sbagliato, e che invece dovrebbero essere "loro" a gestirvi) + +Quindi il gioco si chiama "evitare" decisioni, almeno le più grandi e +difficili. Prendere decisioni piccoli e senza conseguenze va bene, e vi fa +sembrare competenti in quello che state facendo, quindi quello che un dirigente +del kernel ha bisogno di fare è trasformare le decisioni grandi e difficili +in minuzie delle quali nessuno importa. + +Ciò aiuta a capire che la differenza chiave tra una grande decisione ed una +piccola sta nella possibilità di modificare tale decisione in seguito. +Qualsiasi decisione importante può essere ridotta in decisioni meno importanti, +ma dovete assicurarvi che possano essere reversibili in caso di errori +(presenti o futuri). Improvvisamente, dovrete essere doppiamente dirigenti +per **due** decisioni non sequenziali - quella sbagliata **e** quella giusta. + +E le persone vedranno tutto ciò come prova di vera capacità di comando +(*cough* cavolata *cough*) + +Così la chiave per evitare le decisioni difficili diviene l'evitare +di fare cose che non possono essere disfatte. Non infilatevi in un angolo +dal quale non potrete sfuggire. Un topo messo all'angolo può rivelarsi +pericoloso - un dirigente messo all'angolo è solo pietoso. + +**In ogni caso** dato che nessuno è stupido al punto da lasciare veramente ad +un dirigente del kernel un enorme responsabilità , solitamente è facile fare +marcia indietro. Annullare una decisione è molto facile: semplicemente dite a +tutti che siete stati degli scemi incompetenti, dite che siete dispiaciuti, ed +annullate tutto l'inutile lavoro sul quale gli altri hanno lavorato nell'ultimo +anno. Improvvisamente la decisione che avevate preso un anno fa non era poi +così grossa, dato che può essere facilmente annullata. + +È emerso che alcune persone hanno dei problemi con questo tipo di approccio, +questo per due ragioni: + + - ammettere di essere degli idioti è più difficile di quanto sembri. A tutti + noi piace mantenere le apparenze, ed uscire allo scoperto in pubblico per + ammettere che ci si è sbagliati è qualcosa di davvero impegnativo. + - avere qualcuno che ti dice che ciò su cui hai lavorato nell'ultimo anno + non era del tutto valido, può rivelarsi difficile anche per un povero ed + umile ingegnere, e mentre il **lavoro** vero era abbastanza facile da + cancellare, dall'altro canto potreste aver irrimediabilmente perso la + fiducia di quell'ingegnere. E ricordate che l'"irrevocabile" era quello + che avevamo cercato di evitare fin dall'inizio, e la vostra decisione + ha finito per esserlo. + +Fortunatamente, entrambe queste ragioni posso essere mitigate semplicemente +ammettendo fin dal principio che non avete una cavolo di idea, dicendo +agli altri in anticipo che la vostra decisione è puramente ipotetica, e che +potrebbe essere sbagliata. Dovreste sempre riservarvi il diritto di cambiare +la vostra opinione, e rendere gli altri ben **consapevoli** di ciò. +Ed è molto più facile ammettere di essere stupidi quando non avete **ancora** +fatto quella cosa stupida. + +Poi, quando è realmente emersa la vostra stupidità , le persone semplicemente +roteeranno gli occhi e diranno "Uffa, no, ancora". + +Questa ammissione preventiva di incompetenza potrebbe anche portare le persone +che stanno facendo il vero lavoro, a pensarci due volte. Dopo tutto, se +**loro** non sono certi se sia una buona idea, voi, sicuro come la morte, +non dovreste incoraggiarli promettendogli che ciò su cui stanno lavorando +verrà incluso. Fate si che ci pensino due volte prima che si imbarchino in un +grosso lavoro. + +Ricordate: loro devono sapere più cose sui dettagli rispetto a voi, e +solitamente pensano di avere già la risposta a tutto. La miglior cosa che +potete fare in qualità di dirigente è di non instillare troppa fiducia, ma +invece fornire una salutare dose di pensiero critico su quanto stanno facendo. + +Comunque, un altro modo di evitare una decisione è quello di lamentarsi +malinconicamente dicendo : "non possiamo farli entrambi e basta?" e con uno +sguardo pietoso. Fidatevi, funziona. Se non è chiaro quale sia il miglior +approccio, lo scopriranno. La risposta potrebbe essere data dal fatto che +entrambe i gruppi di lavoro diventano frustati al punto di rinunciarvi. + +Questo può suonare come un fallimento, ma di solito questo è un segno che +c'era qualcosa che non andava in entrambe i progetti, e il motivo per +il quale le persone coinvolte non abbiano potuto decidere era che entrambe +sbagliavano. Voi ne uscirete freschi come una rosa, e avrete evitato un'altra +decisione con la quale avreste potuto fregarvi. + + +2) Le persone +------------- + +Ci sono molte persone stupide, ed essere un dirigente significa che dovrete +scendere a patti con questo, e molto più importate, che **loro** devono avere +a che fare con **voi**. + +Ne emerge che mentre è facile annullare degli errori tecnici, non è invece +così facile rimuovere i disordini della personalità . Dovrete semplicemente +convivere con i loro, ed i vostri, problemi. + +Comunque, al fine di preparavi in qualità di dirigenti del kernel, è meglio +ricordare di non abbattere alcun ponte, bombardare alcun paesano innocente, +o escludere troppi sviluppatori kernel. Ne emerge che escludere le persone +è piuttosto facile, mentre includerle nuovamente è difficile. Così +"l'esclusione" immediatamente cade sotto il titolo di "non reversibile", e +diviene un no-no secondo la sezione :ref:`it_decisions`. + +Esistono alcune semplici regole qui: + + (1) non chiamate le persone teste di c*** (al meno, non in pubblico) + (2) imparate a scusarvi quando dimenticate la regola (1) + +Il problema del punto numero 1 è che è molto facile da rispettare, dato che +è possibile dire "sei una testa di c***" in milioni di modi differenti [#f2]_, +a volte senza nemmeno pensarci, e praticamente sempre con la calda convinzione +di essere nel giusto. + +E più convinti sarete che avete ragione (e diciamolo, potete chiamare +praticamente **tutti** testa di c**, e spesso **sarete** nel giusto), più +difficile sarà scusarvi successivamente. + +Per risolvere questo problema, avete due possibilità : + + - diventare davvero bravi nello scusarsi + - essere amabili così che nessuno finirà col sentirsi preso di mira. Siate + creativi abbastanza, e potrebbero esserne divertiti. + +L'opzione dell'essere immancabilmente educati non esiste proprio. Nessuno +si fiderà di qualcuno che chiaramente sta nascondendo il suo vero carattere. + +.. [#f2] Paul Simon cantava: "50 modi per lasciare il vostro amante", perché, + molto francamente, "Un milione di modi per dire ad uno sviluppatore + Testa di c***" non avrebbe funzionato. Ma sono sicuro che ci abbia + pensato. + + +3) Le persone II - quelle buone +------------------------------- + +Mentre emerge che la maggior parte delle persone sono stupide, il corollario +a questo è il triste fatto che anche voi siete fra queste, e che mentre +possiamo tutti crogiolarci nella sicurezza di essere migliori della media +delle persone (diciamocelo, nessuno crede di essere nelle media o sotto di +essa), dovremmo anche ammettere che non siamo il "coltello più affilato" del +circondario, e che ci saranno altre persone che sono meno stupide di quanto +lo siete voi. + +Molti reagiscono male davanti alle persone intelligenti. Altri le usano a +proprio vantaggio. + +Assicuratevi che voi, in qualità di manutentori del kernel, siate nel secondo +gruppo. Inchinatevi dinanzi a loro perché saranno le persone che vi renderanno +il lavoro più facile. In particolare, prenderanno le decisioni per voi, che è +l'oggetto di questo gioco. + +Quindi quando trovate qualcuno più sveglio di voi, prendetevela comoda. +Le vostre responsabilità dirigenziali si ridurranno in gran parte nel dire +"Sembra una buona idea - Vai", oppure "Sembra buono, ma invece circa questo e +quello?". La seconda versione in particolare è una gran modo per imparare +qualcosa di nuovo circa "questo e quello" o di sembrare **extra** dirigenziali +sottolineando qualcosa alla quale i più svegli non avevano pensato. In +entrambe i casi, vincete. + +Una cosa alla quale dovete fare attenzione è che l'essere grandi in qualcosa +non si traduce automaticamente nell'essere grandi anche in altre cose. Quindi +dovreste dare una spintarella alle persone in una specifica direzione, ma +diciamocelo, potrebbero essere bravi in ciò che fanno e far schifo in tutto +il resto. La buona notizia è che le persone tendono a gravitare attorno a ciò +in cui sono bravi, quindi non state facendo nulla di irreversibile quando li +spingete verso una certa direzione, solo non spingete troppo. + + +4) Addossare le colpe +--------------------- + +Le cose andranno male, e le persone vogliono qualcuno da incolpare. Sarete voi. + +Non è poi così difficile accettare la colpa, specialmente se le persone +riescono a capire che non era **tutta** colpa vostra. Il che ci porta +sulla miglior strada per assumersi la colpa: fatelo per qualcun'altro. +Vi sentirete bene nel assumervi la responsabilità , e loro si sentiranno +bene nel non essere incolpati, e coloro che hanno perso i loro 36GB di +pornografia a causa della vostra incompetenza ammetteranno a malincuore che +almeno non avete cercato di fare il furbetto. + +Successivamente fate in modo che gli sviluppatori che in realtà hanno fallito +(se riuscite a trovarli) sappiano **in privato** che sono "fottuti". +Questo non per fargli sapere che la prossima volta possono evitarselo ma per +fargli capire che sono in debito. E, forse cosa più importante, sono loro che +devono sistemare la cosa. Perché, ammettiamolo, è sicuro non sarete voi a +farlo. + +Assumersi la colpa è anche ciò che vi rendere dirigenti in prima battuta. +È parte di ciò che spinge gli altri a fidarsi di voi, e vi garantisce +la gloria potenziale, perché siete gli unici a dire "Ho fatto una cavolata". +E se avete seguito le regole precedenti, sarete decisamente bravi nel dirlo. + + +5) Le cose da evitare +--------------------- + +Esiste una cosa che le persone odiano più che essere chiamate "teste di c****", +ed è essere chiamate "teste di c****" con fare da bigotto. Se per il primo +caso potrete comunque scusarvi, per il secondo non ve ne verrà data nemmeno +l'opportunità . Probabilmente smetteranno di ascoltarvi anche se tutto sommato +state svolgendo un buon lavoro. + +Tutti crediamo di essere migliori degli altri, il che significa che quando +qualcuno inizia a darsi delle arie, ci da **davvero** fastidio. Potreste anche +essere moralmente ed intellettualmente superiore a tutti quelli attorno a voi, +ma non cercate di renderlo ovvio per gli altri a meno che non **vogliate** +veramente far arrabbiare qualcuno [#f3]_. + +Allo stesso modo evitate di essere troppo gentili e pacati. Le buone maniere +facilmente finiscono per strabordare e nascondere i problemi, e come si usa +dire, "su internet nessuno può sentire la vostra pacatezza". Usate argomenti +diretti per farvi capire, non potete sperare che la gente capisca in altro +modo. + +Un po' di umorismo può aiutare a smorzare sia la franchezza che la moralità . +Andare oltre i limiti al punto d'essere ridicolo può portare dei punti a casa +senza renderlo spiacevole per i riceventi, i quali penseranno che stavate +facendo gli scemi. Può anche aiutare a lasciare andare quei blocchi mentali +che abbiamo nei confronti delle critiche. + +.. [#f3] Suggerimento: i forum di discussione su internet, che non sono + collegati col vostro lavoro, sono ottimi modi per sfogare la frustrazione + verso altre persone. Di tanto in tanto scrivete messaggi offensivi col ghigno + in faccia per infiammare qualche discussione: vi sentirete purificati. Solo + cercate di non cagare troppo vicino a casa. + +6) Perché io? +------------- + +Dato che la vostra responsabilità principale è quella di prendervi le colpe +d'altri, e rendere dolorosamente ovvio a tutti che siete degli incompetenti, +la domanda naturale che ne segue sarà : perché dovrei fare tutto ciò? + +Innanzitutto, potreste diventare o no popolari al punto da avere la fila di +ragazzine (o ragazzini, evitiamo pregiudizi o sessismo) che gridano e bussano +alla porta del vostro camerino, ma comunque **proverete** un immenso senso di +realizzazione personale dall'essere "in carica". Dimenticate il fatto che voi +state discutendo con tutti e che cercate di inseguirli il più velocemente che +potete. Tutti continueranno a pensare che voi siete la persona in carica. + +È un bel lavoro se riuscite ad adattarlo a voi. diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst index 995ee69fab11..3e575502690f 100644 --- a/Documentation/translations/it_IT/process/submit-checklist.rst +++ b/Documentation/translations/it_IT/process/submit-checklist.rst @@ -117,7 +117,7 @@ sottomissione delle patch, in particolare sorgenti che ne spieghi la logica: cosa fanno e perché. 25) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 26) Se il codice che avete modificato dipende o usa una qualsiasi interfaccia o funzionalità del kernel che è associata a uno dei seguenti simboli diff --git a/Documentation/translations/it_IT/riscv/patch-acceptance.rst b/Documentation/translations/it_IT/riscv/patch-acceptance.rst new file mode 100644 index 000000000000..edf67252b3fb --- /dev/null +++ b/Documentation/translations/it_IT/riscv/patch-acceptance.rst @@ -0,0 +1,40 @@ +.. include:: ../disclaimer-ita.rst + +:Original: :doc:`../../../riscv/patch-acceptance` +:Translator: Federico Vaga <federico.vaga@vaga.pv.it> + +arch/riscv linee guida alla manutenzione per gli sviluppatori +============================================================= + +Introduzione +------------ + +L'insieme di istruzioni RISC-V sono sviluppate in modo aperto: le +bozze in fase di sviluppo sono disponibili a tutti per essere +revisionate e per essere sperimentare nelle implementazioni. Le bozze +dei nuovi moduli o estensioni possono cambiare in fase di sviluppo - a +volte in modo incompatibile rispetto a bozze precedenti. Questa +flessibilità può portare a dei problemi di manutenzioni per il +supporto RISC-V nel kernel Linux. I manutentori Linux non amano +l'abbandono del codice, e il processo di sviluppo del kernel +preferisce codice ben revisionato e testato rispetto a quello +sperimentale. Desideriamo estendere questi stessi principi al codice +relativo all'architettura RISC-V che verrà accettato per l'inclusione +nel kernel. + +In aggiunta alla lista delle verifiche da fare prima di inviare una patch +------------------------------------------------------------------------- + +Accetteremo le patch per un nuovo modulo o estensione se la fondazione +RISC-V li classifica come "Frozen" o "Retified". (Ovviamente, gli +sviluppatori sono liberi di mantenere una copia del kernel Linux +contenente il codice per una bozza di estensione). + +In aggiunta, la specifica RISC-V permette agli implementatori di +creare le proprie estensioni. Queste estensioni non passano +attraverso il processo di revisione della fondazione RISC-V. Per +questo motivo, al fine di evitare complicazioni o problemi di +prestazioni, accetteremo patch solo per quelle estensioni che sono +state ufficialmente accettate dalla fondazione RISC-V. (Ovviamente, +gli implementatori sono liberi di mantenere una copia del kernel Linux +contenente il codice per queste specifiche estensioni). diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 2e831ece6e26..e50fe6541335 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -641,7 +641,7 @@ P 는 ì§ìˆ˜ 번호 ìºì‹œ ë¼ì¸ì— ì €ìž¥ë˜ì–´ ìžˆê³ , 변수 B 는 홀수 ë² ë¦¬ëˆ…ìŠ¤ 커ë„ì´ ì§€ì›í•˜ëŠ” CPU ë“¤ì€ (1) 쓰기가 ì •ë§ë¡œ ì¼ì–´ë‚ 지, (2) 쓰기가 ì–´ë””ì— ì´ë£¨ì–´ì§ˆì§€, ê·¸ë¦¬ê³ (3) 쓰여질 ê°’ì„ í™•ì‹¤ížˆ 알기 ì „ê¹Œì§€ëŠ” 쓰기를 수행하지 않기 때문입니다. 하지만 "컨트롤 ì˜ì¡´ì„±" 섹션과 -Documentation/RCU/rcu_dereference.txt 파ì¼ì„ ì£¼ì˜ ê¹Šê²Œ ì½ì–´ 주시기 ë°”ëžë‹ˆë‹¤: +Documentation/RCU/rcu_dereference.rst 파ì¼ì„ ì£¼ì˜ ê¹Šê²Œ ì½ì–´ 주시기 ë°”ëžë‹ˆë‹¤: 컴파ì¼ëŸ¬ëŠ” 매우 ì°½ì˜ì ì¸ ë§Žì€ ë°©ë²•ìœ¼ë¡œ 종ì†ì„±ì„ ê¹° 수 있습니다. CPU 1 CPU 2 diff --git a/Documentation/translations/zh_CN/IRQ.txt b/Documentation/translations/zh_CN/IRQ.txt index 956026d5cf82..9aec8dca4fcf 100644 --- a/Documentation/translations/zh_CN/IRQ.txt +++ b/Documentation/translations/zh_CN/IRQ.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/IRQ.txt +Chinese translated version of Documentation/core-api/irq/index.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -9,7 +9,7 @@ or if there is a problem with the translation. Maintainer: Eric W. Biederman <ebiederman@xmission.com> Chinese maintainer: Fu Wei <tekkamanninja@gmail.com> --------------------------------------------------------------------- -Documentation/IRQ.txt çš„ä¸æ–‡ç¿»è¯‘ +Documentation/core-api/irq/index.rst çš„ä¸æ–‡ç¿»è¯‘ 如果想评论或更新本文的内容,请直接è”ç³»åŽŸæ–‡æ¡£çš„ç»´æŠ¤è€…ã€‚å¦‚æžœä½ ä½¿ç”¨è‹±æ–‡ 交æµæœ‰å›°éš¾çš„è¯ï¼Œä¹Ÿå¯ä»¥å‘ä¸æ–‡ç‰ˆç»´æŠ¤è€…求助。如果本翻译更新ä¸åŠæ—¶æˆ–者翻 diff --git a/Documentation/translations/zh_CN/filesystems/debugfs.rst b/Documentation/translations/zh_CN/filesystems/debugfs.rst new file mode 100644 index 000000000000..f8a28793c277 --- /dev/null +++ b/Documentation/translations/zh_CN/filesystems/debugfs.rst @@ -0,0 +1,221 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: :ref:`Documentation/filesystems/debugfs.txt <debugfs_index>` + +======= +Debugfs +======= + +译者 +:: + + ä¸æ–‡ç‰ˆç»´æŠ¤è€…: ç½—æ¥šæˆ Chucheng Luo <luochucheng@vivo.com> + ä¸æ–‡ç‰ˆç¿»è¯‘者: ç½—æ¥šæˆ Chucheng Luo <luochucheng@vivo.com> + ä¸æ–‡ç‰ˆæ ¡è¯‘者: ç½—æ¥šæˆ Chucheng Luo <luochucheng@vivo.com> + + + +版æƒæ‰€æœ‰2020 ç½—æ¥šæˆ <luochucheng@vivo.com> + + +Debugfsæ˜¯å†…æ ¸å¼€å‘人员在用户空间获å–ä¿¡æ¯çš„简å•æ–¹æ³•ã€‚与/procä¸åŒï¼Œprocåªæ供进程 +ä¿¡æ¯ã€‚也ä¸åƒsysfs,å…·æœ‰ä¸¥æ ¼çš„â€œæ¯ä¸ªæ–‡ä»¶ä¸€ä¸ªå€¼â€œçš„规则。debugfsæ ¹æœ¬æ²¡æœ‰è§„åˆ™,å¼€å‘ +人员å¯ä»¥åœ¨è¿™é‡Œæ”¾ç½®ä»–们想è¦çš„任何信æ¯ã€‚debugfs文件系统也ä¸èƒ½ç”¨ä½œç¨³å®šçš„ABI接å£ã€‚ +从ç†è®ºä¸Šè®²ï¼Œdebugfs导出文件的时候没有任何约æŸã€‚但是[1]实际情况并ä¸æ€»æ˜¯é‚£ä¹ˆ +简å•ã€‚å³ä½¿æ˜¯debugfs接å£ï¼Œä¹Ÿæœ€å¥½æ ¹æ®éœ€è¦è¿›è¡Œè®¾è®¡,并尽é‡ä¿æŒæŽ¥å£ä¸å˜ã€‚ + + +Debugfs通常使用以下命令安装:: + + mount -t debugfs none /sys/kernel/debug + +(或ç‰æ•ˆçš„/etc/fstab行)。 +debugfsæ ¹ç›®å½•é»˜è®¤ä»…å¯ç”±root用户访问。è¦æ›´æ”¹å¯¹æ–‡ä»¶æ ‘的访问,请使用“ uidâ€ï¼Œâ€œ gid†+和“ modeâ€æŒ‚载选项。请注æ„,debugfs API仅按照GPLå议导出到模å—。 + +使用debugfs的代ç 应包å«<linux/debugfs.h>。然åŽï¼Œé¦–先是创建至少一个目录æ¥ä¿å˜ +一组debugfs文件:: + + struct dentry *debugfs_create_dir(const char *name, struct dentry *parent); + +如果æˆåŠŸï¼Œæ¤è°ƒç”¨å°†åœ¨æŒ‡å®šçš„父目录下创建一个å为name的目录。如果parentå‚数为空, +则会在debugfsæ ¹ç›®å½•ä¸åˆ›å»ºã€‚创建目录æˆåŠŸæ—¶ï¼Œè¿”回值是一个指å‘dentry结构体的指针。 +该dentry结构体的指针å¯ç”¨äºŽåœ¨ç›®å½•ä¸åˆ›å»ºæ–‡ä»¶ï¼ˆä»¥åŠæœ€åŽå°†å…¶æ¸…ç†å¹²å‡€ï¼‰ã€‚ERR_PTR +(-ERROR)返回值表明出错。如果返回ERR_PTR(-ENODEVï¼‰ï¼Œåˆ™è¡¨æ˜Žå†…æ ¸æ˜¯åœ¨æ²¡æœ‰debugfs +支æŒçš„情况下构建的,并且下述函数都ä¸ä¼šèµ·ä½œç”¨ã€‚ + +在debugfs目录ä¸åˆ›å»ºæ–‡ä»¶çš„最通用方法是:: + + struct dentry *debugfs_create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops); + +在这里,name是è¦åˆ›å»ºçš„文件的å称,modeæ述了访问文件应具有的æƒé™ï¼ŒparentæŒ‡å‘ +应该ä¿å˜æ–‡ä»¶çš„目录,dataå°†å˜å‚¨åœ¨äº§ç”Ÿçš„inode结构体的i_privateå—段ä¸ï¼Œè€Œfops是 +一组文件æ“作函数,这些函数ä¸å®žçŽ°æ–‡ä»¶æ“作的具体行为。至少,read()和/或 +write()æ“作应æ供;其他å¯ä»¥æ ¹æ®éœ€è¦åŒ…括在内。åŒæ ·çš„,返回值将是指å‘创建文件 +çš„dentry指针,错误时返回ERR_PTR(-ERROR),系统ä¸æ”¯æŒdebugfs时返回值为ERR_PTR +(-ENODEV)。创建一个åˆå§‹å¤§å°çš„文件,å¯ä»¥ä½¿ç”¨ä»¥ä¸‹å‡½æ•°ä»£æ›¿:: + + struct dentry *debugfs_create_file_size(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops, + loff_t file_size); + +file_size是åˆå§‹æ–‡ä»¶å¤§å°ã€‚其他å‚数跟函数debugfs_create_file的相åŒã€‚ + +在许多情况下,没必è¦è‡ªå·±åŽ»åˆ›å»ºä¸€ç»„文件æ“作;对于一些简å•çš„情况,debugfs代ç æä¾› +了许多帮助函数。包å«å•ä¸ªæ•´æ•°å€¼çš„文件å¯ä»¥ä½¿ç”¨ä»¥ä¸‹ä»»ä½•ä¸€é¡¹åˆ›å»º:: + + void debugfs_create_u8(const char *name, umode_t mode, + struct dentry *parent, u8 *value); + void debugfs_create_u16(const char *name, umode_t mode, + struct dentry *parent, u16 *value); + struct dentry *debugfs_create_u32(const char *name, umode_t mode, + struct dentry *parent, u32 *value); + void debugfs_create_u64(const char *name, umode_t mode, + struct dentry *parent, u64 *value); + +这些文件支æŒè¯»å–和写入给定值。如果æŸä¸ªæ–‡ä»¶ä¸æ”¯æŒå†™å…¥ï¼Œåªéœ€æ ¹æ®éœ€è¦è®¾ç½®mode +å‚æ•°ä½ã€‚这些文件ä¸çš„值以å进制表示;如果需è¦ä½¿ç”¨åå…进制,å¯ä»¥ä½¿ç”¨ä»¥ä¸‹å‡½æ•° +替代:: + + void debugfs_create_x8(const char *name, umode_t mode, + struct dentry *parent, u8 *value); + void debugfs_create_x16(const char *name, umode_t mode, + struct dentry *parent, u16 *value); + void debugfs_create_x32(const char *name, umode_t mode, + struct dentry *parent, u32 *value); + void debugfs_create_x64(const char *name, umode_t mode, + struct dentry *parent, u64 *value); + +这些功能åªæœ‰åœ¨å¼€å‘人员知é“导出值的大å°çš„时候æ‰æœ‰ç”¨ã€‚æŸäº›æ•°æ®ç±»åž‹åœ¨ä¸åŒçš„架构上 +有ä¸åŒçš„å®½åº¦ï¼Œè¿™æ ·ä¼šä½¿æƒ…å†µå˜å¾—有些å¤æ‚。在这ç§ç‰¹æ®Šæƒ…况下å¯ä»¥ä½¿ç”¨ä»¥ä¸‹å‡½æ•°:: + + void debugfs_create_size_t(const char *name, umode_t mode, + struct dentry *parent, size_t *value); + +ä¸å‡ºæ‰€æ–™ï¼Œæ¤å‡½æ•°å°†åˆ›å»ºä¸€ä¸ªdebugfs文件æ¥è¡¨ç¤ºç±»åž‹ä¸ºsize_tçš„å˜é‡ã€‚ + +åŒæ ·åœ°ï¼Œä¹Ÿæœ‰å¯¼å‡ºæ— 符å·é•¿æ•´åž‹å˜é‡çš„函数,分别以å进制和åå…进制表示如下:: + + struct dentry *debugfs_create_ulong(const char *name, umode_t mode, + struct dentry *parent, + unsigned long *value); + void debugfs_create_xul(const char *name, umode_t mode, + struct dentry *parent, unsigned long *value); + +布尔值å¯ä»¥é€šè¿‡ä»¥ä¸‹æ–¹å¼æ”¾ç½®åœ¨debugfsä¸:: + + struct dentry *debugfs_create_bool(const char *name, umode_t mode, + struct dentry *parent, bool *value); + + +读å–结果文件将产生Y(对于éžé›¶å€¼ï¼‰æˆ–N,åŽè·Ÿæ¢è¡Œç¬¦å†™å…¥çš„时候,它åªæŽ¥å—大写或å°å†™ +值或1或0。任何其他输入将被忽略。 + +åŒæ ·ï¼Œatomic_t类型的值也å¯ä»¥æ”¾ç½®åœ¨debugfsä¸:: + + void debugfs_create_atomic_t(const char *name, umode_t mode, + struct dentry *parent, atomic_t *value) + +读å–æ¤æ–‡ä»¶å°†èŽ·å¾—atomic_t值,写入æ¤æ–‡ä»¶å°†è®¾ç½®atomic_t值。 + +å¦ä¸€ä¸ªé€‰æ‹©æ˜¯é€šè¿‡ä»¥ä¸‹ç»“构体和函数导出一个任æ„二进制数æ®å—:: + + struct debugfs_blob_wrapper { + void *data; + unsigned long size; + }; + + struct dentry *debugfs_create_blob(const char *name, umode_t mode, + struct dentry *parent, + struct debugfs_blob_wrapper *blob); + +读å–æ¤æ–‡ä»¶å°†è¿”回由指针指å‘debugfs_blob_wrapper结构体的数æ®ã€‚一些驱动使用“blobs†+作为一ç§è¿”å›žå‡ è¡Œï¼ˆé™æ€ï¼‰æ ¼å¼åŒ–文本的简å•æ–¹æ³•ã€‚这个函数å¯ç”¨äºŽå¯¼å‡ºäºŒè¿›åˆ¶ä¿¡æ¯ï¼Œä½† +似乎在主线ä¸æ²¡æœ‰ä»»ä½•ä»£ç è¿™æ ·åšã€‚请注æ„,使用debugfs_create_blob()命令创建的 +所有文件是åªè¯»çš„。 + +如果您è¦è½¬å‚¨ä¸€ä¸ªå¯„å˜å™¨å—(在开å‘过程ä¸ç»å¸¸ä¼šè¿™ä¹ˆåšï¼Œä½†æ˜¯è¿™æ ·çš„调试代ç å¾ˆå°‘ä¸Šä¼ +到主线ä¸ã€‚Debugfsæ供两个函数:一个用于创建仅寄å˜å™¨æ–‡ä»¶ï¼Œå¦ä¸€ä¸ªæŠŠä¸€ä¸ªå¯„å˜å™¨å— +æ’入一个顺åºæ–‡ä»¶ä¸:: + + struct debugfs_reg32 { + char *name; + unsigned long offset; + }; + + struct debugfs_regset32 { + struct debugfs_reg32 *regs; + int nregs; + void __iomem *base; + }; + + struct dentry *debugfs_create_regset32(const char *name, umode_t mode, + struct dentry *parent, + struct debugfs_regset32 *regset); + + void debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs, + int nregs, void __iomem *base, char *prefix); + +“baseâ€å‚æ•°å¯èƒ½ä¸º0,但您å¯èƒ½éœ€è¦ä½¿ç”¨__stringify构建reg32数组,实际上有许多寄å˜å™¨ +å称(å®ï¼‰æ˜¯å¯„å˜å™¨å—在基å€ä¸Šçš„å—节å移é‡ã€‚ + +如果è¦åœ¨debugfsä¸è½¬å‚¨u32数组,å¯ä»¥ä½¿ç”¨ä»¥ä¸‹å‡½æ•°åˆ›å»ºæ–‡ä»¶:: + + void debugfs_create_u32_array(const char *name, umode_t mode, + struct dentry *parent, + u32 *array, u32 elements); + +“arrayâ€å‚æ•°æ供数æ®ï¼Œè€Œâ€œelementsâ€å‚数为数组ä¸å…ƒç´ çš„æ•°é‡ã€‚注æ„:数组创建åŽï¼Œæ•°ç»„ +大å°æ— 法更改。 + +有一个函数æ¥åˆ›å»ºä¸Žè®¾å¤‡ç›¸å…³çš„seq_file:: + + struct dentry *debugfs_create_devm_seqfile(struct device *dev, + const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, + void *data)); + +“devâ€å‚数是与æ¤debugfs文件相关的设备,并且“read_fnâ€æ˜¯ä¸€ä¸ªå‡½æ•°æŒ‡é’ˆï¼Œè¿™ä¸ªå‡½æ•°åœ¨ +打å°seq_file内容的时候被回调。 + +还有一些其他的é¢å‘目录的函数:: + + struct dentry *debugfs_rename(struct dentry *old_dir, + struct dentry *old_dentry, + struct dentry *new_dir, + const char *new_name); + + struct dentry *debugfs_create_symlink(const char *name, + struct dentry *parent, + const char *target); + +调用debugfs_rename()将为现有的debugfs文件é‡å‘½å,å¯èƒ½åŒæ—¶åˆ‡æ¢ç›®å½•ã€‚ new_name +函数调用之å‰ä¸èƒ½å˜åœ¨ï¼›è¿”回值为old_dentry,其ä¸åŒ…å«æ›´æ–°çš„ä¿¡æ¯ã€‚å¯ä»¥ä½¿ç”¨ +debugfs_create_symlink()创建符å·é“¾æŽ¥ã€‚ + +所有debugfs用户必须考虑的一件事是: + +debugfsä¸ä¼šè‡ªåŠ¨æ¸…除在其ä¸åˆ›å»ºçš„任何目录。如果一个模å—在ä¸æ˜¾å¼åˆ 除debugfs目录的 +情况下å¸è½½æ¨¡å—,结果将会é—留很多野指针,从而导致系统ä¸ç¨³å®šã€‚å› æ¤ï¼Œæ‰€æœ‰debugfs +用户-至少是那些å¯ä»¥ä½œä¸ºæ¨¡å—构建的用户-å¿…é¡»åšæ¨¡å—å¸è½½çš„æ—¶å€™å‡†å¤‡åˆ é™¤åœ¨æ¤åˆ›å»ºçš„ +所有文件和目录。一份文件å¯ä»¥é€šè¿‡ä»¥ä¸‹æ–¹å¼åˆ 除:: + + void debugfs_remove(struct dentry *dentry); + +dentry值å¯ä»¥ä¸ºNULL或错误值,在这ç§æƒ…况下,ä¸ä¼šæœ‰ä»»ä½•æ–‡ä»¶è¢«åˆ 除。 + +很久以å‰ï¼Œå†…æ ¸å¼€å‘者使用debugfs时需è¦è®°å½•ä»–们创建的æ¯ä¸ªdentry指针,以便最åŽæ‰€æœ‰ +文件都å¯ä»¥è¢«æ¸…ç†æŽ‰ã€‚但是,现在debugfs用户能调用以下函数递归清除之å‰åˆ›å»ºçš„文件:: + + void debugfs_remove_recursive(struct dentry *dentry); + +如果将对应顶层目录的dentryä¼ é€’ç»™ä»¥ä¸Šå‡½æ•°ï¼Œåˆ™è¯¥ç›®å½•ä¸‹çš„æ•´ä¸ªå±‚æ¬¡ç»“æž„å°†ä¼šè¢«åˆ é™¤ã€‚ + +注释: +[1] http://lwn.net/Articles/309298/ diff --git a/Documentation/translations/zh_CN/filesystems/index.rst b/Documentation/translations/zh_CN/filesystems/index.rst index 14f155edaf69..186501d13bc1 100644 --- a/Documentation/translations/zh_CN/filesystems/index.rst +++ b/Documentation/translations/zh_CN/filesystems/index.rst @@ -24,4 +24,5 @@ Linux Kernelä¸çš„文件系统 :maxdepth: 2 virtiofs + debugfs diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index ee1f37da5b23..fcf620049d11 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/filesystems/sysfs.txt +Chinese translated version of Documentation/filesystems/sysfs.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ Maintainer: Patrick Mochel <mochel@osdl.org> Mike Murphy <mamurph@cs.clemson.edu> Chinese maintainer: Fu Wei <tekkamanninja@gmail.com> --------------------------------------------------------------------- -Documentation/filesystems/sysfs.txt çš„ä¸æ–‡ç¿»è¯‘ +Documentation/filesystems/sysfs.rst çš„ä¸æ–‡ç¿»è¯‘ 如果想评论或更新本文的内容,请直接è”ç³»åŽŸæ–‡æ¡£çš„ç»´æŠ¤è€…ã€‚å¦‚æžœä½ ä½¿ç”¨è‹±æ–‡ 交æµæœ‰å›°éš¾çš„è¯ï¼Œä¹Ÿå¯ä»¥å‘ä¸æ–‡ç‰ˆç»´æŠ¤è€…求助。如果本翻译更新ä¸åŠæ—¶æˆ–者翻 @@ -40,7 +40,7 @@ sysfs 是一个最åˆåŸºäºŽ ramfs 且ä½äºŽå†…å˜çš„文件系统。它æ供导å æ•°æ®ç»“æž„åŠå…¶å±žæ€§ï¼Œä»¥åŠå®ƒä»¬ä¹‹é—´çš„å…³è”到用户空间的方法。 sysfs 始终与 kobject 的底层结构紧密相关。请阅读 -Documentation/kobject.txt 文档以获得更多关于 kobject 接å£çš„ +Documentation/core-api/kobject.rst 文档以获得更多关于 kobject 接å£çš„ ä¿¡æ¯ã€‚ @@ -281,7 +281,7 @@ drivers/ 包å«äº†æ¯ä¸ªå·²ä¸ºç‰¹å®šæ€»çº¿ä¸Šçš„设备而挂载的驱动程åºçš å‡å®šé©±åŠ¨æ²¡æœ‰è·¨è¶Šå¤šä¸ªæ€»çº¿ç±»åž‹)。 fs/ 包å«äº†ä¸€ä¸ªä¸ºæ–‡ä»¶ç³»ç»Ÿè®¾ç«‹çš„目录。现在æ¯ä¸ªæƒ³è¦å¯¼å‡ºå±žæ€§çš„文件系统必须 -在 fs/ 下创建自己的层次结构(å‚è§Documentation/filesystems/fuse.txt)。 +在 fs/ 下创建自己的层次结构(å‚è§Documentation/filesystems/fuse.rst)。 dev/ 包å«ä¸¤ä¸ªå目录: char/ å’Œ block/。在这两个å目录ä¸ï¼Œæœ‰ä»¥ <major>:<minor> æ ¼å¼å‘½å的符å·é“¾æŽ¥ã€‚这些符å·é“¾æŽ¥æŒ‡å‘ sysfs 目录 diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst index 8738c55e42a2..50386e0e42e7 100644 --- a/Documentation/translations/zh_CN/process/submit-checklist.rst +++ b/Documentation/translations/zh_CN/process/submit-checklist.rst @@ -97,7 +97,7 @@ Linuxå†…æ ¸è¡¥ä¸æäº¤æ¸…å• 24) 所有内å˜å±éšœä¾‹å¦‚ ``barrier()``, ``rmb()``, ``wmb()`` 都需è¦æºä»£ç ä¸çš„注 释æ¥è§£é‡Šå®ƒä»¬æ£åœ¨æ‰§è¡Œçš„æ“作åŠå…¶åŽŸå› 的逻辑。 -25) 如果补ä¸æ·»åŠ 了任何ioctl,那么也è¦æ›´æ–° ``Documentation/ioctl/ioctl-number.rst`` +25) 如果补ä¸æ·»åŠ 了任何ioctl,那么也è¦æ›´æ–° ``Documentation/userspace-api/ioctl/ioctl-number.rst`` 26) 如果修改åŽçš„æºä»£ç ä¾èµ–或使用与以下 ``Kconfig`` 符å·ç›¸å…³çš„ä»»ä½•å†…æ ¸API或 功能,则在ç¦ç”¨ç›¸å…³ ``Kconfig`` 符å·å’Œ/或 ``=m`` (如果该选项å¯ç”¨ï¼‰çš„情况 diff --git a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt index 9c39ee58ea50..a96abcdec777 100644 --- a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt +++ b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt @@ -488,7 +488,7 @@ struct v4l2_subdev *sd = v4l2_i2c_new_subdev(v4l2_dev, adapter, è¿™ä¸ªå‡½æ•°ä¼šåŠ è½½ç»™å®šçš„æ¨¡å—(如果没有模å—需è¦åŠ 载,å¯ä»¥ä¸º NULL), 并用给定的 i2c 适é…器结构体指针(i2c_adapter)和 器件地å€ï¼ˆchip/address) -作为å‚数调用 i2c_new_device()。如果一切顺利,则就在 v4l2_device +作为å‚数调用 i2c_new_client_device()。如果一切顺利,则就在 v4l2_device ä¸æ³¨å†Œäº†å设备。 ä½ ä¹Ÿå¯ä»¥åˆ©ç”¨ v4l2_i2c_new_subdev()的最åŽä¸€ä¸ªå‚æ•°ï¼Œä¼ é€’ä¸€ä¸ªå¯èƒ½çš„ diff --git a/Documentation/usb/gadget_configfs.rst b/Documentation/usb/gadget_configfs.rst index 54fb08baae22..158e48dab586 100644 --- a/Documentation/usb/gadget_configfs.rst +++ b/Documentation/usb/gadget_configfs.rst @@ -24,7 +24,7 @@ Linux provides a number of functions for gadgets to use. Creating a gadget means deciding what configurations there will be and which functions each configuration will provide. -Configfs (please see `Documentation/filesystems/configfs/*`) lends itself nicely +Configfs (please see `Documentation/filesystems/configfs.rst`) lends itself nicely for the purpose of telling the kernel about the above mentioned decision. This document is about how to do it. @@ -354,7 +354,7 @@ the directories in general can be named at will. A group can have a number of its default sub-groups created automatically. For more information on configfs please see -`Documentation/filesystems/configfs/*`. +`Documentation/filesystems/configfs.rst`. The concepts described above translate to USB gadgets like this: diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index f759edafd938..52bf58417653 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -146,6 +146,7 @@ Code Seq# Include File Comments 'H' 40-4F sound/hdspm.h conflict! 'H' 40-4F sound/hdsp.h conflict! 'H' 90 sound/usb/usx2y/usb_stream.h +'H' 00-0F uapi/misc/habanalabs.h conflict! 'H' A0 uapi/linux/usb/cdc-wdm.h 'H' C0-F0 net/bluetooth/hci.h conflict! 'H' C0-DF net/bluetooth/hidp/hidp.h conflict! diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index c3129b9ba5cb..57c01f531e61 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -74,7 +74,7 @@ should point to a file descriptor that is opened on the ``/dev/sev`` device, if needed (see individual commands). On output, ``error`` is zero on success, or an error code. Error codes -are defined in ``<linux/psp-dev.h>`. +are defined in ``<linux/psp-dev.h>``. KVM implements the following commands to support common lifecycle events of SEV guests, such as launching, running, snapshotting, migrating and decommissioning. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index efbbe570aa9b..d2c1cbce1018 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2572,13 +2572,15 @@ list in 4.68. :Parameters: None :Returns: 0 on success, -1 on error -This signals to the host kernel that the specified guest is being paused by -userspace. The host will set a flag in the pvclock structure that is checked -from the soft lockup watchdog. The flag is part of the pvclock structure that -is shared between guest and host, specifically the second bit of the flags +This ioctl sets a flag accessible to the guest indicating that the specified +vCPU has been paused by the host userspace. + +The host will set a flag in the pvclock structure that is checked from the +soft lockup watchdog. The flag is part of the pvclock structure that is +shared between guest and host, specifically the second bit of the flags field of the pvclock_vcpu_time_info structure. It will be set exclusively by the host and read/cleared exclusively by the guest. The guest operation of -checking and clearing the flag must an atomic operation so +checking and clearing the flag must be an atomic operation so load-link/store-conditional, or equivalent must be used. There are two cases where the guest will clear the flag: when the soft lockup watchdog timer resets itself or when a soft lockup is detected. This ioctl can be called any time diff --git a/Documentation/virt/kvm/arm/pvtime.rst b/Documentation/virt/kvm/arm/pvtime.rst index 2357dd2d8655..687b60d76ca9 100644 --- a/Documentation/virt/kvm/arm/pvtime.rst +++ b/Documentation/virt/kvm/arm/pvtime.rst @@ -76,5 +76,5 @@ It is advisable that one or more 64k pages are set aside for the purpose of these structures and not used for other purposes, this enables the guest to map the region using 64k pages and avoids conflicting attributes with other memory. -For the user space interface see Documentation/virt/kvm/devices/vcpu.txt +For the user space interface see Documentation/virt/kvm/devices/vcpu.rst section "3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL". diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 9963e680770a..ca374d3fe085 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -110,5 +110,5 @@ Returns: Specifies the base address of the stolen time structure for this VCPU. The base address must be 64 byte aligned and exist within a valid guest memory -region. See Documentation/virt/kvm/arm/pvtime.txt for more information +region. See Documentation/virt/kvm/arm/pvtime.rst for more information including the layout of the stolen time structure. diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst index dbaf207e560d..ed4fddd364ea 100644 --- a/Documentation/virt/kvm/hypercalls.rst +++ b/Documentation/virt/kvm/hypercalls.rst @@ -22,7 +22,7 @@ S390: number in R1. For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virt/kvm/s390-diag.txt. + refer to Documentation/virt/kvm/s390-diag.rst. PowerPC: It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. @@ -30,7 +30,7 @@ PowerPC: KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' property inside the device tree's /hypervisor node. - For more information refer to Documentation/virt/kvm/ppc-pv.txt + For more information refer to Documentation/virt/kvm/ppc-pv.rst MIPS: KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 60981887d20b..46126ecc70f7 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -319,7 +319,7 @@ Handling a page fault is performed as follows: - If both P bit and R/W bit of error code are set, this could possibly be handled as a "fast page fault" (fixed without taking the MMU lock). See - the description in Documentation/virt/kvm/locking.txt. + the description in Documentation/virt/kvm/locking.rst. - if needed, walk the guest page tables to determine the guest translation (gva->gpa or ngpa->gpa) diff --git a/Documentation/virt/kvm/review-checklist.rst b/Documentation/virt/kvm/review-checklist.rst index 1f86a9d3f705..dc01aea4057b 100644 --- a/Documentation/virt/kvm/review-checklist.rst +++ b/Documentation/virt/kvm/review-checklist.rst @@ -10,7 +10,7 @@ Review checklist for kvm patches 2. Patches should be against kvm.git master branch. 3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/virt/kvm/api.txt + - the API must be documented in Documentation/virt/kvm/api.rst - the API must be discoverable using KVM_CHECK_EXTENSION 4. New state must include support for save/restore. diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index e8d943b21cf9..611140ffef7e 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -31,6 +31,7 @@ descriptions of data structures and algorithms. active_mm balance cleancache + free_page_reporting frontswap highmem hmm diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst index 637cc49d1b2f..7d6f9385d129 100644 --- a/Documentation/vm/page_frags.rst +++ b/Documentation/vm/page_frags.rst @@ -26,7 +26,7 @@ to be disabled when executing the fragment allocation. The network stack uses two separate caches per CPU to handle fragment allocation. The netdev_alloc_cache is used by callers making use of the -__netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is +netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The main difference between these two calls is the context in which they may be called. The "netdev" prefixed functions are usable in any context as these diff --git a/Documentation/vm/zswap.rst b/Documentation/vm/zswap.rst index f8c6a79d7c70..d8d9fa4a1f0d 100644 --- a/Documentation/vm/zswap.rst +++ b/Documentation/vm/zswap.rst @@ -140,10 +140,10 @@ without any real benefit but with a performance drop for the system), a special parameter has been introduced to implement a sort of hysteresis to refuse taking pages into zswap pool until it has sufficient space if the limit has been hit. To set the threshold at which zswap would start accepting pages -again after it became full, use the sysfs ``accept_threhsold_percent`` +again after it became full, use the sysfs ``accept_threshold_percent`` attribute, e. g.:: - echo 80 > /sys/module/zswap/parameters/accept_threhsold_percent + echo 80 > /sys/module/zswap/parameters/accept_threshold_percent Setting this parameter to 100 will disable the hysteresis. diff --git a/Documentation/watchdog/convert_drivers_to_kernel_api.rst b/Documentation/watchdog/convert_drivers_to_kernel_api.rst index dd934cc08e40..a1c3f038ce0e 100644 --- a/Documentation/watchdog/convert_drivers_to_kernel_api.rst +++ b/Documentation/watchdog/convert_drivers_to_kernel_api.rst @@ -2,7 +2,7 @@ Converting old watchdog drivers to the watchdog framework ========================================================= -by Wolfram Sang <w.sang@pengutronix.de> +by Wolfram Sang <wsa@kernel.org> Before the watchdog framework came into the kernel, every driver had to implement the API on its own. Now, as the framework factored out the common @@ -115,7 +115,7 @@ Add the watchdog operations --------------------------- All possible callbacks are defined in 'struct watchdog_ops'. You can find it -explained in 'watchdog-kernel-api.txt' in this directory. start(), stop() and +explained in 'watchdog-kernel-api.txt' in this directory. start() and owner must be set, the rest are optional. You will easily find corresponding functions in the old driver. Note that you will now get a pointer to the watchdog_device as a parameter to these functions, so you probably have to diff --git a/Documentation/watchdog/watchdog-kernel-api.rst b/Documentation/watchdog/watchdog-kernel-api.rst index 864edbe932c1..068a55ee0d4a 100644 --- a/Documentation/watchdog/watchdog-kernel-api.rst +++ b/Documentation/watchdog/watchdog-kernel-api.rst @@ -123,8 +123,8 @@ The list of watchdog operations is defined as:: struct module *owner; /* mandatory operations */ int (*start)(struct watchdog_device *); - int (*stop)(struct watchdog_device *); /* optional operations */ + int (*stop)(struct watchdog_device *); int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); diff --git a/Documentation/x86/x86_64/uefi.rst b/Documentation/x86/x86_64/uefi.rst index 88c3ba32546f..3b894103a734 100644 --- a/Documentation/x86/x86_64/uefi.rst +++ b/Documentation/x86/x86_64/uefi.rst @@ -36,7 +36,7 @@ Mechanics elilo bootloader with x86_64 support, elilo configuration file, kernel image built in first step and corresponding - initrd. Instructions on building elilo and its dependencies + initrd. Instructions on building elilo and its dependencies can be found in the elilo sourceforge project. - Boot to EFI shell and invoke elilo choosing the kernel image built diff --git a/MAINTAINERS b/MAINTAINERS index 110ac191321b..ca99126a41c2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3742,7 +3742,7 @@ CACHEFILES: FS-CACHE BACKEND FOR CACHING ON MOUNTED FILESYSTEMS M: David Howells <dhowells@redhat.com> L: linux-cachefs@redhat.com (moderated for non-subscribers) S: Supported -F: Documentation/filesystems/caching/cachefiles.txt +F: Documentation/filesystems/caching/cachefiles.rst F: fs/cachefiles/ CADENCE MIPI-CSI2 BRIDGES @@ -4219,7 +4219,7 @@ M: coda@cs.cmu.edu L: codalist@coda.cs.cmu.edu S: Maintained W: http://www.coda.cs.cmu.edu/ -F: Documentation/filesystems/coda.txt +F: Documentation/filesystems/coda.rst F: fs/coda/ F: include/linux/coda*.h F: include/uapi/linux/coda*.h @@ -5012,7 +5012,7 @@ M: Jan Kara <jack@suse.cz> R: Amir Goldstein <amir73il@gmail.com> L: linux-fsdevel@vger.kernel.org S: Maintained -F: Documentation/filesystems/dnotify.txt +F: Documentation/filesystems/dnotify.rst F: fs/notify/dnotify/ F: include/linux/dnotify.h @@ -5026,7 +5026,7 @@ W: http://www.win.tue.nl/~aeb/partitions/partition_types-1.html DISKQUOTA M: Jan Kara <jack@suse.com> S: Maintained -F: Documentation/filesystems/quota.txt +F: Documentation/filesystems/quota.rst F: fs/quota/ F: include/linux/quota*.h F: include/uapi/linux/quota*.h @@ -7040,13 +7040,13 @@ R: Darren Hart <dvhart@infradead.org> L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core -F: Documentation/*futex* +F: Documentation/locking/*futex* F: include/asm-generic/futex.h F: include/linux/futex.h F: include/uapi/linux/futex.h F: kernel/futex.c F: tools/perf/bench/futex* -F: tools/testing/selftests/futex/ +F: Documentation/locking/*futex* GATEWORKS SYSTEM CONTROLLER (GSC) DRIVER M: Tim Harvey <tharvey@gateworks.com> @@ -7527,7 +7527,7 @@ L: linux-remoteproc@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/andersson/remoteproc.git hwspinlock-next F: Documentation/devicetree/bindings/hwlock/ -F: Documentation/hwspinlock.txt +F: Documentation/locking/hwspinlock.rst F: drivers/hwspinlock/ F: include/linux/hwspinlock.h @@ -8934,7 +8934,7 @@ M: Corey Minyard <minyard@acm.org> L: openipmi-developer@lists.sourceforge.net (moderated for non-subscribers) S: Supported W: http://openipmi.sourceforge.net/ -F: Documentation/IPMI.txt +F: Documentation/driver-api/ipmi.rst F: Documentation/devicetree/bindings/ipmi/ F: drivers/char/ipmi/ F: include/linux/ipmi* @@ -8976,7 +8976,7 @@ IRQ DOMAINS (IRQ NUMBER MAPPING LIBRARY) M: Marc Zyngier <maz@kernel.org> S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core -F: Documentation/IRQ-domain.txt +F: Documentation/core-api/irq/irq-domain.rst F: include/linux/irqdomain.h F: kernel/irq/irqdomain.c F: kernel/irq/msi.c @@ -15518,6 +15518,15 @@ M: Nicolas Pitre <nico@fluxnic.net> S: Odd Fixes F: drivers/net/ethernet/smsc/smc91x.* +SECURE MONITOR CALL(SMC) CALLING CONVENTION (SMCCC) +M: Mark Rutland <mark.rutland@arm.com> +M: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> +M: Sudeep Holla <sudeep.holla@arm.com> +L: linux-arm-kernel@lists.infradead.org +S: Maintained +F: drivers/firmware/smccc/ +F: include/linux/arm-smccc.h + SMIA AND SMIA++ IMAGE SENSOR DRIVER M: Sakari Ailus <sakari.ailus@linux.intel.com> L: linux-media@vger.kernel.org @@ -15694,7 +15703,7 @@ F: drivers/ssb/ F: include/linux/ssb/ SONY IMX214 SENSOR DRIVER -M: Ricardo Ribalda <ricardo.ribalda@gmail.com> +M: Ricardo Ribalda <ribalda@kernel.org> L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@ -15934,7 +15943,7 @@ M: Jeremy Kerr <jk@ozlabs.org> L: linuxppc-dev@lists.ozlabs.org S: Supported W: http://www.ibm.com/developerworks/power/cell/ -F: Documentation/filesystems/spufs.txt +F: Documentation/filesystems/spufs/spufs.rst F: arch/powerpc/platforms/cell/spufs/ SQUASHFS FILE SYSTEM @@ -16681,7 +16690,7 @@ S: Maintained F: sound/soc/ti/ TEXAS INSTRUMENTS' DAC7612 DAC DRIVER -M: Ricardo Ribalda <ricardo@ribalda.com> +M: Ricardo Ribalda <ribalda@kernel.org> L: linux-iio@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/iio/dac/ti,dac7612.txt @@ -18585,8 +18594,8 @@ W: http://xfs.org/ T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst -F: Documentation/filesystems/xfs-delayed-logging-design.txt -F: Documentation/filesystems/xfs-self-describing-metadata.txt +F: Documentation/filesystems/xfs-delayed-logging-design.rst +F: Documentation/filesystems/xfs-self-describing-metadata.rst F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h @@ -862,6 +862,12 @@ ifdef CONFIG_LIVEPATCH KBUILD_CFLAGS += $(call cc-option, -flive-patching=inline-clone) endif +ifdef CONFIG_SHADOW_CALL_STACK +CC_FLAGS_SCS := -fsanitize=shadow-call-stack +KBUILD_CFLAGS += $(CC_FLAGS_SCS) +export CC_FLAGS_SCS +endif + # arch Makefile may override CC so keep this after arch Makefile is included NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include) diff --git a/arch/Kconfig b/arch/Kconfig index 786a85d4ad40..2e6f843d87c4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -533,6 +533,31 @@ config STACKPROTECTOR_STRONG about 20% of all kernel functions, which increases the kernel code size by about 2%. +config ARCH_SUPPORTS_SHADOW_CALL_STACK + bool + help + An architecture should select this if it supports Clang's Shadow + Call Stack and implements runtime support for shadow stack + switching. + +config SHADOW_CALL_STACK + bool "Clang Shadow Call Stack" + depends on CC_IS_CLANG && ARCH_SUPPORTS_SHADOW_CALL_STACK + depends on DYNAMIC_FTRACE_WITH_REGS || !FUNCTION_GRAPH_TRACER + help + This option enables Clang's Shadow Call Stack, which uses a + shadow stack to protect function return addresses from being + overwritten by an attacker. More information can be found in + Clang's documentation: + + https://clang.llvm.org/docs/ShadowCallStack.html + + Note that security guarantees in the kernel differ from the + ones documented for user space. The kernel must store addresses + of shadow stacks in memory, which means an attacker capable of + reading and writing arbitrary memory may be able to locate them + and hijack control flow by modifying the stacks. + config HAVE_ARCH_WITHIN_STACK_FRAMES bool help diff --git a/arch/alpha/include/asm/checksum.h b/arch/alpha/include/asm/checksum.h index 473e6ccb65a3..0eac81624d01 100644 --- a/arch/alpha/include/asm/checksum.h +++ b/arch/alpha/include/asm/checksum.h @@ -41,7 +41,8 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum); * here even more important to align src and dst on a 32-bit (or even * better 64-bit) boundary */ -__wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *errp); +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER +__wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *errp); __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum); diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 36d42da7466a..5ddd128d4b7a 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -477,3 +477,4 @@ # 545 reserved for clone3 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd +549 common faccessat2 sys_faccessat2 diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index e53f96e8aa6d..af1dad74e933 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -325,7 +325,7 @@ csum_partial_cfu_unaligned(const unsigned long __user * src, } __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, int len, +csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *errp) { unsigned long checksum = (__force u32) sum; @@ -369,7 +369,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, } return (__force __wsum)checksum; } -EXPORT_SYMBOL(csum_partial_copy_from_user); +EXPORT_SYMBOL(csum_and_copy_from_user); __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) @@ -377,7 +377,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) __wsum checksum; mm_segment_t oldfs = get_fs(); set_fs(KERNEL_DS); - checksum = csum_partial_copy_from_user((__force const void __user *)src, + checksum = csum_and_copy_from_user((__force const void __user *)src, dst, len, sum, NULL); set_fs(oldfs); return checksum; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index c77c93c485a0..16fbf74030fe 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -313,6 +313,9 @@ choice config ARCH_MULTIPLATFORM bool "Allow multiple platforms to be selected" depends on MMU + select ARCH_FLATMEM_ENABLE + select ARCH_SPARSEMEM_ENABLE + select ARCH_SELECT_MEMORY_MODEL select ARM_HAS_SG_CHAIN select ARM_PATCH_PHYS_VIRT select AUTO_ZRELADDR @@ -1516,11 +1519,15 @@ config OABI_COMPAT config ARCH_HAS_HOLES_MEMORYMODEL bool -config ARCH_SPARSEMEM_ENABLE +config ARCH_SELECT_MEMORY_MODEL + bool + +config ARCH_FLATMEM_ENABLE bool -config ARCH_SPARSEMEM_DEFAULT - def_bool ARCH_SPARSEMEM_ENABLE +config ARCH_SPARSEMEM_ENABLE + bool + select SPARSEMEM_STATIC if SPARSEMEM config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM @@ -1955,7 +1962,7 @@ config EFI select UCS2_STRING select EFI_PARAMS_FROM_FDT select EFI_STUB - select EFI_ARMSTUB + select EFI_GENERIC_STUB select EFI_RUNTIME_WRAPPERS ---help--- This option provides support for runtime services provided diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore index db05c6ef3e31..60606b0f378d 100644 --- a/arch/arm/boot/compressed/.gitignore +++ b/arch/arm/boot/compressed/.gitignore @@ -7,12 +7,3 @@ hyp-stub.S piggy_data vmlinux vmlinux.lds - -# borrowed libfdt files -fdt.c -fdt.h -fdt_ro.c -fdt_rw.c -fdt_wip.c -libfdt.h -libfdt_internal.h diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 9c11e7490292..00602a6fba04 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -76,29 +76,30 @@ compress-$(CONFIG_KERNEL_LZMA) = lzma compress-$(CONFIG_KERNEL_XZ) = xzkern compress-$(CONFIG_KERNEL_LZ4) = lz4 -# Borrowed libfdt files for the ATAG compatibility mode - -libfdt := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c -libfdt_hdrs := fdt.h libfdt.h libfdt_internal.h - -libfdt_objs := $(addsuffix .o, $(basename $(libfdt))) - -$(addprefix $(obj)/,$(libfdt) $(libfdt_hdrs)): $(obj)/%: $(srctree)/scripts/dtc/libfdt/% - $(call cmd,shipped) - -$(addprefix $(obj)/,$(libfdt_objs) atags_to_fdt.o): \ - $(addprefix $(obj)/,$(libfdt_hdrs)) +libfdt_objs := fdt_rw.o fdt_ro.o fdt_wip.o fdt.o ifeq ($(CONFIG_ARM_ATAG_DTB_COMPAT),y) OBJS += $(libfdt_objs) atags_to_fdt.o endif +# -fstack-protector-strong triggers protection checks in this code, +# but it is being used too early to link to meaningful stack_chk logic. +nossp-flags-$(CONFIG_CC_HAS_STACKPROTECTOR_NONE) := -fno-stack-protector +$(foreach o, $(libfdt_objs) atags_to_fdt.o, \ + $(eval CFLAGS_$(o) := -I $(srctree)/scripts/dtc/libfdt $(nossp-flags-y))) + +# These were previously generated C files. When you are building the kernel +# with O=, make sure to remove the stale files in the output tree. Otherwise, +# the build system wrongly compiles the stale ones. +ifdef building_out_of_srctree +$(shell rm -f $(addprefix $(obj)/, fdt_rw.c fdt_ro.c fdt_wip.c fdt.c)) +endif + targets := vmlinux vmlinux.lds piggy_data piggy.o \ lib1funcs.o ashldi3.o bswapsdi2.o \ head.o $(OBJS) -clean-files += piggy_data lib1funcs.S ashldi3.S bswapsdi2.S \ - $(libfdt) $(libfdt_hdrs) hyp-stub.S +clean-files += piggy_data lib1funcs.S ashldi3.S bswapsdi2.S hyp-stub.S KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING @@ -107,15 +108,6 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) endif -# -fstack-protector-strong triggers protection checks in this code, -# but it is being used too early to link to meaningful stack_chk logic. -nossp-flags-$(CONFIG_CC_HAS_STACKPROTECTOR_NONE) := -fno-stack-protector -CFLAGS_atags_to_fdt.o := $(nossp-flags-y) -CFLAGS_fdt.o := $(nossp-flags-y) -CFLAGS_fdt_ro.o := $(nossp-flags-y) -CFLAGS_fdt_rw.o := $(nossp-flags-y) -CFLAGS_fdt_wip.o := $(nossp-flags-y) - ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \ -I$(obj) $(DISABLE_ARM_SSP_PER_TASK_PLUGIN) asflags-y := -DZIMAGE diff --git a/arch/arm/boot/compressed/atags_to_fdt.c b/arch/arm/boot/compressed/atags_to_fdt.c index 64c49747f8a3..8452753efebe 100644 --- a/arch/arm/boot/compressed/atags_to_fdt.c +++ b/arch/arm/boot/compressed/atags_to_fdt.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/libfdt_env.h> #include <asm/setup.h> #include <libfdt.h> diff --git a/arch/arm/boot/compressed/efi-header.S b/arch/arm/boot/compressed/efi-header.S index 62286da318e7..c0e7a745103e 100644 --- a/arch/arm/boot/compressed/efi-header.S +++ b/arch/arm/boot/compressed/efi-header.S @@ -60,7 +60,7 @@ optional_header: .long __pecoff_code_size @ SizeOfCode .long __pecoff_data_size @ SizeOfInitializedData .long 0 @ SizeOfUninitializedData - .long efi_entry - start @ AddressOfEntryPoint + .long efi_pe_entry - start @ AddressOfEntryPoint .long start_offset @ BaseOfCode .long __pecoff_data_start - start @ BaseOfData diff --git a/arch/arm/boot/compressed/fdt.c b/arch/arm/boot/compressed/fdt.c new file mode 100644 index 000000000000..f8ea7a201ab1 --- /dev/null +++ b/arch/arm/boot/compressed/fdt.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../../../lib/fdt.c" diff --git a/arch/arm/boot/compressed/fdt_ro.c b/arch/arm/boot/compressed/fdt_ro.c new file mode 100644 index 000000000000..93970a4ad5ae --- /dev/null +++ b/arch/arm/boot/compressed/fdt_ro.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../../../lib/fdt_ro.c" diff --git a/arch/arm/boot/compressed/fdt_rw.c b/arch/arm/boot/compressed/fdt_rw.c new file mode 100644 index 000000000000..f7c6b8b7e01c --- /dev/null +++ b/arch/arm/boot/compressed/fdt_rw.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../../../lib/fdt_rw.c" diff --git a/arch/arm/boot/compressed/fdt_wip.c b/arch/arm/boot/compressed/fdt_wip.c new file mode 100644 index 000000000000..048d2c7a088d --- /dev/null +++ b/arch/arm/boot/compressed/fdt_wip.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../../../lib/fdt_wip.c" diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index e8e1c866e413..c79db44ba128 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -287,28 +287,22 @@ not_angel: */ mov r0, pc cmp r0, r4 - ldrcc r0, LC0+28 + ldrcc r0, .Lheadroom addcc r0, r0, pc cmpcc r4, r0 orrcc r4, r4, #1 @ remember we skipped cache_on blcs cache_on -restart: adr r0, LC0 - ldmia r0, {r1, r2, r3, r6, r11, r12} - ldr sp, [r0, #24] - - /* - * We might be running at a different address. We need - * to fix up various pointers. - */ - sub r0, r0, r1 @ calculate the delta offset - add r6, r6, r0 @ _edata +restart: adr r0, LC1 + ldr sp, [r0] + ldr r6, [r0, #4] + add sp, sp, r0 + add r6, r6, r0 get_inflated_image_size r9, r10, lr #ifndef CONFIG_ZBOOT_ROM /* malloc space is above the relocated stack (64k max) */ - add sp, sp, r0 add r10, sp, #0x10000 #else /* @@ -322,9 +316,6 @@ restart: adr r0, LC0 mov r5, #0 @ init dtb size to 0 #ifdef CONFIG_ARM_APPENDED_DTB /* - * r0 = delta - * r2 = BSS start - * r3 = BSS end * r4 = final kernel address (possibly with LSB set) * r5 = appended dtb size (still unknown) * r6 = _edata @@ -332,8 +323,6 @@ restart: adr r0, LC0 * r8 = atags/device tree pointer * r9 = size of decompressed image * r10 = end of this image, including bss/stack/malloc space if non XIP - * r11 = GOT start - * r12 = GOT end * sp = stack pointer * * if there are device trees (dtb) appended to zImage, advance r10 so that the @@ -381,7 +370,6 @@ restart: adr r0, LC0 /* temporarily relocate the stack past the DTB work space */ add sp, sp, r5 - stmfd sp!, {r0-r3, ip, lr} mov r0, r8 mov r1, r6 mov r2, r5 @@ -400,7 +388,6 @@ restart: adr r0, LC0 mov r2, r5 bleq atags_to_fdt - ldmfd sp!, {r0-r3, ip, lr} sub sp, sp, r5 #endif @@ -537,6 +524,10 @@ dtb_check_done: mov pc, r0 wont_overwrite: + adr r0, LC0 + ldmia r0, {r1, r2, r3, r11, r12} + sub r0, r0, r1 @ calculate the delta offset + /* * If delta is zero, we are running at the address we were linked at. * r0 = delta @@ -660,13 +651,18 @@ not_relocated: mov r0, #0 LC0: .word LC0 @ r1 .word __bss_start @ r2 .word _end @ r3 - .word _edata @ r6 .word _got_start @ r11 .word _got_end @ ip - .word .L_user_stack_end @ sp - .word _end - restart + 16384 + 1024*1024 .size LC0, . - LC0 + .type LC1, #object +LC1: .word .L_user_stack_end - LC1 @ sp + .word _edata - LC1 @ r6 + .size LC1, . - LC1 + +.Lheadroom: + .word _end - restart + 16384 + 1024*1024 + .Linflated_image_size_offset: .long (input_data_end - 4) - . @@ -1434,38 +1430,26 @@ reloc_code_end: #ifdef CONFIG_EFI_STUB ENTRY(efi_enter_kernel) - mov r7, r0 @ preserve image base - mov r4, r1 @ preserve DT pointer + mov r4, r0 @ preserve image base + mov r8, r1 @ preserve DT pointer - mov r0, r4 @ DT start - add r1, r4, r2 @ DT end - bl cache_clean_flush + mrc p15, 0, r0, c1, c0, 0 @ read SCTLR + tst r0, #0x1 @ MMU enabled? + orreq r4, r4, #1 @ set LSB if not - mov r0, r7 @ relocated zImage - ldr r1, =_edata @ size of zImage - add r1, r1, r0 @ end of zImage + mov r0, r8 @ DT start + add r1, r8, r2 @ DT end bl cache_clean_flush - @ The PE/COFF loader might not have cleaned the code we are - @ running beyond the PoU, and so calling cache_off below from - @ inside the PE/COFF loader allocated region is unsafe unless - @ we explicitly clean it to the PoC. - ARM( adrl r0, call_cache_fn ) - THUMB( adr r0, call_cache_fn ) @ region of code we will - adr r1, 0f @ run with MMU off - bl cache_clean_flush - bl cache_off + adr r0, 0f @ switch to our stack + ldr sp, [r0] + add sp, sp, r0 - @ Set parameters for booting zImage according to boot protocol - @ put FDT address in r2, it was returned by efi_entry() - @ r1 is the machine type, and r0 needs to be 0 - mov r0, #0 - mov r1, #0xFFFFFFFF - mov r2, r4 - add r7, r7, #(__efi_start - start) - mov pc, r7 @ no mode switch + mov r5, #0 @ appended DTB size + mov r7, #0xFFFFFFFF @ machine ID + b wont_overwrite ENDPROC(efi_enter_kernel) -0: +0: .long .L_user_stack_end - . #endif .align diff --git a/arch/arm/boot/compressed/libfdt_env.h b/arch/arm/boot/compressed/libfdt_env.h deleted file mode 100644 index 6a0f1f524466..000000000000 --- a/arch/arm/boot/compressed/libfdt_env.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ARM_LIBFDT_ENV_H -#define _ARM_LIBFDT_ENV_H - -#include <linux/limits.h> -#include <linux/types.h> -#include <linux/string.h> -#include <asm/byteorder.h> - -#define INT32_MAX S32_MAX -#define UINT32_MAX U32_MAX - -typedef __be16 fdt16_t; -typedef __be32 fdt32_t; -typedef __be64 fdt64_t; - -#define fdt16_to_cpu(x) be16_to_cpu(x) -#define cpu_to_fdt16(x) cpu_to_be16(x) -#define fdt32_to_cpu(x) be32_to_cpu(x) -#define cpu_to_fdt32(x) cpu_to_be32(x) -#define fdt64_to_cpu(x) be64_to_cpu(x) -#define cpu_to_fdt64(x) cpu_to_be64(x) - -#endif diff --git a/arch/arm/boot/compressed/vmlinux.lds.S b/arch/arm/boot/compressed/vmlinux.lds.S index f82b5962d97e..09ac33f52814 100644 --- a/arch/arm/boot/compressed/vmlinux.lds.S +++ b/arch/arm/boot/compressed/vmlinux.lds.S @@ -63,9 +63,11 @@ SECTIONS _etext = .; .got.plt : { *(.got.plt) } +#ifndef CONFIG_EFI_STUB _got_start = .; .got : { *(.got) } _got_end = .; +#endif /* ensure the zImage file size is always a multiple of 64 bits */ /* (without a dummy byte, ld just ignores the empty section) */ @@ -74,11 +76,14 @@ SECTIONS #ifdef CONFIG_EFI_STUB .data : ALIGN(4096) { __pecoff_data_start = .; + _got_start = .; + *(.got) + _got_end = .; /* * The EFI stub always executes from RAM, and runs strictly before the * decompressor, so we can make an exception for its r/w data, and keep it */ - *(.data.efistub) + *(.data.efistub .bss.efistub) __pecoff_data_end = .; /* diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 3546d294d55f..feac2c8b86f2 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -269,10 +269,9 @@ .endif ;\ .popsection #define ALT_UP_B(label) \ - .equ up_b_offset, label - 9998b ;\ .pushsection ".alt.smp.init", "a" ;\ .long 9998b ;\ - W(b) . + up_b_offset ;\ + W(b) . + (label - 9998b) ;\ .popsection #else #define ALT_SMP(instr...) diff --git a/arch/arm/include/asm/checksum.h b/arch/arm/include/asm/checksum.h index 20043e0ebb07..ed6073fee338 100644 --- a/arch/arm/include/asm/checksum.h +++ b/arch/arm/include/asm/checksum.h @@ -40,6 +40,20 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum); __wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *err_ptr); +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER +static inline +__wsum csum_and_copy_from_user (const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + if (access_ok(src, len)) + return csum_partial_copy_from_user(src, dst, len, sum, err_ptr); + + if (len) + *err_ptr = -EFAULT; + + return sum; +} + /* * Fold a partial checksum without adding pseudo headers */ diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 5ac46e2860bc..9383f236e795 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -50,14 +50,6 @@ void efi_virtmap_unload(void); /* arch specific definitions used by the stub code */ -#define efi_bs_call(func, ...) efi_system_table()->boottime->func(__VA_ARGS__) -#define efi_rt_call(func, ...) efi_system_table()->runtime->func(__VA_ARGS__) -#define efi_is_native() (true) - -#define efi_table_attr(inst, attr) (inst->attr) - -#define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) - struct screen_info *alloc_screen_info(void); void free_screen_info(struct screen_info *si); diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index deef17f34bd2..af0a8500a24e 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -55,6 +55,13 @@ void *module_alloc(unsigned long size) } #endif +bool module_init_section(const char *name) +{ + return strstarts(name, ".init") || + strstarts(name, ".ARM.extab.init") || + strstarts(name, ".ARM.exidx.init"); +} + bool module_exit_section(const char *name) { return strstarts(name, ".exit") || @@ -409,8 +416,17 @@ module_arch_cleanup(struct module *mod) #ifdef CONFIG_ARM_UNWIND int i; - for (i = 0; i < ARM_SEC_MAX; i++) - if (mod->arch.unwind[i]) - unwind_table_del(mod->arch.unwind[i]); + for (i = 0; i < ARM_SEC_MAX; i++) { + unwind_table_del(mod->arch.unwind[i]); + mod->arch.unwind[i] = NULL; + } +#endif +} + +void __weak module_arch_freeing_init(struct module *mod) +{ +#ifdef CONFIG_ARM_UNWIND + unwind_table_del(mod->arch.unwind[ARM_SEC_INIT]); + mod->arch.unwind[ARM_SEC_INIT] = NULL; #endif } diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 17bd32b22371..0203e545bbc8 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -253,20 +253,15 @@ asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd, { struct oabi_epoll_event user; struct epoll_event kernel; - mm_segment_t fs; - long ret; - if (op == EPOLL_CTL_DEL) - return sys_epoll_ctl(epfd, op, fd, NULL); - if (copy_from_user(&user, event, sizeof(user))) + if (ep_op_has_event(op) && + copy_from_user(&user, event, sizeof(user))) return -EFAULT; + kernel.events = user.events; kernel.data = user.data; - fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_epoll_ctl(epfd, op, fd, &kernel); - set_fs(fs); - return ret; + + return do_epoll_ctl(epfd, op, fd, &kernel, false); } asmlinkage long sys_oabi_epoll_wait(int epfd, diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index 5461d589a1e2..60ac7c5999a9 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -5,6 +5,7 @@ * VMA_VM_FLAGS * VM_EXEC */ +#include <linux/const.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -30,7 +31,7 @@ * act_mm - get current->active_mm */ .macro act_mm, rd - bic \rd, sp, #8128 + bic \rd, sp, #(THREAD_SIZE - 1) & ~63 bic \rd, \rd, #63 ldr \rd, [\rd, #TI_TASK] .if (TSK_ACTIVE_MM > IMM12_MASK) diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 4d1cf74a2caa..d5cae5ffede0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -451,3 +451,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5d513f461957..552d36cacc05 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -9,6 +9,7 @@ config ARM64 select ACPI_MCFG if (ACPI && PCI) select ACPI_SPCR_TABLE if ACPI select ACPI_PPTT if ACPI + select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_PREP_COHERENT @@ -33,6 +34,7 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION @@ -62,9 +64,12 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_GNU_PROPERTY select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_SYM_ANNOTATIONS select ARCH_SUPPORTS_MEMORY_FAILURE + select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) select ARCH_SUPPORTS_NUMA_BALANCING @@ -525,13 +530,13 @@ config ARM64_ERRATUM_1418040 If unsure, say Y. -config ARM64_WORKAROUND_SPECULATIVE_AT_VHE +config ARM64_WORKAROUND_SPECULATIVE_AT bool config ARM64_ERRATUM_1165522 - bool "Cortex-A76: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" + bool "Cortex-A76: 1165522: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" default y - select ARM64_WORKAROUND_SPECULATIVE_AT_VHE + select ARM64_WORKAROUND_SPECULATIVE_AT help This option adds a workaround for ARM Cortex-A76 erratum 1165522. @@ -541,10 +546,23 @@ config ARM64_ERRATUM_1165522 If unsure, say Y. +config ARM64_ERRATUM_1319367 + bool "Cortex-A57/A72: 1319537: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" + default y + select ARM64_WORKAROUND_SPECULATIVE_AT + help + This option adds work arounds for ARM Cortex-A57 erratum 1319537 + and A72 erratum 1319367 + + Cortex-A57 and A72 cores could end-up with corrupted TLBs by + speculating an AT instruction during a guest context switch. + + If unsure, say Y. + config ARM64_ERRATUM_1530923 - bool "Cortex-A55: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" + bool "Cortex-A55: 1530923: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" default y - select ARM64_WORKAROUND_SPECULATIVE_AT_VHE + select ARM64_WORKAROUND_SPECULATIVE_AT help This option adds a workaround for ARM Cortex-A55 erratum 1530923. @@ -554,6 +572,9 @@ config ARM64_ERRATUM_1530923 If unsure, say Y. +config ARM64_WORKAROUND_REPEAT_TLBI + bool + config ARM64_ERRATUM_1286807 bool "Cortex-A76: Modification of the translation table for a virtual address might lead to read-after-read ordering violation" default y @@ -570,22 +591,6 @@ config ARM64_ERRATUM_1286807 invalidated has been observed by other observers. The workaround repeats the TLBI+DSB operation. -config ARM64_WORKAROUND_SPECULATIVE_AT_NVHE - bool - -config ARM64_ERRATUM_1319367 - bool "Cortex-A57/A72: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" - default y - select ARM64_WORKAROUND_SPECULATIVE_AT_NVHE - help - This option adds work arounds for ARM Cortex-A57 erratum 1319537 - and A72 erratum 1319367 - - Cortex-A57 and A72 cores could end-up with corrupted TLBs by - speculating an AT instruction during a guest context switch. - - If unsure, say Y. - config ARM64_ERRATUM_1463225 bool "Cortex-A76: Software Step might prevent interrupt recognition" default y @@ -695,6 +700,35 @@ config CAVIUM_TX2_ERRATUM_219 If unsure, say Y. +config FUJITSU_ERRATUM_010001 + bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly" + default y + help + This option adds a workaround for Fujitsu-A64FX erratum E#010001. + On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory + accesses may cause undefined fault (Data abort, DFSC=0b111111). + This fault occurs under a specific hardware condition when a + load/store instruction performs an address translation using: + case-1 TTBR0_EL1 with TCR_EL1.NFD0 == 1. + case-2 TTBR0_EL2 with TCR_EL2.NFD0 == 1. + case-3 TTBR1_EL1 with TCR_EL1.NFD1 == 1. + case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1. + + The workaround is to ensure these bits are clear in TCR_ELx. + The workaround only affects the Fujitsu-A64FX. + + If unsure, say Y. + +config HISILICON_ERRATUM_161600802 + bool "Hip07 161600802: Erroneous redistributor VLPI base" + default y + help + The HiSilicon Hip07 SoC uses the wrong redistributor base + when issued ITS commands such as VMOVP and VMAPP, and requires + a 128kB offset to be applied to the target address in this commands. + + If unsure, say Y. + config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y @@ -706,9 +740,6 @@ config QCOM_FALKOR_ERRATUM_1003 is unchanged. Work around the erratum by invalidating the walk cache entries for the trampoline before entering the kernel proper. -config ARM64_WORKAROUND_REPEAT_TLBI - bool - config QCOM_FALKOR_ERRATUM_1009 bool "Falkor E1009: Prematurely complete a DSB after a TLBI" default y @@ -730,25 +761,6 @@ config QCOM_QDF2400_ERRATUM_0065 If unsure, say Y. -config SOCIONEXT_SYNQUACER_PREITS - bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" - default y - help - Socionext Synquacer SoCs implement a separate h/w block to generate - MSI doorbell writes with non-zero values for the device ID. - - If unsure, say Y. - -config HISILICON_ERRATUM_161600802 - bool "Hip07 161600802: Erroneous redistributor VLPI base" - default y - help - The HiSilicon Hip07 SoC uses the wrong redistributor base - when issued ITS commands such as VMOVP and VMAPP, and requires - a 128kB offset to be applied to the target address in this commands. - - If unsure, say Y. - config QCOM_FALKOR_ERRATUM_E1041 bool "Falkor E1041: Speculative instruction fetches might cause errant memory access" default y @@ -759,22 +771,12 @@ config QCOM_FALKOR_ERRATUM_E1041 If unsure, say Y. -config FUJITSU_ERRATUM_010001 - bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly" +config SOCIONEXT_SYNQUACER_PREITS + bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y help - This option adds a workaround for Fujitsu-A64FX erratum E#010001. - On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory - accesses may cause undefined fault (Data abort, DFSC=0b111111). - This fault occurs under a specific hardware condition when a - load/store instruction performs an address translation using: - case-1 TTBR0_EL1 with TCR_EL1.NFD0 == 1. - case-2 TTBR0_EL2 with TCR_EL2.NFD0 == 1. - case-3 TTBR1_EL1 with TCR_EL1.NFD1 == 1. - case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1. - - The workaround is to ensure these bits are clear in TCR_ELx. - The workaround only affects the Fujitsu-A64FX. + Socionext Synquacer SoCs implement a separate h/w block to generate + MSI doorbell writes with non-zero values for the device ID. If unsure, say Y. @@ -1026,6 +1028,10 @@ config ARCH_HAS_CACHE_LINE_SIZE config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y if PGTABLE_LEVELS > 2 +# Supported by clang >= 7.0 +config CC_HAVE_SHADOW_CALL_STACK + def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" ---help--- @@ -1585,6 +1591,48 @@ endmenu menu "ARMv8.5 architectural features" +config ARM64_BTI + bool "Branch Target Identification support" + default y + help + Branch Target Identification (part of the ARMv8.5 Extensions) + provides a mechanism to limit the set of locations to which computed + branch instructions such as BR or BLR can jump. + + To make use of BTI on CPUs that support it, say Y. + + BTI is intended to provide complementary protection to other control + flow integrity protection mechanisms, such as the Pointer + authentication mechanism provided as part of the ARMv8.3 Extensions. + For this reason, it does not make sense to enable this option without + also enabling support for pointer authentication. Thus, when + enabling this option you should also select ARM64_PTR_AUTH=y. + + Userspace binaries must also be specifically compiled to make use of + this mechanism. If you say N here or the hardware does not support + BTI, such binaries can still run, but you get no additional + enforcement of branch destinations. + +config ARM64_BTI_KERNEL + bool "Use Branch Target Identification for kernel" + default y + depends on ARM64_BTI + depends on ARM64_PTR_AUTH + depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697 + depends on !CC_IS_GCC || GCC_VERSION >= 100100 + depends on !(CC_IS_CLANG && GCOV_KERNEL) + depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS) + help + Build the kernel with Branch Target Identification annotations + and enable enforcement of this for kernel code. When this option + is enabled and the system supports BTI all kernel code including + modular code must have BTI enabled. + +config CC_HAS_BRANCH_PROT_PAC_RET_BTI + # GCC 9 or later, clang 8 or later + def_bool $(cc-option,-mbranch-protection=pac-ret+leaf+bti) + config ARM64_E0PD bool "Enable support for E0PD" default y @@ -1786,7 +1834,7 @@ config EFI select EFI_PARAMS_FROM_FDT select EFI_RUNTIME_WRAPPERS select EFI_STUB - select EFI_ARMSTUB + select EFI_GENERIC_STUB default y help This option provides support for runtime services provided diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 85e4149cc5d5..650e1185c190 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -12,7 +12,6 @@ LDFLAGS_vmlinux :=--no-undefined -X CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) -GZFLAGS :=-9 ifeq ($(CONFIG_RELOCATABLE), y) # Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour @@ -71,7 +70,14 @@ branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) ifeq ($(CONFIG_ARM64_PTR_AUTH),y) branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all +# We enable additional protection for leaf functions as there is some +# narrow potential for ROP protection benefits and no substantial +# performance impact has been observed. +ifeq ($(CONFIG_ARM64_BTI_KERNEL),y) +branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET_BTI) := -mbranch-protection=pac-ret+leaf+bti +else branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf +endif # -march=armv8.3-a enables the non-nops instructions for PAC, to avoid the # compiler to generate them and consequently to break the single image contract # we pass it only to the assembler. This option is utilized only in case of non @@ -81,6 +87,10 @@ endif KBUILD_CFLAGS += $(branch-prot-flags-y) +ifeq ($(CONFIG_SHADOW_CALL_STACK), y) +KBUILD_CFLAGS += -ffixed-x18 +endif + ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__AARCH64EB__ @@ -118,7 +128,7 @@ TEXT_OFFSET := $(shell awk "BEGIN {srand(); printf \"0x%06x\n\", \ int(2 * 1024 * 1024 / (2 ^ $(CONFIG_ARM64_PAGE_SHIFT)) * \ rand()) * (2 ^ $(CONFIG_ARM64_PAGE_SHIFT))}") else -TEXT_OFFSET := 0x00080000 +TEXT_OFFSET := 0x0 endif ifeq ($(CONFIG_KASAN_SW_TAGS), y) @@ -131,7 +141,7 @@ KBUILD_CFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_CPPFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_AFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) -export TEXT_OFFSET GZFLAGS +export TEXT_OFFSET core-y += arch/arm64/ libs-y := arch/arm64/lib/ $(libs-y) diff --git a/arch/arm64/include/asm/asm_pointer_auth.h b/arch/arm64/include/asm/asm_pointer_auth.h index ce2a8486992b..52dead2a8640 100644 --- a/arch/arm64/include/asm/asm_pointer_auth.h +++ b/arch/arm64/include/asm/asm_pointer_auth.h @@ -39,25 +39,58 @@ alternative_if ARM64_HAS_GENERIC_AUTH alternative_else_nop_endif .endm - .macro ptrauth_keys_install_kernel tsk, sync, tmp1, tmp2, tmp3 -alternative_if ARM64_HAS_ADDRESS_AUTH + .macro __ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3 mov \tmp1, #THREAD_KEYS_KERNEL add \tmp1, \tsk, \tmp1 ldp \tmp2, \tmp3, [\tmp1, #PTRAUTH_KERNEL_KEY_APIA] msr_s SYS_APIAKEYLO_EL1, \tmp2 msr_s SYS_APIAKEYHI_EL1, \tmp3 - .if \sync == 1 + .endm + + .macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3 +alternative_if ARM64_HAS_ADDRESS_AUTH + __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3 +alternative_else_nop_endif + .endm + + .macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3 +alternative_if ARM64_HAS_ADDRESS_AUTH + __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3 isb - .endif alternative_else_nop_endif .endm + .macro __ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3 + mrs \tmp1, id_aa64isar1_el1 + ubfx \tmp1, \tmp1, #ID_AA64ISAR1_APA_SHIFT, #8 + cbz \tmp1, .Lno_addr_auth\@ + mov_q \tmp1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ + SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) + mrs \tmp2, sctlr_el1 + orr \tmp2, \tmp2, \tmp1 + msr sctlr_el1, \tmp2 + __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3 + isb +.Lno_addr_auth\@: + .endm + + .macro ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3 +alternative_if_not ARM64_HAS_ADDRESS_AUTH + b .Lno_addr_auth\@ +alternative_else_nop_endif + __ptrauth_keys_init_cpu \tsk, \tmp1, \tmp2, \tmp3 +.Lno_addr_auth\@: + .endm + #else /* CONFIG_ARM64_PTR_AUTH */ .macro ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3 .endm - .macro ptrauth_keys_install_kernel tsk, sync, tmp1, tmp2, tmp3 + .macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3 + .endm + + .macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3 .endm #endif /* CONFIG_ARM64_PTR_AUTH */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 0bff325117b4..54d181177656 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -736,4 +736,54 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .Lyield_out_\@ : .endm +/* + * This macro emits a program property note section identifying + * architecture features which require special handling, mainly for + * use in assembly files included in the VDSO. + */ + +#define NT_GNU_PROPERTY_TYPE_0 5 +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) +#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) + +#ifdef CONFIG_ARM64_BTI_KERNEL +#define GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT \ + ((GNU_PROPERTY_AARCH64_FEATURE_1_BTI | \ + GNU_PROPERTY_AARCH64_FEATURE_1_PAC)) +#endif + +#ifdef GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT +.macro emit_aarch64_feature_1_and, feat=GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT + .pushsection .note.gnu.property, "a" + .align 3 + .long 2f - 1f + .long 6f - 3f + .long NT_GNU_PROPERTY_TYPE_0 +1: .string "GNU" +2: + .align 3 +3: .long GNU_PROPERTY_AARCH64_FEATURE_1_AND + .long 5f - 4f +4: + /* + * This is described with an array of char in the Linux API + * spec but the text and all other usage (including binutils, + * clang and GCC) treat this as a 32 bit value so no swizzling + * is required for big endian. + */ + .long \feat +5: + .align 3 +6: + .popsection +.endm + +#else +.macro emit_aarch64_feature_1_and, feat=0 +.endm + +#endif /* GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT */ + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index e6cca3d4acf7..ce50c1f1f1ea 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -79,7 +79,7 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) * IPI all online CPUs so that they undergo a context synchronization * event and are forced to refetch the new instructions. */ -#ifdef CONFIG_KGDB + /* * KGDB performs cache maintenance with interrupts disabled, so we * will deadlock trying to IPI the secondary CPUs. In theory, we can @@ -89,9 +89,9 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) * the patching operation, so we don't need extra IPIs here anyway. * In which case, add a KGDB-specific bodge and return early. */ - if (kgdb_connected && irqs_disabled()) + if (in_dbg_master()) return; -#endif + kick_all_cpus_sync(); } diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h index eece20d2c55f..51a7ce87cdfe 100644 --- a/arch/arm64/include/asm/compiler.h +++ b/arch/arm64/include/asm/compiler.h @@ -2,8 +2,6 @@ #ifndef __ASM_COMPILER_H #define __ASM_COMPILER_H -#if defined(CONFIG_ARM64_PTR_AUTH) - /* * The EL0/EL1 pointer bits used by a pointer authentication code. * This is dependent on TBI0/TBI1 being enabled, or bits 63:56 would also apply. @@ -19,6 +17,4 @@ #define __builtin_return_address(val) \ (void *)(ptrauth_clear_pac((unsigned long)__builtin_return_address(val))) -#endif /* CONFIG_ARM64_PTR_AUTH */ - #endif /* __ASM_COMPILER_H */ diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index b4a40535a3d8..7faae6ff3ab4 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -33,6 +33,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64zfr0; u32 reg_id_dfr0; + u32 reg_id_dfr1; u32 reg_id_isar0; u32 reg_id_isar1; u32 reg_id_isar2; @@ -44,8 +45,11 @@ struct cpuinfo_arm64 { u32 reg_id_mmfr1; u32 reg_id_mmfr2; u32 reg_id_mmfr3; + u32 reg_id_mmfr4; + u32 reg_id_mmfr5; u32 reg_id_pfr0; u32 reg_id_pfr1; + u32 reg_id_pfr2; u32 reg_mvfr0; u32 reg_mvfr1; diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 8eb5a088ae65..d7b3bb0cb180 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -44,7 +44,7 @@ #define ARM64_SSBS 34 #define ARM64_WORKAROUND_1418040 35 #define ARM64_HAS_SB 36 -#define ARM64_WORKAROUND_SPECULATIVE_AT_VHE 37 +#define ARM64_WORKAROUND_SPECULATIVE_AT 37 #define ARM64_HAS_ADDRESS_AUTH_ARCH 38 #define ARM64_HAS_ADDRESS_AUTH_IMP_DEF 39 #define ARM64_HAS_GENERIC_AUTH_ARCH 40 @@ -55,13 +55,14 @@ #define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM 45 #define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM 46 #define ARM64_WORKAROUND_1542419 47 -#define ARM64_WORKAROUND_SPECULATIVE_AT_NVHE 48 -#define ARM64_HAS_E0PD 49 -#define ARM64_HAS_RNG 50 -#define ARM64_HAS_AMU_EXTN 51 -#define ARM64_HAS_ADDRESS_AUTH 52 -#define ARM64_HAS_GENERIC_AUTH 53 +#define ARM64_HAS_E0PD 48 +#define ARM64_HAS_RNG 49 +#define ARM64_HAS_AMU_EXTN 50 +#define ARM64_HAS_ADDRESS_AUTH 51 +#define ARM64_HAS_GENERIC_AUTH 52 +#define ARM64_HAS_32BIT_EL1 53 +#define ARM64_BTI 54 -#define ARM64_NCAPS 54 +#define ARM64_NCAPS 55 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index afe08251ff95..5d1f4ae42799 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -551,6 +551,13 @@ static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0) cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_BIGENDEL0_SHIFT) == 0x1; } +static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT); + + return val == ID_AA64PFR0_EL1_32BIT_64BIT; +} + static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); @@ -680,6 +687,11 @@ static inline bool system_has_prio_mask_debugging(void) system_uses_irq_prio_masking(); } +static inline bool system_supports_bti(void) +{ + return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI); +} + #define ARM64_BP_HARDEN_UNKNOWN -1 #define ARM64_BP_HARDEN_WA_NEEDED 0 #define ARM64_BP_HARDEN_NOT_REQUIRED 1 @@ -745,6 +757,24 @@ static inline bool cpu_has_hw_af(void) extern bool cpu_has_amu_feat(int cpu); #endif +static inline unsigned int get_vmid_bits(u64 mmfr1) +{ + int vmid_bits; + + vmid_bits = cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_VMIDBITS_SHIFT); + if (vmid_bits == ID_AA64MMFR1_VMIDBITS_16) + return 16; + + /* + * Return the default here even if any reserved + * value is fetched from the system register. + */ + return 8; +} + +u32 get_kvm_ipa_limit(void); + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 7619f473155f..e5ceea213e39 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -125,5 +125,7 @@ static inline int reinstall_suspended_bps(struct pt_regs *regs) int aarch32_break_handler(struct pt_regs *regs); +void debug_traps_init(void); + #endif /* __ASSEMBLY */ #endif /* __ASM_DEBUG_MONITORS_H */ diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 45e821222774..d4ab3f73e7a3 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -86,14 +86,6 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base, return (image_addr & ~(SZ_1G - 1UL)) + (1UL << (VA_BITS_MIN - 1)); } -#define efi_bs_call(func, ...) efi_system_table()->boottime->func(__VA_ARGS__) -#define efi_rt_call(func, ...) efi_system_table()->runtime->func(__VA_ARGS__) -#define efi_is_native() (true) - -#define efi_table_attr(inst, attr) (inst->attr) - -#define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) - #define alloc_screen_info(x...) &screen_info static inline void free_screen_info(struct screen_info *si) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index b618017205a3..4f00d50585a4 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -114,7 +114,11 @@ #ifndef __ASSEMBLY__ +#include <uapi/linux/elf.h> #include <linux/bug.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/types.h> #include <asm/processor.h> /* for signal_minsigstksz, used by ARCH_DLINFO */ typedef unsigned long elf_greg_t; @@ -224,6 +228,52 @@ extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, #endif /* CONFIG_COMPAT */ +struct arch_elf_state { + int flags; +}; + +#define ARM64_ELF_BTI (1 << 0) + +#define INIT_ARCH_ELF_STATE { \ + .flags = 0, \ +} + +static inline int arch_parse_elf_property(u32 type, const void *data, + size_t datasz, bool compat, + struct arch_elf_state *arch) +{ + /* No known properties for AArch32 yet */ + if (IS_ENABLED(CONFIG_COMPAT) && compat) + return 0; + + if (type == GNU_PROPERTY_AARCH64_FEATURE_1_AND) { + const u32 *p = data; + + if (datasz != sizeof(*p)) + return -ENOEXEC; + + if (system_supports_bti() && + (*p & GNU_PROPERTY_AARCH64_FEATURE_1_BTI)) + arch->flags |= ARM64_ELF_BTI; + } + + return 0; +} + +static inline int arch_elf_pt_proc(void *ehdr, void *phdr, + struct file *f, bool is_interp, + struct arch_elf_state *state) +{ + return 0; +} + +static inline int arch_check_elf(void *ehdr, bool has_interp, + void *interp_ehdr, + struct arch_elf_state *state) +{ + return 0; +} + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 6a395a7e6707..035003acfa87 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -22,7 +22,7 @@ #define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ /* Unallocated EC: 0x0A - 0x0B */ #define ESR_ELx_EC_CP14_64 (0x0C) -/* Unallocated EC: 0x0d */ +#define ESR_ELx_EC_BTI (0x0D) #define ESR_ELx_EC_ILL (0x0E) /* Unallocated EC: 0x0F - 0x10 */ #define ESR_ELx_EC_SVC32 (0x11) diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 7a6e81ca23a8..7577a754d443 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -34,6 +34,7 @@ static inline u32 disr_to_esr(u64 disr) asmlinkage void enter_from_user_mode(void); void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); +void do_bti(struct pt_regs *regs); asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, struct pt_regs *regs); diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index 87ad961f3c97..985493af704b 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -32,30 +32,70 @@ u64 smp_irq_stat_cpu(unsigned int cpu); struct nmi_ctx { u64 hcr; + unsigned int cnt; }; DECLARE_PER_CPU(struct nmi_ctx, nmi_contexts); -#define arch_nmi_enter() \ - do { \ - if (is_kernel_in_hyp_mode()) { \ - struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \ - nmi_ctx->hcr = read_sysreg(hcr_el2); \ - if (!(nmi_ctx->hcr & HCR_TGE)) { \ - write_sysreg(nmi_ctx->hcr | HCR_TGE, hcr_el2); \ - isb(); \ - } \ - } \ - } while (0) +#define arch_nmi_enter() \ +do { \ + struct nmi_ctx *___ctx; \ + u64 ___hcr; \ + \ + if (!is_kernel_in_hyp_mode()) \ + break; \ + \ + ___ctx = this_cpu_ptr(&nmi_contexts); \ + if (___ctx->cnt) { \ + ___ctx->cnt++; \ + break; \ + } \ + \ + ___hcr = read_sysreg(hcr_el2); \ + if (!(___hcr & HCR_TGE)) { \ + write_sysreg(___hcr | HCR_TGE, hcr_el2); \ + isb(); \ + } \ + /* \ + * Make sure the sysreg write is performed before ___ctx->cnt \ + * is set to 1. NMIs that see cnt == 1 will rely on us. \ + */ \ + barrier(); \ + ___ctx->cnt = 1; \ + /* \ + * Make sure ___ctx->cnt is set before we save ___hcr. We \ + * don't want ___ctx->hcr to be overwritten. \ + */ \ + barrier(); \ + ___ctx->hcr = ___hcr; \ +} while (0) -#define arch_nmi_exit() \ - do { \ - if (is_kernel_in_hyp_mode()) { \ - struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \ - if (!(nmi_ctx->hcr & HCR_TGE)) \ - write_sysreg(nmi_ctx->hcr, hcr_el2); \ - } \ - } while (0) +#define arch_nmi_exit() \ +do { \ + struct nmi_ctx *___ctx; \ + u64 ___hcr; \ + \ + if (!is_kernel_in_hyp_mode()) \ + break; \ + \ + ___ctx = this_cpu_ptr(&nmi_contexts); \ + ___hcr = ___ctx->hcr; \ + /* \ + * Make sure we read ___ctx->hcr before we release \ + * ___ctx->cnt as it makes ___ctx->hcr updatable again. \ + */ \ + barrier(); \ + ___ctx->cnt--; \ + /* \ + * Make sure ___ctx->cnt release is visible before we \ + * restore the sysreg. Otherwise a new NMI occurring \ + * right after write_sysreg() can be fooled and think \ + * we secured things for it. \ + */ \ + barrier(); \ + if (!___ctx->cnt && !(___hcr & HCR_TGE)) \ + write_sysreg(___hcr, hcr_el2); \ +} while (0) static inline void ack_bad_irq(unsigned int irq) { diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 0f00265248b5..d683bcbf1e7c 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -94,6 +94,7 @@ #define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16) #define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) #define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) +#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index bb313dde58a4..0bc46149e491 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -39,13 +39,37 @@ enum aarch64_insn_encoding_class { * system instructions */ }; -enum aarch64_insn_hint_op { +enum aarch64_insn_hint_cr_op { AARCH64_INSN_HINT_NOP = 0x0 << 5, AARCH64_INSN_HINT_YIELD = 0x1 << 5, AARCH64_INSN_HINT_WFE = 0x2 << 5, AARCH64_INSN_HINT_WFI = 0x3 << 5, AARCH64_INSN_HINT_SEV = 0x4 << 5, AARCH64_INSN_HINT_SEVL = 0x5 << 5, + + AARCH64_INSN_HINT_XPACLRI = 0x07 << 5, + AARCH64_INSN_HINT_PACIA_1716 = 0x08 << 5, + AARCH64_INSN_HINT_PACIB_1716 = 0x0A << 5, + AARCH64_INSN_HINT_AUTIA_1716 = 0x0C << 5, + AARCH64_INSN_HINT_AUTIB_1716 = 0x0E << 5, + AARCH64_INSN_HINT_PACIAZ = 0x18 << 5, + AARCH64_INSN_HINT_PACIASP = 0x19 << 5, + AARCH64_INSN_HINT_PACIBZ = 0x1A << 5, + AARCH64_INSN_HINT_PACIBSP = 0x1B << 5, + AARCH64_INSN_HINT_AUTIAZ = 0x1C << 5, + AARCH64_INSN_HINT_AUTIASP = 0x1D << 5, + AARCH64_INSN_HINT_AUTIBZ = 0x1E << 5, + AARCH64_INSN_HINT_AUTIBSP = 0x1F << 5, + + AARCH64_INSN_HINT_ESB = 0x10 << 5, + AARCH64_INSN_HINT_PSB = 0x11 << 5, + AARCH64_INSN_HINT_TSB = 0x12 << 5, + AARCH64_INSN_HINT_CSDB = 0x14 << 5, + + AARCH64_INSN_HINT_BTI = 0x20 << 5, + AARCH64_INSN_HINT_BTIC = 0x22 << 5, + AARCH64_INSN_HINT_BTIJ = 0x24 << 5, + AARCH64_INSN_HINT_BTIJC = 0x26 << 5, }; enum aarch64_insn_imm_type { @@ -344,7 +368,7 @@ __AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000) #undef __AARCH64_INSN_FUNCS -bool aarch64_insn_is_nop(u32 insn); +bool aarch64_insn_is_steppable_hint(u32 insn); bool aarch64_insn_is_branch_imm(u32 insn); static inline bool aarch64_insn_is_adr_adrp(u32 insn) @@ -370,7 +394,7 @@ u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, enum aarch64_insn_branch_type type); u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, enum aarch64_insn_condition cond); -u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_op op); +u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op); u32 aarch64_insn_gen_nop(void); u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, enum aarch64_insn_branch_type type); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a30b4eec7cb4..6ea53e6e8b26 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -507,10 +507,12 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, static __always_inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) { - if (vcpu_mode_is_32bit(vcpu)) + if (vcpu_mode_is_32bit(vcpu)) { kvm_skip_instr32(vcpu, is_wide_instr); - else + } else { *vcpu_pc(vcpu) += 4; + *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK; + } /* advance the singlestep state machine */ *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 32c8a675e5a4..57c0afcf9dcf 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -573,10 +573,6 @@ static inline bool kvm_arch_requires_vhe(void) if (system_supports_sve()) return true; - /* Some implementations have defects that confine them to VHE */ - if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) - return true; - return false; } @@ -670,7 +666,7 @@ static inline int kvm_arm_have_ssbd(void) void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); -void kvm_set_ipa_limit(void); +int kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index fe57f60f06a8..015883671ec3 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -10,10 +10,9 @@ #include <linux/compiler.h> #include <linux/kvm_host.h> #include <asm/alternative.h> -#include <asm/kvm_mmu.h> #include <asm/sysreg.h> -#define __hyp_text __section(.hyp.text) notrace +#define __hyp_text __section(.hyp.text) notrace __noscs #define read_sysreg_elx(r,nvh,vh) \ ({ \ @@ -88,22 +87,5 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); void __noreturn __hyp_do_panic(unsigned long, ...); -/* - * Must be called from hyp code running at EL2 with an updated VTTBR - * and interrupts disabled. - */ -static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) -{ - write_sysreg(kvm->arch.vtcr, vtcr_el2); - write_sysreg(kvm_get_vttbr(kvm), vttbr_el2); - - /* - * ARM errata 1165522 and 1530923 require the actual execution of the - * above before we can switch to the EL1/EL0 translation regime used by - * the guest. - */ - asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT_VHE)); -} - #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 30b0e8d6b895..85da6befe76e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -416,7 +416,7 @@ static inline unsigned int kvm_get_vmid_bits(void) { int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; + return get_vmid_bits(reg); } /* @@ -604,5 +604,22 @@ static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } +/* + * Must be called from hyp code running at EL2 with an updated VTTBR + * and interrupts disabled. + */ +static __always_inline void __load_guest_stage2(struct kvm *kvm) +{ + write_sysreg(kvm->arch.vtcr, vtcr_el2); + write_sysreg(kvm_get_vttbr(kvm), vttbr_el2); + + /* + * ARM errata 1165522 and 1530923 require the actual execution of the + * above before we can switch to the EL1/EL0 translation regime used by + * the guest. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); +} + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index ebee3113a62f..81fefd2a1d02 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -4,6 +4,52 @@ #define __ALIGN .align 2 #define __ALIGN_STR ".align 2" +#if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__) + +/* + * Since current versions of gas reject the BTI instruction unless we + * set the architecture version to v8.5 we use the hint instruction + * instead. + */ +#define BTI_C hint 34 ; +#define BTI_J hint 36 ; + +/* + * When using in-kernel BTI we need to ensure that PCS-conformant assembly + * functions have suitable annotations. Override SYM_FUNC_START to insert + * a BTI landing pad at the start of everything. + */ +#define SYM_FUNC_START(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + BTI_C + +#define SYM_FUNC_START_NOALIGN(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \ + BTI_C + +#define SYM_FUNC_START_LOCAL(name) \ + SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ + BTI_C + +#define SYM_FUNC_START_LOCAL_NOALIGN(name) \ + SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \ + BTI_C + +#define SYM_FUNC_START_WEAK(name) \ + SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ + BTI_C + +#define SYM_FUNC_START_WEAK_NOALIGN(name) \ + SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \ + BTI_C + +#define SYM_INNER_LABEL(name, linkage) \ + .type name SYM_T_NONE ASM_NL \ + SYM_ENTRY(name, linkage, SYM_A_NONE) \ + BTI_J + +#endif + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h new file mode 100644 index 000000000000..081ec8de9ea6 --- /dev/null +++ b/arch/arm64/include/asm/mman.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#include <linux/compiler.h> +#include <linux/types.h> +#include <uapi/asm/mman.h> + +static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, + unsigned long pkey __always_unused) +{ + if (system_supports_bti() && (prot & PROT_BTI)) + return VM_ARM64_BTI; + + return 0; +} +#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) + +static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) +{ + return (vm_flags & VM_ARM64_BTI) ? __pgprot(PTE_GP) : __pgprot(0); +} +#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) + +static inline bool arch_validate_prot(unsigned long prot, + unsigned long addr __always_unused) +{ + unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM; + + if (system_supports_bti()) + supported |= PROT_BTI; + + return (prot & ~supported) == 0; +} +#define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) + +#endif /* ! __ASM_MMAN_H__ */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 6bf5e650da78..9c91a8f93a0e 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -151,6 +151,7 @@ #define PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ #define PTE_AF (_AT(pteval_t, 1) << 10) /* Access Flag */ #define PTE_NG (_AT(pteval_t, 1) << 11) /* nG */ +#define PTE_GP (_AT(pteval_t, 1) << 50) /* BTI guarded */ #define PTE_DBM (_AT(pteval_t, 1) << 51) /* Dirty Bit Management */ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ @@ -190,7 +191,6 @@ * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) -#define PTE_S2_MEMATTR_MASK (_AT(pteval_t, 0xf) << 2) /* * EL2/HYP PTE/PMD definitions diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 1305e28225fc..2e7e0f452301 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -21,6 +21,7 @@ #ifndef __ASSEMBLY__ +#include <asm/cpufeature.h> #include <asm/pgtable-types.h> extern bool arm64_use_ng_mappings; @@ -31,6 +32,16 @@ extern bool arm64_use_ng_mappings; #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) +/* + * If we have userspace only BTI we don't want to mark kernel pages + * guarded even if the system does support BTI. + */ +#ifdef CONFIG_ARM64_BTI_KERNEL +#define PTE_MAYBE_GP (system_supports_bti() ? PTE_GP : 0) +#else +#define PTE_MAYBE_GP 0 +#endif + #define PROT_DEFAULT (_PROT_DEFAULT | PTE_MAYBE_NG) #define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_MAYBE_NG) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 47095216d6a8..dae0466d19d6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -460,6 +460,7 @@ extern pgd_t init_pg_dir[PTRS_PER_PGD]; extern pgd_t init_pg_end[]; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; +extern pgd_t idmap_pg_end[]; extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd); @@ -511,7 +512,7 @@ static inline void pte_unmap(pte_t *pte) { } #define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) #define pte_clear_fixmap() clear_fixmap(FIX_PTE) -#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(__pmd_to_phys(pmd))) +#define pmd_page(pmd) phys_to_page(__pmd_to_phys(pmd)) /* use ONLY for statically allocated translation tables */ #define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) @@ -569,7 +570,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) #define pmd_clear_fixmap() clear_fixmap(FIX_PMD) -#define pud_page(pud) pfn_to_page(__phys_to_pfn(__pud_to_phys(pud))) +#define pud_page(pud) phys_to_page(__pud_to_phys(pud)) /* use ONLY for statically allocated translation tables */ #define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) @@ -627,7 +628,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) #define pud_clear_fixmap() clear_fixmap(FIX_PUD) -#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd))) +#define pgd_page(pgd) phys_to_page(__pgd_to_phys(pgd)) /* use ONLY for statically allocated translation tables */ #define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) @@ -663,7 +664,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | - PTE_PROT_NONE | PTE_VALID | PTE_WRITE; + PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = pte_mkdirty(pte); diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index bf57308fcd63..2172ec7594ba 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -35,6 +35,7 @@ #define GIC_PRIO_PSR_I_SET (1 << 4) /* Additional SPSR bits not exposed in the UABI */ + #define PSR_IL_BIT (1 << 20) /* AArch32-specific ptrace requests */ diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h new file mode 100644 index 000000000000..eaa2cd92e4c1 --- /dev/null +++ b/arch/arm64/include/asm/scs.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_SCS_H +#define _ASM_SCS_H + +#ifdef __ASSEMBLY__ + +#include <asm/asm-offsets.h> + +#ifdef CONFIG_SHADOW_CALL_STACK + scs_sp .req x18 + + .macro scs_load tsk, tmp + ldr scs_sp, [\tsk, #TSK_TI_SCS_SP] + .endm + + .macro scs_save tsk, tmp + str scs_sp, [\tsk, #TSK_TI_SCS_SP] + .endm +#else + .macro scs_load tsk, tmp + .endm + + .macro scs_save tsk, tmp + .endm +#endif /* CONFIG_SHADOW_CALL_STACK */ + +#endif /* __ASSEMBLY __ */ + +#endif /* _ASM_SCS_H */ diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 40d5ba029615..ea268d88b6f7 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -23,14 +23,6 @@ #define CPU_STUCK_REASON_52_BIT_VA (UL(1) << CPU_STUCK_REASON_SHIFT) #define CPU_STUCK_REASON_NO_GRAN (UL(2) << CPU_STUCK_REASON_SHIFT) -/* Possible options for __cpu_setup */ -/* Option to setup primary cpu */ -#define ARM64_CPU_BOOT_PRIMARY (1) -/* Option to setup secondary cpus */ -#define ARM64_CPU_BOOT_SECONDARY (2) -/* Option to setup cpus for different cpu run time services */ -#define ARM64_CPU_RUNTIME (3) - #ifndef __ASSEMBLY__ #include <asm/percpu.h> @@ -96,9 +88,6 @@ asmlinkage void secondary_start_kernel(void); struct secondary_data { void *stack; struct task_struct *task; -#ifdef CONFIG_ARM64_PTR_AUTH - struct ptrauth_keys_kernel ptrauth_key; -#endif long status; }; diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 4d9b1f48dc39..5017b531a415 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -68,12 +68,10 @@ extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); -static inline bool on_irq_stack(unsigned long sp, +static inline bool on_stack(unsigned long sp, unsigned long low, + unsigned long high, enum stack_type type, struct stack_info *info) { - unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); - unsigned long high = low + IRQ_STACK_SIZE; - if (!low) return false; @@ -83,12 +81,20 @@ static inline bool on_irq_stack(unsigned long sp, if (info) { info->low = low; info->high = high; - info->type = STACK_TYPE_IRQ; + info->type = type; } - return true; } +static inline bool on_irq_stack(unsigned long sp, + struct stack_info *info) +{ + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); + unsigned long high = low + IRQ_STACK_SIZE; + + return on_stack(sp, low, high, STACK_TYPE_IRQ, info); +} + static inline bool on_task_stack(const struct task_struct *tsk, unsigned long sp, struct stack_info *info) @@ -96,16 +102,7 @@ static inline bool on_task_stack(const struct task_struct *tsk, unsigned long low = (unsigned long)task_stack_page(tsk); unsigned long high = low + THREAD_SIZE; - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_TASK; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_TASK, info); } #ifdef CONFIG_VMAP_STACK @@ -117,16 +114,7 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); unsigned long high = low + OVERFLOW_STACK_SIZE; - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_OVERFLOW; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_OVERFLOW, info); } #else static inline bool on_overflow_stack(unsigned long sp, diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h index 8939c87c4dce..0cde2f473971 100644 --- a/arch/arm64/include/asm/suspend.h +++ b/arch/arm64/include/asm/suspend.h @@ -2,7 +2,7 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -#define NR_CTX_REGS 12 +#define NR_CTX_REGS 13 #define NR_CALLEE_SAVED_REGS 12 /* diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index c4ac0ac25a00..463175f80341 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -105,6 +105,10 @@ #define SYS_DC_CSW sys_insn(1, 0, 7, 10, 2) #define SYS_DC_CISW sys_insn(1, 0, 7, 14, 2) +/* + * System registers, organised loosely by encoding but grouped together + * where the architected name contains an index. e.g. ID_MMFR<n>_EL1. + */ #define SYS_OSDTRRX_EL1 sys_reg(2, 0, 0, 0, 2) #define SYS_MDCCINT_EL1 sys_reg(2, 0, 0, 2, 0) #define SYS_MDSCR_EL1 sys_reg(2, 0, 0, 2, 2) @@ -134,12 +138,16 @@ #define SYS_ID_PFR0_EL1 sys_reg(3, 0, 0, 1, 0) #define SYS_ID_PFR1_EL1 sys_reg(3, 0, 0, 1, 1) +#define SYS_ID_PFR2_EL1 sys_reg(3, 0, 0, 3, 4) #define SYS_ID_DFR0_EL1 sys_reg(3, 0, 0, 1, 2) +#define SYS_ID_DFR1_EL1 sys_reg(3, 0, 0, 3, 5) #define SYS_ID_AFR0_EL1 sys_reg(3, 0, 0, 1, 3) #define SYS_ID_MMFR0_EL1 sys_reg(3, 0, 0, 1, 4) #define SYS_ID_MMFR1_EL1 sys_reg(3, 0, 0, 1, 5) #define SYS_ID_MMFR2_EL1 sys_reg(3, 0, 0, 1, 6) #define SYS_ID_MMFR3_EL1 sys_reg(3, 0, 0, 1, 7) +#define SYS_ID_MMFR4_EL1 sys_reg(3, 0, 0, 2, 6) +#define SYS_ID_MMFR5_EL1 sys_reg(3, 0, 0, 3, 6) #define SYS_ID_ISAR0_EL1 sys_reg(3, 0, 0, 2, 0) #define SYS_ID_ISAR1_EL1 sys_reg(3, 0, 0, 2, 1) @@ -147,7 +155,6 @@ #define SYS_ID_ISAR3_EL1 sys_reg(3, 0, 0, 2, 3) #define SYS_ID_ISAR4_EL1 sys_reg(3, 0, 0, 2, 4) #define SYS_ID_ISAR5_EL1 sys_reg(3, 0, 0, 2, 5) -#define SYS_ID_MMFR4_EL1 sys_reg(3, 0, 0, 2, 6) #define SYS_ID_ISAR6_EL1 sys_reg(3, 0, 0, 2, 7) #define SYS_MVFR0_EL1 sys_reg(3, 0, 0, 3, 0) @@ -552,6 +559,8 @@ #endif /* SCTLR_EL1 specific flags. */ +#define SCTLR_EL1_BT1 (BIT(36)) +#define SCTLR_EL1_BT0 (BIT(35)) #define SCTLR_EL1_UCI (BIT(26)) #define SCTLR_EL1_E0E (BIT(24)) #define SCTLR_EL1_SPAN (BIT(23)) @@ -594,6 +603,7 @@ /* id_aa64isar0 */ #define ID_AA64ISAR0_RNDR_SHIFT 60 +#define ID_AA64ISAR0_TLB_SHIFT 56 #define ID_AA64ISAR0_TS_SHIFT 52 #define ID_AA64ISAR0_FHM_SHIFT 48 #define ID_AA64ISAR0_DP_SHIFT 44 @@ -637,6 +647,8 @@ #define ID_AA64PFR0_CSV2_SHIFT 56 #define ID_AA64PFR0_DIT_SHIFT 48 #define ID_AA64PFR0_AMU_SHIFT 44 +#define ID_AA64PFR0_MPAM_SHIFT 40 +#define ID_AA64PFR0_SEL2_SHIFT 36 #define ID_AA64PFR0_SVE_SHIFT 32 #define ID_AA64PFR0_RAS_SHIFT 28 #define ID_AA64PFR0_GIC_SHIFT 24 @@ -655,15 +667,21 @@ #define ID_AA64PFR0_ASIMD_NI 0xf #define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 #define ID_AA64PFR0_EL1_64BIT_ONLY 0x1 +#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2 #define ID_AA64PFR0_EL0_64BIT_ONLY 0x1 #define ID_AA64PFR0_EL0_32BIT_64BIT 0x2 /* id_aa64pfr1 */ +#define ID_AA64PFR1_MPAMFRAC_SHIFT 16 +#define ID_AA64PFR1_RASFRAC_SHIFT 12 +#define ID_AA64PFR1_MTE_SHIFT 8 #define ID_AA64PFR1_SSBS_SHIFT 4 +#define ID_AA64PFR1_BT_SHIFT 0 #define ID_AA64PFR1_SSBS_PSTATE_NI 0 #define ID_AA64PFR1_SSBS_PSTATE_ONLY 1 #define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 +#define ID_AA64PFR1_BT_BTI 0x1 /* id_aa64zfr0 */ #define ID_AA64ZFR0_F64MM_SHIFT 56 @@ -688,6 +706,9 @@ #define ID_AA64ZFR0_SVEVER_SVE2 0x1 /* id_aa64mmfr0 */ +#define ID_AA64MMFR0_TGRAN4_2_SHIFT 40 +#define ID_AA64MMFR0_TGRAN64_2_SHIFT 36 +#define ID_AA64MMFR0_TGRAN16_2_SHIFT 32 #define ID_AA64MMFR0_TGRAN4_SHIFT 28 #define ID_AA64MMFR0_TGRAN64_SHIFT 24 #define ID_AA64MMFR0_TGRAN16_SHIFT 20 @@ -752,6 +773,25 @@ #define ID_DFR0_PERFMON_8_1 0x4 +#define ID_ISAR4_SWP_FRAC_SHIFT 28 +#define ID_ISAR4_PSR_M_SHIFT 24 +#define ID_ISAR4_SYNCH_PRIM_FRAC_SHIFT 20 +#define ID_ISAR4_BARRIER_SHIFT 16 +#define ID_ISAR4_SMC_SHIFT 12 +#define ID_ISAR4_WRITEBACK_SHIFT 8 +#define ID_ISAR4_WITHSHIFTS_SHIFT 4 +#define ID_ISAR4_UNPRIV_SHIFT 0 + +#define ID_DFR1_MTPMU_SHIFT 0 + +#define ID_ISAR0_DIVIDE_SHIFT 24 +#define ID_ISAR0_DEBUG_SHIFT 20 +#define ID_ISAR0_COPROC_SHIFT 16 +#define ID_ISAR0_CMPBRANCH_SHIFT 12 +#define ID_ISAR0_BITFIELD_SHIFT 8 +#define ID_ISAR0_BITCOUNT_SHIFT 4 +#define ID_ISAR0_SWAP_SHIFT 0 + #define ID_ISAR5_RDM_SHIFT 24 #define ID_ISAR5_CRC32_SHIFT 16 #define ID_ISAR5_SHA2_SHIFT 12 @@ -767,6 +807,22 @@ #define ID_ISAR6_DP_SHIFT 4 #define ID_ISAR6_JSCVT_SHIFT 0 +#define ID_MMFR4_EVT_SHIFT 28 +#define ID_MMFR4_CCIDX_SHIFT 24 +#define ID_MMFR4_LSM_SHIFT 20 +#define ID_MMFR4_HPDS_SHIFT 16 +#define ID_MMFR4_CNP_SHIFT 12 +#define ID_MMFR4_XNX_SHIFT 8 +#define ID_MMFR4_SPECSEI_SHIFT 0 + +#define ID_MMFR5_ETS_SHIFT 0 + +#define ID_PFR0_DIT_SHIFT 24 +#define ID_PFR0_CSV2_SHIFT 16 + +#define ID_PFR2_SSBS_SHIFT 4 +#define ID_PFR2_CSV3_SHIFT 0 + #define MVFR0_FPROUND_SHIFT 28 #define MVFR0_FPSHVEC_SHIFT 24 #define MVFR0_FPSQRT_SHIFT 20 @@ -785,17 +841,14 @@ #define MVFR1_FPDNAN_SHIFT 4 #define MVFR1_FPFTZ_SHIFT 0 - -#define ID_AA64MMFR0_TGRAN4_SHIFT 28 -#define ID_AA64MMFR0_TGRAN64_SHIFT 24 -#define ID_AA64MMFR0_TGRAN16_SHIFT 20 - -#define ID_AA64MMFR0_TGRAN4_NI 0xf -#define ID_AA64MMFR0_TGRAN4_SUPPORTED 0x0 -#define ID_AA64MMFR0_TGRAN64_NI 0xf -#define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0 -#define ID_AA64MMFR0_TGRAN16_NI 0x0 -#define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1 +#define ID_PFR1_GIC_SHIFT 28 +#define ID_PFR1_VIRT_FRAC_SHIFT 24 +#define ID_PFR1_SEC_FRAC_SHIFT 20 +#define ID_PFR1_GENTIMER_SHIFT 16 +#define ID_PFR1_VIRTUALIZATION_SHIFT 12 +#define ID_PFR1_MPROGMOD_SHIFT 8 +#define ID_PFR1_SECURITY_SHIFT 4 +#define ID_PFR1_PROGMOD_SHIFT 0 #if defined(CONFIG_ARM64_4K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 512174a8e789..6ea8b6a26ae9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,10 @@ struct thread_info { #endif } preempt; }; +#ifdef CONFIG_SHADOW_CALL_STACK + void *scs_base; + void *scs_sp; +#endif }; #define thread_saved_pc(tsk) \ @@ -100,11 +104,20 @@ void arch_release_task_struct(struct task_struct *tsk); _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) +#ifdef CONFIG_SHADOW_CALL_STACK +#define INIT_SCS \ + .scs_base = init_shadow_call_stack, \ + .scs_sp = init_shadow_call_stack, +#else +#define INIT_SCS +#endif + #define INIT_THREAD_INFO(tsk) \ { \ .flags = _TIF_FOREIGN_FPSTATE, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ + INIT_SCS \ } #endif /* __ASM_THREAD_INFO_H */ diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 803039d504de..3b859596840d 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 439 +#define __NR_compat_syscalls 440 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index c1c61635f89c..6d95d0c8bf2f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -883,6 +883,8 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 7752d93bb50f..2d6ba1c2592e 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -73,5 +73,6 @@ #define HWCAP2_BF16 (1 << 14) #define HWCAP2_DGH (1 << 15) #define HWCAP2_RNG (1 << 16) +#define HWCAP2_BTI (1 << 17) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h new file mode 100644 index 000000000000..6fdd71eb644f --- /dev/null +++ b/arch/arm64/include/uapi/asm/mman.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__ASM_MMAN_H +#define _UAPI__ASM_MMAN_H + +#include <asm-generic/mman.h> + +#define PROT_BTI 0x10 /* BTI guarded page */ + +#endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index d1bb5b69f1ce..42cbe34d95ce 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -46,6 +46,7 @@ #define PSR_I_BIT 0x00000080 #define PSR_A_BIT 0x00000100 #define PSR_D_BIT 0x00000200 +#define PSR_BTYPE_MASK 0x00000c00 #define PSR_SSBS_BIT 0x00001000 #define PSR_PAN_BIT 0x00400000 #define PSR_UAO_BIT 0x00800000 @@ -55,6 +56,8 @@ #define PSR_Z_BIT 0x40000000 #define PSR_N_BIT 0x80000000 +#define PSR_BTYPE_SHIFT 10 + /* * Groups of PSR bits */ @@ -63,6 +66,12 @@ #define PSR_x 0x0000ff00 /* Extension */ #define PSR_c 0x000000ff /* Control */ +/* Convenience names for the values of PSTATE.BTYPE */ +#define PSR_BTYPE_NONE (0b00 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_JC (0b01 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_C (0b10 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_J (0b11 << PSR_BTYPE_SHIFT) + /* syscall emulation path in ptrace */ #define PTRACE_SYSEMU 31 #define PTRACE_SYSEMU_SINGLESTEP 32 diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 4e5b8ee31442..151f28521f1e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_SSBD) += ssbd.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-y += vdso/ probes/ obj-$(CONFIG_COMPAT_VDSO) += vdso32/ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 9981a0a5a87f..3539d7092612 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -34,6 +34,10 @@ int main(void) #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif +#ifdef CONFIG_SHADOW_CALL_STACK + DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base)); + DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp)); +#endif DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); #ifdef CONFIG_STACKPROTECTOR DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary)); @@ -92,9 +96,6 @@ int main(void) BLANK(); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); -#ifdef CONFIG_ARM64_PTR_AUTH - DEFINE(CPU_BOOT_PTRAUTH_KEY, offsetof(struct secondary_data, ptrauth_key)); -#endif BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 38087b4c0432..4a18055b2ff9 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -29,7 +29,7 @@ * branch to what would be the reset vector. It must be executed with the * flat identity mapping. */ -ENTRY(__cpu_soft_restart) +SYM_CODE_START(__cpu_soft_restart) /* Clear sctlr_el1 flags. */ mrs x12, sctlr_el1 mov_q x13, SCTLR_ELx_FLAGS @@ -47,6 +47,6 @@ ENTRY(__cpu_soft_restart) mov x1, x3 // arg1 mov x2, x4 // arg2 br x8 -ENDPROC(__cpu_soft_restart) +SYM_CODE_END(__cpu_soft_restart) .popsection diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index df56d2295d16..b0ce6bf14f6a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -635,7 +635,7 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, return is_midr_in_range(midr, &range) && has_dic; } -#if defined(CONFIG_HARDEN_EL2_VECTORS) || defined(CONFIG_ARM64_ERRATUM_1319367) +#if defined(CONFIG_HARDEN_EL2_VECTORS) static const struct midr_range ca57_a72[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), @@ -757,12 +757,16 @@ static const struct arm64_cpu_capabilities erratum_843419_list[] = { }; #endif -#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT_VHE -static const struct midr_range erratum_speculative_at_vhe_list[] = { +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT +static const struct midr_range erratum_speculative_at_list[] = { #ifdef CONFIG_ARM64_ERRATUM_1165522 /* Cortex A76 r0p0 to r2p0 */ MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), #endif +#ifdef CONFIG_ARM64_ERRATUM_1319367 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), +#endif #ifdef CONFIG_ARM64_ERRATUM_1530923 /* Cortex A55 r0p0 to r2p0 */ MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 2, 0), @@ -774,7 +778,7 @@ static const struct midr_range erratum_speculative_at_vhe_list[] = { const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { - .desc = "ARM errata 826319, 827319, 824069, 819472", + .desc = "ARM errata 826319, 827319, 824069, or 819472", .capability = ARM64_WORKAROUND_CLEAN_CACHE, ERRATA_MIDR_RANGE_LIST(workaround_clean_cache), .cpu_enable = cpu_enable_cache_maint_trap, @@ -856,7 +860,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI { - .desc = "Qualcomm erratum 1009, ARM erratum 1286807", + .desc = "Qualcomm erratum 1009, or ARM erratum 1286807", .capability = ARM64_WORKAROUND_REPEAT_TLBI, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = cpucap_multi_entry_cap_matches, @@ -897,11 +901,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_1418040_list), }, #endif -#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT_VHE +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT { - .desc = "ARM errata 1165522, 1530923", - .capability = ARM64_WORKAROUND_SPECULATIVE_AT_VHE, - ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_vhe_list), + .desc = "ARM errata 1165522, 1319367, or 1530923", + .capability = ARM64_WORKAROUND_SPECULATIVE_AT, + ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_list), }, #endif #ifdef CONFIG_ARM64_ERRATUM_1463225 @@ -935,13 +939,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .cpu_enable = cpu_enable_trap_ctr_access, }, #endif -#ifdef CONFIG_ARM64_ERRATUM_1319367 - { - .desc = "ARM erratum 1319367", - .capability = ARM64_WORKAROUND_SPECULATIVE_AT_NVHE, - ERRATA_MIDR_RANGE_LIST(ca57_a72), - }, -#endif { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9fac745aa7bb..4ae41670c2e6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3,6 +3,61 @@ * Contains CPU feature definitions * * Copyright (C) 2015 ARM Ltd. + * + * A note for the weary kernel hacker: the code here is confusing and hard to + * follow! That's partly because it's solving a nasty problem, but also because + * there's a little bit of over-abstraction that tends to obscure what's going + * on behind a maze of helper functions and macros. + * + * The basic problem is that hardware folks have started gluing together CPUs + * with distinct architectural features; in some cases even creating SoCs where + * user-visible instructions are available only on a subset of the available + * cores. We try to address this by snapshotting the feature registers of the + * boot CPU and comparing these with the feature registers of each secondary + * CPU when bringing them up. If there is a mismatch, then we update the + * snapshot state to indicate the lowest-common denominator of the feature, + * known as the "safe" value. This snapshot state can be queried to view the + * "sanitised" value of a feature register. + * + * The sanitised register values are used to decide which capabilities we + * have in the system. These may be in the form of traditional "hwcaps" + * advertised to userspace or internal "cpucaps" which are used to configure + * things like alternative patching and static keys. While a feature mismatch + * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch + * may prevent a CPU from being onlined at all. + * + * Some implementation details worth remembering: + * + * - Mismatched features are *always* sanitised to a "safe" value, which + * usually indicates that the feature is not supported. + * + * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK" + * warning when onlining an offending CPU and the kernel will be tainted + * with TAINT_CPU_OUT_OF_SPEC. + * + * - Features marked as FTR_VISIBLE have their sanitised value visible to + * userspace. FTR_VISIBLE features in registers that are only visible + * to EL0 by trapping *must* have a corresponding HWCAP so that late + * onlining of CPUs cannot lead to features disappearing at runtime. + * + * - A "feature" is typically a 4-bit register field. A "capability" is the + * high-level description derived from the sanitised field value. + * + * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID + * scheme for fields in ID registers") to understand when feature fields + * may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly). + * + * - KVM exposes its own view of the feature registers to guest operating + * systems regardless of FTR_VISIBLE. This is typically driven from the + * sanitised register values to allow virtual CPUs to be migrated between + * arbitrary physical CPUs, but some features not present on the host are + * also advertised and emulated. Look at sys_reg_descs[] for the gory + * details. + * + * - If the arm64_ftr_bits[] for a register has a missing field, then this + * field is treated as STRICT RES0, including for read_sanitised_ftr_reg(). + * This is stronger than FTR_HIDDEN and can be used to hide features from + * KVM guests. */ #define pr_fmt(fmt) "CPU features: " fmt @@ -124,6 +179,7 @@ static bool __system_matches_cap(unsigned int n); */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), @@ -166,22 +222,27 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_DIT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_AMU_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_MPAM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SEL2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), - /* Linux doesn't care about the EL3 */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -209,6 +270,24 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { /* + * Page size not being supported at Stage-2 is not fatal. You + * just give up KVM if PAGE_SIZE isn't supported there. Go fix + * your favourite nesting hypervisor. + * + * There is a small corner case where the hypervisor explicitly + * advertises a given granule size at Stage-2 (value 2) on some + * vCPUs, and uses the fallback to Stage-1 (value 0) for other + * vCPUs. Although this is not forbidden by the architecture, it + * indicates that the hypervisor is being silly (or buggy). + * + * We make no effort to cope with this and pretend that if these + * fields are inconsistent across vCPUs, then it isn't worth + * trying to bring KVM up. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN4_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN64_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN16_2_SHIFT, 4, 1), + /* * We already refuse to boot CPUs that don't support our configured * page size, so we can only detect mismatches for a page size other * than the one we're currently using. Unfortunately, SoCs like this @@ -247,7 +326,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_UAO_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CNP_SHIFT, 4, 0), @@ -289,7 +368,7 @@ static const struct arm64_ftr_bits ftr_id_mmfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 36, 28, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 36, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), @@ -316,6 +395,16 @@ static const struct arm64_ftr_bits ftr_dczid[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_isar0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DIVIDE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DEBUG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_COPROC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_CMPBRANCH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITFIELD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITCOUNT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_SWAP_SHIFT, 4, 0), + ARM64_FTR_END, +}; static const struct arm64_ftr_bits ftr_id_isar5[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0), @@ -328,7 +417,37 @@ static const struct arm64_ftr_bits ftr_id_isar5[] = { }; static const struct arm64_ftr_bits ftr_id_mmfr4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CNP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_XNX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* ac2 */ + /* + * SpecSEI = 1 indicates that the PE might generate an SError on an + * external abort on speculative read. It is safe to assume that an + * SError might be generated than it will not be. Hence it has been + * classified as FTR_HIGHER_SAFE. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_SPECSEI_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SWP_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_PSR_M_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SYNCH_PRIM_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_BARRIER_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SMC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WRITEBACK_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WITHSHIFTS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_UNPRIV_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_mmfr5[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_ETS_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -344,6 +463,8 @@ static const struct arm64_ftr_bits ftr_id_isar6[] = { }; static const struct arm64_ftr_bits ftr_id_pfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* State3 */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), /* State2 */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* State1 */ @@ -351,8 +472,26 @@ static const struct arm64_ftr_bits ftr_id_pfr0[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRT_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SEC_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GENTIMER_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRTUALIZATION_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_MPROGMOD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SECURITY_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_PROGMOD_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), + /* [31:28] TraceFilt */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), @@ -363,6 +502,11 @@ static const struct arm64_ftr_bits ftr_id_dfr0[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_dfr1[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_MTPMU_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_zcr[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0), /* LEN */ @@ -373,7 +517,7 @@ static const struct arm64_ftr_bits ftr_zcr[] = { * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of * 0. Covers the following 32bit registers: - * id_isar[0-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] + * id_isar[1-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] */ static const struct arm64_ftr_bits ftr_generic_32bits[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), @@ -411,7 +555,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 1 */ ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0), - ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1), ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0), ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0), ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits), @@ -419,11 +563,11 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits), /* Op1 = 0, CRn = 0, CRm = 2 */ - ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0), ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4), ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5), ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4), ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6), @@ -432,6 +576,9 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2), + ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2), + ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1), + ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5), /* Op1 = 0, CRn = 0, CRm = 4 */ ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), @@ -468,16 +615,16 @@ static int search_cmp_ftr_reg(const void *id, const void *regp) } /* - * get_arm64_ftr_reg - Lookup a feature register entry using its - * sys_reg() encoding. With the array arm64_ftr_regs sorted in the - * ascending order of sys_id , we use binary search to find a matching + * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using + * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the + * ascending order of sys_id, we use binary search to find a matching * entry. * * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure. It is upto the caller to decide * the impact of a failure. */ -static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id) { const struct __ftr_reg_entry *ret; @@ -491,6 +638,27 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) return NULL; } +/* + * get_arm64_ftr_reg - Looks up a feature register entry using + * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn(). + * + * returns - Upon success, matching ftr_reg entry for id. + * - NULL on failure but with an WARN_ON(). + */ +static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +{ + struct arm64_ftr_reg *reg; + + reg = get_arm64_ftr_reg_nowarn(sys_id); + + /* + * Requesting a non-existent register search is an error. Warn + * and let the caller handle it. + */ + WARN_ON(!reg); + return reg; +} + static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, s64 ftr_val) { @@ -552,7 +720,8 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) const struct arm64_ftr_bits *ftrp; struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg); - BUG_ON(!reg); + if (!reg) + return; for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { u64 ftr_mask = arm64_ftr_mask(ftrp); @@ -625,6 +794,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); + init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1); init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); @@ -636,8 +806,11 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); + init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4); + init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5); init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); + init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2); init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); @@ -682,7 +855,9 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); - BUG_ON(!regp); + if (!regp) + return 0; + update_cpu_ftr_reg(regp, val); if ((boot & regp->strict_mask) == (val & regp->strict_mask)) return 0; @@ -691,6 +866,104 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) return 1; } +static void relax_cpu_ftr_reg(u32 sys_id, int field) +{ + const struct arm64_ftr_bits *ftrp; + struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); + + if (!regp) + return; + + for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) { + if (ftrp->shift == field) { + regp->strict_mask &= ~arm64_ftr_mask(ftrp); + break; + } + } + + /* Bogus field? */ + WARN_ON(!ftrp->width); +} + +static int update_32bit_cpu_features(int cpu, struct cpuinfo_arm64 *info, + struct cpuinfo_arm64 *boot) +{ + int taint = 0; + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + /* + * If we don't have AArch32 at all then skip the checks entirely + * as the register values may be UNKNOWN and we're not going to be + * using them for anything. + */ + if (!id_aa64pfr0_32bit_el0(pfr0)) + return taint; + + /* + * If we don't have AArch32 at EL1, then relax the strictness of + * EL1-dependent register fields to avoid spurious sanity check fails. + */ + if (!id_aa64pfr0_32bit_el1(pfr0)) { + relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_SMC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRT_FRAC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SEC_FRAC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRTUALIZATION_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SECURITY_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_PROGMOD_SHIFT); + } + + taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, + info->reg_id_dfr0, boot->reg_id_dfr0); + taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu, + info->reg_id_dfr1, boot->reg_id_dfr1); + taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, + info->reg_id_isar0, boot->reg_id_isar0); + taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, + info->reg_id_isar1, boot->reg_id_isar1); + taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, + info->reg_id_isar2, boot->reg_id_isar2); + taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, + info->reg_id_isar3, boot->reg_id_isar3); + taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, + info->reg_id_isar4, boot->reg_id_isar4); + taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, + info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, + info->reg_id_isar6, boot->reg_id_isar6); + + /* + * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and + * ACTLR formats could differ across CPUs and therefore would have to + * be trapped for virtualization anyway. + */ + taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, + info->reg_id_mmfr0, boot->reg_id_mmfr0); + taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, + info->reg_id_mmfr1, boot->reg_id_mmfr1); + taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, + info->reg_id_mmfr2, boot->reg_id_mmfr2); + taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, + info->reg_id_mmfr3, boot->reg_id_mmfr3); + taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu, + info->reg_id_mmfr4, boot->reg_id_mmfr4); + taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu, + info->reg_id_mmfr5, boot->reg_id_mmfr5); + taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, + info->reg_id_pfr0, boot->reg_id_pfr0); + taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, + info->reg_id_pfr1, boot->reg_id_pfr1); + taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu, + info->reg_id_pfr2, boot->reg_id_pfr2); + taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, + info->reg_mvfr0, boot->reg_mvfr0); + taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, + info->reg_mvfr1, boot->reg_mvfr1); + taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, + info->reg_mvfr2, boot->reg_mvfr2); + + return taint; +} + /* * Update system wide CPU feature registers with the values from a * non-boot CPU. Also performs SANITY checks to make sure that there @@ -753,9 +1026,6 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); - /* - * EL3 is not our concern. - */ taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, @@ -764,55 +1034,6 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); - /* - * If we have AArch32, we care about 32-bit features for compat. - * If the system doesn't support AArch32, don't update them. - */ - if (id_aa64pfr0_32bit_el0(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && - id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - - taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, - info->reg_id_dfr0, boot->reg_id_dfr0); - taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, - info->reg_id_isar0, boot->reg_id_isar0); - taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, - info->reg_id_isar1, boot->reg_id_isar1); - taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, - info->reg_id_isar2, boot->reg_id_isar2); - taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, - info->reg_id_isar3, boot->reg_id_isar3); - taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, - info->reg_id_isar4, boot->reg_id_isar4); - taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, - info->reg_id_isar5, boot->reg_id_isar5); - taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, - info->reg_id_isar6, boot->reg_id_isar6); - - /* - * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and - * ACTLR formats could differ across CPUs and therefore would have to - * be trapped for virtualization anyway. - */ - taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, - info->reg_id_mmfr0, boot->reg_id_mmfr0); - taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, - info->reg_id_mmfr1, boot->reg_id_mmfr1); - taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, - info->reg_id_mmfr2, boot->reg_id_mmfr2); - taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, - info->reg_id_mmfr3, boot->reg_id_mmfr3); - taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, - info->reg_id_pfr0, boot->reg_id_pfr0); - taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, - info->reg_id_pfr1, boot->reg_id_pfr1); - taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, - info->reg_mvfr0, boot->reg_mvfr0); - taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, - info->reg_mvfr1, boot->reg_mvfr1); - taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, - info->reg_mvfr2, boot->reg_mvfr2); - } - if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, info->reg_zcr, boot->reg_zcr); @@ -824,6 +1045,12 @@ void update_cpu_features(int cpu, } /* + * This relies on a sanitised view of the AArch64 ID registers + * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last. + */ + taint |= update_32bit_cpu_features(cpu, info, boot); + + /* * Mismatched CPU features are a recipe for disaster. Don't even * pretend to support them. */ @@ -837,8 +1064,8 @@ u64 read_sanitised_ftr_reg(u32 id) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id); - /* We shouldn't get a request for an unsupported register */ - BUG_ON(!regp); + if (!regp) + return 0; return regp->sys_val; } @@ -854,11 +1081,15 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) switch (sys_id) { read_sysreg_case(SYS_ID_PFR0_EL1); read_sysreg_case(SYS_ID_PFR1_EL1); + read_sysreg_case(SYS_ID_PFR2_EL1); read_sysreg_case(SYS_ID_DFR0_EL1); + read_sysreg_case(SYS_ID_DFR1_EL1); read_sysreg_case(SYS_ID_MMFR0_EL1); read_sysreg_case(SYS_ID_MMFR1_EL1); read_sysreg_case(SYS_ID_MMFR2_EL1); read_sysreg_case(SYS_ID_MMFR3_EL1); + read_sysreg_case(SYS_ID_MMFR4_EL1); + read_sysreg_case(SYS_ID_MMFR5_EL1); read_sysreg_case(SYS_ID_ISAR0_EL1); read_sysreg_case(SYS_ID_ISAR1_EL1); read_sysreg_case(SYS_ID_ISAR2_EL1); @@ -1409,6 +1640,21 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, } #endif +#ifdef CONFIG_ARM64_BTI +static void bti_enable(const struct arm64_cpu_capabilities *__unused) +{ + /* + * Use of X16/X17 for tail-calls and trampolines that jump to + * function entry points using BR is a requirement for + * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI. + * So, be strict and forbid other BRs using other registers to + * jump onto a PACIxSP instruction: + */ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1); + isb(); +} +#endif /* CONFIG_ARM64_BTI */ + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -1511,6 +1757,18 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR0_EL0_SHIFT, .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, }, +#ifdef CONFIG_KVM + { + .desc = "32-bit EL1 Support", + .capability = ARM64_HAS_32BIT_EL1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_EL1_SHIFT, + .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, + }, +#endif { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, @@ -1779,6 +2037,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 1, }, #endif +#ifdef CONFIG_ARM64_BTI + { + .desc = "Branch Target Identification", + .capability = ARM64_BTI, +#ifdef CONFIG_ARM64_BTI_KERNEL + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, +#else + .type = ARM64_CPUCAP_SYSTEM_FEATURE, +#endif + .matches = has_cpuid_feature, + .cpu_enable = bti_enable, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_BT_SHIFT, + .min_field_value = ID_AA64PFR1_BT_BTI, + .sign = FTR_UNSIGNED, + }, +#endif {}, }; @@ -1888,6 +2163,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F64MM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_F64MM, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), #endif HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS), +#ifdef CONFIG_ARM64_BTI + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_BT_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_BT_BTI, CAP_HWCAP, KERNEL_HWCAP_BTI), +#endif #ifdef CONFIG_ARM64_PTR_AUTH HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), @@ -2181,6 +2459,36 @@ static void verify_sve_features(void) /* Add checks on other ZCR bits here if necessary */ } +static void verify_hyp_capabilities(void) +{ + u64 safe_mmfr1, mmfr0, mmfr1; + int parange, ipa_max; + unsigned int safe_vmid_bits, vmid_bits; + + if (!IS_ENABLED(CONFIG_KVM) || !IS_ENABLED(CONFIG_KVM_ARM_HOST)) + return; + + safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); + + /* Verify VMID bits */ + safe_vmid_bits = get_vmid_bits(safe_mmfr1); + vmid_bits = get_vmid_bits(mmfr1); + if (vmid_bits < safe_vmid_bits) { + pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + /* Verify IPA range */ + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_PARANGE_SHIFT); + ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + if (ipa_max < get_kvm_ipa_limit()) { + pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id()); + cpu_die_early(); + } +} /* * Run through the enabled system capabilities and enable() it on this CPU. @@ -2206,6 +2514,9 @@ static void verify_local_cpu_capabilities(void) if (system_supports_sve()) verify_sve_features(); + + if (is_hyp_mode_available()) + verify_hyp_capabilities(); } void check_local_cpu_capabilities(void) @@ -2394,7 +2705,7 @@ static int emulate_sys_reg(u32 id, u64 *valp) if (sys_reg_CRm(id) == 0) return emulate_id_reg(id, valp); - regp = get_arm64_ftr_reg(id); + regp = get_arm64_ftr_reg_nowarn(id); if (regp) *valp = arm64_ftr_reg_user_value(regp); else diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 86136075ae41..86637466daa8 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -92,6 +92,7 @@ static const char *const hwcap_str[] = { "bf16", "dgh", "rng", + "bti", NULL }; @@ -311,6 +312,8 @@ static int __init cpuinfo_regs_init(void) } return 0; } +device_initcall(cpuinfo_regs_init); + static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) { unsigned int cpu = smp_processor_id(); @@ -362,6 +365,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) /* Update the 32bit ID registers only if AArch32 is implemented */ if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); + info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1); info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); @@ -373,8 +377,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); + info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1); + info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1); info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1); info->reg_mvfr0 = read_cpuid(MVFR0_EL1); info->reg_mvfr1 = read_cpuid(MVFR1_EL1); @@ -403,5 +410,3 @@ void __init cpuinfo_store_boot_cpu(void) boot_cpu_data = *info; init_cpu_features(&boot_cpu_data); } - -device_initcall(cpuinfo_regs_init); diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c index ca4c3e12d8c5..1f646b07e3e9 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/crash_core.c @@ -5,6 +5,7 @@ */ #include <linux/crash_core.h> +#include <asm/cpufeature.h> #include <asm/memory.h> void arch_crash_save_vmcoreinfo(void) @@ -16,4 +17,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", PHYS_OFFSET); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n", + system_supports_address_auth() ? + ptrauth_kernel_pac_mask() : 0); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 48222a4760c2..15e80c876d46 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -376,15 +376,13 @@ int aarch32_break_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(aarch32_break_handler); -static int __init debug_traps_init(void) +void __init debug_traps_init(void) { hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, TRAP_TRACE, "single-step handler"); hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, TRAP_BRKPT, "ptrace BRK handler"); - return 0; } -arch_initcall(debug_traps_init); /* Re-enable single step for syscall restarting. */ void user_rewind_single_step(struct task_struct *task) diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 1a03618df0df..0073b24b5d25 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -14,12 +14,12 @@ SYM_CODE_START(efi_enter_kernel) /* - * efi_entry() will have copied the kernel image if necessary and we + * efi_pe_entry() will have copied the kernel image if necessary and we * end up here with device tree address in x1 and the kernel entry * point stored in x0. Save those values in registers which are * callee preserved. */ - ldr w2, =stext_offset + ldr w2, =primary_entry_offset add x19, x0, x2 // relocated Image entrypoint mov x20, x1 // DTB address diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 914999ccaf8a..df67c0f2a077 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -27,12 +27,12 @@ optional_header: .long __initdata_begin - efi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_efi_entry - _head // AddressOfEntryPoint + .long __efistub_efi_pe_entry - _head // AddressOfEntryPoint .long efi_header_end - _head // BaseOfCode extra_header_fields: .quad 0 // ImageBase - .long SZ_4K // SectionAlignment + .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment .short 0 // MajorOperatingSystemVersion .short 0 // MinorOperatingSystemVersion diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S index 3fc71106cb2b..75691a2641c1 100644 --- a/arch/arm64/kernel/efi-rt-wrapper.S +++ b/arch/arm64/kernel/efi-rt-wrapper.S @@ -5,7 +5,7 @@ #include <linux/linkage.h> -ENTRY(__efi_rt_asm_wrapper) +SYM_FUNC_START(__efi_rt_asm_wrapper) stp x29, x30, [sp, #-32]! mov x29, sp @@ -34,5 +34,14 @@ ENTRY(__efi_rt_asm_wrapper) ldp x29, x30, [sp], #32 b.ne 0f ret -0: b efi_handle_corrupted_x18 // tail call -ENDPROC(__efi_rt_asm_wrapper) +0: + /* + * With CONFIG_SHADOW_CALL_STACK, the kernel uses x18 to store a + * shadow stack pointer, which we need to restore before returning to + * potentially instrumented code. This is safe because the wrapper is + * called with preemption disabled and a separate shadow stack is used + * for interrupts. + */ + mov x18, x2 + b efi_handle_corrupted_x18 // tail call +SYM_FUNC_END(__efi_rt_asm_wrapper) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index c839b5bf1904..3dbdf9752b11 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -94,7 +94,7 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) break; default: el1_inv(regs, esr); - }; + } } NOKPROBE_SYMBOL(el1_sync_handler); @@ -188,6 +188,14 @@ static void notrace el0_undef(struct pt_regs *regs) } NOKPROBE_SYMBOL(el0_undef); +static void notrace el0_bti(struct pt_regs *regs) +{ + user_exit_irqoff(); + local_daif_restore(DAIF_PROCCTX); + do_bti(regs); +} +NOKPROBE_SYMBOL(el0_bti); + static void notrace el0_inv(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); @@ -255,6 +263,9 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_UNKNOWN: el0_undef(regs); break; + case ESR_ELx_EC_BTI: + el0_bti(regs); + break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 0f24eae8f3cc..f880dd63ddc3 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -16,34 +16,34 @@ * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_save_state) +SYM_FUNC_START(fpsimd_save_state) fpsimd_save x0, 8 ret -ENDPROC(fpsimd_save_state) +SYM_FUNC_END(fpsimd_save_state) /* * Load the FP registers. * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_load_state) +SYM_FUNC_START(fpsimd_load_state) fpsimd_restore x0, 8 ret -ENDPROC(fpsimd_load_state) +SYM_FUNC_END(fpsimd_load_state) #ifdef CONFIG_ARM64_SVE -ENTRY(sve_save_state) +SYM_FUNC_START(sve_save_state) sve_save 0, x1, 2 ret -ENDPROC(sve_save_state) +SYM_FUNC_END(sve_save_state) -ENTRY(sve_load_state) +SYM_FUNC_START(sve_load_state) sve_load 0, x1, x2, 3, x4 ret -ENDPROC(sve_load_state) +SYM_FUNC_END(sve_load_state) -ENTRY(sve_get_vl) +SYM_FUNC_START(sve_get_vl) _sve_rdvl 0, 1 ret -ENDPROC(sve_get_vl) +SYM_FUNC_END(sve_get_vl) #endif /* CONFIG_ARM64_SVE */ diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 833d48c9acb5..a338f40e64d3 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -23,8 +23,9 @@ * * ... where <entry> is either ftrace_caller or ftrace_regs_caller. * - * Each instrumented function follows the AAPCS, so here x0-x8 and x19-x30 are - * live, and x9-x18 are safe to clobber. + * Each instrumented function follows the AAPCS, so here x0-x8 and x18-x30 are + * live (x18 holds the Shadow Call Stack pointer), and x9-x17 are safe to + * clobber. * * We save the callsite's context into a pt_regs before invoking any ftrace * callbacks. So that we can get a sensible backtrace, we create a stack record diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ddcde093c433..5304d193c79d 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -23,6 +23,7 @@ #include <asm/mmu.h> #include <asm/processor.h> #include <asm/ptrace.h> +#include <asm/scs.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> @@ -178,7 +179,9 @@ alternative_cb_end apply_ssbd 1, x22, x23 - ptrauth_keys_install_kernel tsk, 1, x20, x22, x23 + ptrauth_keys_install_kernel tsk, x20, x22, x23 + + scs_load tsk, x20 .else add x21, sp, #S_FRAME_SIZE get_current_task tsk @@ -343,6 +346,8 @@ alternative_else_nop_endif msr cntkctl_el1, x1 4: #endif + scs_save tsk, x0 + /* No kernel C function calls after this as user keys are set. */ ptrauth_keys_install_user tsk, x0, x1, x2 @@ -388,6 +393,9 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 .macro irq_stack_entry mov x19, sp // preserve the original sp +#ifdef CONFIG_SHADOW_CALL_STACK + mov x24, scs_sp // preserve the original shadow stack +#endif /* * Compare sp with the base of the task stack. @@ -405,15 +413,25 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 /* switch to the irq stack */ mov sp, x26 + +#ifdef CONFIG_SHADOW_CALL_STACK + /* also switch to the irq shadow stack */ + adr_this_cpu scs_sp, irq_shadow_call_stack, x26 +#endif + 9998: .endm /* - * x19 should be preserved between irq_stack_entry and - * irq_stack_exit. + * The callee-saved regs (x19-x29) should be preserved between + * irq_stack_entry and irq_stack_exit, but note that kernel_entry + * uses x20-x23 to store data for later use. */ .macro irq_stack_exit mov sp, x19 +#ifdef CONFIG_SHADOW_CALL_STACK + mov scs_sp, x24 +#endif .endm /* GPRs used by entry code */ @@ -728,20 +746,9 @@ el0_error_naked: SYM_CODE_END(el0_error) /* - * Ok, we need to do extra processing, enter the slow path. - */ -work_pending: - mov x0, sp // 'regs' - bl do_notify_resume -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on // enabled while in userspace -#endif - ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step - b finish_ret_to_user -/* * "slow" syscall return path. */ -ret_to_user: +SYM_CODE_START_LOCAL(ret_to_user) disable_daif gic_prio_kentry_setup tmp=x3 ldr x1, [tsk, #TSK_TI_FLAGS] @@ -753,7 +760,19 @@ finish_ret_to_user: bl stackleak_erase #endif kernel_exit 0 -ENDPROC(ret_to_user) + +/* + * Ok, we need to do extra processing, enter the slow path. + */ +work_pending: + mov x0, sp // 'regs' + bl do_notify_resume +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_on // enabled while in userspace +#endif + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step + b finish_ret_to_user +SYM_CODE_END(ret_to_user) .popsection // .entry.text @@ -900,7 +919,9 @@ SYM_FUNC_START(cpu_switch_to) ldr lr, [x8] mov sp, x9 msr sp_el0, x1 - ptrauth_keys_install_kernel x1, 1, x8, x9, x10 + ptrauth_keys_install_kernel x1, x8, x9, x10 + scs_save x0, x8 + scs_load x1, x8 ret SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) @@ -1029,13 +1050,16 @@ SYM_CODE_START(__sdei_asm_handler) mov x19, x1 +#if defined(CONFIG_VMAP_STACK) || defined(CONFIG_SHADOW_CALL_STACK) + ldrb w4, [x19, #SDEI_EVENT_PRIORITY] +#endif + #ifdef CONFIG_VMAP_STACK /* * entry.S may have been using sp as a scratch register, find whether * this is a normal or critical event and switch to the appropriate * stack for this CPU. */ - ldrb w4, [x19, #SDEI_EVENT_PRIORITY] cbnz w4, 1f ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 b 2f @@ -1045,6 +1069,15 @@ SYM_CODE_START(__sdei_asm_handler) mov sp, x5 #endif +#ifdef CONFIG_SHADOW_CALL_STACK + /* Use a separate shadow call stack for normal and critical events */ + cbnz w4, 3f + adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal, tmp=x6 + b 4f +3: adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical, tmp=x6 +4: +#endif + /* * We may have interrupted userspace, or a guest, or exit-from or * return-to either of these. We can't trust sp_el0, restore it. diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 57a91032b4c2..632702146813 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/irqchip/arm-gic-v3.h> +#include <asm/asm_pointer_auth.h> #include <asm/assembler.h> #include <asm/boot.h> #include <asm/ptrace.h> @@ -27,6 +28,7 @@ #include <asm/pgtable-hwdef.h> #include <asm/pgtable.h> #include <asm/page.h> +#include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> #include <asm/thread_info.h> @@ -70,9 +72,9 @@ _head: * its opcode forms the magic "MZ" signature required by UEFI. */ add x13, x18, #0x16 - b stext + b primary_entry #else - b stext // branch to kernel start, magic + b primary_entry // branch to kernel start, magic .long 0 // reserved #endif le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian @@ -98,14 +100,13 @@ pe_header: * primary lowlevel boot path: * * Register Scope Purpose - * x21 stext() .. start_kernel() FDT pointer passed at boot in x0 - * x23 stext() .. start_kernel() physical misalignment/KASLR offset - * x28 __create_page_tables() callee preserved temp register - * x19/x20 __primary_switch() callee preserved temp registers - * x24 __primary_switch() .. relocate_kernel() - * current RELR displacement + * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 + * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset + * x28 __create_page_tables() callee preserved temp register + * x19/x20 __primary_switch() callee preserved temp registers + * x24 __primary_switch() .. relocate_kernel() current RELR displacement */ -SYM_CODE_START(stext) +SYM_CODE_START(primary_entry) bl preserve_boot_args bl el2_setup // Drop to EL1, w0=cpu_boot_mode adrp x23, __PHYS_OFFSET @@ -118,10 +119,9 @@ SYM_CODE_START(stext) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ - mov x0, #ARM64_CPU_BOOT_PRIMARY bl __cpu_setup // initialise processor b __primary_switch -SYM_CODE_END(stext) +SYM_CODE_END(primary_entry) /* * Preserve the arguments passed by the bootloader in x0 .. x3 @@ -394,13 +394,19 @@ SYM_FUNC_START_LOCAL(__create_page_tables) /* * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate the idmap and swapper page - * tables again to remove any speculatively loaded cache lines. + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. */ + dmb sy + adrp x0, idmap_pg_dir + adrp x1, idmap_pg_end + sub x1, x1, x0 + bl __inval_dcache_area + + adrp x0, init_pg_dir adrp x1, init_pg_end sub x1, x1, x0 - dmb sy bl __inval_dcache_area ret x28 @@ -417,6 +423,10 @@ SYM_FUNC_START_LOCAL(__primary_switched) adr_l x5, init_task msr sp_el0, x5 // Save thread_info +#ifdef CONFIG_ARM64_PTR_AUTH + __ptrauth_keys_init_cpu x5, x6, x7, x8 +#endif + adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb @@ -424,6 +434,10 @@ SYM_FUNC_START_LOCAL(__primary_switched) stp xzr, x30, [sp, #-16]! mov x29, sp +#ifdef CONFIG_SHADOW_CALL_STACK + adr_l scs_sp, init_shadow_call_stack // Set shadow call stack +#endif + str_l x21, __fdt_pointer, x5 // Save FDT pointer ldr_l x4, kimage_vaddr // Save the offset between @@ -717,7 +731,6 @@ SYM_FUNC_START_LOCAL(secondary_startup) * Common entry point for secondary CPUs. */ bl __cpu_secondary_check52bitva - mov x0, #ARM64_CPU_BOOT_SECONDARY bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir bl __enable_mmu @@ -737,8 +750,14 @@ SYM_FUNC_START_LOCAL(__secondary_switched) ldr x2, [x0, #CPU_BOOT_TASK] cbz x2, __secondary_too_slow msr sp_el0, x2 + scs_load x2, x3 mov x29, #0 mov x30, #0 + +#ifdef CONFIG_ARM64_PTR_AUTH + ptrauth_keys_init_cpu x2, x3, x4, x5 +#endif + b secondary_start_kernel SYM_FUNC_END(__secondary_switched) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index 6532105b3e32..8ccca660034e 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -65,7 +65,7 @@ * x5: physical address of a zero page that remains zero after resume */ .pushsection ".hibernate_exit.text", "ax" -ENTRY(swsusp_arch_suspend_exit) +SYM_CODE_START(swsusp_arch_suspend_exit) /* * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page @@ -110,7 +110,7 @@ ENTRY(swsusp_arch_suspend_exit) cbz x24, 3f /* Do we need to re-initialise EL2? */ hvc #0 3: ret -ENDPROC(swsusp_arch_suspend_exit) +SYM_CODE_END(swsusp_arch_suspend_exit) /* * Restore the hyp stub. @@ -119,15 +119,15 @@ ENDPROC(swsusp_arch_suspend_exit) * * x24: The physical address of __hyp_stub_vectors */ -el1_sync: +SYM_CODE_START_LOCAL(el1_sync) msr vbar_el2, x24 eret -ENDPROC(el1_sync) +SYM_CODE_END(el1_sync) .macro invalid_vector label -\label: +SYM_CODE_START_LOCAL(\label) b \label -ENDPROC(\label) +SYM_CODE_END(\label) .endm invalid_vector el2_sync_invalid @@ -141,7 +141,7 @@ ENDPROC(\label) /* el2 vectors - switch el2 here while we restore the memory image. */ .align 11 -ENTRY(hibernate_el2_vectors) +SYM_CODE_START(hibernate_el2_vectors) ventry el2_sync_invalid // Synchronous EL2t ventry el2_irq_invalid // IRQ EL2t ventry el2_fiq_invalid // FIQ EL2t @@ -161,6 +161,6 @@ ENTRY(hibernate_el2_vectors) ventry el1_irq_invalid // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 ventry el1_error_invalid // Error 32-bit EL1 -END(hibernate_el2_vectors) +SYM_CODE_END(hibernate_el2_vectors) .popsection diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index e473ead806ed..160f5881a0b7 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -21,7 +21,7 @@ .align 11 -ENTRY(__hyp_stub_vectors) +SYM_CODE_START(__hyp_stub_vectors) ventry el2_sync_invalid // Synchronous EL2t ventry el2_irq_invalid // IRQ EL2t ventry el2_fiq_invalid // FIQ EL2t @@ -41,11 +41,11 @@ ENTRY(__hyp_stub_vectors) ventry el1_irq_invalid // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 ventry el1_error_invalid // Error 32-bit EL1 -ENDPROC(__hyp_stub_vectors) +SYM_CODE_END(__hyp_stub_vectors) .align 11 -el1_sync: +SYM_CODE_START_LOCAL(el1_sync) cmp x0, #HVC_SET_VECTORS b.ne 2f msr vbar_el2, x1 @@ -68,12 +68,12 @@ el1_sync: 9: mov x0, xzr eret -ENDPROC(el1_sync) +SYM_CODE_END(el1_sync) .macro invalid_vector label -\label: +SYM_CODE_START_LOCAL(\label) b \label -ENDPROC(\label) +SYM_CODE_END(\label) .endm invalid_vector el2_sync_invalid @@ -106,15 +106,15 @@ ENDPROC(\label) * initialisation entry point. */ -ENTRY(__hyp_set_vectors) +SYM_FUNC_START(__hyp_set_vectors) mov x1, x0 mov x0, #HVC_SET_VECTORS hvc #0 ret -ENDPROC(__hyp_set_vectors) +SYM_FUNC_END(__hyp_set_vectors) -ENTRY(__hyp_reset_vectors) +SYM_FUNC_START(__hyp_reset_vectors) mov x0, #HVC_RESET_VECTORS hvc #0 ret -ENDPROC(__hyp_reset_vectors) +SYM_FUNC_END(__hyp_reset_vectors) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 7f06ad93fc95..be0a63ffed23 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -13,7 +13,7 @@ #ifdef CONFIG_EFI __efistub_kernel_size = _edata - _text; -__efistub_stext_offset = stext - _text; +__efistub_primary_entry_offset = primary_entry - _text; /* diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 4a9e773a177f..684d871ae38d 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -51,21 +51,33 @@ enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn) return aarch64_insn_encoding_class[(insn >> 25) & 0xf]; } -/* NOP is an alias of HINT */ -bool __kprobes aarch64_insn_is_nop(u32 insn) +bool __kprobes aarch64_insn_is_steppable_hint(u32 insn) { if (!aarch64_insn_is_hint(insn)) return false; switch (insn & 0xFE0) { - case AARCH64_INSN_HINT_YIELD: - case AARCH64_INSN_HINT_WFE: - case AARCH64_INSN_HINT_WFI: - case AARCH64_INSN_HINT_SEV: - case AARCH64_INSN_HINT_SEVL: - return false; - default: + case AARCH64_INSN_HINT_XPACLRI: + case AARCH64_INSN_HINT_PACIA_1716: + case AARCH64_INSN_HINT_PACIB_1716: + case AARCH64_INSN_HINT_AUTIA_1716: + case AARCH64_INSN_HINT_AUTIB_1716: + case AARCH64_INSN_HINT_PACIAZ: + case AARCH64_INSN_HINT_PACIASP: + case AARCH64_INSN_HINT_PACIBZ: + case AARCH64_INSN_HINT_PACIBSP: + case AARCH64_INSN_HINT_AUTIAZ: + case AARCH64_INSN_HINT_AUTIASP: + case AARCH64_INSN_HINT_AUTIBZ: + case AARCH64_INSN_HINT_AUTIBSP: + case AARCH64_INSN_HINT_BTI: + case AARCH64_INSN_HINT_BTIC: + case AARCH64_INSN_HINT_BTIJ: + case AARCH64_INSN_HINT_BTIJC: + case AARCH64_INSN_HINT_NOP: return true; + default: + return false; } } @@ -574,7 +586,7 @@ u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, offset >> 2); } -u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_op op) +u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op) { return aarch64_insn_get_hint_value() | op; } @@ -1535,16 +1547,10 @@ static u32 aarch64_encode_immediate(u64 imm, u32 insn) { unsigned int immr, imms, n, ones, ror, esz, tmp; - u64 mask = ~0UL; - - /* Can't encode full zeroes or full ones */ - if (!imm || !~imm) - return AARCH64_BREAK_FAULT; + u64 mask; switch (variant) { case AARCH64_INSN_VARIANT_32BIT: - if (upper_32_bits(imm)) - return AARCH64_BREAK_FAULT; esz = 32; break; case AARCH64_INSN_VARIANT_64BIT: @@ -1556,6 +1562,12 @@ static u32 aarch64_encode_immediate(u64 imm, return AARCH64_BREAK_FAULT; } + mask = GENMASK(esz - 1, 0); + + /* Can't encode full zeroes, full ones, or value wider than the mask */ + if (!imm || imm == mask || imm & ~mask) + return AARCH64_BREAK_FAULT; + /* * Inverse of Replicate(). Try to spot a repeating pattern * with a pow2 stride. diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index b40c3b0def92..522e6f517ec0 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -138,12 +138,12 @@ static int setup_dtb(struct kimage *image, /* add rng-seed */ if (rng_is_initialized()) { - u8 rng_seed[RNG_SEED_SIZE]; - get_random_bytes(rng_seed, RNG_SEED_SIZE); - ret = fdt_setprop(dtb, off, FDT_PROP_RNG_SEED, rng_seed, - RNG_SEED_SIZE); + void *rng_seed; + ret = fdt_setprop_placeholder(dtb, off, FDT_PROP_RNG_SEED, + RNG_SEED_SIZE, &rng_seed); if (ret) goto out; + get_random_bytes(rng_seed, RNG_SEED_SIZE); } else { pr_notice("RNG is not initialised: omitting \"%s\" property\n", FDT_PROP_RNG_SEED); @@ -284,7 +284,7 @@ int load_other_segments(struct kimage *image, image->arch.elf_headers_sz = headers_sz; pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_headers_mem, headers_sz, headers_sz); + image->arch.elf_headers_mem, kbuf.bufsz, kbuf.memsz); } /* load initrd */ @@ -305,7 +305,7 @@ int load_other_segments(struct kimage *image, initrd_load_addr = kbuf.mem; pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - initrd_load_addr, initrd_len, initrd_len); + initrd_load_addr, kbuf.bufsz, kbuf.memsz); } /* load dtb */ @@ -332,7 +332,7 @@ int load_other_segments(struct kimage *image, image->arch.dtb_mem = kbuf.mem; pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kbuf.mem, dtb_len, dtb_len); + kbuf.mem, kbuf.bufsz, kbuf.memsz); return 0; diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 1ef702b0be2d..295d66490584 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -120,7 +120,7 @@ static bool has_pv_steal_clock(void) struct arm_smccc_res res; /* To detect the presence of PV time support we require SMCCC 1.1+ */ - if (psci_ops.smccc_version < SMCCC_VERSION_1_1) + if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) return false; arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index b78fac9e546c..263d5fba4c8a 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -46,7 +46,7 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * except for the NOP case. */ if (aarch64_insn_is_hint(insn)) - return aarch64_insn_is_nop(insn); + return aarch64_insn_is_steppable_hint(insn); return true; } diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 45dce03aaeaf..890ca72c5a51 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -61,7 +61,7 @@ ldp x28, x29, [sp, #S_X28] .endm -ENTRY(kretprobe_trampoline) +SYM_CODE_START(kretprobe_trampoline) sub sp, sp, #S_FRAME_SIZE save_all_base_regs @@ -79,4 +79,4 @@ ENTRY(kretprobe_trampoline) add sp, sp, #S_FRAME_SIZE ret -ENDPROC(kretprobe_trampoline) +SYM_CODE_END(kretprobe_trampoline) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 56be4cbf771f..eade7807e819 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -11,6 +11,7 @@ #include <linux/compat.h> #include <linux/efi.h> +#include <linux/elf.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> @@ -18,6 +19,7 @@ #include <linux/sched/task_stack.h> #include <linux/kernel.h> #include <linux/lockdep.h> +#include <linux/mman.h> #include <linux/mm.h> #include <linux/stddef.h> #include <linux/sysctl.h> @@ -209,6 +211,15 @@ void machine_restart(char *cmd) while (1); } +#define bstr(suffix, str) [PSR_BTYPE_ ## suffix >> PSR_BTYPE_SHIFT] = str +static const char *const btypes[] = { + bstr(NONE, "--"), + bstr( JC, "jc"), + bstr( C, "-c"), + bstr( J , "j-") +}; +#undef bstr + static void print_pstate(struct pt_regs *regs) { u64 pstate = regs->pstate; @@ -227,7 +238,10 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_AA32_I_BIT ? 'I' : 'i', pstate & PSR_AA32_F_BIT ? 'F' : 'f'); } else { - printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO)\n", + const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >> + PSR_BTYPE_SHIFT]; + + printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO BTYPE=%s)\n", pstate, pstate & PSR_N_BIT ? 'N' : 'n', pstate & PSR_Z_BIT ? 'Z' : 'z', @@ -238,7 +252,8 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_I_BIT ? 'I' : 'i', pstate & PSR_F_BIT ? 'F' : 'f', pstate & PSR_PAN_BIT ? '+' : '-', - pstate & PSR_UAO_BIT ? '+' : '-'); + pstate & PSR_UAO_BIT ? '+' : '-', + btype_str); } } @@ -655,3 +670,25 @@ asmlinkage void __sched arm64_preempt_schedule_irq(void) if (system_capabilities_finalized()) preempt_schedule_irq(); } + +#ifdef CONFIG_BINFMT_ELF +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp) +{ + /* + * For dynamically linked executables the interpreter is + * responsible for setting PROT_BTI on everything except + * itself. + */ + if (is_interp != has_interp) + return prot; + + if (!(state->flags & ARM64_ELF_BTI)) + return prot; + + if (prot & PROT_EXEC) + prot |= PROT_BTI; + + return prot; +} +#endif diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index e7b01904f180..76790a5f2a0d 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1875,7 +1875,7 @@ void syscall_trace_exit(struct pt_regs *regs) */ #define SPSR_EL1_AARCH64_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ - GENMASK_ULL(20, 13) | GENMASK_ULL(11, 10) | GENMASK_ULL(5, 5)) + GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) diff --git a/arch/arm64/kernel/reloc_test_syms.S b/arch/arm64/kernel/reloc_test_syms.S index 16a34f188f26..c50f45fa29fa 100644 --- a/arch/arm64/kernel/reloc_test_syms.S +++ b/arch/arm64/kernel/reloc_test_syms.S @@ -5,81 +5,81 @@ #include <linux/linkage.h> -ENTRY(absolute_data64) +SYM_FUNC_START(absolute_data64) ldr x0, 0f ret 0: .quad sym64_abs -ENDPROC(absolute_data64) +SYM_FUNC_END(absolute_data64) -ENTRY(absolute_data32) +SYM_FUNC_START(absolute_data32) ldr w0, 0f ret 0: .long sym32_abs -ENDPROC(absolute_data32) +SYM_FUNC_END(absolute_data32) -ENTRY(absolute_data16) +SYM_FUNC_START(absolute_data16) adr x0, 0f ldrh w0, [x0] ret 0: .short sym16_abs, 0 -ENDPROC(absolute_data16) +SYM_FUNC_END(absolute_data16) -ENTRY(signed_movw) +SYM_FUNC_START(signed_movw) movz x0, #:abs_g2_s:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(signed_movw) +SYM_FUNC_END(signed_movw) -ENTRY(unsigned_movw) +SYM_FUNC_START(unsigned_movw) movz x0, #:abs_g3:sym64_abs movk x0, #:abs_g2_nc:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(unsigned_movw) +SYM_FUNC_END(unsigned_movw) .align 12 .space 0xff8 -ENTRY(relative_adrp) +SYM_FUNC_START(relative_adrp) adrp x0, sym64_rel add x0, x0, #:lo12:sym64_rel ret -ENDPROC(relative_adrp) +SYM_FUNC_END(relative_adrp) .align 12 .space 0xffc -ENTRY(relative_adrp_far) +SYM_FUNC_START(relative_adrp_far) adrp x0, memstart_addr add x0, x0, #:lo12:memstart_addr ret -ENDPROC(relative_adrp_far) +SYM_FUNC_END(relative_adrp_far) -ENTRY(relative_adr) +SYM_FUNC_START(relative_adr) adr x0, sym64_rel ret -ENDPROC(relative_adr) +SYM_FUNC_END(relative_adr) -ENTRY(relative_data64) +SYM_FUNC_START(relative_data64) adr x1, 0f ldr x0, [x1] add x0, x0, x1 ret 0: .quad sym64_rel - . -ENDPROC(relative_data64) +SYM_FUNC_END(relative_data64) -ENTRY(relative_data32) +SYM_FUNC_START(relative_data32) adr x1, 0f ldr w0, [x1] add x0, x0, x1 ret 0: .long sym64_rel - . -ENDPROC(relative_data32) +SYM_FUNC_END(relative_data32) -ENTRY(relative_data16) +SYM_FUNC_START(relative_data16) adr x1, 0f ldrsh w0, [x1] add x0, x0, x1 ret 0: .short sym64_rel - ., 0 -ENDPROC(relative_data16) +SYM_FUNC_END(relative_data16) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index c40ce496c78b..542d6edc6806 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -26,7 +26,7 @@ * control_code_page, a special page which has been set up to be preserved * during the copy operation. */ -ENTRY(arm64_relocate_new_kernel) +SYM_CODE_START(arm64_relocate_new_kernel) /* Setup the list loop variables. */ mov x18, x2 /* x18 = dtb address */ @@ -111,7 +111,7 @@ ENTRY(arm64_relocate_new_kernel) mov x3, xzr br x17 -ENDPROC(arm64_relocate_new_kernel) +SYM_CODE_END(arm64_relocate_new_kernel) .align 3 /* To keep the 64-bit values below naturally aligned. */ diff --git a/arch/arm64/kernel/scs.c b/arch/arm64/kernel/scs.c new file mode 100644 index 000000000000..e8f7ff45dd8f --- /dev/null +++ b/arch/arm64/kernel/scs.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include <linux/percpu.h> +#include <linux/scs.h> + +DEFINE_SCS(irq_shadow_call_stack); + +#ifdef CONFIG_ARM_SDE_INTERFACE +DEFINE_SCS(sdei_shadow_call_stack_normal); +DEFINE_SCS(sdei_shadow_call_stack_critical); +#endif diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index d6259dac62b6..dab88260b137 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -95,19 +95,7 @@ static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info) unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); unsigned long high = low + SDEI_STACK_SIZE; - if (!low) - return false; - - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_NORMAL; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_SDEI_NORMAL, info); } static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info) @@ -115,19 +103,7 @@ static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info) unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); unsigned long high = low + SDEI_STACK_SIZE; - if (!low) - return false; - - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_CRITICAL; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_SDEI_CRITICAL, info); } bool _on_sdei_stack(unsigned long sp, struct stack_info *info) @@ -251,22 +227,12 @@ asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; - bool do_nmi_exit = false; - /* - * nmi_enter() deals with printk() re-entrance and use of RCU when - * RCU believed this CPU was idle. Because critical events can - * interrupt normal events, we may already be in_nmi(). - */ - if (!in_nmi()) { - nmi_enter(); - do_nmi_exit = true; - } + nmi_enter(); ret = _sdei_handler(regs, arg); - if (do_nmi_exit) - nmi_exit(); + nmi_exit(); return ret; } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 339882db5a91..801d56cdf701 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -732,6 +732,22 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->pc = (unsigned long)ka->sa.sa_handler; + /* + * Signal delivery is a (wacky) indirect function call in + * userspace, so simulate the same setting of BTYPE as a BLR + * <register containing the signal handler entry point>. + * Signal delivery to a location in a PROT_BTI guarded page + * that is not a function entry point will now trigger a + * SIGILL in userspace. + * + * If the signal handler entry point is not in a PROT_BTI + * guarded page, this is harmless. + */ + if (system_supports_bti()) { + regs->pstate &= ~PSR_BTYPE_MASK; + regs->pstate |= PSR_BTYPE_C; + } + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 7b2f2e650c44..ba40d57757d6 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -62,7 +62,7 @@ * * x0 = struct sleep_stack_data area */ -ENTRY(__cpu_suspend_enter) +SYM_FUNC_START(__cpu_suspend_enter) stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS] stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16] stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32] @@ -95,23 +95,22 @@ ENTRY(__cpu_suspend_enter) ldp x29, lr, [sp], #16 mov x0, #1 ret -ENDPROC(__cpu_suspend_enter) +SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" -ENTRY(cpu_resume) +SYM_CODE_START(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly - mov x0, #ARM64_CPU_RUNTIME bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir bl __enable_mmu ldr x8, =_cpu_resume br x8 -ENDPROC(cpu_resume) +SYM_CODE_END(cpu_resume) .ltorg .popsection -ENTRY(_cpu_resume) +SYM_FUNC_START(_cpu_resume) mrs x1, mpidr_el1 adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address @@ -147,4 +146,4 @@ ENTRY(_cpu_resume) ldp x29, lr, [x29] mov x0, #0 ret -ENDPROC(_cpu_resume) +SYM_FUNC_END(_cpu_resume) diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 54655273d1e0..1f93809528a4 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -30,9 +30,9 @@ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_smc) +SYM_FUNC_START(__arm_smccc_smc) SMCCC smc -ENDPROC(__arm_smccc_smc) +SYM_FUNC_END(__arm_smccc_smc) EXPORT_SYMBOL(__arm_smccc_smc) /* @@ -41,7 +41,7 @@ EXPORT_SYMBOL(__arm_smccc_smc) * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_hvc) +SYM_FUNC_START(__arm_smccc_hvc) SMCCC hvc -ENDPROC(__arm_smccc_hvc) +SYM_FUNC_END(__arm_smccc_hvc) EXPORT_SYMBOL(__arm_smccc_hvc) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index bb813d06114a..04b1ca0d7aba 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -65,7 +65,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ struct secondary_data secondary_data; /* Number of CPUs which aren't online, but looping in kernel text. */ -int cpus_stuck_in_kernel; +static int cpus_stuck_in_kernel; enum ipi_msg_type { IPI_RESCHEDULE, @@ -114,10 +114,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) */ secondary_data.task = idle; secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; -#if defined(CONFIG_ARM64_PTR_AUTH) - secondary_data.ptrauth_key.apia.lo = idle->thread.keys_kernel.apia.lo; - secondary_data.ptrauth_key.apia.hi = idle->thread.keys_kernel.apia.hi; -#endif update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -140,10 +136,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_crit("CPU%u: failed to come online\n", cpu); secondary_data.task = NULL; secondary_data.stack = NULL; -#if defined(CONFIG_ARM64_PTR_AUTH) - secondary_data.ptrauth_key.apia.lo = 0; - secondary_data.ptrauth_key.apia.hi = 0; -#endif __flush_dcache_area(&secondary_data, sizeof(secondary_data)); status = READ_ONCE(secondary_data.status); if (status == CPU_MMU_OFF) diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index a12c0c88d345..5f5b868292f5 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -98,6 +98,24 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, regs->orig_x0 = regs->regs[0]; regs->syscallno = scno; + /* + * BTI note: + * The architecture does not guarantee that SPSR.BTYPE is zero + * on taking an SVC, so we could return to userspace with a + * non-zero BTYPE after the syscall. + * + * This shouldn't matter except when userspace is explicitly + * doing something stupid, such as setting PROT_BTI on a page + * that lacks conforming BTI/PACIxSP instructions, falling + * through from one executable page to another with differing + * PROT_BTI, or messing with BTYPE via ptrace: in such cases, + * userspace should not be surprised if a SIGILL occurs on + * syscall return. + * + * So, don't touch regs->pstate & PSR_BTYPE_MASK here. + * (Similarly for HVC and SMC elsewhere.) + */ + cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); user_exit(); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cf402be5c573..d332590f5978 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -272,6 +272,61 @@ void arm64_notify_die(const char *str, struct pt_regs *regs, } } +#ifdef CONFIG_COMPAT +#define PSTATE_IT_1_0_SHIFT 25 +#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) +#define PSTATE_IT_7_2_SHIFT 10 +#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) + +static u32 compat_get_it_state(struct pt_regs *regs) +{ + u32 it, pstate = regs->pstate; + + it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; + it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; + + return it; +} + +static void compat_set_it_state(struct pt_regs *regs, u32 it) +{ + u32 pstate_it; + + pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; + pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; + + regs->pstate &= ~PSR_AA32_IT_MASK; + regs->pstate |= pstate_it; +} + +static void advance_itstate(struct pt_regs *regs) +{ + u32 it; + + /* ARM mode */ + if (!(regs->pstate & PSR_AA32_T_BIT) || + !(regs->pstate & PSR_AA32_IT_MASK)) + return; + + it = compat_get_it_state(regs); + + /* + * If this is the last instruction of the block, wipe the IT + * state. Otherwise advance it. + */ + if (!(it & 7)) + it = 0; + else + it = (it & 0xe0) | ((it << 1) & 0x1f); + + compat_set_it_state(regs, it); +} +#else +static void advance_itstate(struct pt_regs *regs) +{ +} +#endif + void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) { regs->pc += size; @@ -282,6 +337,11 @@ void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) */ if (user_mode(regs)) user_fastforward_single_step(current); + + if (compat_user_mode(regs)) + advance_itstate(regs); + else + regs->pstate &= ~PSR_BTYPE_MASK; } static LIST_HEAD(undef_hook); @@ -411,6 +471,13 @@ void do_undefinstr(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_undefinstr); +void do_bti(struct pt_regs *regs) +{ + BUG_ON(!user_mode(regs)); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); +} +NOKPROBE_SYMBOL(do_bti); + #define __user_cache_maint(insn, address, res) \ if (address >= user_addr_max()) { \ res = -EFAULT; \ @@ -566,34 +633,7 @@ static const struct sys64_hook sys64_hooks[] = { {}, }; - #ifdef CONFIG_COMPAT -#define PSTATE_IT_1_0_SHIFT 25 -#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) -#define PSTATE_IT_7_2_SHIFT 10 -#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) - -static u32 compat_get_it_state(struct pt_regs *regs) -{ - u32 it, pstate = regs->pstate; - - it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; - it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; - - return it; -} - -static void compat_set_it_state(struct pt_regs *regs, u32 it) -{ - u32 pstate_it; - - pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; - pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; - - regs->pstate &= ~PSR_AA32_IT_MASK; - regs->pstate |= pstate_it; -} - static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) { int cond; @@ -614,42 +654,12 @@ static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) return aarch32_opcode_cond_checks[cond](regs->pstate); } -static void advance_itstate(struct pt_regs *regs) -{ - u32 it; - - /* ARM mode */ - if (!(regs->pstate & PSR_AA32_T_BIT) || - !(regs->pstate & PSR_AA32_IT_MASK)) - return; - - it = compat_get_it_state(regs); - - /* - * If this is the last instruction of the block, wipe the IT - * state. Otherwise advance it. - */ - if (!(it & 7)) - it = 0; - else - it = (it & 0xe0) | ((it << 1) & 0x1f); - - compat_set_it_state(regs, it); -} - -static void arm64_compat_skip_faulting_instruction(struct pt_regs *regs, - unsigned int sz) -{ - advance_itstate(regs); - arm64_skip_faulting_instruction(regs, sz); -} - static void compat_cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) { int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; pt_regs_write_reg(regs, reg, arch_timer_get_rate()); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_32_hooks[] = { @@ -669,7 +679,7 @@ static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_64_hooks[] = { @@ -690,7 +700,7 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs) * There is no T16 variant of a CP access, so we * always advance PC by 4 bytes. */ - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); return; } @@ -753,6 +763,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", + [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", @@ -906,17 +917,13 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) { - const bool was_in_nmi = in_nmi(); - - if (!was_in_nmi) - nmi_enter(); + nmi_enter(); /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - if (!was_in_nmi) - nmi_exit(); + nmi_exit(); } asmlinkage void enter_from_user_mode(void) @@ -1047,11 +1054,11 @@ int __init early_brk64(unsigned long addr, unsigned int esr, return bug_handler(regs, esr) != DBG_HOOK_HANDLED; } -/* This registration must happen early, before debug_traps_init(). */ void __init trap_init(void) { register_kernel_break_hook(&bug_break_hook); #ifdef CONFIG_KASAN_SW_TAGS register_kernel_break_hook(&kasan_break_hook); #endif + debug_traps_init(); } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 033a48f30dbb..d51a898fd60f 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -33,20 +33,14 @@ extern char vdso_start[], vdso_end[]; extern char vdso32_start[], vdso32_end[]; #endif /* CONFIG_COMPAT_VDSO */ -/* vdso_lookup arch_index */ -enum arch_vdso_type { - ARM64_VDSO = 0, +enum vdso_abi { + VDSO_ABI_AA64, #ifdef CONFIG_COMPAT_VDSO - ARM64_VDSO32 = 1, + VDSO_ABI_AA32, #endif /* CONFIG_COMPAT_VDSO */ }; -#ifdef CONFIG_COMPAT_VDSO -#define VDSO_TYPES (ARM64_VDSO32 + 1) -#else -#define VDSO_TYPES (ARM64_VDSO + 1) -#endif /* CONFIG_COMPAT_VDSO */ -struct __vdso_abi { +struct vdso_abi_info { const char *name; const char *vdso_code_start; const char *vdso_code_end; @@ -57,14 +51,14 @@ struct __vdso_abi { struct vm_special_mapping *cm; }; -static struct __vdso_abi vdso_lookup[VDSO_TYPES] __ro_after_init = { - { +static struct vdso_abi_info vdso_info[] __ro_after_init = { + [VDSO_ABI_AA64] = { .name = "vdso", .vdso_code_start = vdso_start, .vdso_code_end = vdso_end, }, #ifdef CONFIG_COMPAT_VDSO - { + [VDSO_ABI_AA32] = { .name = "vdso32", .vdso_code_start = vdso32_start, .vdso_code_end = vdso32_end, @@ -81,13 +75,13 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static int __vdso_remap(enum arch_vdso_type arch_index, +static int __vdso_remap(enum vdso_abi abi, const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size = vdso_lookup[arch_index].vdso_code_end - - vdso_lookup[arch_index].vdso_code_start; + unsigned long vdso_size = vdso_info[abi].vdso_code_end - + vdso_info[abi].vdso_code_start; if (vdso_size != new_size) return -EINVAL; @@ -97,24 +91,24 @@ static int __vdso_remap(enum arch_vdso_type arch_index, return 0; } -static int __vdso_init(enum arch_vdso_type arch_index) +static int __vdso_init(enum vdso_abi abi) { int i; struct page **vdso_pagelist; unsigned long pfn; - if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) { + if (memcmp(vdso_info[abi].vdso_code_start, "\177ELF", 4)) { pr_err("vDSO is not a valid ELF object!\n"); return -EINVAL; } - vdso_lookup[arch_index].vdso_pages = ( - vdso_lookup[arch_index].vdso_code_end - - vdso_lookup[arch_index].vdso_code_start) >> + vdso_info[abi].vdso_pages = ( + vdso_info[abi].vdso_code_end - + vdso_info[abi].vdso_code_start) >> PAGE_SHIFT; /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1, + vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); if (vdso_pagelist == NULL) @@ -125,26 +119,27 @@ static int __vdso_init(enum arch_vdso_type arch_index) /* Grab the vDSO code pages. */ - pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start); + pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); - for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++) + for (i = 0; i < vdso_info[abi].vdso_pages; i++) vdso_pagelist[i + 1] = pfn_to_page(pfn + i); - vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0]; - vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1]; + vdso_info[abi].dm->pages = &vdso_pagelist[0]; + vdso_info[abi].cm->pages = &vdso_pagelist[1]; return 0; } -static int __setup_additional_pages(enum arch_vdso_type arch_index, +static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, int uses_interp) { unsigned long vdso_base, vdso_text_len, vdso_mapping_len; + unsigned long gp_flags = 0; void *ret; - vdso_text_len = vdso_lookup[arch_index].vdso_pages << PAGE_SHIFT; + vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ vdso_mapping_len = vdso_text_len + PAGE_SIZE; @@ -156,16 +151,19 @@ static int __setup_additional_pages(enum arch_vdso_type arch_index, ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, VM_READ|VM_MAYREAD, - vdso_lookup[arch_index].dm); + vdso_info[abi].dm); if (IS_ERR(ret)) goto up_fail; + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) + gp_flags = VM_ARM64_BTI; + vdso_base += PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|gp_flags| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso_lookup[arch_index].cm); + vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; @@ -184,46 +182,42 @@ up_fail: static int aarch32_vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - return __vdso_remap(ARM64_VDSO32, sm, new_vma); + return __vdso_remap(VDSO_ABI_AA32, sm, new_vma); } #endif /* CONFIG_COMPAT_VDSO */ -/* - * aarch32_vdso_pages: - * 0 - kuser helpers - * 1 - sigreturn code - * or (CONFIG_COMPAT_VDSO): - * 0 - kuser helpers - * 1 - vdso data - * 2 - vdso code - */ -#define C_VECTORS 0 +enum aarch32_map { + AA32_MAP_VECTORS, /* kuser helpers */ #ifdef CONFIG_COMPAT_VDSO -#define C_VVAR 1 -#define C_VDSO 2 -#define C_PAGES (C_VDSO + 1) + AA32_MAP_VVAR, + AA32_MAP_VDSO, #else -#define C_SIGPAGE 1 -#define C_PAGES (C_SIGPAGE + 1) -#endif /* CONFIG_COMPAT_VDSO */ -static struct page *aarch32_vdso_pages[C_PAGES] __ro_after_init; -static struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = { - { + AA32_MAP_SIGPAGE +#endif +}; + +static struct page *aarch32_vectors_page __ro_after_init; +#ifndef CONFIG_COMPAT_VDSO +static struct page *aarch32_sig_page __ro_after_init; +#endif + +static struct vm_special_mapping aarch32_vdso_maps[] = { + [AA32_MAP_VECTORS] = { .name = "[vectors]", /* ABI */ - .pages = &aarch32_vdso_pages[C_VECTORS], + .pages = &aarch32_vectors_page, }, #ifdef CONFIG_COMPAT_VDSO - { + [AA32_MAP_VVAR] = { .name = "[vvar]", }, - { + [AA32_MAP_VDSO] = { .name = "[vdso]", .mremap = aarch32_vdso_mremap, }, #else - { + [AA32_MAP_SIGPAGE] = { .name = "[sigpage]", /* ABI */ - .pages = &aarch32_vdso_pages[C_SIGPAGE], + .pages = &aarch32_sig_page, }, #endif /* CONFIG_COMPAT_VDSO */ }; @@ -243,8 +237,8 @@ static int aarch32_alloc_kuser_vdso_page(void) memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, kuser_sz); - aarch32_vdso_pages[C_VECTORS] = virt_to_page(vdso_page); - flush_dcache_page(aarch32_vdso_pages[C_VECTORS]); + aarch32_vectors_page = virt_to_page(vdso_page); + flush_dcache_page(aarch32_vectors_page); return 0; } @@ -253,10 +247,10 @@ static int __aarch32_alloc_vdso_pages(void) { int ret; - vdso_lookup[ARM64_VDSO32].dm = &aarch32_vdso_spec[C_VVAR]; - vdso_lookup[ARM64_VDSO32].cm = &aarch32_vdso_spec[C_VDSO]; + vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; + vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; - ret = __vdso_init(ARM64_VDSO32); + ret = __vdso_init(VDSO_ABI_AA32); if (ret) return ret; @@ -275,8 +269,8 @@ static int __aarch32_alloc_vdso_pages(void) return -ENOMEM; memcpy((void *)sigpage, __aarch32_sigret_code_start, sigret_sz); - aarch32_vdso_pages[C_SIGPAGE] = virt_to_page(sigpage); - flush_dcache_page(aarch32_vdso_pages[C_SIGPAGE]); + aarch32_sig_page = virt_to_page(sigpage); + flush_dcache_page(aarch32_sig_page); ret = aarch32_alloc_kuser_vdso_page(); if (ret) @@ -306,7 +300,7 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, - &aarch32_vdso_spec[C_VECTORS]); + &aarch32_vdso_maps[AA32_MAP_VECTORS]); return PTR_ERR_OR_ZERO(ret); } @@ -330,7 +324,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) ret = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - &aarch32_vdso_spec[C_SIGPAGE]); + &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); if (IS_ERR(ret)) goto out; @@ -354,7 +348,7 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out; #ifdef CONFIG_COMPAT_VDSO - ret = __setup_additional_pages(ARM64_VDSO32, + ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, uses_interp); @@ -371,22 +365,19 @@ out: static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - return __vdso_remap(ARM64_VDSO, sm, new_vma); + return __vdso_remap(VDSO_ABI_AA64, sm, new_vma); } -/* - * aarch64_vdso_pages: - * 0 - vvar - * 1 - vdso - */ -#define A_VVAR 0 -#define A_VDSO 1 -#define A_PAGES (A_VDSO + 1) -static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = { - { +enum aarch64_map { + AA64_MAP_VVAR, + AA64_MAP_VDSO, +}; + +static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { + [AA64_MAP_VVAR] = { .name = "[vvar]", }, - { + [AA64_MAP_VDSO] = { .name = "[vdso]", .mremap = vdso_mremap, }, @@ -394,10 +385,10 @@ static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = { static int __init vdso_init(void) { - vdso_lookup[ARM64_VDSO].dm = &vdso_spec[A_VVAR]; - vdso_lookup[ARM64_VDSO].cm = &vdso_spec[A_VDSO]; + vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; - return __vdso_init(ARM64_VDSO); + return __vdso_init(VDSO_ABI_AA64); } arch_initcall(vdso_init); @@ -410,7 +401,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = __setup_additional_pages(ARM64_VDSO, + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 3862cad2410c..556d424c6f52 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -17,15 +17,19 @@ obj-vdso := vgettimeofday.o note.o sigreturn.o targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) +btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti + +# -Bsymbolic has been added for consistency with arm, the compat vDSO and +# potential future proofing if we end up with internal calls to the exported +# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so +# preparation in build-time C")). ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ - --build-id -n -T + -Bsymbolic --eh-frame-hdr --build-id -n $(btildflags-y) -T ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 ccflags-y += -DDISABLE_BRANCH_PROFILING -VDSO_LDFLAGS := -Bsymbolic - -CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n UBSAN_SANITIZE := n diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S index 0ce6ec75a525..3d4e82290c80 100644 --- a/arch/arm64/kernel/vdso/note.S +++ b/arch/arm64/kernel/vdso/note.S @@ -12,9 +12,12 @@ #include <linux/version.h> #include <linux/elfnote.h> #include <linux/build-salt.h> +#include <asm/assembler.h> ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END BUILD_SALT + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/sigreturn.S b/arch/arm64/kernel/vdso/sigreturn.S index 12324863d5c2..620a3ef837b7 100644 --- a/arch/arm64/kernel/vdso/sigreturn.S +++ b/arch/arm64/kernel/vdso/sigreturn.S @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Sigreturn trampoline for returning from a signal when the SA_RESTORER - * flag is not set. + * flag is not set. It serves primarily as a hall of shame for crappy + * unwinders and features an exciting but mysterious NOP instruction. + * + * It's also fragile as hell, so please think twice before changing anything + * in here. * * Copyright (C) 2012 ARM Limited * @@ -9,18 +13,54 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> #include <asm/unistd.h> .text - nop -SYM_FUNC_START(__kernel_rt_sigreturn) +/* Ensure that the mysterious NOP can be associated with a function. */ .cfi_startproc + +/* + * .cfi_signal_frame causes the corresponding Frame Description Entry in the + * .eh_frame section to be annotated as a signal frame. This allows DWARF + * unwinders (e.g. libstdc++) to implement _Unwind_GetIPInfo(), which permits + * unwinding out of the signal trampoline without the need for the mysterious + * NOP. + */ .cfi_signal_frame - .cfi_def_cfa x29, 0 - .cfi_offset x29, 0 * 8 - .cfi_offset x30, 1 * 8 + +/* + * Tell the unwinder where to locate the frame record linking back to the + * interrupted context. We don't provide unwind info for registers other + * than the frame pointer and the link register here; in practice, this + * is sufficient for unwinding in C/C++ based runtimes and the values in + * the sigcontext may have been modified by this point anyway. Debuggers + * already have baked-in strategies for attempting to unwind out of signals. + */ + .cfi_def_cfa x29, 0 + .cfi_offset x29, 0 * 8 + .cfi_offset x30, 1 * 8 + +/* + * This mysterious NOP is required for some unwinders (e.g. libc++) that + * unconditionally subtract one from the result of _Unwind_GetIP() in order to + * identify the calling function. + * Hack borrowed from arch/powerpc/kernel/vdso64/sigtramp.S. + */ + nop // Mysterious NOP + +/* + * GDB relies on being able to identify the sigreturn instruction sequence to + * unwind from signal handlers. We cannot, therefore, use SYM_FUNC_START() + * here, as it will emit a BTI C instruction and break the unwinder. Thankfully, + * this function is only ever called from a RET and so omitting the landing pad + * is perfectly fine. + */ +SYM_CODE_START(__kernel_rt_sigreturn) mov x8, #__NR_rt_sigreturn svc #0 .cfi_endproc -SYM_FUNC_END(__kernel_rt_sigreturn) +SYM_CODE_END(__kernel_rt_sigreturn) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S index d1414fee5274..c4b1990bf2be 100644 --- a/arch/arm64/kernel/vdso/vdso.S +++ b/arch/arm64/kernel/vdso/vdso.S @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/linkage.h> #include <linux/const.h> +#include <asm/assembler.h> #include <asm/page.h> .globl vdso_start, vdso_end @@ -19,3 +20,5 @@ vdso_start: vdso_end: .previous + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso32/sigreturn.S b/arch/arm64/kernel/vdso32/sigreturn.S index 620524969696..b0091064c3d6 100644 --- a/arch/arm64/kernel/vdso32/sigreturn.S +++ b/arch/arm64/kernel/vdso32/sigreturn.S @@ -3,6 +3,9 @@ * This file provides both A32 and T32 versions, in accordance with the * arm sigreturn code. * + * Please read the comments in arch/arm64/kernel/vdso/sigreturn.S to + * understand some of the craziness in here. + * * Copyright (C) 2018 ARM Limited */ @@ -17,39 +20,39 @@ .save {r0-r15} .pad #COMPAT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_sigreturn_arm) +SYM_CODE_START(__kernel_sigreturn_arm) mov r7, #__NR_compat_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_sigreturn_arm) +SYM_CODE_END(__kernel_sigreturn_arm) .fnstart .save {r0-r15} .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_rt_sigreturn_arm) +SYM_CODE_START(__kernel_rt_sigreturn_arm) mov r7, #__NR_compat_rt_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_rt_sigreturn_arm) +SYM_CODE_END(__kernel_rt_sigreturn_arm) .thumb .fnstart .save {r0-r15} .pad #COMPAT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_sigreturn_thumb) +SYM_CODE_START(__kernel_sigreturn_thumb) mov r7, #__NR_compat_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_sigreturn_thumb) +SYM_CODE_END(__kernel_sigreturn_thumb) .fnstart .save {r0-r15} .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_rt_sigreturn_thumb) +SYM_CODE_START(__kernel_rt_sigreturn_thumb) mov r7, #__NR_compat_rt_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_rt_sigreturn_thumb) +SYM_CODE_END(__kernel_rt_sigreturn_thumb) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 497f9675071d..3be632177631 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -17,10 +17,6 @@ #include "image.h" -/* .exit.text needed in case of alternative patching */ -#define ARM_EXIT_KEEP(x) x -#define ARM_EXIT_DISCARD(x) - OUTPUT_ARCH(aarch64) ENTRY(_text) @@ -72,8 +68,8 @@ jiffies = jiffies_64; /* * The size of the PE/COFF section that covers the kernel image, which - * runs from stext to _edata, must be a round multiple of the PE/COFF - * FileAlignment, which we set to its minimum value of 0x200. 'stext' + * runs from _stext to _edata, must be a round multiple of the PE/COFF + * FileAlignment, which we set to its minimum value of 0x200. '_stext' * itself is 4 KB aligned, so padding out _edata to a 0x200 aligned * boundary should be sufficient. */ @@ -95,8 +91,6 @@ SECTIONS * order of matching. */ /DISCARD/ : { - ARM_EXIT_DISCARD(EXIT_TEXT) - ARM_EXIT_DISCARD(EXIT_DATA) EXIT_CALL *(.discard) *(.discard.*) @@ -139,6 +133,7 @@ SECTIONS idmap_pg_dir = .; . += IDMAP_DIR_SIZE; + idmap_pg_end = .; #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 tramp_pg_dir = .; @@ -161,7 +156,7 @@ SECTIONS __exittext_begin = .; .exit.text : { - ARM_EXIT_KEEP(EXIT_TEXT) + EXIT_TEXT } __exittext_end = .; @@ -175,7 +170,7 @@ SECTIONS *(.altinstr_replacement) } - . = ALIGN(PAGE_SIZE); + . = ALIGN(SEGMENT_ALIGN); __inittext_end = .; __initdata_begin = .; @@ -188,7 +183,7 @@ SECTIONS *(.init.rodata.* .init.bss) /* from the EFI stub */ } .exit.data : { - ARM_EXIT_KEEP(EXIT_DATA) + EXIT_DATA } PERCPU_SECTION(L1_CACHE_BYTES) @@ -246,6 +241,7 @@ SECTIONS . += INIT_DIR_SIZE; init_pg_end = .; + . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8a1e81a400e0..1336e6f0acdf 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -138,7 +138,7 @@ static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu) write_sysreg(val, cptr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt; isb(); @@ -181,7 +181,7 @@ static void deactivate_traps_vhe(void) * above before we can switch to the EL2/EL0 translation regime used by * the host. */ - asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT_VHE)); + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); write_sysreg(vectors, vbar_el1); @@ -192,7 +192,7 @@ static void __hyp_text __deactivate_traps_nvhe(void) { u64 mdcr_el2 = read_sysreg(mdcr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 6d2df9fe0b5d..ea5d22fbdacf 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -107,7 +107,8 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); - if (!cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (has_vhe() || + !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR); write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR); } else if (!ctxt->__hyp_running_vcpu) { @@ -138,7 +139,8 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1); write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE) && + if (!has_vhe() && + cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) && ctxt->__hyp_running_vcpu) { /* * Must only be done for host registers, hence the context diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index ceaddbe4279f..d063a576d511 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -23,7 +23,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm, local_irq_save(cxt->flags); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* * For CPUs that are affected by ARM errata 1165522 or 1530923, * we cannot trust stage-1 to be in a correct state at that @@ -63,7 +63,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm, static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm, struct tlb_inv_context *cxt) { - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* @@ -79,8 +79,9 @@ static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm, isb(); } + /* __load_guest_stage2() includes an ISB for the workaround. */ __load_guest_stage2(kvm); - isb(); + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } static void __hyp_text __tlb_switch_to_guest(struct kvm *kvm, @@ -103,7 +104,7 @@ static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm, write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); isb(); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Restore the registers to what they were */ write_sysreg_el1(cxt->tcr, SYS_TCR); write_sysreg_el1(cxt->sctlr, SYS_SCTLR); @@ -117,7 +118,7 @@ static void __hyp_text __tlb_switch_to_host_nvhe(struct kvm *kvm, { write_sysreg(0, vttbr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Ensure write of the host VMID */ isb(); /* Restore the host's TCR_EL1 */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 30b7ea680f66..70cd7bcca433 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -46,14 +46,6 @@ static const struct kvm_regs default_regs_reset32 = { PSR_AA32_I_BIT | PSR_AA32_F_BIT), }; -static bool cpu_has_32bit_el1(void) -{ - u64 pfr0; - - pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - return !!(pfr0 & 0x20); -} - /** * kvm_arch_vm_ioctl_check_extension * @@ -66,7 +58,7 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_ARM_EL1_32BIT: - r = cpu_has_32bit_el1(); + r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); break; case KVM_CAP_GUEST_DEBUG_HW_BPS: r = get_num_brps(); @@ -288,7 +280,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) switch (vcpu->arch.target) { default: if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { - if (!cpu_has_32bit_el1()) + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) goto out; cpu_reset = &default_regs_reset32; } else { @@ -340,11 +332,50 @@ out: return ret; } -void kvm_set_ipa_limit(void) +u32 get_kvm_ipa_limit(void) +{ + return kvm_ipa_limit; +} + +int kvm_set_ipa_limit(void) { - unsigned int ipa_max, pa_max, va_max, parange; + unsigned int ipa_max, pa_max, va_max, parange, tgran_2; + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_PARANGE_SHIFT); + + /* + * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at + * Stage-2. If not, things will stop very quickly. + */ + switch (PAGE_SIZE) { + default: + case SZ_4K: + tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT; + break; + case SZ_16K: + tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT; + break; + case SZ_64K: + tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT; + break; + } + + switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { + default: + case 1: + kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); + return -EINVAL; + case 0: + kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); + break; + case 2: + kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); + break; + } - parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); /* Clamp the IPA limit to the PA size supported by the kernel */ @@ -378,6 +409,8 @@ void kvm_set_ipa_limit(void) "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); kvm_ipa_limit = ipa_max; kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); + + return 0; } /* @@ -390,7 +423,7 @@ void kvm_set_ipa_limit(void) */ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) { - u64 vtcr = VTCR_EL2_FLAGS; + u64 vtcr = VTCR_EL2_FLAGS, mmfr0; u32 parange, phys_shift; u8 lvls; @@ -406,7 +439,9 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) phys_shift = KVM_PHYS_SHIFT; } - parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_PARANGE_SHIFT); if (parange > ID_AA64MMFR0_PARANGE_MAX) parange = ID_AA64MMFR0_PARANGE_MAX; vtcr |= parange << VTCR_EL2_PS_SHIFT; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 51db934702b6..7d7a39b01135 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1456,9 +1456,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_SANITISED(MVFR1_EL1), ID_SANITISED(MVFR2_EL1), ID_UNALLOCATED(3,3), - ID_UNALLOCATED(3,4), - ID_UNALLOCATED(3,5), - ID_UNALLOCATED(3,6), + ID_SANITISED(ID_PFR2_EL1), + ID_HIDDEN(ID_DFR1_EL1), + ID_SANITISED(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 8e25e89ad01f..0f8a3a9e3795 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -20,36 +20,36 @@ * x0 - bytes not copied */ - .macro ldrb1 ptr, regB, val - uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val + .macro ldrb1 reg, ptr, val + uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val .endm - .macro strb1 ptr, regB, val - strb \ptr, [\regB], \val + .macro strb1 reg, ptr, val + strb \reg, [\ptr], \val .endm - .macro ldrh1 ptr, regB, val - uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val + .macro ldrh1 reg, ptr, val + uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val .endm - .macro strh1 ptr, regB, val - strh \ptr, [\regB], \val + .macro strh1 reg, ptr, val + strh \reg, [\ptr], \val .endm - .macro ldr1 ptr, regB, val - uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val + .macro ldr1 reg, ptr, val + uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val .endm - .macro str1 ptr, regB, val - str \ptr, [\regB], \val + .macro str1 reg, ptr, val + str \reg, [\ptr], \val .endm - .macro ldp1 ptr, regB, regC, val - uao_ldp 9998f, \ptr, \regB, \regC, \val + .macro ldp1 reg1, reg2, ptr, val + uao_ldp 9998f, \reg1, \reg2, \ptr, \val .endm - .macro stp1 ptr, regB, regC, val - stp \ptr, \regB, [\regC], \val + .macro stp1 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr], \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 667139013ed1..80e37ada0ee1 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -21,36 +21,36 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 ptr, regB, val - uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val + .macro ldrb1 reg, ptr, val + uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val .endm - .macro strb1 ptr, regB, val - uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val + .macro strb1 reg, ptr, val + uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val .endm - .macro ldrh1 ptr, regB, val - uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val + .macro ldrh1 reg, ptr, val + uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val .endm - .macro strh1 ptr, regB, val - uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val + .macro strh1 reg, ptr, val + uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val .endm - .macro ldr1 ptr, regB, val - uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val + .macro ldr1 reg, ptr, val + uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val .endm - .macro str1 ptr, regB, val - uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val + .macro str1 reg, ptr, val + uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val .endm - .macro ldp1 ptr, regB, regC, val - uao_ldp 9998f, \ptr, \regB, \regC, \val + .macro ldp1 reg1, reg2, ptr, val + uao_ldp 9998f, \reg1, \reg2, \ptr, \val .endm - .macro stp1 ptr, regB, regC, val - uao_stp 9998f, \ptr, \regB, \regC, \val + .macro stp1 reg1, reg2, ptr, val + uao_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 1a104d0089f3..4ec59704b8f2 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -19,36 +19,36 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 ptr, regB, val - ldrb \ptr, [\regB], \val + .macro ldrb1 reg, ptr, val + ldrb \reg, [\ptr], \val .endm - .macro strb1 ptr, regB, val - uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val + .macro strb1 reg, ptr, val + uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val .endm - .macro ldrh1 ptr, regB, val - ldrh \ptr, [\regB], \val + .macro ldrh1 reg, ptr, val + ldrh \reg, [\ptr], \val .endm - .macro strh1 ptr, regB, val - uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val + .macro strh1 reg, ptr, val + uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val .endm - .macro ldr1 ptr, regB, val - ldr \ptr, [\regB], \val + .macro ldr1 reg, ptr, val + ldr \reg, [\ptr], \val .endm - .macro str1 ptr, regB, val - uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val + .macro str1 reg, ptr, val + uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val .endm - .macro ldp1 ptr, regB, regC, val - ldp \ptr, \regB, [\regC], \val + .macro ldp1 reg1, reg2, ptr, val + ldp \reg1, \reg2, [\ptr], \val .endm - .macro stp1 ptr, regB, regC, val - uao_stp 9998f, \ptr, \regB, \regC, \val + .macro stp1 reg1, reg2, ptr, val + uao_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 243e107e9896..0f9e10ecda23 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -9,7 +9,7 @@ #include <asm/alternative.h> #include <asm/assembler.h> - .cpu generic+crc + .arch armv8-a+crc .macro __crc32, c cmp x2, #16 diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 9f382adfa88a..e0bf83d556f2 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -24,36 +24,36 @@ * Returns: * x0 - dest */ - .macro ldrb1 ptr, regB, val - ldrb \ptr, [\regB], \val + .macro ldrb1 reg, ptr, val + ldrb \reg, [\ptr], \val .endm - .macro strb1 ptr, regB, val - strb \ptr, [\regB], \val + .macro strb1 reg, ptr, val + strb \reg, [\ptr], \val .endm - .macro ldrh1 ptr, regB, val - ldrh \ptr, [\regB], \val + .macro ldrh1 reg, ptr, val + ldrh \reg, [\ptr], \val .endm - .macro strh1 ptr, regB, val - strh \ptr, [\regB], \val + .macro strh1 reg, ptr, val + strh \reg, [\ptr], \val .endm - .macro ldr1 ptr, regB, val - ldr \ptr, [\regB], \val + .macro ldr1 reg, ptr, val + ldr \reg, [\ptr], \val .endm - .macro str1 ptr, regB, val - str \ptr, [\regB], \val + .macro str1 reg, ptr, val + str \reg, [\ptr], \val .endm - .macro ldp1 ptr, regB, regC, val - ldp \ptr, \regB, [\regC], \val + .macro ldp1 reg1, reg2, ptr, val + ldp \reg1, \reg2, [\ptr], \val .endm - .macro stp1 ptr, regB, regC, val - stp \ptr, \regB, [\regC], \val + .macro stp1 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr], \val .endm .weak memcpy diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 9b26f9a88724..d702d60e64da 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -92,6 +92,9 @@ static void set_reserved_asid_bits(void) bitmap_clear(asid_map, 0, NUM_USER_ASIDS); } +#define asid_gen_match(asid) \ + (!(((asid) ^ atomic64_read(&asid_generation)) >> asid_bits)) + static void flush_context(void) { int i; @@ -220,8 +223,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) * because atomic RmWs are totally ordered for a given location. */ old_active_asid = atomic64_read(&per_cpu(active_asids, cpu)); - if (old_active_asid && - !((asid ^ atomic64_read(&asid_generation)) >> asid_bits) && + if (old_active_asid && asid_gen_match(asid) && atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu), old_active_asid, asid)) goto switch_mm_fastpath; @@ -229,7 +231,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_lock_irqsave(&cpu_asid_lock, flags); /* Check that our ASID belongs to the current generation. */ asid = atomic64_read(&mm->context.id); - if ((asid ^ atomic64_read(&asid_generation)) >> asid_bits) { + if (!asid_gen_match(asid)) { asid = new_context(mm); atomic64_set(&mm->context.id, asid); } diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index d4313bc0c4c1..0da020c563e6 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -146,6 +146,11 @@ static const struct prot_bits pte_bits[] = { .set = "UXN", .clear = " ", }, { + .mask = PTE_GP, + .val = PTE_GP, + .set = "GP", + .clear = " ", + }, { .mask = PTE_ATTRINDX_MASK, .val = PTE_ATTRINDX(MT_DEVICE_nGnRnE), .set = "DEVICE/nGnRnE", diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index e42727e3568e..d2df416b840e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -272,7 +272,7 @@ int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - if (!valid_section(__nr_to_section(pfn_to_section_nr(pfn)))) + if (!valid_section(__pfn_to_section(pfn))) return 0; #endif return memblock_is_map_memory(addr); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a374e4f51a62..c299b73dd5e4 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -610,6 +610,22 @@ core_initcall(map_entry_trampoline); #endif /* + * Open coded check for BTI, only for use to determine configuration + * for early mappings for before the cpufeature code has run. + */ +static bool arm64_early_this_cpu_has_bti(void) +{ + u64 pfr1; + + if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + return false; + + pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1); + return cpuid_feature_extract_unsigned_field(pfr1, + ID_AA64PFR1_BT_SHIFT); +} + +/* * Create fine-grained mappings for the kernel. */ static void __init map_kernel(pgd_t *pgdp) @@ -625,6 +641,14 @@ static void __init map_kernel(pgd_t *pgdp) pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; /* + * If we have a CPU that supports BTI and a kernel built for + * BTI then mark the kernel executable text as guarded pages + * now so we don't have to rewrite the page tables later. + */ + if (arm64_early_this_cpu_has_bti()) + text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); + + /* * Only rodata will be remapped with different permissions later on, * all other segments are allowed to use contiguous mappings. */ diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 250c49008d73..bde08090b838 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -126,13 +126,13 @@ int set_memory_nx(unsigned long addr, int numpages) { return change_memory_common(addr, numpages, __pgprot(PTE_PXN), - __pgprot(0)); + __pgprot(PTE_MAYBE_GP)); } int set_memory_x(unsigned long addr, int numpages) { return change_memory_common(addr, numpages, - __pgprot(0), + __pgprot(PTE_MAYBE_GP), __pgprot(PTE_PXN)); } diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 197a9ba2d5ea..b7bebb12a56d 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -58,6 +58,8 @@ * cpu_do_suspend - save CPU registers context * * x0: virtual address of context pointer + * + * This must be kept in sync with struct cpu_suspend_ctx in <asm/suspend.h>. */ SYM_FUNC_START(cpu_do_suspend) mrs x2, tpidr_el0 @@ -82,6 +84,11 @@ alternative_endif stp x8, x9, [x0, #48] stp x10, x11, [x0, #64] stp x12, x13, [x0, #80] + /* + * Save x18 as it may be used as a platform register, e.g. by shadow + * call stack. + */ + str x18, [x0, #96] ret SYM_FUNC_END(cpu_do_suspend) @@ -98,6 +105,13 @@ SYM_FUNC_START(cpu_do_resume) ldp x9, x10, [x0, #48] ldp x11, x12, [x0, #64] ldp x13, x14, [x0, #80] + /* + * Restore x18, as it may be used as a platform register, and clear + * the buffer to minimize the risk of exposure when used for shadow + * call stack. + */ + ldr x18, [x0, #96] + str xzr, [x0, #96] msr tpidr_el0, x2 msr tpidrro_el0, x3 msr contextidr_el1, x4 @@ -139,7 +153,7 @@ alternative_if ARM64_HAS_RAS_EXTN msr_s SYS_DISR_EL1, xzr alternative_else_nop_endif - ptrauth_keys_install_kernel x14, 0, x1, x2, x3 + ptrauth_keys_install_kernel_nosync x14, x1, x2, x3 isb ret SYM_FUNC_END(cpu_do_resume) @@ -386,8 +400,6 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings) * * Initialise the processor for turning the MMU on. * - * Input: - * x0 with a flag ARM64_CPU_BOOT_PRIMARY/ARM64_CPU_BOOT_SECONDARY/ARM64_CPU_RUNTIME. * Output: * Return in x0 the value of the SCTLR_EL1 register. */ @@ -446,51 +458,9 @@ SYM_FUNC_START(__cpu_setup) 1: #endif /* CONFIG_ARM64_HW_AFDBM */ msr tcr_el1, x10 - mov x1, x0 /* * Prepare SCTLR */ mov_q x0, SCTLR_EL1_SET - -#ifdef CONFIG_ARM64_PTR_AUTH - /* No ptrauth setup for run time cpus */ - cmp x1, #ARM64_CPU_RUNTIME - b.eq 3f - - /* Check if the CPU supports ptrauth */ - mrs x2, id_aa64isar1_el1 - ubfx x2, x2, #ID_AA64ISAR1_APA_SHIFT, #8 - cbz x2, 3f - - /* - * The primary cpu keys are reset here and can be - * re-initialised with some proper values later. - */ - msr_s SYS_APIAKEYLO_EL1, xzr - msr_s SYS_APIAKEYHI_EL1, xzr - - /* Just enable ptrauth for primary cpu */ - cmp x1, #ARM64_CPU_BOOT_PRIMARY - b.eq 2f - - /* if !system_supports_address_auth() then skip enable */ -alternative_if_not ARM64_HAS_ADDRESS_AUTH - b 3f -alternative_else_nop_endif - - /* Install ptrauth key for secondary cpus */ - adr_l x2, secondary_data - ldr x3, [x2, #CPU_BOOT_TASK] // get secondary_data.task - cbz x3, 2f // check for slow booting cpus - ldp x3, x4, [x2, #CPU_BOOT_PTRAUTH_KEY] - msr_s SYS_APIAKEYLO_EL1, x3 - msr_s SYS_APIAKEYHI_EL1, x4 - -2: /* Enable ptrauth instructions */ - ldr x2, =SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ - SCTLR_ELx_ENDA | SCTLR_ELx_ENDB - orr x0, x0, x2 -3: -#endif ret // return to head.S SYM_FUNC_END(__cpu_setup) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index eb73f9f72c46..cc0cf0f5c7c3 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -100,6 +100,14 @@ /* Rd = Rn OP imm12 */ #define A64_ADD_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD) #define A64_SUB_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB) +#define A64_ADDS_I(sf, Rd, Rn, imm12) \ + A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD_SETFLAGS) +#define A64_SUBS_I(sf, Rd, Rn, imm12) \ + A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB_SETFLAGS) +/* Rn + imm12; set condition flags */ +#define A64_CMN_I(sf, Rn, imm12) A64_ADDS_I(sf, A64_ZR, Rn, imm12) +/* Rn - imm12; set condition flags */ +#define A64_CMP_I(sf, Rn, imm12) A64_SUBS_I(sf, A64_ZR, Rn, imm12) /* Rd = Rn */ #define A64_MOV(sf, Rd, Rn) A64_ADD_I(sf, Rd, Rn, 0) @@ -189,4 +197,26 @@ /* Rn & Rm; set condition flags */ #define A64_TST(sf, Rn, Rm) A64_ANDS(sf, A64_ZR, Rn, Rm) +/* Logical (immediate) */ +#define A64_LOGIC_IMM(sf, Rd, Rn, imm, type) ({ \ + u64 imm64 = (sf) ? (u64)imm : (u64)(u32)imm; \ + aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_##type, \ + A64_VARIANT(sf), Rn, Rd, imm64); \ +}) +/* Rd = Rn OP imm */ +#define A64_AND_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND) +#define A64_ORR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, ORR) +#define A64_EOR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, EOR) +#define A64_ANDS_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND_SETFLAGS) +/* Rn & imm; set condition flags */ +#define A64_TST_I(sf, Rn, imm) A64_ANDS_I(sf, A64_ZR, Rn, imm) + +/* HINTs */ +#define A64_HINT(x) aarch64_insn_gen_hint(x) + +/* BTI */ +#define A64_BTI_C A64_HINT(AARCH64_INSN_HINT_BTIC) +#define A64_BTI_J A64_HINT(AARCH64_INSN_HINT_BTIJ) +#define A64_BTI_JC A64_HINT(AARCH64_INSN_HINT_BTIJC) + #endif /* _BPF_JIT_H */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index cdc79de0c794..3cb25b43b368 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -167,11 +167,21 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) return to - from; } +static bool is_addsub_imm(u32 imm) +{ + /* Either imm12 or shifted imm12. */ + return !(imm & ~0xfff) || !(imm & ~0xfff000); +} + /* Stack must be multiples of 16B */ #define STACK_ALIGN(sz) (((sz) + 15) & ~15) /* Tail call offset to jump into */ +#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) +#define PROLOGUE_OFFSET 8 +#else #define PROLOGUE_OFFSET 7 +#endif static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) { @@ -208,6 +218,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) * */ + /* BTI landing pad */ + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + emit(A64_BTI_C, ctx); + /* Save FP and LR registers to stay align with ARM64 AAPCS */ emit(A64_PUSH(A64_FP, A64_LR, A64_SP), ctx); emit(A64_MOV(1, A64_FP, A64_SP), ctx); @@ -230,6 +244,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) cur_offset, PROLOGUE_OFFSET); return -1; } + + /* BTI landing pad for the tail call, done with a BR */ + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + emit(A64_BTI_J, ctx); } ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth); @@ -356,6 +374,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, const bool isdw = BPF_SIZE(code) == BPF_DW; u8 jmp_cond, reg; s32 jmp_offset; + u32 a64_insn; #define check_imm(bits, imm) do { \ if ((((imm) > 0) && ((imm) >> (bits))) || \ @@ -478,28 +497,55 @@ emit_bswap_uxt: /* dst = dst OP imm */ case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_ADD | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_ADD(is64, dst, dst, tmp), ctx); + if (is_addsub_imm(imm)) { + emit(A64_ADD_I(is64, dst, dst, imm), ctx); + } else if (is_addsub_imm(-imm)) { + emit(A64_SUB_I(is64, dst, dst, -imm), ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_ADD(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_SUB(is64, dst, dst, tmp), ctx); + if (is_addsub_imm(imm)) { + emit(A64_SUB_I(is64, dst, dst, imm), ctx); + } else if (is_addsub_imm(-imm)) { + emit(A64_ADD_I(is64, dst, dst, -imm), ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_SUB(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_AND(is64, dst, dst, tmp), ctx); + a64_insn = A64_AND_I(is64, dst, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_AND(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_ORR(is64, dst, dst, tmp), ctx); + a64_insn = A64_ORR_I(is64, dst, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_ORR(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_EOR(is64, dst, dst, tmp), ctx); + a64_insn = A64_EOR_I(is64, dst, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_EOR(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_K: @@ -623,13 +669,24 @@ emit_cond_jmp: case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_CMP(is64, dst, tmp), ctx); + if (is_addsub_imm(imm)) { + emit(A64_CMP_I(is64, dst, imm), ctx); + } else if (is_addsub_imm(-imm)) { + emit(A64_CMN_I(is64, dst, -imm), ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_CMP(is64, dst, tmp), ctx); + } goto emit_cond_jmp; case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_TST(is64, dst, tmp), ctx); + a64_insn = A64_TST_I(is64, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_TST(is64, dst, tmp), ctx); + } goto emit_cond_jmp; /* function call */ case BPF_JMP | BPF_CALL: diff --git a/arch/c6x/lib/checksum.c b/arch/c6x/lib/checksum.c index 46940844c553..335ca4900808 100644 --- a/arch/c6x/lib/checksum.c +++ b/arch/c6x/lib/checksum.c @@ -4,28 +4,6 @@ #include <linux/module.h> #include <net/checksum.h> -#include <asm/byteorder.h> - -/* - * copy from fs while checksumming, otherwise like csum_partial - */ -__wsum -csum_partial_copy_from_user(const void __user *src, void *dst, int len, - __wsum sum, int *csum_err) -{ - int missing; - - missing = __copy_from_user(dst, src, len); - if (missing) { - memset(dst + len - missing, 0, missing); - *csum_err = -EFAULT; - } else - *csum_err = 0; - - return csum_partial(dst, len, sum); -} -EXPORT_SYMBOL(csum_partial_copy_from_user); - /* These are from csum_64plus.S */ EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(csum_partial_copy); diff --git a/arch/ia64/include/asm/checksum.h b/arch/ia64/include/asm/checksum.h index 0ed18bc3f6cf..2a1c64629cdc 100644 --- a/arch/ia64/include/asm/checksum.h +++ b/arch/ia64/include/asm/checksum.h @@ -37,16 +37,6 @@ extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); -/* - * Same as csum_partial, but copies from src while it checksums. - * - * Here it is even more important to align src and dst on a 32-bit (or - * even better 64-bit) boundary. - */ -extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, - int *errp); - extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum); diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index f69f3fe0532e..a54eacbc61a9 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -57,12 +57,12 @@ unsigned long hcdp_phys = EFI_INVALID_TABLE_ADDR; unsigned long sal_systab_phys = EFI_INVALID_TABLE_ADDR; static const efi_config_table_type_t arch_tables[] __initconst = { - {ESI_TABLE_GUID, "ESI", &esi_phys}, - {HCDP_TABLE_GUID, "HCDP", &hcdp_phys}, - {MPS_TABLE_GUID, "MPS", &mps_phys}, - {PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID, "PALO", &palo_phys}, - {SAL_SYSTEM_TABLE_GUID, "SALsystab", &sal_systab_phys}, - {NULL_GUID, NULL, 0}, + {ESI_TABLE_GUID, &esi_phys, "ESI" }, + {HCDP_TABLE_GUID, &hcdp_phys, "HCDP" }, + {MPS_TABLE_GUID, &mps_phys, "MPS" }, + {PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID, &palo_phys, "PALO" }, + {SAL_SYSTEM_TABLE_GUID, &sal_systab_phys, "SALsystab" }, + {}, }; extern efi_status_t efi_call_phys (void *, ...); diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 042911e670b8..49e325b604b3 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -358,3 +358,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c index bf9396b1ed32..5d147a33d648 100644 --- a/arch/ia64/lib/csum_partial_copy.c +++ b/arch/ia64/lib/csum_partial_copy.c @@ -103,39 +103,11 @@ out: * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS. * But it's very tricky to get right even in C. */ -extern unsigned long do_csum(const unsigned char *, long); - -__wsum -csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum psum, int *errp) -{ - unsigned long result; - - /* XXX Fixme - * for now we separate the copy from checksum for obvious - * alignment difficulties. Look at the Alpha code and you'll be - * scared. - */ - - if (__copy_from_user(dst, src, len) != 0 && errp) - *errp = -EFAULT; - - result = do_csum(dst, len); - - /* add in old sum, and carry.. */ - result += (__force u32)psum; - /* 32+c bits -> 32 bits */ - result = (result & 0xffffffff) + (result >> 32); - return (__force __wsum)result; -} - -EXPORT_SYMBOL(csum_partial_copy_from_user); - __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) { - return csum_partial_copy_from_user((__force const void __user *)src, - dst, len, sum, NULL); + memcpy(dst, src, len); + return csum_partial(dst, len, sum); } EXPORT_SYMBOL(csum_partial_copy_nocheck); diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index c32ab8041cf6..4eb911d64e8d 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -221,6 +221,7 @@ static void __init amiga_identify(void) case AMI_1200: AMIGAHW_SET(A1200_IDE); AMIGAHW_SET(PCMCIA); + fallthrough; case AMI_500: case AMI_500PLUS: case AMI_1000: @@ -233,7 +234,7 @@ static void __init amiga_identify(void) case AMI_3000T: AMIGAHW_SET(AMBER_FF); AMIGAHW_SET(MAGIC_REKICK); - /* fall through */ + fallthrough; case AMI_3000PLUS: AMIGAHW_SET(A3000_SCSI); AMIGAHW_SET(A3000_CLK); @@ -242,7 +243,7 @@ static void __init amiga_identify(void) case AMI_4000T: AMIGAHW_SET(A4000_SCSI); - /* fall through */ + fallthrough; case AMI_4000: AMIGAHW_SET(A4000_IDE); AMIGAHW_SET(A3000_CLK); @@ -628,7 +629,7 @@ struct savekmsg { unsigned long magic2; /* SAVEKMSG_MAGIC2 */ unsigned long magicptr; /* address of magic1 */ unsigned long size; - char data[0]; + char data[]; }; static struct savekmsg *savekmsg; diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 5b3a273ae3da..888b75e7fd79 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -100,7 +100,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -381,6 +380,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -452,6 +452,7 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -472,6 +473,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -619,9 +621,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 0bf0907a7c80..45303846b659 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -96,7 +96,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -360,6 +359,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -408,6 +408,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -428,6 +429,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -575,9 +577,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 876e69292294..de824c1bc3d3 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -103,7 +103,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -376,6 +375,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -430,6 +430,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -450,6 +451,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -597,9 +599,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index aa59c242e715..071839ca6a59 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -93,7 +93,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -358,6 +357,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -401,6 +401,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -421,6 +422,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -568,9 +570,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 308cd93929a9..37ac7b019ec1 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -95,7 +95,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -359,6 +358,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -410,6 +410,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -430,6 +431,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -577,9 +579,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0bc210ace870..608779866260 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -94,7 +94,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -375,6 +374,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -432,6 +432,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -452,6 +453,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -599,9 +601,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 3b3b832dee80..0abb53c38c20 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -114,7 +114,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -419,6 +418,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -518,6 +518,7 @@ CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -538,6 +539,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -685,9 +687,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index e3633c66926f..cb14c234d3ad 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -92,7 +92,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -357,6 +356,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -400,6 +400,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -420,6 +421,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -567,9 +569,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 88b3f7f9f146..e8a1920aded7 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -93,7 +93,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -358,6 +357,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -401,6 +401,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -421,6 +422,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -568,9 +570,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 3dd5b536921e..2cbf416fc725 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -94,7 +94,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -365,6 +364,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -419,6 +419,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -439,6 +440,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -586,9 +588,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 715e015ed270..fed3cc7abcc4 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -90,7 +90,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -355,6 +354,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -403,6 +403,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -423,6 +424,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -570,8 +572,10 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index f9ff129ac7c2..0954fde256e6 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -90,7 +90,6 @@ CONFIG_NF_CONNTRACK_SANE=m CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_SET=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_NUMGEN=m @@ -355,6 +354,7 @@ CONFIG_IPVLAN=m CONFIG_IPVTAP=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_BAREUDP=m CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -402,6 +402,7 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y @@ -422,6 +423,7 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_PROC_KCORE=y CONFIG_PROC_CHILDREN=y CONFIG_TMPFS=y @@ -569,9 +571,11 @@ CONFIG_XZ_DEC_TEST=m CONFIG_STRING_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_TEST_LIST_SORT=m +CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_SORT=m CONFIG_REED_SOLOMON_TEST=m CONFIG_ATOMIC64_SELFTEST=m diff --git a/arch/m68k/include/asm/checksum.h b/arch/m68k/include/asm/checksum.h index f9b94e4b94f9..3f2c15d6f18c 100644 --- a/arch/m68k/include/asm/checksum.h +++ b/arch/m68k/include/asm/checksum.h @@ -30,7 +30,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum); * better 64-bit) boundary */ -extern __wsum csum_partial_copy_from_user(const void __user *src, +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER +extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *csum_err); diff --git a/arch/m68k/include/asm/mac_via.h b/arch/m68k/include/asm/mac_via.h index de1470c4d829..1149251ea58d 100644 --- a/arch/m68k/include/asm/mac_via.h +++ b/arch/m68k/include/asm/mac_via.h @@ -257,6 +257,7 @@ extern int rbv_present,via_alt_mapping; struct irq_desc; +extern void via_l2_flush(int writeback); extern void via_register_interrupts(void); extern void via_irq_enable(int); extern void via_irq_disable(int); diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h index 7e85de984df1..9ae9f8d05925 100644 --- a/arch/m68k/include/asm/uaccess_mm.h +++ b/arch/m68k/include/asm/uaccess_mm.h @@ -142,7 +142,7 @@ asm volatile ("\n" \ __get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \ break; \ case 8: { \ - const void *__gu_ptr = (ptr); \ + const void __user *__gu_ptr = (ptr); \ union { \ u64 l; \ __typeof__(*(ptr)) t; \ diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f4f49fcb76d0..f71b1bbcc198 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 435 common clone3 __sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/m68k/lib/checksum.c b/arch/m68k/lib/checksum.c index 5fa3d392e181..31797be9a3dc 100644 --- a/arch/m68k/lib/checksum.c +++ b/arch/m68k/lib/checksum.c @@ -129,7 +129,7 @@ EXPORT_SYMBOL(csum_partial); */ __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, +csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *csum_err) { /* @@ -316,7 +316,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, return(sum); } -EXPORT_SYMBOL(csum_partial_copy_from_user); +EXPORT_SYMBOL(csum_and_copy_from_user); /* diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 611f73bfc87c..d0126ab01360 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -59,7 +59,6 @@ extern void iop_preinit(void); extern void iop_init(void); extern void via_init(void); extern void via_init_clock(irq_handler_t func); -extern void via_flush_cache(void); extern void oss_init(void); extern void psc_init(void); extern void baboon_init(void); @@ -130,21 +129,6 @@ int __init mac_parse_bootinfo(const struct bi_record *record) return unknown; } -/* - * Flip into 24bit mode for an instant - flushes the L2 cache card. We - * have to disable interrupts for this. Our IRQ handlers will crap - * themselves if they take an IRQ in 24bit mode! - */ - -static void mac_cache_card_flush(int writeback) -{ - unsigned long flags; - - local_irq_save(flags); - via_flush_cache(); - local_irq_restore(flags); -} - void __init config_mac(void) { if (!MACH_IS_MAC) @@ -175,9 +159,8 @@ void __init config_mac(void) * not. */ - if (macintosh_config->ident == MAC_MODEL_IICI - || macintosh_config->ident == MAC_MODEL_IIFX) - mach_l2_flush = mac_cache_card_flush; + if (macintosh_config->ident == MAC_MODEL_IICI) + mach_l2_flush = via_l2_flush; } diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index 9bfa17015768..d3775afb0f07 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -299,7 +299,6 @@ void __init iop_init(void) /* * Register the interrupt handler for the IOPs. - * TODO: might be wrong for non-OSS machines. Anyone? */ void __init iop_register_interrupts(void) @@ -566,36 +565,42 @@ irqreturn_t iop_ism_irq(int irq, void *dev_id) uint iop_num = (uint) dev_id; volatile struct mac_iop *iop = iop_base[iop_num]; int i,state; + u8 events = iop->status_ctrl & (IOP_INT0 | IOP_INT1); iop_pr_debug("status %02X\n", iop->status_ctrl); - /* INT0 indicates a state change on an outgoing message channel */ - - if (iop->status_ctrl & IOP_INT0) { - iop->status_ctrl = IOP_INT0 | IOP_RUN | IOP_AUTOINC; - iop_pr_debug("new status %02X, send states", iop->status_ctrl); - for (i = 0 ; i < NUM_IOP_CHAN ; i++) { - state = iop_readb(iop, IOP_ADDR_SEND_STATE + i); - iop_pr_cont(" %02X", state); - if (state == IOP_MSG_COMPLETE) { - iop_handle_send(iop_num, i); + do { + /* INT0 indicates state change on an outgoing message channel */ + if (events & IOP_INT0) { + iop->status_ctrl = IOP_INT0 | IOP_RUN | IOP_AUTOINC; + iop_pr_debug("new status %02X, send states", + iop->status_ctrl); + for (i = 0; i < NUM_IOP_CHAN; i++) { + state = iop_readb(iop, IOP_ADDR_SEND_STATE + i); + iop_pr_cont(" %02X", state); + if (state == IOP_MSG_COMPLETE) + iop_handle_send(iop_num, i); } + iop_pr_cont("\n"); } - iop_pr_cont("\n"); - } - if (iop->status_ctrl & IOP_INT1) { /* INT1 for incoming msgs */ - iop->status_ctrl = IOP_INT1 | IOP_RUN | IOP_AUTOINC; - iop_pr_debug("new status %02X, recv states", iop->status_ctrl); - for (i = 0 ; i < NUM_IOP_CHAN ; i++) { - state = iop_readb(iop, IOP_ADDR_RECV_STATE + i); - iop_pr_cont(" %02X", state); - if (state == IOP_MSG_NEW) { - iop_handle_recv(iop_num, i); + /* INT1 for incoming messages */ + if (events & IOP_INT1) { + iop->status_ctrl = IOP_INT1 | IOP_RUN | IOP_AUTOINC; + iop_pr_debug("new status %02X, recv states", + iop->status_ctrl); + for (i = 0; i < NUM_IOP_CHAN; i++) { + state = iop_readb(iop, IOP_ADDR_RECV_STATE + i); + iop_pr_cont(" %02X", state); + if (state == IOP_MSG_NEW) + iop_handle_recv(iop_num, i); } + iop_pr_cont("\n"); } - iop_pr_cont("\n"); - } + + events = iop->status_ctrl & (IOP_INT0 | IOP_INT1); + } while (events); + return IRQ_HANDLED; } diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c index 3c2cfcb74982..1f0fad2a98a0 100644 --- a/arch/m68k/mac/via.c +++ b/arch/m68k/mac/via.c @@ -294,10 +294,14 @@ void via_debug_dump(void) * the system into 24-bit mode for an instant. */ -void via_flush_cache(void) +void via_l2_flush(int writeback) { + unsigned long flags; + + local_irq_save(flags); via2[gBufB] &= ~VIA2B_vMode32; via2[gBufB] |= VIA2B_vMode32; + local_irq_restore(flags); } /* diff --git a/arch/m68k/tools/amiga/dmesg.c b/arch/m68k/tools/amiga/dmesg.c index 7340f5b6cf6d..f8005a7efb0b 100644 --- a/arch/m68k/tools/amiga/dmesg.c +++ b/arch/m68k/tools/amiga/dmesg.c @@ -34,7 +34,7 @@ struct savekmsg { u_long magic2; /* SAVEKMSG_MAGIC2 */ u_long magicptr; /* address of magic1 */ u_long size; - char data[0]; + char data[]; }; diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 4c67b11f9c9e..edacc4561f2b 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 1f9e8ad636cc..f777141f5256 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -376,3 +376,4 @@ 435 n32 clone3 __sys_clone3 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd +439 n32 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c0b9d802dbf6..da8c76394e17 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -352,3 +352,4 @@ 435 n64 clone3 __sys_clone3 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd +439 n64 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index ac586774c980..13280625d312 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -425,3 +425,4 @@ 435 o32 clone3 __sys_clone3 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd +439 o32 faccessat2 sys_faccessat2 diff --git a/arch/nios2/include/asm/checksum.h b/arch/nios2/include/asm/checksum.h index 703c5ee63421..ec39698d3bea 100644 --- a/arch/nios2/include/asm/checksum.h +++ b/arch/nios2/include/asm/checksum.h @@ -14,8 +14,6 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum); extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum); -extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, int *csum_err); #define csum_partial_copy_nocheck(src, dst, len, sum) \ csum_partial_copy((src), (dst), (len), (sum)) diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h index c1c22819a04d..fe8c63b2d2c3 100644 --- a/arch/parisc/include/asm/checksum.h +++ b/arch/parisc/include/asm/checksum.h @@ -27,13 +27,6 @@ extern __wsum csum_partial(const void *, int, __wsum); extern __wsum csum_partial_copy_nocheck(const void *, void *, int, __wsum); /* - * this is a new version of the above that records errors it finds in *errp, - * but continues and zeros the rest of the buffer. - */ -extern __wsum csum_partial_copy_from_user(const void __user *src, - void *dst, int len, __wsum sum, int *errp); - -/* * Optimized for IP headers, which always checksum on 4 octet boundaries. * * Written by Randolph Chung <tausq@debian.org>, and then mucked with by diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index e1a8fee3ad49..d46b6709ec56 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -300,7 +300,7 @@ static ssize_t perf_write(struct file *file, const char __user *buf, else return -EFAULT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; if (count != sizeof(uint32_t)) diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 52a15f5cd130..5a758fa6ec52 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -435,3 +435,4 @@ 435 common clone3 sys_clone3_wrapper 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/parisc/lib/checksum.c b/arch/parisc/lib/checksum.c index 256322c7b648..c6f161583549 100644 --- a/arch/parisc/lib/checksum.c +++ b/arch/parisc/lib/checksum.c @@ -123,23 +123,3 @@ __wsum csum_partial_copy_nocheck(const void *src, void *dst, return sum; } EXPORT_SYMBOL(csum_partial_copy_nocheck); - -/* - * Copy from userspace and compute checksum. If we catch an exception - * then zero the rest of the buffer. - */ -__wsum csum_partial_copy_from_user(const void __user *src, - void *dst, int len, - __wsum sum, int *err_ptr) -{ - int missing; - - missing = copy_from_user(dst, src, len); - if (missing) { - memset(dst + len - missing, 0, missing); - *err_ptr = -EFAULT; - } - - return csum_partial(dst, len, sum); -} -EXPORT_SYMBOL(csum_partial_copy_from_user); diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h index be48c2215fa2..a809b1b44ddf 100644 --- a/arch/powerpc/include/uapi/asm/kvm_para.h +++ b/arch/powerpc/include/uapi/asm/kvm_para.h @@ -31,7 +31,7 @@ * Struct fields are always 32 or 64 bit aligned, depending on them being 32 * or 64 bit wide respectively. * - * See Documentation/virt/kvm/ppc-pv.txt + * See Documentation/virt/kvm/ppc-pv.rst */ struct kvm_vcpu_arch_shared { __u64 scratch1; diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 220ae11555f2..f833a3190822 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -527,3 +527,4 @@ 435 spu clone3 sys_ni_syscall 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 3fca22276bb1..b44dd75de517 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -441,15 +441,9 @@ nonrecoverable: void system_reset_exception(struct pt_regs *regs) { unsigned long hsrr0, hsrr1; - bool nested = in_nmi(); bool saved_hsrrs = false; - /* - * Avoid crashes in case of nested NMI exceptions. Recoverability - * is determined by RI and in_nmi - */ - if (!nested) - nmi_enter(); + nmi_enter(); /* * System reset can interrupt code where HSRRs are live and MSR[RI]=1. @@ -521,8 +515,7 @@ out: mtspr(SPRN_HSRR1, hsrr1); } - if (!nested) - nmi_exit(); + nmi_exit(); /* What should we do here? We could issue a shutdown or hard reset. */ } @@ -823,9 +816,8 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { int recover = 0; - bool nested = in_nmi(); - if (!nested) - nmi_enter(); + + nmi_enter(); __this_cpu_inc(irq_stat.mce_exceptions); @@ -851,8 +843,7 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - if (!nested) - nmi_exit(); + nmi_exit(); die("Machine check", regs, SIGBUS); @@ -863,8 +854,7 @@ void machine_check_exception(struct pt_regs *regs) return; bail: - if (!nested) - nmi_exit(); + nmi_exit(); } void SMIException(struct pt_regs *regs) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 31a0f201fb6f..a1706b63b82d 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -90,6 +90,7 @@ SECTIONS #ifdef CONFIG_PPC64 *(.tramp.ftrace.text); #endif + NOINSTR_TEXT SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index eb82dda884e5..0edcfd0b491d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -976,7 +976,7 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* Sampling not supported */ @@ -1412,7 +1412,7 @@ static int trace_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* Return if this is a couting event */ diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index 8b3296b62f65..3b75e8f60609 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -21,22 +21,6 @@ #include "spufs.h" -static ssize_t do_coredump_read(int num, struct spu_context *ctx, void *buffer, - size_t size, loff_t *off) -{ - u64 data; - int ret; - - if (spufs_coredump_read[num].read) - return spufs_coredump_read[num].read(ctx, buffer, size, off); - - data = spufs_coredump_read[num].get(ctx); - ret = snprintf(buffer, size, "0x%.16llx", data); - if (ret >= size) - return size; - return ++ret; /* count trailing NULL */ -} - static int spufs_ctx_note_size(struct spu_context *ctx, int dfd) { int i, sz, total = 0; @@ -118,58 +102,43 @@ int spufs_coredump_extra_notes_size(void) static int spufs_arch_write_note(struct spu_context *ctx, int i, struct coredump_params *cprm, int dfd) { - loff_t pos = 0; - int sz, rc, total = 0; - const int bufsz = PAGE_SIZE; - char *name; - char fullname[80], *buf; + size_t sz = spufs_coredump_read[i].size; + char fullname[80]; struct elf_note en; - size_t skip; - - buf = (void *)get_zeroed_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; + size_t ret; - name = spufs_coredump_read[i].name; - sz = spufs_coredump_read[i].size; - - sprintf(fullname, "SPU/%d/%s", dfd, name); + sprintf(fullname, "SPU/%d/%s", dfd, spufs_coredump_read[i].name); en.n_namesz = strlen(fullname) + 1; en.n_descsz = sz; en.n_type = NT_SPU; if (!dump_emit(cprm, &en, sizeof(en))) - goto Eio; - + return -EIO; if (!dump_emit(cprm, fullname, en.n_namesz)) - goto Eio; - + return -EIO; if (!dump_align(cprm, 4)) - goto Eio; - - do { - rc = do_coredump_read(i, ctx, buf, bufsz, &pos); - if (rc > 0) { - if (!dump_emit(cprm, buf, rc)) - goto Eio; - total += rc; - } - } while (rc == bufsz && total < sz); - - if (rc < 0) - goto out; - - skip = roundup(cprm->pos - total + sz, 4) - cprm->pos; - if (!dump_skip(cprm, skip)) - goto Eio; - - rc = 0; -out: - free_page((unsigned long)buf); - return rc; -Eio: - free_page((unsigned long)buf); - return -EIO; + return -EIO; + + if (spufs_coredump_read[i].dump) { + ret = spufs_coredump_read[i].dump(ctx, cprm); + if (ret < 0) + return ret; + } else { + char buf[32]; + + ret = snprintf(buf, sizeof(buf), "0x%.16llx", + spufs_coredump_read[i].get(ctx)); + if (ret >= sizeof(buf)) + return sizeof(buf); + + /* count trailing the NULL: */ + if (!dump_emit(cprm, buf, ret + 1)) + return -EIO; + } + + if (!dump_skip(cprm, roundup(cprm->pos - ret + sz, 4) - cprm->pos)) + return -EIO; + return 0; } int spufs_coredump_extra_notes_write(struct coredump_params *cprm) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index c0f950a3f4e1..e44427c24585 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -9,6 +9,7 @@ #undef DEBUG +#include <linux/coredump.h> #include <linux/fs.h> #include <linux/ioctl.h> #include <linux/export.h> @@ -129,6 +130,14 @@ out: return ret; } +static ssize_t spufs_dump_emit(struct coredump_params *cprm, void *buf, + size_t size) +{ + if (!dump_emit(cprm, buf, size)) + return -EIO; + return size; +} + #define DEFINE_SPUFS_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ @@ -172,12 +181,9 @@ spufs_mem_release(struct inode *inode, struct file *file) } static ssize_t -__spufs_mem_read(struct spu_context *ctx, char __user *buffer, - size_t size, loff_t *pos) +spufs_mem_dump(struct spu_context *ctx, struct coredump_params *cprm) { - char *local_store = ctx->ops->get_ls(ctx); - return simple_read_from_buffer(buffer, size, pos, local_store, - LS_SIZE); + return spufs_dump_emit(cprm, ctx->ops->get_ls(ctx), LS_SIZE); } static ssize_t @@ -190,7 +196,8 @@ spufs_mem_read(struct file *file, char __user *buffer, ret = spu_acquire(ctx); if (ret) return ret; - ret = __spufs_mem_read(ctx, buffer, size, pos); + ret = simple_read_from_buffer(buffer, size, pos, ctx->ops->get_ls(ctx), + LS_SIZE); spu_release(ctx); return ret; @@ -459,12 +466,10 @@ spufs_regs_open(struct inode *inode, struct file *file) } static ssize_t -__spufs_regs_read(struct spu_context *ctx, char __user *buffer, - size_t size, loff_t *pos) +spufs_regs_dump(struct spu_context *ctx, struct coredump_params *cprm) { - struct spu_lscsa *lscsa = ctx->csa.lscsa; - return simple_read_from_buffer(buffer, size, pos, - lscsa->gprs, sizeof lscsa->gprs); + return spufs_dump_emit(cprm, ctx->csa.lscsa->gprs, + sizeof(ctx->csa.lscsa->gprs)); } static ssize_t @@ -482,7 +487,8 @@ spufs_regs_read(struct file *file, char __user *buffer, ret = spu_acquire_saved(ctx); if (ret) return ret; - ret = __spufs_regs_read(ctx, buffer, size, pos); + ret = simple_read_from_buffer(buffer, size, pos, ctx->csa.lscsa->gprs, + sizeof(ctx->csa.lscsa->gprs)); spu_release_saved(ctx); return ret; } @@ -517,12 +523,10 @@ static const struct file_operations spufs_regs_fops = { }; static ssize_t -__spufs_fpcr_read(struct spu_context *ctx, char __user * buffer, - size_t size, loff_t * pos) +spufs_fpcr_dump(struct spu_context *ctx, struct coredump_params *cprm) { - struct spu_lscsa *lscsa = ctx->csa.lscsa; - return simple_read_from_buffer(buffer, size, pos, - &lscsa->fpcr, sizeof(lscsa->fpcr)); + return spufs_dump_emit(cprm, &ctx->csa.lscsa->fpcr, + sizeof(ctx->csa.lscsa->fpcr)); } static ssize_t @@ -535,7 +539,8 @@ spufs_fpcr_read(struct file *file, char __user * buffer, ret = spu_acquire_saved(ctx); if (ret) return ret; - ret = __spufs_fpcr_read(ctx, buffer, size, pos); + ret = simple_read_from_buffer(buffer, size, pos, &ctx->csa.lscsa->fpcr, + sizeof(ctx->csa.lscsa->fpcr)); spu_release_saved(ctx); return ret; } @@ -590,17 +595,12 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf, size_t len, loff_t *pos) { struct spu_context *ctx = file->private_data; - u32 mbox_data, __user *udata; + u32 mbox_data, __user *udata = (void __user *)buf; ssize_t count; if (len < 4) return -EINVAL; - if (!access_ok(buf, len)) - return -EFAULT; - - udata = (void __user *)buf; - count = spu_acquire(ctx); if (count) return count; @@ -616,7 +616,7 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf, * but still need to return the data we have * read successfully so far. */ - ret = __put_user(mbox_data, udata); + ret = put_user(mbox_data, udata); if (ret) { if (!count) count = -EFAULT; @@ -698,17 +698,12 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf, size_t len, loff_t *pos) { struct spu_context *ctx = file->private_data; - u32 ibox_data, __user *udata; + u32 ibox_data, __user *udata = (void __user *)buf; ssize_t count; if (len < 4) return -EINVAL; - if (!access_ok(buf, len)) - return -EFAULT; - - udata = (void __user *)buf; - count = spu_acquire(ctx); if (count) goto out; @@ -727,7 +722,7 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf, } /* if we can't write at all, return -EFAULT */ - count = __put_user(ibox_data, udata); + count = put_user(ibox_data, udata); if (count) goto out_unlock; @@ -741,7 +736,7 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf, * but still need to return the data we have * read successfully so far. */ - ret = __put_user(ibox_data, udata); + ret = put_user(ibox_data, udata); if (ret) break; } @@ -836,17 +831,13 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf, size_t len, loff_t *pos) { struct spu_context *ctx = file->private_data; - u32 wbox_data, __user *udata; + u32 wbox_data, __user *udata = (void __user *)buf; ssize_t count; if (len < 4) return -EINVAL; - udata = (void __user *)buf; - if (!access_ok(buf, len)) - return -EFAULT; - - if (__get_user(wbox_data, udata)) + if (get_user(wbox_data, udata)) return -EFAULT; count = spu_acquire(ctx); @@ -873,7 +864,7 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf, /* write as much as possible */ for (count = 4, udata++; (count + 4) <= len; count += 4, udata++) { int ret; - ret = __get_user(wbox_data, udata); + ret = get_user(wbox_data, udata); if (ret) break; @@ -967,28 +958,26 @@ spufs_signal1_release(struct inode *inode, struct file *file) return 0; } -static ssize_t __spufs_signal1_read(struct spu_context *ctx, char __user *buf, - size_t len, loff_t *pos) +static ssize_t spufs_signal1_dump(struct spu_context *ctx, + struct coredump_params *cprm) { - int ret = 0; - u32 data; + if (!ctx->csa.spu_chnlcnt_RW[3]) + return 0; + return spufs_dump_emit(cprm, &ctx->csa.spu_chnldata_RW[3], + sizeof(ctx->csa.spu_chnldata_RW[3])); +} - if (len < 4) +static ssize_t __spufs_signal1_read(struct spu_context *ctx, char __user *buf, + size_t len) +{ + if (len < sizeof(ctx->csa.spu_chnldata_RW[3])) return -EINVAL; - - if (ctx->csa.spu_chnlcnt_RW[3]) { - data = ctx->csa.spu_chnldata_RW[3]; - ret = 4; - } - - if (!ret) - goto out; - - if (copy_to_user(buf, &data, 4)) + if (!ctx->csa.spu_chnlcnt_RW[3]) + return 0; + if (copy_to_user(buf, &ctx->csa.spu_chnldata_RW[3], + sizeof(ctx->csa.spu_chnldata_RW[3]))) return -EFAULT; - -out: - return ret; + return sizeof(ctx->csa.spu_chnldata_RW[3]); } static ssize_t spufs_signal1_read(struct file *file, char __user *buf, @@ -1000,7 +989,7 @@ static ssize_t spufs_signal1_read(struct file *file, char __user *buf, ret = spu_acquire_saved(ctx); if (ret) return ret; - ret = __spufs_signal1_read(ctx, buf, len, pos); + ret = __spufs_signal1_read(ctx, buf, len); spu_release_saved(ctx); return ret; @@ -1104,28 +1093,26 @@ spufs_signal2_release(struct inode *inode, struct file *file) return 0; } -static ssize_t __spufs_signal2_read(struct spu_context *ctx, char __user *buf, - size_t len, loff_t *pos) +static ssize_t spufs_signal2_dump(struct spu_context *ctx, + struct coredump_params *cprm) { - int ret = 0; - u32 data; + if (!ctx->csa.spu_chnlcnt_RW[4]) + return 0; + return spufs_dump_emit(cprm, &ctx->csa.spu_chnldata_RW[4], + sizeof(ctx->csa.spu_chnldata_RW[4])); +} - if (len < 4) +static ssize_t __spufs_signal2_read(struct spu_context *ctx, char __user *buf, + size_t len) +{ + if (len < sizeof(ctx->csa.spu_chnldata_RW[4])) return -EINVAL; - - if (ctx->csa.spu_chnlcnt_RW[4]) { - data = ctx->csa.spu_chnldata_RW[4]; - ret = 4; - } - - if (!ret) - goto out; - - if (copy_to_user(buf, &data, 4)) + if (!ctx->csa.spu_chnlcnt_RW[4]) + return 0; + if (copy_to_user(buf, &ctx->csa.spu_chnldata_RW[4], + sizeof(ctx->csa.spu_chnldata_RW[4]))) return -EFAULT; - -out: - return ret; + return sizeof(ctx->csa.spu_chnldata_RW[4]); } static ssize_t spufs_signal2_read(struct file *file, char __user *buf, @@ -1137,7 +1124,7 @@ static ssize_t spufs_signal2_read(struct file *file, char __user *buf, ret = spu_acquire_saved(ctx); if (ret) return ret; - ret = __spufs_signal2_read(ctx, buf, len, pos); + ret = __spufs_signal2_read(ctx, buf, len); spu_release_saved(ctx); return ret; @@ -1961,38 +1948,36 @@ static const struct file_operations spufs_caps_fops = { .release = single_release, }; -static ssize_t __spufs_mbox_info_read(struct spu_context *ctx, - char __user *buf, size_t len, loff_t *pos) +static ssize_t spufs_mbox_info_dump(struct spu_context *ctx, + struct coredump_params *cprm) { - u32 data; - - /* EOF if there's no entry in the mbox */ if (!(ctx->csa.prob.mb_stat_R & 0x0000ff)) return 0; - - data = ctx->csa.prob.pu_mb_R; - - return simple_read_from_buffer(buf, len, pos, &data, sizeof data); + return spufs_dump_emit(cprm, &ctx->csa.prob.pu_mb_R, + sizeof(ctx->csa.prob.pu_mb_R)); } static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf, size_t len, loff_t *pos) { - int ret; struct spu_context *ctx = file->private_data; - - if (!access_ok(buf, len)) - return -EFAULT; + u32 stat, data; + int ret; ret = spu_acquire_saved(ctx); if (ret) return ret; spin_lock(&ctx->csa.register_lock); - ret = __spufs_mbox_info_read(ctx, buf, len, pos); + stat = ctx->csa.prob.mb_stat_R; + data = ctx->csa.prob.pu_mb_R; spin_unlock(&ctx->csa.register_lock); spu_release_saved(ctx); - return ret; + /* EOF if there's no entry in the mbox */ + if (!(stat & 0x0000ff)) + return 0; + + return simple_read_from_buffer(buf, len, pos, &data, sizeof(data)); } static const struct file_operations spufs_mbox_info_fops = { @@ -2001,38 +1986,36 @@ static const struct file_operations spufs_mbox_info_fops = { .llseek = generic_file_llseek, }; -static ssize_t __spufs_ibox_info_read(struct spu_context *ctx, - char __user *buf, size_t len, loff_t *pos) +static ssize_t spufs_ibox_info_dump(struct spu_context *ctx, + struct coredump_params *cprm) { - u32 data; - - /* EOF if there's no entry in the ibox */ if (!(ctx->csa.prob.mb_stat_R & 0xff0000)) return 0; - - data = ctx->csa.priv2.puint_mb_R; - - return simple_read_from_buffer(buf, len, pos, &data, sizeof data); + return spufs_dump_emit(cprm, &ctx->csa.priv2.puint_mb_R, + sizeof(ctx->csa.priv2.puint_mb_R)); } static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf, size_t len, loff_t *pos) { struct spu_context *ctx = file->private_data; + u32 stat, data; int ret; - if (!access_ok(buf, len)) - return -EFAULT; - ret = spu_acquire_saved(ctx); if (ret) return ret; spin_lock(&ctx->csa.register_lock); - ret = __spufs_ibox_info_read(ctx, buf, len, pos); + stat = ctx->csa.prob.mb_stat_R; + data = ctx->csa.priv2.puint_mb_R; spin_unlock(&ctx->csa.register_lock); spu_release_saved(ctx); - return ret; + /* EOF if there's no entry in the ibox */ + if (!(stat & 0xff0000)) + return 0; + + return simple_read_from_buffer(buf, len, pos, &data, sizeof(data)); } static const struct file_operations spufs_ibox_info_fops = { @@ -2041,41 +2024,36 @@ static const struct file_operations spufs_ibox_info_fops = { .llseek = generic_file_llseek, }; -static ssize_t __spufs_wbox_info_read(struct spu_context *ctx, - char __user *buf, size_t len, loff_t *pos) +static size_t spufs_wbox_info_cnt(struct spu_context *ctx) { - int i, cnt; - u32 data[4]; - u32 wbox_stat; - - wbox_stat = ctx->csa.prob.mb_stat_R; - cnt = 4 - ((wbox_stat & 0x00ff00) >> 8); - for (i = 0; i < cnt; i++) { - data[i] = ctx->csa.spu_mailbox_data[i]; - } + return (4 - ((ctx->csa.prob.mb_stat_R & 0x00ff00) >> 8)) * sizeof(u32); +} - return simple_read_from_buffer(buf, len, pos, &data, - cnt * sizeof(u32)); +static ssize_t spufs_wbox_info_dump(struct spu_context *ctx, + struct coredump_params *cprm) +{ + return spufs_dump_emit(cprm, &ctx->csa.spu_mailbox_data, + spufs_wbox_info_cnt(ctx)); } static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf, size_t len, loff_t *pos) { struct spu_context *ctx = file->private_data; - int ret; - - if (!access_ok(buf, len)) - return -EFAULT; + u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)]; + int ret, count; ret = spu_acquire_saved(ctx); if (ret) return ret; spin_lock(&ctx->csa.register_lock); - ret = __spufs_wbox_info_read(ctx, buf, len, pos); + count = spufs_wbox_info_cnt(ctx); + memcpy(&data, &ctx->csa.spu_mailbox_data, sizeof(data)); spin_unlock(&ctx->csa.register_lock); spu_release_saved(ctx); - return ret; + return simple_read_from_buffer(buf, len, pos, &data, + count * sizeof(u32)); } static const struct file_operations spufs_wbox_info_fops = { @@ -2084,50 +2062,53 @@ static const struct file_operations spufs_wbox_info_fops = { .llseek = generic_file_llseek, }; -static ssize_t __spufs_dma_info_read(struct spu_context *ctx, - char __user *buf, size_t len, loff_t *pos) +static void spufs_get_dma_info(struct spu_context *ctx, + struct spu_dma_info *info) { - struct spu_dma_info info; - struct mfc_cq_sr *qp, *spuqp; int i; - info.dma_info_type = ctx->csa.priv2.spu_tag_status_query_RW; - info.dma_info_mask = ctx->csa.lscsa->tag_mask.slot[0]; - info.dma_info_status = ctx->csa.spu_chnldata_RW[24]; - info.dma_info_stall_and_notify = ctx->csa.spu_chnldata_RW[25]; - info.dma_info_atomic_command_status = ctx->csa.spu_chnldata_RW[27]; + info->dma_info_type = ctx->csa.priv2.spu_tag_status_query_RW; + info->dma_info_mask = ctx->csa.lscsa->tag_mask.slot[0]; + info->dma_info_status = ctx->csa.spu_chnldata_RW[24]; + info->dma_info_stall_and_notify = ctx->csa.spu_chnldata_RW[25]; + info->dma_info_atomic_command_status = ctx->csa.spu_chnldata_RW[27]; for (i = 0; i < 16; i++) { - qp = &info.dma_info_command_data[i]; - spuqp = &ctx->csa.priv2.spuq[i]; + struct mfc_cq_sr *qp = &info->dma_info_command_data[i]; + struct mfc_cq_sr *spuqp = &ctx->csa.priv2.spuq[i]; qp->mfc_cq_data0_RW = spuqp->mfc_cq_data0_RW; qp->mfc_cq_data1_RW = spuqp->mfc_cq_data1_RW; qp->mfc_cq_data2_RW = spuqp->mfc_cq_data2_RW; qp->mfc_cq_data3_RW = spuqp->mfc_cq_data3_RW; } +} - return simple_read_from_buffer(buf, len, pos, &info, - sizeof info); +static ssize_t spufs_dma_info_dump(struct spu_context *ctx, + struct coredump_params *cprm) +{ + struct spu_dma_info info; + + spufs_get_dma_info(ctx, &info); + return spufs_dump_emit(cprm, &info, sizeof(info)); } static ssize_t spufs_dma_info_read(struct file *file, char __user *buf, size_t len, loff_t *pos) { struct spu_context *ctx = file->private_data; + struct spu_dma_info info; int ret; - if (!access_ok(buf, len)) - return -EFAULT; - ret = spu_acquire_saved(ctx); if (ret) return ret; spin_lock(&ctx->csa.register_lock); - ret = __spufs_dma_info_read(ctx, buf, len, pos); + spufs_get_dma_info(ctx, &info); spin_unlock(&ctx->csa.register_lock); spu_release_saved(ctx); - return ret; + return simple_read_from_buffer(buf, len, pos, &info, + sizeof(info)); } static const struct file_operations spufs_dma_info_fops = { @@ -2136,52 +2117,55 @@ static const struct file_operations spufs_dma_info_fops = { .llseek = no_llseek, }; -static ssize_t __spufs_proxydma_info_read(struct spu_context *ctx, - char __user *buf, size_t len, loff_t *pos) +static void spufs_get_proxydma_info(struct spu_context *ctx, + struct spu_proxydma_info *info) { - struct spu_proxydma_info info; - struct mfc_cq_sr *qp, *puqp; - int ret = sizeof info; int i; - if (len < ret) - return -EINVAL; - - if (!access_ok(buf, len)) - return -EFAULT; + info->proxydma_info_type = ctx->csa.prob.dma_querytype_RW; + info->proxydma_info_mask = ctx->csa.prob.dma_querymask_RW; + info->proxydma_info_status = ctx->csa.prob.dma_tagstatus_R; - info.proxydma_info_type = ctx->csa.prob.dma_querytype_RW; - info.proxydma_info_mask = ctx->csa.prob.dma_querymask_RW; - info.proxydma_info_status = ctx->csa.prob.dma_tagstatus_R; for (i = 0; i < 8; i++) { - qp = &info.proxydma_info_command_data[i]; - puqp = &ctx->csa.priv2.puq[i]; + struct mfc_cq_sr *qp = &info->proxydma_info_command_data[i]; + struct mfc_cq_sr *puqp = &ctx->csa.priv2.puq[i]; qp->mfc_cq_data0_RW = puqp->mfc_cq_data0_RW; qp->mfc_cq_data1_RW = puqp->mfc_cq_data1_RW; qp->mfc_cq_data2_RW = puqp->mfc_cq_data2_RW; qp->mfc_cq_data3_RW = puqp->mfc_cq_data3_RW; } +} - return simple_read_from_buffer(buf, len, pos, &info, - sizeof info); +static ssize_t spufs_proxydma_info_dump(struct spu_context *ctx, + struct coredump_params *cprm) +{ + struct spu_proxydma_info info; + + spufs_get_proxydma_info(ctx, &info); + return spufs_dump_emit(cprm, &info, sizeof(info)); } static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf, size_t len, loff_t *pos) { struct spu_context *ctx = file->private_data; + struct spu_proxydma_info info; int ret; + if (len < sizeof(info)) + return -EINVAL; + ret = spu_acquire_saved(ctx); if (ret) return ret; spin_lock(&ctx->csa.register_lock); - ret = __spufs_proxydma_info_read(ctx, buf, len, pos); + spufs_get_proxydma_info(ctx, &info); spin_unlock(&ctx->csa.register_lock); spu_release_saved(ctx); - return ret; + return simple_read_from_buffer(buf, len, pos, &info, + sizeof(info)); } static const struct file_operations spufs_proxydma_info_fops = { @@ -2625,23 +2609,23 @@ const struct spufs_tree_descr spufs_dir_debug_contents[] = { }; const struct spufs_coredump_reader spufs_coredump_read[] = { - { "regs", __spufs_regs_read, NULL, sizeof(struct spu_reg128[128])}, - { "fpcr", __spufs_fpcr_read, NULL, sizeof(struct spu_reg128) }, + { "regs", spufs_regs_dump, NULL, sizeof(struct spu_reg128[128])}, + { "fpcr", spufs_fpcr_dump, NULL, sizeof(struct spu_reg128) }, { "lslr", NULL, spufs_lslr_get, 19 }, { "decr", NULL, spufs_decr_get, 19 }, { "decr_status", NULL, spufs_decr_status_get, 19 }, - { "mem", __spufs_mem_read, NULL, LS_SIZE, }, - { "signal1", __spufs_signal1_read, NULL, sizeof(u32) }, + { "mem", spufs_mem_dump, NULL, LS_SIZE, }, + { "signal1", spufs_signal1_dump, NULL, sizeof(u32) }, { "signal1_type", NULL, spufs_signal1_type_get, 19 }, - { "signal2", __spufs_signal2_read, NULL, sizeof(u32) }, + { "signal2", spufs_signal2_dump, NULL, sizeof(u32) }, { "signal2_type", NULL, spufs_signal2_type_get, 19 }, { "event_mask", NULL, spufs_event_mask_get, 19 }, { "event_status", NULL, spufs_event_status_get, 19 }, - { "mbox_info", __spufs_mbox_info_read, NULL, sizeof(u32) }, - { "ibox_info", __spufs_ibox_info_read, NULL, sizeof(u32) }, - { "wbox_info", __spufs_wbox_info_read, NULL, 4 * sizeof(u32)}, - { "dma_info", __spufs_dma_info_read, NULL, sizeof(struct spu_dma_info)}, - { "proxydma_info", __spufs_proxydma_info_read, + { "mbox_info", spufs_mbox_info_dump, NULL, sizeof(u32) }, + { "ibox_info", spufs_ibox_info_dump, NULL, sizeof(u32) }, + { "wbox_info", spufs_wbox_info_dump, NULL, 4 * sizeof(u32)}, + { "dma_info", spufs_dma_info_dump, NULL, sizeof(struct spu_dma_info)}, + { "proxydma_info", spufs_proxydma_info_dump, NULL, sizeof(struct spu_proxydma_info)}, { "object-id", NULL, spufs_object_id_get, 19 }, { "npc", NULL, spufs_npc_get, 19 }, diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h index 413c89afe112..1ba4d884febf 100644 --- a/arch/powerpc/platforms/cell/spufs/spufs.h +++ b/arch/powerpc/platforms/cell/spufs/spufs.h @@ -337,8 +337,7 @@ void spufs_dma_callback(struct spu *spu, int type); extern struct spu_coredump_calls spufs_coredump_calls; struct spufs_coredump_reader { char *name; - ssize_t (*read)(struct spu_context *ctx, - char __user *buffer, size_t size, loff_t *pos); + ssize_t (*dump)(struct spu_context *ctx, struct coredump_params *cprm); u64 (*get)(struct spu_context *ctx); size_t size; }; diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index 91e376b0d28c..6d01c96aeb5c 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -39,25 +39,6 @@ csum_partial(const void *buff, int len, __wsum sum) return sum; } -/* - * the same as csum_partial_copy, but copies from user space. - * - * here even more important to align src and dst on a 32-bit (or even - * better 64-bit) boundary - * - * Copy from userspace and compute checksum. - */ -static inline __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, - int *err_ptr) -{ - if (unlikely(copy_from_user(dst, src, len))) - *err_ptr = -EFAULT; - return csum_partial(dst, len, sum); -} - - static inline __wsum csum_partial_copy_nocheck (const void *src, void *dst, int len, __wsum sum) { diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bd7bd3581a0f..bfdcb7633957 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ 435 common clone3 sys_clone3 sys_clone3 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 sys_faccessat2 diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b4f0e37b83eb..97656d20b9ea 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -71,7 +71,6 @@ config SUPERH32 select HAVE_FUNCTION_TRACER select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select ARCH_WANT_IPC_PARSE_VERSION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_ARCH_KGDB diff --git a/arch/sh/include/asm/checksum_32.h b/arch/sh/include/asm/checksum_32.h index 36b84cfd3f67..91571a42e44e 100644 --- a/arch/sh/include/asm/checksum_32.h +++ b/arch/sh/include/asm/checksum_32.h @@ -48,12 +48,17 @@ __wsum csum_partial_copy_nocheck(const void *src, void *dst, return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); } +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static inline -__wsum csum_partial_copy_from_user(const void __user *src, void *dst, +__wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *err_ptr) { - return csum_partial_copy_generic((__force const void *)src, dst, + if (access_ok(src, len)) + return csum_partial_copy_generic((__force const void *)src, dst, len, sum, err_ptr, NULL); + if (len) + *err_ptr = -EFAULT; + return sum; } /* diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index c7a30fcd135f..acc35daa1b79 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 63cf17bc760d..2130381c9d57 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -170,11 +170,21 @@ BUILD_TRAP_HANDLER(bug) force_sig(SIGTRAP); } +#ifdef CONFIG_DYNAMIC_FTRACE +extern void arch_ftrace_nmi_enter(void); +extern void arch_ftrace_nmi_exit(void); +#else +static inline void arch_ftrace_nmi_enter(void) { } +static inline void arch_ftrace_nmi_exit(void) { } +#endif + BUILD_TRAP_HANDLER(nmi) { unsigned int cpu = smp_processor_id(); TRAP_HANDLER_DECL; + arch_ftrace_nmi_enter(); + nmi_enter(); nmi_count(cpu)++; @@ -190,4 +200,6 @@ BUILD_TRAP_HANDLER(nmi) } nmi_exit(); + + arch_ftrace_nmi_exit(); } diff --git a/arch/sparc/include/asm/checksum.h b/arch/sparc/include/asm/checksum.h index c3be56e2e768..a6256cb6fc5c 100644 --- a/arch/sparc/include/asm/checksum.h +++ b/arch/sparc/include/asm/checksum.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef ___ASM_SPARC_CHECKSUM_H #define ___ASM_SPARC_CHECKSUM_H +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER #if defined(__sparc__) && defined(__arch64__) #include <asm/checksum_64.h> #else diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h index 5fc98d80b03b..479a0b812af5 100644 --- a/arch/sparc/include/asm/checksum_32.h +++ b/arch/sparc/include/asm/checksum_32.h @@ -60,7 +60,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) } static inline __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, int len, +csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *err) { register unsigned long ret asm("o0") = (unsigned long)src; @@ -68,6 +68,12 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, register int l asm("g1") = len; register __wsum s asm("g7") = sum; + if (unlikely(!access_ok(src, len))) { + if (len) + *err = -EFAULT; + return sum; + } + __asm__ __volatile__ ( ".section __ex_table,#alloc\n\t" ".align 4\n\t" @@ -83,8 +89,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, return (__force __wsum)ret; } +#define HAVE_CSUM_COPY_USER + static inline __wsum -csum_partial_copy_to_user(const void *src, void __user *dst, int len, +csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, int *err) { if (!access_ok(dst, len)) { @@ -113,9 +121,6 @@ csum_partial_copy_to_user(const void *src, void __user *dst, int len, } } -#define HAVE_CSUM_COPY_USER -#define csum_and_copy_to_user csum_partial_copy_to_user - /* ihl is always 5 or greater, almost always is 5, and iph is word aligned * the majority of the time. */ diff --git a/arch/sparc/include/asm/checksum_64.h b/arch/sparc/include/asm/checksum_64.h index e52450930e4e..0fa4433f5662 100644 --- a/arch/sparc/include/asm/checksum_64.h +++ b/arch/sparc/include/asm/checksum_64.h @@ -46,7 +46,7 @@ long __csum_partial_copy_from_user(const void __user *src, __wsum sum); static inline __wsum -csum_partial_copy_from_user(const void __user *src, +csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *err) { diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index f13615ecdecc..8004a276cb74 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -483,3 +483,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2d3f963fd6f1..6e23037f3b51 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -92,6 +92,7 @@ config X86 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_SYM_ANNOTATIONS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT @@ -1611,19 +1612,10 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config ARCH_HAVE_MEMORY_PRESENT - def_bool y - depends on X86_32 && DISCONTIGMEM - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA -config ARCH_DISCONTIGMEM_ENABLE - def_bool n - depends on NUMA && X86_32 - depends on BROKEN - config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD @@ -1888,10 +1880,10 @@ config X86_UMIP results are dummy. config X86_INTEL_MEMORY_PROTECTION_KEYS - prompt "Intel Memory Protection Keys" + prompt "Memory Protection Keys" def_bool y # Note: only available in 64-bit mode - depends on CPU_SUP_INTEL && X86_64 + depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD) select ARCH_USES_HIGH_VMA_FLAGS select ARCH_HAS_PKEYS ---help--- diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2e74690b028a..f909d3ce36e6 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -99,15 +99,6 @@ config DEBUG_WX If in doubt, say "Y". -config DOUBLEFAULT - default y - bool "Enable doublefault exception handler" if EXPERT && X86_32 - ---help--- - This option allows trapping of rare doublefault exceptions that - would otherwise cause a system to silently reboot. Disabling this - option saves about 4k and might cause you much additional grey - hair. - config DEBUG_TLBFLUSH bool "Set upper limit of TLB entries to flush one-by-one" depends on DEBUG_KERNEL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b65ec63c7db7..00e378de8bc0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -246,7 +246,7 @@ drivers-$(CONFIG_FB) += arch/x86/video/ boot := arch/x86/boot -BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage +BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 isoimage PHONY += bzImage $(BOOT_TARGETS) @@ -267,8 +267,8 @@ endif $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $@ -PHONY += install -install: +PHONY += install bzlilo +install bzlilo: $(Q)$(MAKE) $(build)=$(boot) $@ PHONY += vdso_install diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index e17be90ab312..4c5355684321 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -57,11 +57,10 @@ $(obj)/cpu.o: $(obj)/cpustr.h quiet_cmd_cpustr = CPUSTR $@ cmd_cpustr = $(obj)/mkcpustr > $@ -targets += cpustr.h $(obj)/cpustr.h: $(obj)/mkcpustr FORCE $(call if_changed,cpustr) endif -clean-files += cpustr.h +targets += cpustr.h # --------------------------------------------------------------------------- @@ -129,6 +128,8 @@ quiet_cmd_genimage = GENIMAGE $3 cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD) +PHONY += bzdisk fdimage fdimage144 fdimage288 isoimage bzlilo install + # This requires write access to /dev/fd0 bzdisk: $(obj)/bzImage $(obj)/mtools.conf $(call cmd,genimage,bzdisk,/dev/fd0) @@ -146,7 +147,7 @@ isoimage: $(obj)/bzImage $(call cmd,genimage,isoimage,$(obj)/image.iso) @$(kecho) 'Kernel: $(obj)/image.iso is ready' -bzlilo: $(obj)/bzImage +bzlilo: if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index ef2ad7253cd5..8bcbcee54aa1 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -280,9 +280,9 @@ acpi_physical_address get_rsdp_addr(void) */ #define MAX_ADDR_LEN 19 -static acpi_physical_address get_cmdline_acpi_rsdp(void) +static unsigned long get_cmdline_acpi_rsdp(void) { - acpi_physical_address addr = 0; + unsigned long addr = 0; #ifdef CONFIG_KEXEC char val[MAX_ADDR_LEN] = { }; @@ -292,7 +292,7 @@ static acpi_physical_address get_cmdline_acpi_rsdp(void) if (ret < 0) return 0; - if (kstrtoull(val, 16, &addr)) + if (boot_kstrtoul(val, 16, &addr)) return 0; #endif return addr; @@ -314,7 +314,6 @@ static unsigned long get_acpi_srat_table(void) * different ideas about whether to trust a command-line parameter. */ rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); - if (!rsdp) rsdp = (struct acpi_table_rsdp *)(long) boot_params->acpi_rsdp_addr; diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index 2b2049259619..c4bb0f9363f5 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -28,8 +28,6 @@ SYM_FUNC_START(__efi64_thunk) push %rbx leaq 1f(%rip), %rbp - leaq efi_gdt64(%rip), %rbx - movl %ebx, 2(%rbx) /* Fixup the gdt base address */ movl %ds, %eax push %rax @@ -48,7 +46,8 @@ SYM_FUNC_START(__efi64_thunk) movl %r8d, 0xc(%rsp) movl %r9d, 0x10(%rsp) - sgdt 0x14(%rsp) + leaq 0x14(%rsp), %rbx + sgdt (%rbx) /* * Switch to gdt with 32-bit segments. This is the firmware GDT @@ -68,8 +67,7 @@ SYM_FUNC_START(__efi64_thunk) pushq %rax lretq -1: lgdt 0x14(%rsp) - addq $32, %rsp +1: addq $32, %rsp movq %rdi, %rax pop %rbx @@ -175,14 +173,3 @@ SYM_DATA_END(efi32_boot_cs) SYM_DATA_START(efi32_boot_ds) .word 0 SYM_DATA_END(efi32_boot_ds) - -SYM_DATA_START(efi_gdt64) - .word efi_gdt64_end - efi_gdt64 - .long 0 /* Filled out by user */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x0080890000000000 /* TS descriptor */ - .quad 0x0000000000000000 /* TS continued */ -SYM_DATA_END_LABEL(efi_gdt64, SYM_L_LOCAL, efi_gdt64_end) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index ab3307036ba4..03557f2174bf 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -49,16 +49,17 @@ * Position Independent Executable (PIE) so that linker won't optimize * R_386_GOT32X relocation to its fixed symbol address. Older * linkers generate R_386_32 relocations against locally defined symbols, - * _bss, _ebss, _got and _egot, in PIE. It isn't wrong, just less + * _bss, _ebss, _got, _egot and _end, in PIE. It isn't wrong, just less * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle * R_386_32 relocations when relocating the kernel. To generate - * R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as + * R_386_RELATIVE relocations, we mark _bss, _ebss, _got, _egot and _end as * hidden: */ .hidden _bss .hidden _ebss .hidden _got .hidden _egot + .hidden _end __HEAD SYM_FUNC_START(startup_32) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 4f7e6b84be07..e821a7d7d5c4 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -42,6 +42,7 @@ .hidden _ebss .hidden _got .hidden _egot + .hidden _end __HEAD .code32 @@ -393,6 +394,14 @@ SYM_CODE_START(startup_64) addq %rax, 2(%rax) lgdt (%rax) + /* Reload CS so IRET returns to a CS actually in the GDT */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 508cfa6828c5..8f1025d1f681 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -52,6 +52,7 @@ SECTIONS _data = . ; *(.data) *(.data.*) + *(.bss.efistub) _edata = . ; } . = ALIGN(L1_CACHE_BYTES); @@ -73,4 +74,6 @@ SECTIONS #endif . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; + + DISCARDS } diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 8272a4492844..8a3fff9128bb 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -117,7 +117,6 @@ static unsigned int simple_guess_base(const char *cp) * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use */ - unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { unsigned long long result = 0; @@ -335,3 +334,45 @@ int kstrtoull(const char *s, unsigned int base, unsigned long long *res) s++; return _kstrtoull(s, base, res); } + +static int _kstrtoul(const char *s, unsigned int base, unsigned long *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +/** + * kstrtoul - convert a string to an unsigned long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the simple_strtoull. + */ +int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res) +{ + /* + * We want to shortcut function call, but + * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0. + */ + if (sizeof(unsigned long) == sizeof(unsigned long long) && + __alignof__(unsigned long) == __alignof__(unsigned long long)) + return kstrtoull(s, base, (unsigned long long *)res); + else + return _kstrtoul(s, base, res); +} diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 38d8f2f5e47e..995f7b7ad512 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -30,4 +30,5 @@ extern unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); int kstrtoull(const char *s, unsigned int base, unsigned long long *res); +int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res); #endif /* BOOT_STRING_H */ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index cad6e1bfa7d5..54e7d15dbd0d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2758,7 +2758,7 @@ SYM_FUNC_START(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2803,7 +2803,7 @@ SYM_FUNC_START(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index d01ddd73de65..ecc0a9a905c4 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -1228,7 +1228,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 563ef6e83cdd..0907243c501c 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -1339,7 +1339,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 0e6690e3618c..8501ec4532f4 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -75,7 +75,7 @@ .text SYM_FUNC_START(crc_pcl) -#define bufp %rdi +#define bufp rdi #define bufp_dw %edi #define bufp_w %di #define bufp_b %dil @@ -105,9 +105,9 @@ SYM_FUNC_START(crc_pcl) ## 1) ALIGN: ################################################################ - mov bufp, bufptmp # rdi = *buf - neg bufp - and $7, bufp # calculate the unalignment amount of + mov %bufp, bufptmp # rdi = *buf + neg %bufp + and $7, %bufp # calculate the unalignment amount of # the address je proc_block # Skip if aligned @@ -123,13 +123,13 @@ SYM_FUNC_START(crc_pcl) do_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer - add bufp, bufptmp # align buffer pointer for quadword + add %bufp, bufptmp # align buffer pointer for quadword # processing - sub bufp, len # update buffer length + sub %bufp, len # update buffer length align_loop: crc32b %bl, crc_init_dw # compute crc32 of 1-byte shr $8, tmp # get next byte - dec bufp + dec %bufp jne align_loop proc_block: @@ -169,10 +169,10 @@ continue_block: xor crc2, crc2 ## branch into array - lea jump_table(%rip), bufp - movzxw (bufp, %rax, 2), len - lea crc_array(%rip), bufp - lea (bufp, len, 1), bufp + lea jump_table(%rip), %bufp + movzxw (%bufp, %rax, 2), len + lea crc_array(%rip), %bufp + lea (%bufp, len, 1), %bufp JMP_NOSPEC bufp ################################################################ @@ -218,9 +218,9 @@ LABEL crc_ %i ## 4) Combine three results: ################################################################ - lea (K_table-8)(%rip), bufp # first entry is for idx 1 + lea (K_table-8)(%rip), %bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 - pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 + pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 leal (%eax,%eax,2), %eax # rax *= 3 (total *24) subq %rax, tmp # tmp -= rax*24 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b67bae7091d7..ac232f456396 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -816,7 +816,7 @@ SYM_CODE_START(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - CALL_NOSPEC %ebx + CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -1501,7 +1501,7 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi + CALL_NOSPEC edi jmp ret_from_exception SYM_CODE_END(common_exception_read_cr2) @@ -1522,7 +1522,7 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi + CALL_NOSPEC edi jmp ret_from_exception SYM_CODE_END(common_exception) @@ -1536,7 +1536,6 @@ SYM_CODE_START(debug) jmp common_exception SYM_CODE_END(debug) -#ifdef CONFIG_DOUBLEFAULT SYM_CODE_START(double_fault) 1: /* @@ -1576,7 +1575,6 @@ SYM_CODE_START(double_fault) hlt jmp 1b SYM_CODE_END(double_fault) -#endif /* * NMI is doubly nasty. It can happen on the first instruction of diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3063aa9090f9..64fe3d82157e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -348,7 +348,7 @@ SYM_CODE_START(ret_from_fork) /* kernel thread */ UNWIND_HINT_EMPTY movq %r12, %rdi - CALL_NOSPEC %rbx + CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 54581ac671b4..d8f8a1a69ed1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -442,3 +442,4 @@ 435 i386 clone3 sys_clone3 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd +439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 37b844f839bc..78847b32e137 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -359,6 +359,7 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 433a1259f61d..54e03ab26ff3 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -24,6 +24,8 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o +vobjs32-y += vdso32/vclock_gettime.o # files to link into kernel obj-y += vma.o @@ -37,10 +39,12 @@ vdso_img-$(VDSO32-y) += 32 obj-$(VDSO32-y) += vdso32-setup.o vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) +vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) +targets += vdso32/vdso32.lds $(vobjs32-y) # Build the vDSO image C files and link them in. vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) @@ -130,10 +134,6 @@ $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 -targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o -targets += vdso32/vclock_gettime.o - KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 @@ -158,12 +158,7 @@ endif $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(obj)/vdso32.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/system_call.o \ - $(obj)/vdso32/sigreturn.o +$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE $(call if_changed,vdso_and_check) # diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 3842873b3ae3..7380908045c7 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -187,7 +187,7 @@ static void map_input(const char *name, void **addr, size_t *len, int prot) int fd = open(name, O_RDONLY); if (fd == -1) - err(1, "%s", name); + err(1, "open(%s)", name); tmp_len = lseek(fd, 0, SEEK_END); if (tmp_len == (off_t)-1) @@ -240,7 +240,7 @@ int main(int argc, char **argv) outfilename = argv[3]; outfile = fopen(outfilename, "w"); if (!outfile) - err(1, "%s", argv[2]); + err(1, "fopen(%s)", outfilename); go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index a20b134de2a8..6f46e11ce539 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -13,8 +13,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, unsigned long load_size = -1; /* Work around bogus warning */ unsigned long mapping_size; ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; - int i; - unsigned long j; + unsigned long i, syms_nr; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, *alt_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; @@ -86,11 +85,10 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); + syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); /* Walk the symbol table */ - for (i = 0; - i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); - i++) { - int k; + for (i = 0; i < syms_nr; i++) { + unsigned int k; ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; const char *sym_name = raw_addr + @@ -150,11 +148,11 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {", mapping_size); - for (j = 0; j < stripped_len; j++) { - if (j % 10 == 0) + for (i = 0; i < stripped_len; i++) { + if (i % 10 == 0) fprintf(outfile, "\n\t"); fprintf(outfile, "0x%02X, ", - (int)((unsigned char *)stripped_addr)[j]); + (int)((unsigned char *)stripped_addr)[i]); } fprintf(outfile, "\n};\n\n"); diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 9a7a1446cb3a..4a809c6cbd2f 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -10,11 +10,11 @@ config PERF_EVENTS_INTEL_UNCORE available on NehalemEX and more modern processors. config PERF_EVENTS_INTEL_RAPL - tristate "Intel rapl performance events" - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + tristate "Intel/AMD rapl performance events" + depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI default y ---help--- - Include support for Intel rapl performance events for power + Include support for Intel and AMD rapl performance events for power monitoring on modern processors. config PERF_EVENTS_INTEL_CSTATE diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9e07f554333f..12c42eba77ec 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += core.o probe.o +obj-$(PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ +obj-$(CONFIG_CPU_SUP_CENTAUR) += zhaoxin/ +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin/ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a619763e96e1..9e63ee50b19a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + err = zhaoxin_pmu_init(); + break; default: err = -ENOTSUPP; } diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 3468b0c1dc7c..e67a5886336c 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -2,8 +2,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o -intel-rapl-perf-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6a3b599ee0fe..731dd8d0dbb1 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -58,7 +58,7 @@ struct bts_buffer { local_t head; unsigned long end; void **data_pages; - struct bts_phys buf[0]; + struct bts_phys buf[]; }; static struct pmu bts_pmu; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 332954cccece..ca35c8b5ee10 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1892,8 +1892,8 @@ static __initconst const u64 tnt_hw_cache_extra_regs static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1), EVENT_EXTRA_END }; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1db7a51d9792..e94af4a54d0d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -226,8 +226,6 @@ static int __init pt_pmu_hw_init(void) pt_pmu.vmx = true; } - attrs = NULL; - for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 0da4a4605536..b469ddd45515 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -130,7 +130,7 @@ struct intel_uncore_box { struct list_head list; struct list_head active_list; void __iomem *io_addr; - struct intel_uncore_extra_reg shared_regs[0]; + struct intel_uncore_extra_reg shared_regs[]; }; /* CFL uncore 8th cbox MSRs */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f1cd1ca1a77b..e17a3d8a47ed 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -618,6 +618,7 @@ struct x86_pmu { /* PMI handler bits */ unsigned int late_ack :1, + enabled_ack :1, counter_freezing :1; /* * sysfs attrs @@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void) return 0; } #endif /* CONFIG_CPU_SUP_INTEL */ + +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN)) +int zhaoxin_pmu_init(void); +#else +static inline int zhaoxin_pmu_init(void) +{ + return 0; +} +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/ diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index c2ede2f3b277..136a1e847254 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -10,6 +10,11 @@ not_visible(struct kobject *kobj, struct attribute *attr, int i) return 0; } +/* + * Accepts msr[] array with non populated entries as long as either + * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs + * visibility is visible when group->is_visible callback is set. + */ unsigned long perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) { @@ -24,8 +29,16 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) if (!msr[bit].no_check) { struct attribute_group *grp = msr[bit].grp; + /* skip entry with no group */ + if (!grp) + continue; + grp->is_visible = not_visible; + /* skip unpopulated entry */ + if (!msr[bit].msr) + continue; + if (msr[bit].test && !msr[bit].test(bit, data)) continue; /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/rapl.c index a5dbd25852cb..0f2bf59f4354 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/rapl.c @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Support Intel RAPL energy consumption counters + * Support Intel/AMD RAPL energy consumption counters * Copyright (C) 2013 Google, Inc., Stephane Eranian * * Intel RAPL interface is specified in the IA-32 Manual Vol3b * section 14.7.1 (September 2013) * + * AMD RAPL interface for Fam17h is described in the public PPR: + * https://bugzilla.kernel.org/show_bug.cgi?id=206537 + * * RAPL provides more controls than just reporting energy consumption * however here we only expose the 3 energy consumption free running * counters (pp0, pkg, dram). @@ -58,8 +61,8 @@ #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> -#include "../perf_event.h" -#include "../probe.h" +#include "perf_event.h" +#include "probe.h" MODULE_LICENSE("GPL"); @@ -128,7 +131,9 @@ struct rapl_pmus { }; struct rapl_model { + struct perf_msr *rapl_msrs; unsigned long events; + unsigned int msr_power_unit; bool apply_quirk; }; @@ -138,7 +143,7 @@ static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; -static struct perf_msr rapl_msrs[]; +static struct perf_msr *rapl_msrs; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { @@ -455,9 +460,16 @@ static struct attribute *rapl_events_cores[] = { NULL, }; +static umode_t +rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return 0; +} + static struct attribute_group rapl_events_cores_group = { .name = "events", .attrs = rapl_events_cores, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_pkg[] = { @@ -470,6 +482,7 @@ static struct attribute *rapl_events_pkg[] = { static struct attribute_group rapl_events_pkg_group = { .name = "events", .attrs = rapl_events_pkg, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_ram[] = { @@ -482,6 +495,7 @@ static struct attribute *rapl_events_ram[] = { static struct attribute_group rapl_events_ram_group = { .name = "events", .attrs = rapl_events_ram, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_gpu[] = { @@ -494,6 +508,7 @@ static struct attribute *rapl_events_gpu[] = { static struct attribute_group rapl_events_gpu_group = { .name = "events", .attrs = rapl_events_gpu, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_psys[] = { @@ -506,6 +521,7 @@ static struct attribute *rapl_events_psys[] = { static struct attribute_group rapl_events_psys_group = { .name = "events", .attrs = rapl_events_psys, + .is_visible = rapl_not_visible, }; static bool test_msr(int idx, void *data) @@ -513,7 +529,7 @@ static bool test_msr(int idx, void *data) return test_bit(idx, (unsigned long *) data); } -static struct perf_msr rapl_msrs[] = { +static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, @@ -521,6 +537,16 @@ static struct perf_msr rapl_msrs[] = { [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, }; +/* + * Force to PERF_RAPL_MAX size due to: + * - perf_msr_probe(PERF_RAPL_MAX) + * - want to use same event codes across both architectures + */ +static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = { + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, +}; + + static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -575,13 +601,13 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_check_hw_unit(bool apply_quirk) +static int rapl_check_hw_unit(struct rapl_model *rm) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_DOMAINS; i++) rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; @@ -592,7 +618,7 @@ static int rapl_check_hw_unit(bool apply_quirk) * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ - if (apply_quirk) + if (rm->apply_quirk) rapl_hw_unit[PERF_RAPL_RAM] = 16; /* @@ -673,6 +699,8 @@ static struct rapl_model model_snb = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { @@ -680,6 +708,8 @@ static struct rapl_model model_snbep = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { @@ -688,6 +718,8 @@ static struct rapl_model model_hsw = { BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { @@ -695,12 +727,16 @@ static struct rapl_model model_hsx = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { .events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { @@ -710,6 +746,15 @@ static struct rapl_model model_skl = { BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, +}; + +static struct rapl_model model_amd_fam17h = { + .events = BIT(PERF_RAPL_PKG), + .apply_quirk = false, + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_msrs = amd_rapl_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { @@ -738,8 +783,11 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); @@ -755,10 +803,13 @@ static int __init rapl_pmu_init(void) return -ENODEV; rm = (struct rapl_model *) id->driver_data; + + rapl_msrs = rm->rapl_msrs; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, false, (void *) &rm->events); - ret = rapl_check_hw_unit(rm->apply_quirk); + ret = rapl_check_hw_unit(rm); if (ret) return ret; diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile new file mode 100644 index 000000000000..642c1174d662 --- /dev/null +++ b/arch/x86/events/zhaoxin/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += core.o diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c new file mode 100644 index 000000000000..898fa1ae9ceb --- /dev/null +++ b/arch/x86/events/zhaoxin/core.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Zhoaxin PMU; like Intel Architectural PerfMon-v2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/nmi.h> + +#include <asm/cpufeature.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "../perf_event.h" + +/* + * Zhaoxin PerfMon, used on zxc and later. + */ +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { + + [PERF_COUNT_HW_CPU_CYCLES] = 0x0082, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515, + [PERF_COUNT_HW_CACHE_MISSES] = 0x051a, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0083, +}; + +static struct event_constraint zxc_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint zxd_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */ + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */ + EVENT_CONSTRAINT_END +}; + +static __initconst const u64 zxd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x0538, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0700, + [C(RESULT_MISS)] = 0x0709, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static __initconst const u64 zxe_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x054b, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0028, + [C(RESULT_MISS)] = 0x0029, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static void zhaoxin_pmu_disable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void zhaoxin_pmu_enable_all(int added) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static inline u64 zhaoxin_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void zhaoxin_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void zxc_pmu_ack_status(u64 ack) +{ + /* + * ZXC needs global control enabled in order to clear status bits. + */ + zhaoxin_pmu_enable_all(0); + zhaoxin_pmu_ack_status(ack); + zhaoxin_pmu_disable_all(); +} + +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); +} + +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_enable_fixed(hwc); + return; + } + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + u64 status; + int bit; + + cpuc = this_cpu_ptr(&cpu_hw_events); + apic_write(APIC_LVTPC, APIC_DM_NMI); + zhaoxin_pmu_disable_all(); + status = zhaoxin_pmu_get_status(); + if (!status) + goto done; + +again: + if (x86_pmu.enabled_ack) + zxc_pmu_ack_status(status); + else + zhaoxin_pmu_ack_status(status); + + inc_irq_stat(apic_perf_irqs); + + /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + x86_perf_event_update(event); + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = zhaoxin_pmu_get_status(); + if (status) + goto again; + +done: + zhaoxin_pmu_enable_all(0); + return handled; +} + +static u64 zhaoxin_pmu_event_map(int hw_event) +{ + return zx_pmon_event_map[hw_event]; +} + +static struct event_constraint * +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} + +PMU_FORMAT_ATTR(event, "config:0-7"); +PMU_FORMAT_ATTR(umask, "config:8-15"); +PMU_FORMAT_ATTR(edge, "config:18"); +PMU_FORMAT_ATTR(inv, "config:23"); +PMU_FORMAT_ATTR(cmask, "config:24-31"); + +static struct attribute *zx_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + +static const struct x86_pmu zhaoxin_pmu __initconst = { + .name = "zhaoxin", + .handle_irq = zhaoxin_pmu_handle_irq, + .disable_all = zhaoxin_pmu_disable_all, + .enable_all = zhaoxin_pmu_enable_all, + .enable = zhaoxin_pmu_enable_event, + .disable = zhaoxin_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = zhaoxin_pmu_event_map, + .max_events = ARRAY_SIZE(zx_pmon_event_map), + .apic = 1, + /* + * For zxd/zxe, read/write operation for PMCx MSR is 48 bits. + */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = zhaoxin_get_event_constraints, + + .format_attrs = zx_arch_formats_attr, + .events_sysfs_show = zhaoxin_event_sysfs_show, +}; + +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void zhaoxin_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { + zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; + pr_warn("CPUID marked event: \'%s\' unavailable\n", + zx_arch_events_map[bit].name); + } +} + +__init int zhaoxin_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; + unsigned int unused; + int version; + + pr_info("Welcome to zhaoxin pmu!\n"); + + /* + * Check whether the Architectural PerfMon supports + * hw_event or not. + */ + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1) + return -ENODEV; + + version = eax.split.version_id; + if (version != 2) + return -ENODEV; + + x86_pmu = zhaoxin_pmu; + pr_info("Version check pass!\n"); + + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_add_quirk(zhaoxin_arch_events_quirk); + + switch (boot_cpu_data.x86) { + case 0x06: + if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) { + + x86_pmu.max_period = x86_pmu.cntval_mask >> 1; + + /* Clearing status works only if the global control is enable on zxc. */ + x86_pmu.enabled_ack = 1; + + x86_pmu.event_constraints = zxc_event_constraints; + zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0; + + pr_cont("ZXC events, "); + break; + } + return -ENODEV; + + case 0x07: + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01); + + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0); + + switch (boot_cpu_data.x86_model) { + case 0x1b: + memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709; + + pr_cont("ZXD events, "); + break; + case 0x3b: + memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029; + + pr_cont("ZXE events, "); + break; + default: + return -ENODEV; + } + break; + + default: + return -ENODEV; + } + + x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; + x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + return 0; +} + diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 3d21eab7aaed..6efe6cb3768a 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/unistd_32.h> +#include <asm/audit.h> unsigned ia32_dir_class[] = { #include <asm-generic/audit_dir_write.h> diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f9d8804144d0..81cf22398cd1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -350,7 +350,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault); user_access_end(); - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false)) + if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) return -EFAULT; /* Set up registers for signal handler */ diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h new file mode 100644 index 000000000000..1b07fb102c4e --- /dev/null +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -0,0 +1,25 @@ +#ifdef CONFIG_64BIT +GEN(rax) +GEN(rbx) +GEN(rcx) +GEN(rdx) +GEN(rsi) +GEN(rdi) +GEN(rbp) +GEN(r8) +GEN(r9) +GEN(r10) +GEN(r11) +GEN(r12) +GEN(r13) +GEN(r14) +GEN(r15) +#else +GEN(eax) +GEN(ebx) +GEN(ecx) +GEN(edx) +GEN(esi) +GEN(edi) +GEN(ebp) +#endif diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index 99bb207fc04c..87ce8e963215 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -25,11 +25,7 @@ #define APBT_MIN_FREQ 1000000 #define APBT_MMAP_SIZE 1024 -#define APBT_DEV_USED 1 - extern void apbt_time_init(void); -extern unsigned long apbt_quick_calibrate(void); -extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); @@ -38,7 +34,6 @@ extern int sfi_mtimer_num; #else /* CONFIG_APB_TIMER */ -static inline unsigned long apbt_quick_calibrate(void) {return 0; } static inline void apbt_time_init(void) { } #endif diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 7a4bb1bd4bdb..ebc248e49549 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -15,16 +15,6 @@ #define RDRAND_RETRY_LOOPS 10 -#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" -#define RDSEED_INT ".byte 0x0f,0xc7,0xf8" -#ifdef CONFIG_X86_64 -# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" -# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8" -#else -# define RDRAND_LONG RDRAND_INT -# define RDSEED_LONG RDSEED_INT -#endif - /* Unconditional execution of RDRAND and RDSEED */ static inline bool __must_check rdrand_long(unsigned long *v) @@ -32,9 +22,9 @@ static inline bool __must_check rdrand_long(unsigned long *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG + asm volatile("rdrand %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); @@ -46,9 +36,9 @@ static inline bool __must_check rdrand_int(unsigned int *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT + asm volatile("rdrand %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); @@ -58,18 +48,18 @@ static inline bool __must_check rdrand_int(unsigned int *v) static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG + asm volatile("rdseed %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } static inline bool __must_check rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT + asm volatile("rdseed %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ce92c4acc913..9bf2620ce817 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -17,24 +17,19 @@ extern void cmpxchg8b_emu(void); #endif #ifdef CONFIG_RETPOLINE -#ifdef CONFIG_X86_32 -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); -#else -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); -INDIRECT_THUNK(8) -INDIRECT_THUNK(9) -INDIRECT_THUNK(10) -INDIRECT_THUNK(11) -INDIRECT_THUNK(12) -INDIRECT_THUNK(13) -INDIRECT_THUNK(14) -INDIRECT_THUNK(15) -#endif -INDIRECT_THUNK(ax) -INDIRECT_THUNK(bx) -INDIRECT_THUNK(cx) -INDIRECT_THUNK(dx) -INDIRECT_THUNK(si) -INDIRECT_THUNK(di) -INDIRECT_THUNK(bp) + +#define DECL_INDIRECT_THUNK(reg) \ + extern asmlinkage void __x86_indirect_thunk_ ## reg (void); + +#define DECL_RETPOLINE(reg) \ + extern asmlinkage void __x86_retpoline_ ## reg (void); + +#undef GEN +#define GEN(reg) DECL_INDIRECT_THUNK(reg) +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) DECL_RETPOLINE(reg) +#include <asm/GEN-for-each-reg.h> + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h new file mode 100644 index 000000000000..36aec57ea7a3 --- /dev/null +++ b/arch/x86/include/asm/audit.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AUDIT_H +#define _ASM_X86_AUDIT_H + +int ia32_classify_syscall(unsigned int syscall); + +#endif /* _ASM_X86_AUDIT_H */ diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index d79d1e622dcf..0ada98d5d09f 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +#define HAVE_CSUM_COPY_USER #ifdef CONFIG_X86_32 # include <asm/checksum_32.h> #else diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index f57b94e02c57..11624c8a9d8d 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -44,18 +44,21 @@ static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); } -static inline __wsum csum_partial_copy_from_user(const void __user *src, - void *dst, - int len, __wsum sum, - int *err_ptr) +static inline __wsum csum_and_copy_from_user(const void __user *src, + void *dst, int len, + __wsum sum, int *err_ptr) { __wsum ret; might_sleep(); - stac(); + if (!user_access_begin(src, len)) { + if (len) + *err_ptr = -EFAULT; + return sum; + } ret = csum_partial_copy_generic((__force void *)src, dst, len, sum, err_ptr, NULL); - clac(); + user_access_end(); return ret; } @@ -173,7 +176,6 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, /* * Copy and checksum to user */ -#define HAVE_CSUM_COPY_USER static inline __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, @@ -182,11 +184,10 @@ static inline __wsum csum_and_copy_to_user(const void *src, __wsum ret; might_sleep(); - if (access_ok(dst, len)) { - stac(); + if (user_access_begin(dst, len)) { ret = csum_partial_copy_generic(src, (__force void *)dst, len, sum, NULL, err_ptr); - clac(); + user_access_end(); return ret; } diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 3ec6d3267cf9..0a289b87e872 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -129,27 +129,19 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); -#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 -#define HAVE_CSUM_COPY_USER 1 - - /* Do not call this directly. Use the wrappers below */ extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, int len, __wsum sum, int *src_err_ptr, int *dst_err_ptr); -extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, +extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp); -extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst, +extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp); extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum); -/* Old names. To be removed. */ -#define csum_and_copy_to_user csum_partial_copy_to_user -#define csum_and_copy_from_user csum_partial_copy_from_user - /** * ip_compute_csum - Compute an 16bit IP checksum. * @buff: buffer address. diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 52e9f3480f69..d4edf281fff4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -214,7 +214,11 @@ static inline bool in_compat_syscall(void) #endif struct compat_siginfo; -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const kernel_siginfo_t *from, bool x32_ABI); + +#ifdef CONFIG_X86_X32_ABI +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const kernel_siginfo_t *from); +#define copy_siginfo_to_user32 copy_siginfo_to_user32 +#endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index cf3d621c6892..eb8fcede9e3b 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -20,12 +20,14 @@ #define X86_CENTAUR_FAM6_C7_D 0xd #define X86_CENTAUR_FAM6_NANO 0xf +#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Base macro for CPU matching + * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@_vendor * @_family: The family number or X86_FAMILY_ANY * @_model: The model number, model constant or X86_MODEL_ANY + * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY * @_data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer @@ -37,16 +39,35 @@ * into another macro at the usage site for good reasons, then please * start this local macro with X86_MATCH to allow easy grepping. */ -#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(_vendor, _family, _model, \ - _feature, _data) { \ +#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ .vendor = X86_VENDOR_##_vendor, \ .family = _family, \ .model = _model, \ + .steppings = _steppings, \ .feature = _feature, \ .driver_data = (unsigned long) _data \ } /** + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ + X86_STEPPING_ANY, feature, data) + +/** * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@vendor @@ -139,6 +160,10 @@ #define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) +#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ + steppings, X86_FEATURE_ANY, data) + /* * Match specific microcode revisions. * diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h index af9a14ac8962..54a6e4a2e132 100644 --- a/arch/x86/include/asm/doublefault.h +++ b/arch/x86/include/asm/doublefault.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_DOUBLEFAULT_H #define _ASM_X86_DOUBLEFAULT_H -#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +#ifdef CONFIG_X86_32 extern void doublefault_init_cpu_tss(void); #else static inline void doublefault_init_cpu_tss(void) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8391c115c0ec..89dcc7aa7e2c 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -9,6 +9,7 @@ #include <asm/nospec-branch.h> #include <asm/mmu_context.h> #include <linux/build_bug.h> +#include <linux/kernel.h> extern unsigned long efi_fw_vendor, efi_config_table; @@ -225,14 +226,21 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, /* arch specific definitions used by the stub code */ -__attribute_const__ bool efi_is_64bit(void); +#ifdef CONFIG_EFI_MIXED + +#define ARCH_HAS_EFISTUB_WRAPPERS + +static inline bool efi_is_64bit(void) +{ + extern const bool efi_is64; + + return efi_is64; +} static inline bool efi_is_native(void) { if (!IS_ENABLED(CONFIG_X86_64)) return true; - if (!IS_ENABLED(CONFIG_EFI_MIXED)) - return true; return efi_is_64bit(); } @@ -286,6 +294,15 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_allocate_pool(type, size, buffer) \ ((type), (size), efi64_zero_upper(buffer)) +#define __efi64_argmap_create_event(type, tpl, f, c, event) \ + ((type), (tpl), (f), (c), efi64_zero_upper(event)) + +#define __efi64_argmap_set_timer(event, type, time) \ + ((event), (type), lower_32_bits(time), upper_32_bits(time)) + +#define __efi64_argmap_wait_for_event(num, event, index) \ + ((num), (event), efi64_zero_upper(index)) + #define __efi64_argmap_handle_protocol(handle, protocol, interface) \ ((handle), (protocol), efi64_zero_upper(interface)) @@ -307,6 +324,10 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_load_file(protocol, path, policy, bufsize, buf) \ ((protocol), (path), (policy), efi64_zero_upper(bufsize), (buf)) +/* Graphics Output Protocol */ +#define __efi64_argmap_query_mode(gop, mode, size, info) \ + ((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro @@ -335,15 +356,26 @@ static inline u32 efi64_convert_status(efi_status_t status) #define efi_bs_call(func, ...) \ (efi_is_native() \ - ? efi_system_table()->boottime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table(), \ - boottime), func, __VA_ARGS__)) + ? efi_system_table->boottime->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_table_attr(efi_system_table, \ + boottime), \ + func, __VA_ARGS__)) #define efi_rt_call(func, ...) \ (efi_is_native() \ - ? efi_system_table()->runtime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table(), \ - runtime), func, __VA_ARGS__)) + ? efi_system_table->runtime->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_table_attr(efi_system_table, \ + runtime), \ + func, __VA_ARGS__)) + +#else /* CONFIG_EFI_MIXED */ + +static inline bool efi_is_64bit(void) +{ + return IS_ENABLED(CONFIG_X86_64); +} + +#endif /* CONFIG_EFI_MIXED */ extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 44c48e34d799..42159f45bf9c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -31,7 +31,8 @@ extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct task_struct *dst, struct task_struct *src); -extern void fpu__clear(struct fpu *fpu); +extern void fpu__clear_user_states(struct fpu *fpu); +extern void fpu__clear_all(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); @@ -92,7 +93,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask; + xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; } static inline void fpstate_init_fxstate(struct fxregs_state *fx) @@ -399,7 +400,10 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) u32 hmask = mask >> 32; int err; - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + if (static_cpu_has(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); return err; } diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index c6136d79f8c0..422d8369012a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,19 +21,29 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) -/* Supervisor features */ -#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) - -/* All currently supported features */ -#define XCNTXT_MASK (XFEATURE_MASK_FP | \ - XFEATURE_MASK_SSE | \ - XFEATURE_MASK_YMM | \ - XFEATURE_MASK_OPMASK | \ - XFEATURE_MASK_ZMM_Hi256 | \ - XFEATURE_MASK_Hi16_ZMM | \ - XFEATURE_MASK_PKRU | \ - XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) +/* All currently supported user features */ +#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) + +/* All currently supported supervisor features */ +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0) + +/* + * Unsupported supervisor features. When a supervisor feature in this mask is + * supported in the future, move it to the supported supervisor feature mask. + */ +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) + +/* All supervisor states including supported and unsupported states. */ +#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ + XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " @@ -41,7 +51,18 @@ #define REX_PREFIX #endif -extern u64 xfeatures_mask; +extern u64 xfeatures_mask_all; + +static inline u64 xfeatures_mask_supervisor(void) +{ + return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_user(void) +{ + return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; +} + extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, @@ -54,8 +75,9 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); +void copy_supervisor_to_kernel(struct xregs_state *xsave); /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -extern int validate_xstate_header(const struct xstate_header *hdr); +int validate_user_xstate_header(const struct xstate_header *hdr); #endif diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h index 989cfa86de85..734482afbf81 100644 --- a/arch/x86/include/asm/invpcid.h +++ b/arch/x86/include/asm/invpcid.h @@ -12,12 +12,9 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, * stale TLB entries and, especially if we're flushing global * mappings, we don't want the compiler to reorder any subsequent * memory accesses before the TLB flush. - * - * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and - * invpcid (%rcx), %rax in long mode. */ - asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (&desc) : "memory"); + asm volatile("invpcid %[desc], %[type]" + :: [desc] "m" (desc), [type] "r" (type) : "memory"); } #define INVPCID_TYPE_INDIV_ADDR 0 diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 73d8dd14dda2..2d4515e8b7df 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -14,43 +14,4 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #endif /* CONFIG_NUMA */ -#ifdef CONFIG_DISCONTIGMEM - -/* - * generic node memory support, the following assumptions apply: - * - * 1) memory comes in 64Mb contiguous chunks which are either present or not - * 2) we will not have more than 64Gb in total - * - * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb / 4096bytes/page = 16777216 pages - */ -#define MAX_NR_PAGES 16777216 -#define MAX_SECTIONS 1024 -#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS) - -extern s8 physnode_map[]; - -static inline int pfn_to_nid(unsigned long pfn) -{ -#ifdef CONFIG_NUMA - return((int) physnode_map[(pfn) / PAGES_PER_SECTION]); -#else - return 0; -#endif -} - -static inline int pfn_valid(int pfn) -{ - int nid = pfn_to_nid(pfn); - - if (nid >= 0) - return (pfn < node_end_pfn(nid)); - return 0; -} - -#define early_pfn_valid(pfn) pfn_valid((pfn)) - -#endif /* CONFIG_DISCONTIGMEM */ - #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 12c9684d59ba..ef452b817f44 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -301,6 +301,9 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b +#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 + /* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 #define MSR_CONFIG_TDP_LEVEL_1 0x00000649 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e9a281e2660..d52d1aacdd97 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -4,20 +4,13 @@ #define _ASM_X86_NOSPEC_BRANCH_H_ #include <linux/static_key.h> +#include <linux/frame.h> #include <asm/alternative.h> #include <asm/alternative-asm.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> - -/* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - ANNOTATE_IGNORE_ALTERNATIVE +#include <asm/unwind_hints.h> /* * Fill the CPU return stack buffer. @@ -46,21 +39,25 @@ #define __FILL_RETURN_BUFFER(reg, nr, sp) \ mov $(nr/2), reg; \ 771: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 772f; \ 773: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 773b; \ 772: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 774f; \ 775: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 775b; \ 774: \ + add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ - jnz 771b; \ - add $(BITS_PER_LONG/8) * nr, sp; + jnz 771b; #ifdef __ASSEMBLY__ @@ -77,57 +74,27 @@ .endm /* - * These are the bare retpoline primitives for indirect jmp and call. - * Do not use these directly; they only exist to make the ALTERNATIVE - * invocation below less ugly. - */ -.macro RETPOLINE_JMP reg:req - call .Ldo_rop_\@ -.Lspec_trap_\@: - pause - lfence - jmp .Lspec_trap_\@ -.Ldo_rop_\@: - mov \reg, (%_ASM_SP) - ret -.endm - -/* - * This is a wrapper around RETPOLINE_JMP so the called function in reg - * returns to the instruction after the macro. - */ -.macro RETPOLINE_CALL reg:req - jmp .Ldo_call_\@ -.Ldo_retpoline_jmp_\@: - RETPOLINE_JMP \reg -.Ldo_call_\@: - call .Ldo_retpoline_jmp_\@ -.endm - -/* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ - __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ + __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD #else - jmp *\reg + jmp *%\reg #endif .endm .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ - __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ + __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD #else - call *\reg + call *%\reg #endif .endm @@ -137,10 +104,8 @@ */ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ - \ftr + ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) .Lskip_rsb_\@: #endif .endm @@ -161,16 +126,16 @@ * which is ensured when CONFIG_RETPOLINE is defined. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ + "call __x86_retpoline_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_AMD) + # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ @@ -180,7 +145,6 @@ * here, anyway. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 6e060907c163..d25534940bde 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -58,8 +58,7 @@ #define ORC_TYPE_CALL 0 #define ORC_TYPE_REGS 1 #define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_SAVE 3 -#define UNWIND_HINT_TYPE_RESTORE 4 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0dca7f7aeff2..be7b19646897 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -66,8 +66,7 @@ do { \ #endif /* !__ASSEMBLY__ */ /* - * kern_addr_valid() is (1) for FLATMEM and (0) for - * SPARSEMEM and DISCONTIGMEM + * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM */ #ifdef CONFIG_FLATMEM #define kern_addr_valid(addr) (1) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c4e8fd709cf6..29ee0c088009 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -728,7 +728,6 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( - UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -738,7 +737,6 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" - UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 27c47d183f4b..8b58d6975d5d 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -57,8 +57,10 @@ static __always_inline unsigned long smap_save(void) { unsigned long flags; - asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, - X86_FEATURE_SMAP) + asm volatile ("# smap_save\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "pushf; pop %0; " __ASM_CLAC "\n\t" + "1:" : "=rm" (flags) : : "memory", "cc"); return flags; @@ -66,7 +68,10 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { - asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + asm volatile ("# smap_restore\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "push %0; popf\n\t" + "1:" : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index bf3e34b25afc..323db6c5852a 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -3,29 +3,7 @@ #define _ASM_X86_SPINLOCK_TYPES_H #include <linux/types.h> - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -#define __TICKET_LOCK_INC 2 -#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) -#else -#define __TICKET_LOCK_INC 1 -#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) -#endif - -#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) -typedef u8 __ticket_t; -typedef u16 __ticketpair_t; -#else -typedef u16 __ticket_t; -typedef u32 __ticketpair_t; -#endif - -#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) - -#define TICKET_SHIFT (sizeof(__ticket_t) * 8) - #include <asm-generic/qspinlock_types.h> - #include <asm-generic/qrwlock_types.h> #endif /* _ASM_X86_SPINLOCK_TYPES_H */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c26a7e1d8a2c..2ae904bf25e4 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -69,9 +69,7 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -#endif dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); @@ -118,11 +116,6 @@ void smp_spurious_interrupt(struct pt_regs *regs); void smp_error_interrupt(struct pt_regs *regs); asmlinkage void smp_irq_move_cleanup_interrupt(void); -extern void ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs); -extern void ist_begin_non_atomic(struct pt_regs *regs); -extern void ist_end_non_atomic(void); - #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index f5e2eb12cb71..7d903fdb3f43 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -86,32 +86,15 @@ UNWIND_HINT sp_offset=\sp_offset .endm -.macro UNWIND_HINT_SAVE - UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE -.endm - -.macro UNWIND_HINT_RESTORE - UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +/* + * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN + * and sibling calls. On these, sp_offset denotes the expected offset from + * initial_func_cfi. + */ +.macro UNWIND_HINT_RET_OFFSET sp_offset=8 + UNWIND_HINT type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset .endm -#else /* !__ASSEMBLY__ */ - -#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ - "987: \n\t" \ - ".pushsection .discard.unwind_hints\n\t" \ - /* struct unwind_hint */ \ - ".long 987b - .\n\t" \ - ".short " __stringify(sp_offset) "\n\t" \ - ".byte " __stringify(sp_reg) "\n\t" \ - ".byte " __stringify(type) "\n\t" \ - ".byte " __stringify(end) "\n\t" \ - ".balign 4 \n\t" \ - ".popsection\n\t" - -#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0) - -#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0) - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 389174eaec79..2fcc3ac12e76 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -123,12 +123,6 @@ enum uv_memprotect { UV_MEMPROT_ALLOW_RW }; -/* - * bios calls have 6 parameters - */ -extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); - extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, @@ -146,7 +140,6 @@ extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; extern long system_serial_number; -#define uv_partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 45ea95ce79b4..ae587ce544f4 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -31,7 +31,6 @@ static inline bool is_early_uv_system(void) } extern int is_uv_system(void); extern int is_uv_hubbed(int uvtype); -extern int is_uv_hubless(int uvtype); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); @@ -44,7 +43,6 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline bool is_early_uv_system(void) { return 0; } static inline int is_uv_system(void) { return 0; } static inline int is_uv_hubbed(int uv) { return 0; } -static inline int is_uv_hubless(int uv) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 950cd1395d5d..60ca0afdeaf9 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -219,20 +219,6 @@ static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info; } -#define UV_HUB_INFO_VERSION 0x7150 -extern int uv_hub_info_version(void); -static inline int uv_hub_info_check(int version) -{ - if (uv_hub_info_version() == version) - return 0; - - pr_crit("UV: uv_hub_info version(%x) mismatch, expecting(%x)\n", - uv_hub_info_version(), version); - - BUG(); /* Catastrophic - cannot continue on unknown UV system */ -} -#define _uv_hub_info_check() uv_hub_info_check(UV_HUB_INFO_VERSION) - /* * HUB revision ranges for each UV HUB architecture. * This is a software convention - NOT the hardware revision numbers in @@ -244,51 +230,32 @@ static inline int uv_hub_info_check(int version) #define UV4_HUB_REVISION_BASE 7 #define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ -/* WARNING: UVx_HUB_IS_SUPPORTED defines are deprecated and will be removed */ static inline int is_uv1_hub(void) { -#ifdef UV1_HUB_IS_SUPPORTED return is_uv_hubbed(uv(1)); -#else - return 0; -#endif } static inline int is_uv2_hub(void) { -#ifdef UV2_HUB_IS_SUPPORTED return is_uv_hubbed(uv(2)); -#else - return 0; -#endif } static inline int is_uv3_hub(void) { -#ifdef UV3_HUB_IS_SUPPORTED return is_uv_hubbed(uv(3)); -#else - return 0; -#endif } /* First test "is UV4A", then "is UV4" */ static inline int is_uv4a_hub(void) { -#ifdef UV4A_HUB_IS_SUPPORTED if (is_uv_hubbed(uv(4))) return (uv_hub_info->hub_revision == UV4A_HUB_REVISION_BASE); -#endif return 0; } static inline int is_uv4_hub(void) { -#ifdef UV4_HUB_IS_SUPPORTED return is_uv_hubbed(uv(4)); -#else - return 0; -#endif } static inline int is_uvx_hub(void) @@ -692,7 +659,6 @@ static inline int uv_cpu_blade_processor_id(int cpu) { return uv_cpu_info_per(cpu)->blade_cpu_id; } -#define _uv_cpu_blade_processor_id 1 /* indicate function available */ /* Blade number to Node number (UV1..UV4 is 1:1) */ static inline int uv_blade_to_node(int blade) @@ -856,26 +822,6 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) } extern unsigned int uv_apicid_hibits; -static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) -{ - apicid |= uv_apicid_hibits; - return (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | - (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | - (vector << UVH_IPI_INT_VECTOR_SHFT); -} - -static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) -{ - unsigned long val; - unsigned long dmode = dest_Fixed; - - if (vector == NMI_VECTOR) - dmode = dest_NMI; - - val = uv_hub_ipi_value(apicid, vector, dmode); - uv_write_global_mmr64(pnode, UVH_IPI_INT, val); -} /* * Get the minimum revision number of the hub chips within the partition. diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 62c79e26a59a..9ee5ed6e8b34 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -99,13 +99,6 @@ #define UV3_HUB_PART_NUMBER_X 0x4321 #define UV4_HUB_PART_NUMBER 0x99a1 -/* Compat: Indicate which UV Hubs are supported. */ -#define UV1_HUB_IS_SUPPORTED 1 -#define UV2_HUB_IS_SUPPORTED 1 -#define UV3_HUB_IS_SUPPORTED 1 -#define UV4_HUB_IS_SUPPORTED 1 -#define UV4A_HUB_IS_SUPPORTED 1 - /* Error function to catch undefined references */ extern unsigned long uv_undefined(char *str); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ba89cabe5fcf..2a7c3afa62e2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,9 +102,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -endif +obj-$(CONFIG_X86_32) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index fe698f96617c..263eeaddb0aa 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -345,56 +345,3 @@ out_noapbt: apb_timer_block_enabled = 0; panic("failed to enable APB timer\n"); } - -/* called before apb_timer_enable, use early map */ -unsigned long apbt_quick_calibrate(void) -{ - int i, scale; - u64 old, new; - u64 t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - dw_apb_clocksource_start(clocksource_apbt); - - /* check if the timer can count down, otherwise return */ - old = dw_apb_clocksource_read(clocksource_apbt); - i = 10000; - while (--i) { - if (old != dw_apb_clocksource_read(clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq / 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - dw_apb_clocksource_start(clocksource_apbt); - - old = dw_apb_clocksource_read(clocksource_apbt); - old += loop; - - t1 = rdtsc(); - - do { - new = dw_apb_clocksource_read(clocksource_apbt); - } while (new < old); - - t2 = rdtsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * (apbt_freq / 1000)) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; -failed: - return 0; -} diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e53dda210cd7..4b1d31be50b4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -544,46 +544,20 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -static __init u32 hsx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x3a; /* EP */ - case 0x04: return 0x0f; /* EX */ - } - - return ~0U; -} - -static __init u32 bdx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x00000011; - case 0x03: return 0x0700000e; - case 0x04: return 0x0f00000c; - case 0x05: return 0x0e000003; - } - - return ~0U; -} - -static __init u32 skx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x03: return 0x01000136; - case 0x04: return 0x02000014; - } +static const struct x86_cpu_id deadline_match[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ - if (boot_cpu_data.x86_stepping > 4) - return 0; + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), - return ~0U; -} + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), -static const struct x86_cpu_id deadline_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_X, &skx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), @@ -615,14 +589,7 @@ static __init bool apic_validate_deadline_timer(void) if (!m) return true; - /* - * Function pointers will have the MSB set due to address layout, - * immediate revisions will not. - */ - if ((long)m->driver_data < 0) - rev = ((u32 (*)(void))(m->driver_data))(); - else - rev = (u32)m->driver_data; + rev = (u32)m->driver_data; if (boot_cpu_data.microcode >= rev) return true; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 913c88617848..ce61e3e7d399 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -154,19 +154,6 @@ static inline bool mp_is_legacy_irq(int irq) return irq >= 0 && irq < nr_legacy_irqs(); } -/* - * Initialize all legacy IRQs and all pins on the first IOAPIC - * if we have legacy interrupt controller. Kernel boot option "pirq=" - * may rely on non-legacy pins on the first IOAPIC. - */ -static inline int mp_init_irq_at_boot(int ioapic, int irq) -{ - if (!nr_legacy_irqs()) - return 0; - - return ioapic == 0 || mp_is_legacy_irq(irq); -} - static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) { return ioapics[ioapic].irqdomain; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ad53b2abc859..69e70ed0f5e6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -30,8 +30,6 @@ static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; -static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; -static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; /* Unpack OEM/TABLE ID's to be NULL terminated strings */ @@ -48,11 +46,9 @@ static struct { unsigned int gnode_shift; } uv_cpuid; -int uv_min_hub_revision_id; -EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static int uv_min_hub_revision_id; unsigned int uv_apicid_hibits; -EXPORT_SYMBOL_GPL(uv_apicid_hibits); static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; @@ -85,20 +81,7 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) static inline bool is_GRU_range(u64 start, u64 end) { - if (gru_dist_base) { - u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */ - u64 sl = start & gru_dist_lmask; /* Base offset bits */ - u64 eu = end & gru_dist_umask; - u64 el = end & gru_dist_lmask; - - /* Must reside completely within a single GRU range: */ - return (sl == gru_dist_base && el == gru_dist_base && - su >= gru_first_node_paddr && - su <= gru_last_node_paddr && - eu == su); - } else { - return start >= gru_start_paddr && end <= gru_end_paddr; - } + return start >= gru_start_paddr && end <= gru_end_paddr; } static bool uv_is_untracked_pat_range(u64 start, u64 end) @@ -385,11 +368,10 @@ int is_uv_hubbed(int uvtype) } EXPORT_SYMBOL_GPL(is_uv_hubbed); -int is_uv_hubless(int uvtype) +static int is_uv_hubless(int uvtype) { return (uv_hubless_system & uvtype); } -EXPORT_SYMBOL_GPL(is_uv_hubless); void **__uv_hub_info_list; EXPORT_SYMBOL_GPL(__uv_hub_info_list); @@ -417,12 +399,6 @@ static __initdata struct uv_gam_range_s *_gr_table; #define SOCK_EMPTY ((unsigned short)~0) -extern int uv_hub_info_version(void) -{ - return UV_HUB_INFO_VERSION; -} -EXPORT_SYMBOL(uv_hub_info_version); - /* Default UV memory block size is 2GB */ static unsigned long mem_block_size __initdata = (2UL << 30); @@ -590,12 +566,21 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) static void uv_send_IPI_one(int cpu, int vector) { - unsigned long apicid; - int pnode; + unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu); + int pnode = uv_apicid_to_pnode(apicid); + unsigned long dmode, val; + + if (vector == NMI_VECTOR) + dmode = dest_NMI; + else + dmode = dest_Fixed; + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + ((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); - apicid = per_cpu(x86_cpu_to_apicid, cpu); - pnode = uv_apicid_to_pnode(apicid); - uv_hub_send_ipi(pnode, apicid, vector); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } static void uv_send_IPI_mask(const struct cpumask *mask, int vector) @@ -797,42 +782,6 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift init_extra_mapping_wb(paddr, bytes); } -static __init void map_gru_distributed(unsigned long c) -{ - union uvh_rh_gam_gru_overlay_config_mmr_u gru; - u64 paddr; - unsigned long bytes; - int nid; - - gru.v = c; - - /* Only base bits 42:28 relevant in dist mode */ - gru_dist_base = gru.v & 0x000007fff0000000UL; - if (!gru_dist_base) { - pr_info("UV: Map GRU_DIST base address NULL\n"); - return; - } - - bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; - gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); - gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); - gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ - - for_each_online_node(nid) { - paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | - gru_dist_base; - init_extra_mapping_wb(paddr, bytes); - gru_first_node_paddr = min(paddr, gru_first_node_paddr); - gru_last_node_paddr = max(paddr, gru_last_node_paddr); - } - - /* Save upper (63:M) bits of address only for is_GRU_range */ - gru_first_node_paddr &= gru_dist_umask; - gru_last_node_paddr &= gru_dist_umask; - - pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); -} - static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; @@ -846,12 +795,6 @@ static __init void map_gru_high(int max_pnode) return; } - /* Only UV3 has distributed GRU mode */ - if (is_uv3_hub() && gru.s3.mode) { - map_gru_distributed(gru.v); - return; - } - base = (gru.v & mask) >> shift; map_high("GRU", base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)base << shift); diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index e1efe44ebefc..83d9cad4e68b 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/audit.h> #include <asm/unistd.h> +#include <asm/audit.h> static unsigned dir_class[] = { #include <asm-generic/audit_dir_write.h> @@ -41,7 +42,6 @@ int audit_classify_arch(int arch) int audit_classify_syscall(int abi, unsigned syscall) { #ifdef CONFIG_IA32_EMULATION - extern int ia32_classify_syscall(unsigned); if (abi == AUDIT_ARCH_I386) return ia32_classify_syscall(syscall); #endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c36e89930965..d4806eac9325 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1145,8 +1145,7 @@ static const int amd_erratum_383[] = /* #1054: Instructions Retired Performance Counter May Be Inaccurate */ static const int amd_erratum_1054[] = - AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index d3482eb43ff3..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -39,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 54165f3569e8..e9265e2f28c9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,8 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/task_work.h> +#include <linux/hardirq.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -1086,23 +1088,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1204,6 +1189,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1222,7 +1230,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1259,7 +1267,7 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) if (__mc_check_crashing_cpu(cpu)) return; - ist_enter(regs); + nmi_enter(); this_cpu_inc(mce_exception_count); @@ -1352,23 +1360,24 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); - local_irq_enable(); - - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - local_irq_disable(); - ist_end_non_atomic(); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } out_ist: - ist_exit(regs); + nmi_exit(); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..5ee94aa1b766 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -24,7 +25,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; - ist_enter(regs); + nmi_enter(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +40,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..b3938c195365 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -18,12 +19,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - ist_enter(regs); + nmi_enter(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9556930cd8c1..a5ee607a3b89 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; } return 0; } @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; } return 0; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 8e3a8fedfa4d..722fd712e1cf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,6 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); struct doublefault_stack *ss = &cea->doublefault_stack; @@ -103,9 +102,6 @@ static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); return true; -#else - return false; -#endif } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c5399e80c59c..4d13c57f370a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -910,14 +910,6 @@ static int __init parse_memmap_one(char *p) return -EINVAL; if (!strncmp(p, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* - * If we are doing a crash dump, we still need to know - * the real memory size before the original memory map is - * reset. - */ - saved_max_pfn = e820__end_of_ram_pfn(); -#endif e820_table->nr_entries = 0; userdef = 1; return 0; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 9b33904251a9..93fbdff2974f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -15,12 +15,9 @@ #include <xen/hvc-console.h> #include <asm/pci-direct.h> #include <asm/fixmap.h> -#include <asm/intel-mid.h> #include <asm/pgtable.h> #include <linux/usb/ehci_def.h> #include <linux/usb/xhci-dbgp.h> -#include <linux/efi.h> -#include <asm/efi.h> #include <asm/pci_x86.h> /* Simple VGA output */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 12c70840980e..06c818967bb6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -291,15 +291,13 @@ void fpu__drop(struct fpu *fpu) } /* - * Clear FPU registers by setting them up from - * the init fpstate: + * Clear FPU registers by setting them up from the init fpstate. + * Caller must do fpregs_[un]lock() around it. */ -static inline void copy_init_fpstate_to_fpregs(void) +static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { - fpregs_lock(); - if (use_xsave()) - copy_kernel_to_xregs(&init_fpstate.xsave, -1); + copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); else @@ -307,9 +305,6 @@ static inline void copy_init_fpstate_to_fpregs(void) if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); - - fpregs_mark_activate(); - fpregs_unlock(); } /* @@ -318,18 +313,40 @@ static inline void copy_init_fpstate_to_fpregs(void) * Called by sys_execve(), by the signal handler code and by various * error paths. */ -void fpu__clear(struct fpu *fpu) +static void fpu__clear(struct fpu *fpu, bool user_only) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ + WARN_ON_FPU(fpu != ¤t->thread.fpu); - fpu__drop(fpu); + if (!static_cpu_has(X86_FEATURE_FPU)) { + fpu__drop(fpu); + fpu__initialize(fpu); + return; + } - /* - * Make sure fpstate is cleared and initialized. - */ - fpu__initialize(fpu); - if (static_cpu_has(X86_FEATURE_FPU)) - copy_init_fpstate_to_fpregs(); + fpregs_lock(); + + if (user_only) { + if (!fpregs_state_valid(fpu, smp_processor_id()) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); + copy_init_fpstate_to_fpregs(xfeatures_mask_user()); + } else { + copy_init_fpstate_to_fpregs(xfeatures_mask_all); + } + + fpregs_mark_activate(); + fpregs_unlock(); +} + +void fpu__clear_user_states(struct fpu *fpu) +{ + fpu__clear(fpu, true); +} + +void fpu__clear_all(struct fpu *fpu) +{ + fpu__clear(fpu, false); } /* diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6ce7e0a23268..61ddc3a5e5c2 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -224,7 +224,8 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ u64 __init fpu__get_supported_xfeatures_mask(void) { - return XCNTXT_MASK; + return XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; } /* Legacy code to initialize eager fpu mode. */ diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index d652b939ccfb..bd1d0649f8ce 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -139,7 +139,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); if (!ret) - ret = validate_xstate_header(&xsave->header); + ret = validate_user_xstate_header(&xsave->header); } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 400a05e1c1c5..9393a445d73c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -211,9 +211,9 @@ retry: } static inline void -sanitize_restored_xstate(union fpregs_state *state, - struct user_i387_ia32_struct *ia32_env, - u64 xfeatures, int fx_only) +sanitize_restored_user_xstate(union fpregs_state *state, + struct user_i387_ia32_struct *ia32_env, + u64 user_xfeatures, int fx_only) { struct xregs_state *xsave = &state->xsave; struct xstate_header *header = &xsave->header; @@ -226,13 +226,22 @@ sanitize_restored_xstate(union fpregs_state *state, */ /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. + * 'user_xfeatures' might have bits clear which are + * set in header->xfeatures. This represents features that + * were in init state prior to a signal delivery, and need + * to be reset back to the init state. Clear any user + * feature bits which are set in the kernel buffer to get + * them back to the init state. + * + * Supervisor state is unchanged by input from userspace. + * Ensure supervisor state bits stay set and supervisor + * state is not modified. */ if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= xfeatures; + header->xfeatures &= user_xfeatures | + xfeatures_mask_supervisor(); } if (use_fxsr()) { @@ -252,16 +261,24 @@ sanitize_restored_xstate(union fpregs_state *state, */ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { + u64 init_bv; + int r; + if (use_xsave()) { if (fx_only) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_fxregs(buf); + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + + r = copy_user_to_fxregs(buf); + if (!r) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return r; } else { - u64 init_bv = xfeatures_mask & ~xbv; - if (unlikely(init_bv)) + init_bv = xfeatures_mask_user() & ~xbv; + + r = copy_user_to_xregs(buf, xbv); + if (!r && unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_xregs(buf, xbv); + return r; } } else if (use_fxsr()) { return copy_user_to_fxregs(buf); @@ -277,7 +294,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; - u64 xfeatures = 0; + u64 user_xfeatures = 0; int fx_only = 0; int ret = 0; @@ -285,7 +302,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { - fpu__clear(fpu); + fpu__clear_user_states(fpu); return 0; } @@ -310,32 +327,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) trace_x86_fpu_xstate_check_failed(fpu); } else { state_size = fx_sw_user.xstate_size; - xfeatures = fx_sw_user.xfeatures; + user_xfeatures = fx_sw_user.xfeatures; } } - /* - * The current state of the FPU registers does not matter. By setting - * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate - * is not modified on context switch and that the xstate is considered - * to be loaded again on return to userland (overriding last_cpu avoids - * the optimisation). - */ - set_thread_flag(TIF_NEED_FPU_LOAD); - __fpu_invalidate_fpregs_state(fpu); - if ((unsigned long)buf_fx % 64) fx_only = 1; - /* - * For 32-bit frames with fxstate, copy the fxstate so it can be - * reconstructed later. - */ - if (ia32_fxstate) { - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - goto err_out; - envp = &env; - } else { + + if (!ia32_fxstate) { /* * Attempt to restore the FPU registers directly from user * memory. For that to succeed, the user access cannot cause @@ -345,20 +344,65 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpregs_lock(); pagefault_disable(); - ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only); + ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only); pagefault_enable(); if (!ret) { + + /* + * Restore supervisor states: previous context switch + * etc has done XSAVES and saved the supervisor states + * in the kernel buffer from which they can be restored + * now. + * + * We cannot do a single XRSTORS here - which would + * be nice - because the rest of the FPU registers are + * being restored from a user buffer directly. The + * single XRSTORS happens below, when the user buffer + * has been copied to the kernel one. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); fpregs_mark_activate(); fpregs_unlock(); return 0; } - fpregs_deactivate(fpu); fpregs_unlock(); + } else { + /* + * For 32-bit frames with fxstate, copy the fxstate so it can + * be reconstructed later. + */ + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + goto err_out; + envp = &env; } + /* + * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is + * not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + fpregs_lock(); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + + /* + * Supervisor states are not modified by user space input. Save + * current supervisor states first and invalidate the FPU regs. + */ + if (xfeatures_mask_supervisor()) + copy_supervisor_to_kernel(&fpu->state.xsave); + set_thread_flag(TIF_NEED_FPU_LOAD); + } + __fpu_invalidate_fpregs_state(fpu); + fpregs_unlock(); if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask & ~xfeatures; + u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; if (using_compacted_format()) { ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); @@ -366,17 +410,24 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_xstate_header(&fpu->state.xsave.header); + ret = validate_user_xstate_header(&fpu->state.xsave.header); } if (ret) goto err_out; - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); fpregs_lock(); if (unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures); + + /* + * Restore previously saved supervisor xstates along with + * copied-in user xstates. + */ + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, + user_xfeatures | xfeatures_mask_supervisor()); } else if (use_fxsr()) { ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); @@ -385,11 +436,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) goto err_out; } - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); fpregs_lock(); if (use_xsave()) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + u64 init_bv; + + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } @@ -410,7 +464,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) err_out: if (ret) - fpu__clear(fpu); + fpu__clear_user_states(fpu); return ret; } @@ -465,7 +519,7 @@ void fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask; + fx_sw_reserved.xfeatures = xfeatures_mask_user(); fx_sw_reserved.xstate_size = fpu_user_xstate_size; if (IS_ENABLED(CONFIG_IA32_EMULATION) || diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6a54e83d5589..bda2e5eaca0e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -54,13 +54,15 @@ static short xsave_cpuid_features[] __initdata = { }; /* - * Mask of xstate features supported by the CPU and the kernel: + * This represents the full set of bits that should ever be set in a kernel + * XSAVE buffer, both supervisor and user xstates. */ -u64 xfeatures_mask __read_mostly; +u64 xfeatures_mask_all __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -76,7 +78,7 @@ unsigned int fpu_user_xstate_size; */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask; + u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -150,7 +152,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * None of the feature bits are in init state. So nothing else * to do for us, as the memory layout is up to date. */ - if ((xfeatures & xfeatures_mask) == xfeatures_mask) + if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all) return; /* @@ -177,7 +179,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * in a special way already: */ feature_bit = 0x2; - xfeatures = (xfeatures_mask & ~xfeatures) >> 2; + xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2; /* * Update all the remaining memory layouts according to their @@ -205,30 +207,39 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) + u64 unsup_bits; + + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) return; /* - * Make it clear that XSAVES supervisor states are not yet - * implemented should anyone expect it to work by changing - * bits in XFEATURE_MASK_* macros and XCR0. + * Unsupported supervisor xstates should not be found in + * the xfeatures mask. */ - WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), - "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n", + unsup_bits); - xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; + xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + + /* + * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features + * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user + * states can be set here. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * MSR_IA32_XSS sets supervisor states managed by XSAVES. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); } -/* - * Note that in the future we will likely need a pair of - * functions here: one for user xstates and the other for - * system xstates. For now, they are the same. - */ -static int xfeature_enabled(enum xfeature xfeature) +static bool xfeature_enabled(enum xfeature xfeature) { - return !!(xfeatures_mask & (1UL << xfeature)); + return xfeatures_mask_all & BIT_ULL(xfeature); } /* @@ -383,6 +394,33 @@ static void __init setup_xstate_comp_offsets(void) } /* + * Setup offsets of a supervisor-state-only XSAVES buffer: + * + * The offsets stored in xstate_comp_offsets[] only work for one specific + * value of the Requested Feature BitMap (RFBM). In cases where a different + * RFBM value is used, a different set of offsets is required. This set of + * offsets is for when RFBM=xfeatures_mask_supervisor(). + */ +static void __init setup_supervisor_only_offsets(void) +{ + unsigned int next_offset; + int i; + + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + continue; + + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); + + xstate_supervisor_only_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; + } +} + +/* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) @@ -415,7 +453,7 @@ static void __init setup_init_fpu_buf(void) if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask; + xfeatures_mask_all; /* * Init all the features state with header.xfeatures being 0x0 @@ -438,7 +476,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr) * format. Checking a supervisor state's uncompacted offset is * an error. */ - if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) { + if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) { WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); return -1; } @@ -472,10 +510,10 @@ int using_compacted_format(void) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -int validate_xstate_header(const struct xstate_header *hdr) +int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + if (hdr->xfeatures & ~xfeatures_mask_user()) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -610,15 +648,12 @@ static void do_extra_xstate_size_checks(void) /* - * Get total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. - * - * Note that we do not currently set any bits on IA32_XSS so - * 'XCR0 | IA32_XSS == XCR0' for now. */ static unsigned int __init get_xsaves_size(void) { @@ -700,7 +735,7 @@ static int __init init_xstate_size(void) */ static void fpu__init_disable_system_xstate(void) { - xfeatures_mask = 0; + xfeatures_mask_all = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); } @@ -735,16 +770,26 @@ void __init fpu__init_system_xstate(void) return; } + /* + * Find user xstates supported by the processor. + */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask = eax + ((u64)edx << 32); + xfeatures_mask_all = eax + ((u64)edx << 32); - if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * Find supervisor xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + xfeatures_mask_all |= ecx + ((u64)edx << 32); + + if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ - pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", + xfeatures_mask_all); goto out_disable; } @@ -753,10 +798,10 @@ void __init fpu__init_system_xstate(void) */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask &= ~BIT(i); + xfeatures_mask_all &= ~BIT_ULL(i); } - xfeatures_mask &= fpu__get_supported_xfeatures_mask(); + xfeatures_mask_all &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -768,15 +813,16 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user()); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); + setup_supervisor_only_offsets(); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask, + xfeatures_mask_all, fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; @@ -795,7 +841,14 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (boot_cpu_has(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * Restore IA32_XSS. The same CPUID bit enumerates support + * of XSAVES and MSR_IA32_XSS. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); } /* @@ -840,10 +893,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that xfeatures_mask is - * what we write to the XCR0 register. + * have not enabled. */ - WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to @@ -1010,7 +1062,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + header.xfeatures &= xfeatures_mask_user(); if (header.xfeatures & XFEATURE_MASK_FP) copy_part(0, off_mxcsr, @@ -1090,7 +1142,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + header.xfeatures &= xfeatures_mask_user(); /* * Copy xregs_state->header: @@ -1157,7 +1209,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) memcpy(&hdr, kbuf + offset, size); - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { @@ -1183,7 +1235,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: @@ -1211,7 +1263,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { @@ -1239,7 +1291,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: @@ -1249,6 +1301,61 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) return 0; } +/* + * Save only supervisor states to the kernel buffer. This blows away all + * old states, and is intended to be used only in __fpu__restore_sig(), where + * user states are restored from the user buffer. + */ +void copy_supervisor_to_kernel(struct xregs_state *xstate) +{ + struct xstate_header *header; + u64 max_bit, min_bit; + u32 lmask, hmask; + int err, i; + + if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (!xfeatures_mask_supervisor()) + return; + + max_bit = __fls(xfeatures_mask_supervisor()); + min_bit = __ffs(xfeatures_mask_supervisor()); + + lmask = xfeatures_mask_supervisor(); + hmask = xfeatures_mask_supervisor() >> 32; + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + if (WARN_ON_FPU(err)) + return; + + /* + * At this point, the buffer has only supervisor states and must be + * converted back to normal kernel format. + */ + header = &xstate->header; + header->xcomp_bv |= xfeatures_mask_all; + + /* + * This only moves states up in the buffer. Start with + * the last state and move backwards so that states are + * not overwritten until after they are moved. Note: + * memmove() allows overlapping src/dst buffers. + */ + for (i = max_bit; i >= min_bit; i--) { + u8 *xbuf = (u8 *)xstate; + + if (!((header->xfeatures >> i) & 1)) + continue; + + /* Move xfeature 'i' into its normal location */ + memmove(xbuf + xstate_comp_offsets[i], + xbuf + xstate_supervisor_only_offsets[i], + xstate_sizes[i]); + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b0e641793be4..c84d28e90a58 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -282,7 +282,8 @@ static inline void tramp_free(void *tramp) { } /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -extern void ftrace_epilogue(void); +extern void ftrace_regs_caller_ret(void); +extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -334,7 +335,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) call_offset = (unsigned long)ftrace_regs_call; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_epilogue; + end_offset = (unsigned long)ftrace_caller_end; op_offset = (unsigned long)ftrace_caller_op_ptr; call_offset = (unsigned long)ftrace_call; } @@ -366,6 +367,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) if (WARN_ON(ret < 0)) goto fail; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); + ret = probe_kernel_read(ip, (void *)retq, RET_SIZE); + if (WARN_ON(ret < 0)) + goto fail; + } + /* * The address of the ftrace_ops that is used for this trampoline * is stored at the end of the trampoline. This will be used to @@ -433,7 +441,7 @@ void set_ftrace_ops_ro(void) end_offset = (unsigned long)ftrace_regs_caller_end; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_epilogue; + end_offset = (unsigned long)ftrace_caller_end; } size = end_offset - start_offset; size = size + RET_SIZE + sizeof(void *); diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e8a9f8370112..e405fe1a8bf4 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -189,5 +189,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - JMP_NOSPEC %ecx + JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 369e61faacfe..aa5d28aeb31e 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -23,7 +23,7 @@ #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ -#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) +#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE) /* * gcc -pg option adds a call to 'mcount' in most functions. @@ -77,7 +77,7 @@ /* * We add enough stack to save all regs. */ - subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rcx, RCX(%rsp) movq %rdx, RDX(%rsp) @@ -157,8 +157,12 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) * think twice before adding any new code or changing the * layout here. */ -SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) +SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) + jmp ftrace_epilogue +SYM_FUNC_END(ftrace_caller); + +SYM_FUNC_START(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) jmp ftrace_stub @@ -170,14 +174,12 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) retq -SYM_FUNC_END(ftrace_caller) +SYM_FUNC_END(ftrace_epilogue) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq - UNWIND_HINT_SAVE - /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ @@ -233,10 +235,13 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq ORIG_RAX(%rsp), %rax movq %rax, MCOUNT_REG_SIZE-8(%rsp) - /* If ORIG_RAX is anything but zero, make this a call to that */ + /* + * If ORIG_RAX is anything but zero, make this a call to that. + * See arch_ftrace_set_direct_caller(). + */ movq ORIG_RAX(%rsp), %rax - cmpq $0, %rax - je 1f + testq %rax, %rax + jz 1f /* Swap the flags with orig_rax */ movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -244,20 +249,14 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq %rax, MCOUNT_REG_SIZE(%rsp) restore_mcount_regs 8 + /* Restore flags */ + popfq - jmp 2f +SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); + UNWIND_HINT_RET_OFFSET + jmp ftrace_epilogue 1: restore_mcount_regs - - -2: - /* - * The stack layout is nondetermistic here, depending on which path was - * taken. This confuses objtool and ORC, rightfully so. For now, - * pretend the stack always looks like the non-direct case. - */ - UNWIND_HINT_RESTORE - /* Restore flags */ popfq @@ -268,7 +267,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * to the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) - jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) @@ -303,7 +301,7 @@ trace: * function tracing is enabled. */ movq ftrace_trace_function, %r8 - CALL_NOSPEC %r8 + CALL_NOSPEC r8 restore_mcount_regs jmp fgraph_trace @@ -340,6 +338,6 @@ SYM_CODE_START(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - JMP_NOSPEC %rdi + JMP_NOSPEC rdi SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 6407ea21fa1b..bdcc5146de96 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -25,10 +25,6 @@ #include <linux/atomic.h> #include <linux/sched/clock.h> -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - #include <asm/cpu_entry_area.h> #include <asm/traps.h> #include <asm/mach_traps.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 35638f1c5791..ce6cd220f722 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -191,7 +191,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__clear(&tsk->thread.fpu); + fpu__clear_all(&tsk->thread.fpu); } void disable_TSC(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4b3fa6cd3106..a3767e74c758 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -237,6 +237,9 @@ static u64 __init get_ramdisk_image(void) ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + if (ramdisk_image == 0) + ramdisk_image = phys_initrd_start; + return ramdisk_image; } static u64 __init get_ramdisk_size(void) @@ -245,6 +248,9 @@ static u64 __init get_ramdisk_size(void) ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + if (ramdisk_size == 0) + ramdisk_size = phys_initrd_size; + return ramdisk_size; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 83b74fb38c8f..399f97abee02 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,6 +37,7 @@ #include <asm/vm86.h> #ifdef CONFIG_X86_64 +#include <linux/compat.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> #endif /* CONFIG_X86_64 */ @@ -511,6 +512,31 @@ Efault: } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} +#endif /* CONFIG_X86_X32_ABI */ + static int x32_setup_rt_frame(struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) @@ -543,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } @@ -732,7 +758,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - fpu__clear(fpu); + fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2f24c334a938..2467f3dd35d3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1384,12 +1384,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) speculative_store_bypass_ht_init(); } -void arch_enable_nonboot_cpus_begin(void) +void arch_thaw_secondary_cpus_begin(void) { set_mtrr_aps_delayed_init(); } -void arch_enable_nonboot_cpus_end(void) +void arch_thaw_secondary_cpus_end(void) { mtrr_aps_init(); } @@ -1857,24 +1857,25 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) #include <asm/cpu_device_id.h> #include <asm/intel-family.h> -#define ICPU(model) \ - {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_XEON_PHI_KNL), - ICPU(INTEL_FAM6_XEON_PHI_KNM), + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_SKYLAKE_X), + X86_MATCH(SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_ATOM_GOLDMONT), - ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), - ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), {} }; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b89f6ac6a0c0..b2942b2dbfcf 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -35,8 +35,7 @@ #include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ -struct tboot *tboot __read_mostly; -EXPORT_SYMBOL(tboot); +static struct tboot *tboot __read_mostly; /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 @@ -46,6 +45,11 @@ EXPORT_SYMBOL(tboot); static u8 tboot_uuid[16] __initdata = TBOOT_UUID; +bool tboot_enabled(void) +{ + return tboot != NULL; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..428186d9de46 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,12 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> +#include <linux/hardirq.h> +#include <linux/atomic.h> + #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> -#include <linux/atomic.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> @@ -82,78 +84,6 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } -/* - * In IST context, we explicitly disable preemption. This serves two - * purposes: it makes it much less likely that we would accidentally - * schedule in IST context and it will force a warning if we somehow - * manage to schedule by accident. - */ -void ist_enter(struct pt_regs *regs) -{ - if (user_mode(regs)) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - } else { - /* - * We might have interrupted pretty much anything. In - * fact, if we're a machine check, we can even interrupt - * NMI processing. We don't want in_nmi() to return true, - * but we need to notify RCU. - */ - rcu_nmi_enter(); - } - - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); -} -NOKPROBE_SYMBOL(ist_enter); - -void ist_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); - - if (!user_mode(regs)) - rcu_nmi_exit(); -} - -/** - * ist_begin_non_atomic() - begin a non-atomic section in an IST exception - * @regs: regs passed to the IST exception handler - * - * IST exception handlers normally cannot schedule. As a special - * exception, if the exception interrupted userspace code (i.e. - * user_mode(regs) would return true) and the exception was not - * a double fault, it can be safe to schedule. ist_begin_non_atomic() - * begins a non-atomic section within an ist_enter()/ist_exit() region. - * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call ist_end_non_atomic() - * before ist_exit(). - */ -void ist_begin_non_atomic(struct pt_regs *regs) -{ - BUG_ON(!user_mode(regs)); - - /* - * Sanity check: we need to be on the normal thread stack. This - * will catch asm bugs and any attempt to use ist_preempt_enable - * from double_fault. - */ - BUG_ON(!on_thread_stack()); - - preempt_enable_no_resched(); -} - -/** - * ist_end_non_atomic() - begin a non-atomic section in an IST exception - * - * Ends a non-atomic section started with ist_begin_non_atomic(). - */ -void ist_end_non_atomic(void) -{ - preempt_disable(); -} - int is_valid_bugaddr(unsigned long addr) { unsigned short ud; @@ -326,7 +256,6 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * @@ -363,7 +292,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * - * No need for ist_enter here because we don't use RCU. + * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && @@ -398,7 +327,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign } #endif - ist_enter(regs); + nmi_enter(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -450,7 +379,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign die("double fault", regs, error_code); panic("Machine halted."); } -#endif dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { @@ -592,19 +520,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Unlike any other non-IST entry, we can be called from a kprobe in - * non-CONTEXT_KERNEL kernel mode or even during context tracking - * state changes. Make sure that we wake up RCU even if we're coming - * from kernel code. - * - * This means that we can't schedule even if we came from a - * preemptible kernel context. That's okay. + * Unlike any other non-IST entry, we can be called from pretty much + * any location in the kernel through kprobes -- text_poke() will most + * likely be handled by poke_int3_handler() above. This means this + * handler is effectively NMI-like. */ - if (!user_mode(regs)) { - rcu_nmi_enter(); - preempt_disable(); - } - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + if (!user_mode(regs)) + nmi_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, @@ -626,10 +548,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) cond_local_irq_disable(regs); exit: - if (!user_mode(regs)) { - preempt_enable_no_resched(); - rcu_nmi_exit(); - } + if (!user_mode(regs)) + nmi_exit(); } NOKPROBE_SYMBOL(do_int3); @@ -733,7 +653,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; - ist_enter(regs); + nmi_enter(); get_debugreg(dr6, 6); /* @@ -826,7 +746,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - ist_exit(regs); + nmi_exit(); } NOKPROBE_SYMBOL(do_debug); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8071952e9cf2..fd59fee84631 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3586,7 +3586,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virt/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.rst to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4742e8fa7ee7..d1d768912368 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -153,7 +153,7 @@ SYM_FUNC_START(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -436,7 +436,7 @@ SYM_FUNC_START(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index c66c8b00f236..ee63d7576fd2 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -10,7 +10,7 @@ #include <asm/smap.h> /** - * csum_partial_copy_from_user - Copy and checksum from user space. + * csum_and_copy_from_user - Copy and checksum from user space. * @src: source address (user space) * @dst: destination address * @len: number of bytes to be copied. @@ -21,13 +21,13 @@ * src and dst are best aligned to 64bits. */ __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, +csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp) { might_sleep(); *errp = 0; - if (!likely(access_ok(src, len))) + if (!user_access_begin(src, len)) goto out_err; /* @@ -42,8 +42,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, while (((unsigned long)src & 6) && len >= 2) { __u16 val16; - if (__get_user(val16, (const __u16 __user *)src)) - goto out_err; + unsafe_get_user(val16, (const __u16 __user *)src, out); *(__u16 *)dst = val16; isum = (__force __wsum)add32_with_carry( @@ -53,25 +52,26 @@ csum_partial_copy_from_user(const void __user *src, void *dst, len -= 2; } } - stac(); isum = csum_partial_copy_generic((__force const void *)src, dst, len, isum, errp, NULL); - clac(); + user_access_end(); if (unlikely(*errp)) goto out_err; return isum; +out: + user_access_end(); out_err: *errp = -EFAULT; memset(dst, 0, len); return isum; } -EXPORT_SYMBOL(csum_partial_copy_from_user); +EXPORT_SYMBOL(csum_and_copy_from_user); /** - * csum_partial_copy_to_user - Copy and checksum to user space. + * csum_and_copy_to_user - Copy and checksum to user space. * @src: source address * @dst: destination address (user space) * @len: number of bytes to be copied. @@ -82,14 +82,14 @@ EXPORT_SYMBOL(csum_partial_copy_from_user); * src and dst are best aligned to 64bits. */ __wsum -csum_partial_copy_to_user(const void *src, void __user *dst, +csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { __wsum ret; might_sleep(); - if (unlikely(!access_ok(dst, len))) { + if (!user_access_begin(dst, len)) { *errp = -EFAULT; return 0; } @@ -100,9 +100,7 @@ csum_partial_copy_to_user(const void *src, void __user *dst, isum = (__force __wsum)add32_with_carry( (__force unsigned)isum, val16); - *errp = __put_user(val16, (__u16 __user *)dst); - if (*errp) - return isum; + unsafe_put_user(val16, (__u16 __user *)dst, out); src += 2; dst += 2; len -= 2; @@ -110,13 +108,16 @@ csum_partial_copy_to_user(const void *src, void __user *dst, } *errp = 0; - stac(); ret = csum_partial_copy_generic(src, (void __force *)dst, len, isum, NULL, errp); - clac(); + user_access_end(); return ret; +out: + user_access_end(); + *errp = -EFAULT; + return isum; } -EXPORT_SYMBOL(csum_partial_copy_to_user); +EXPORT_SYMBOL(csum_and_copy_to_user); /** * csum_partial_copy_nocheck - Copy and checksum. diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 363ec132df7e..b4c43a9b1483 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,15 +7,31 @@ #include <asm/alternative-asm.h> #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> +#include <asm/frame.h> .macro THUNK reg .section .text.__x86.indirect_thunk + .align 32 SYM_FUNC_START(__x86_indirect_thunk_\reg) - CFI_STARTPROC - JMP_NOSPEC %\reg - CFI_ENDPROC + JMP_NOSPEC \reg SYM_FUNC_END(__x86_indirect_thunk_\reg) + +SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg) + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop_\@ +.Lspec_trap_\@: + UNWIND_HINT_EMPTY + pause + lfence + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov %\reg, (%_ASM_SP) + UNWIND_HINT_RET_OFFSET + ret +SYM_FUNC_END(__x86_retpoline_\reg) + .endm /* @@ -24,25 +40,24 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) * only see one instance of "__x86_indirect_thunk_\reg" rather * than one per register with the correct names. So we do it * the simple and nasty way... + * + * Worse, you can only have a single EXPORT_SYMBOL per line, + * and CPP can't insert newlines, so we have to repeat everything + * at least twice. */ -#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) - -GENERATE_THUNK(_ASM_AX) -GENERATE_THUNK(_ASM_BX) -GENERATE_THUNK(_ASM_CX) -GENERATE_THUNK(_ASM_DX) -GENERATE_THUNK(_ASM_SI) -GENERATE_THUNK(_ASM_DI) -GENERATE_THUNK(_ASM_BP) -#ifdef CONFIG_64BIT -GENERATE_THUNK(r8) -GENERATE_THUNK(r9) -GENERATE_THUNK(r10) -GENERATE_THUNK(r11) -GENERATE_THUNK(r12) -GENERATE_THUNK(r13) -GENERATE_THUNK(r14) -GENERATE_THUNK(r15) -#endif + +#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) +#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg) + +#undef GEN +#define GEN(reg) THUNK reg +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) EXPORT_THUNK(reg) +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) EXPORT_RETPOLINE(reg) +#include <asm/GEN-for-each-reg.h> diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 56f9189bbadb..5199d8a1daf1 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -17,7 +17,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif -#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +#ifdef CONFIG_X86_32 DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); #endif @@ -114,12 +114,10 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) #else static inline void percpu_setup_exception_stacks(unsigned int cpu) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(cpu); cea_map_percpu_pages(&cea->doublefault_stack, &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); -#endif } #endif diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1bba16c5742b..a573a3e63f02 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -121,8 +121,6 @@ __ref void *alloc_low_pages(unsigned int num) } else { pfn = pgt_buf_end; pgt_buf_end += num; - printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", - pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); } for (i = 0; i < num; i++) { diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index cb91eccc4960..c90c20904a60 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -18,7 +18,9 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/compat.h> +#include <linux/elf-randomize.h> #include <asm/elf.h> +#include <asm/io.h> #include "physaddr.h" diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f2bd3d61e16b..104544359d69 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -27,40 +27,6 @@ #include "numa_internal.h" -#ifdef CONFIG_DISCONTIGMEM -/* - * 4) physnode_map - the mapping between a pfn and owning node - * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 64Mb break (each element of the array will - * represent 64Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * - * physnode_map[0-15] = 0; - * physnode_map[16-31] = 1; - * physnode_map[32- ] = -1; - */ -s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1}; -EXPORT_SYMBOL(physnode_map); - -void memory_present(int nid, unsigned long start, unsigned long end) -{ - unsigned long pfn; - - printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", - nid, start, end); - printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); - printk(KERN_DEBUG " "); - start = round_down(start, PAGES_PER_SECTION); - end = round_up(end, PAGES_PER_SECTION); - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { - physnode_map[pfn / PAGES_PER_SECTION] = nid; - printk(KERN_CONT "%lx ", pfn); - } - printk(KERN_CONT "\n"); -} -#endif - extern unsigned long highend_pfn, highstart_pfn; void __init initmem_init(void) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1aae5302501d..e966115d105c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -62,12 +62,12 @@ static unsigned long efi_runtime, efi_nr_tables; unsigned long efi_fw_vendor, efi_config_table; static const efi_config_table_type_t arch_tables[] __initconst = { - {EFI_PROPERTIES_TABLE_GUID, "PROP", &prop_phys}, - {UGA_IO_PROTOCOL_GUID, "UGA", &uga_phys}, + {EFI_PROPERTIES_TABLE_GUID, &prop_phys, "PROP" }, + {UGA_IO_PROTOCOL_GUID, &uga_phys, "UGA" }, #ifdef CONFIG_X86_UV - {UV_SYSTEM_TABLE_GUID, "UVsystab", &uv_systab_phys}, + {UV_SYSTEM_TABLE_GUID, &uv_systab_phys, "UVsystab" }, #endif - {NULL_GUID, NULL, NULL}, + {}, }; static const unsigned long * const efi_tables[] = { diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 15da118f04f0..90380a17ab23 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -21,7 +21,7 @@ SYM_FUNC_START(__efi_call) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx - CALL_NOSPEC %rdi + CALL_NOSPEC rdi leave ret SYM_FUNC_END(__efi_call) diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index c60255da5a6c..4494589a288a 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -45,7 +45,8 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } -s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) +static s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, + u64 a5) { s64 ret; @@ -57,10 +58,9 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) return ret; } -EXPORT_SYMBOL_GPL(uv_bios_call); -s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) +static s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) { unsigned long bios_flags; s64 ret; @@ -77,18 +77,13 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } - long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); long sn_coherency_id; -EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); long system_serial_number; -EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; -EXPORT_SYMBOL_GPL(uv_type); - s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, long *region, long *ssn) @@ -115,7 +110,6 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, *ssn = v1; return ret; } -EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, @@ -166,7 +160,6 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, (u64)ticks_per_second, 0, 0, 0); } -EXPORT_SYMBOL_GPL(uv_bios_freq_base); /* * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target @@ -185,7 +178,6 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, (u64)decode, (u64)domain, (u64)bus, 0, 0); } -EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); int uv_bios_init(void) { diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 62214731fea5..266773e2fb37 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -21,7 +21,7 @@ static ssize_t partition_id_show(struct kobject *kobj, static ssize_t coherence_id_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%ld\n", uv_partition_coherence_id()); + return snprintf(buf, PAGE_SIZE, "%ld\n", sn_coherency_id); } static struct kobj_attribute partition_id_attr = diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index aaff9ed7ff45..fc3b757afb2c 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -307,7 +307,7 @@ int hibernate_resume_nonboot_cpu_disable(void) if (ret) return ret; smp_ops.play_dead = resume_play_dead; - ret = disable_nonboot_cpus(); + ret = freeze_secondary_cpus(0); smp_ops.play_dead = play_dead; return ret; } diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index 2a56cac64687..ff6bba2c8ab6 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -36,26 +36,6 @@ __wsum csum_partial_copy_nocheck(const void *src, void *dst, return csum_partial(dst, len, sum); } -/* - * the same as csum_partial, but copies from src while it - * checksums, and handles user-space pointer exceptions correctly, when needed. - * - * here even more important to align src and dst on a 32-bit (or even - * better 64-bit) boundary - */ - -static __inline__ -__wsum csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, int *err_ptr) -{ - if (copy_from_user(dst, src, len)) { - *err_ptr = -EFAULT; - return (__force __wsum)-1; - } - - return csum_partial(dst, len, sum); -} - /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 1abe455d926a..205a9bc981b0 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -29,7 +29,7 @@ static efi_system_table_t efi_systab_xen __initdata = { .fw_vendor = EFI_INVALID_TABLE_ADDR, /* Initialized later. */ .fw_revision = 0, /* Initialized later. */ .con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ - .con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_in = NULL, /* Not used under Xen. */ .con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ .con_out = NULL, /* Not used under Xen. */ .stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h index 8b687176ad72..d8292cc9ebdf 100644 --- a/arch/xtensa/include/asm/checksum.h +++ b/arch/xtensa/include/asm/checksum.h @@ -44,8 +44,6 @@ asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst, /* * Note: when you get a NULL pointer exception here this means someone * passed in an incorrect kernel address to one of these functions. - * - * If you use these functions directly please don't forget the access_ok(). */ static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, @@ -54,12 +52,17 @@ __wsum csum_partial_copy_nocheck(const void *src, void *dst, return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); } +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static inline -__wsum csum_partial_copy_from_user(const void __user *src, void *dst, +__wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *err_ptr) { - return csum_partial_copy_generic((__force const void *)src, dst, + if (access_ok(dst, len)) + return csum_partial_copy_generic((__force const void *)src, dst, len, sum, err_ptr, NULL); + if (len) + *err_ptr = -EFAULT; + return sum; } /* diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 85a9ab1bc04d..69d0d73876b3 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -408,3 +408,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 01962c63a711..f2d0e5915dab 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -394,7 +394,7 @@ static int __init gtdt_sbsa_gwdt_init(void) */ ret = acpi_gtdt_init(table, &timer_count); if (ret || !timer_count) - return ret; + goto out_put_gtdt; for_each_platform_timer(platform_timer) { if (is_non_secure_watchdog(platform_timer)) { @@ -408,6 +408,8 @@ static int __init gtdt_sbsa_gwdt_init(void) if (gwdt_count) pr_info("found %d SBSA generic Watchdog(s).\n", gwdt_count); +out_put_gtdt: + acpi_put_table(table); return ret; } diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 7d04424189df..28a6b387e80e 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -299,61 +299,8 @@ out: return status; } -struct iort_workaround_oem_info { - char oem_id[ACPI_OEM_ID_SIZE + 1]; - char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; - u32 oem_revision; -}; - -static bool apply_id_count_workaround; - -static struct iort_workaround_oem_info wa_info[] __initdata = { - { - .oem_id = "HISI ", - .oem_table_id = "HIP07 ", - .oem_revision = 0, - }, { - .oem_id = "HISI ", - .oem_table_id = "HIP08 ", - .oem_revision = 0, - } -}; - -static void __init -iort_check_id_count_workaround(struct acpi_table_header *tbl) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(wa_info); i++) { - if (!memcmp(wa_info[i].oem_id, tbl->oem_id, ACPI_OEM_ID_SIZE) && - !memcmp(wa_info[i].oem_table_id, tbl->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) && - wa_info[i].oem_revision == tbl->oem_revision) { - apply_id_count_workaround = true; - pr_warn(FW_BUG "ID count for ID mapping entry is wrong, applying workaround\n"); - break; - } - } -} - -static inline u32 iort_get_map_max(struct acpi_iort_id_mapping *map) -{ - u32 map_max = map->input_base + map->id_count; - - /* - * The IORT specification revision D (Section 3, table 4, page 9) says - * Number of IDs = The number of IDs in the range minus one, but the - * IORT code ignored the "minus one", and some firmware did that too, - * so apply a workaround here to keep compatible with both the spec - * compliant and non-spec compliant firmwares. - */ - if (apply_id_count_workaround) - map_max--; - - return map_max; -} - static int iort_id_map(struct acpi_iort_id_mapping *map, u8 type, u32 rid_in, - u32 *rid_out) + u32 *rid_out, bool check_overlap) { /* Single mapping does not care for input id */ if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) { @@ -368,10 +315,37 @@ static int iort_id_map(struct acpi_iort_id_mapping *map, u8 type, u32 rid_in, return -ENXIO; } - if (rid_in < map->input_base || rid_in > iort_get_map_max(map)) + if (rid_in < map->input_base || + (rid_in > map->input_base + map->id_count)) return -ENXIO; + if (check_overlap) { + /* + * We already found a mapping for this input ID at the end of + * another region. If it coincides with the start of this + * region, we assume the prior match was due to the off-by-1 + * issue mentioned below, and allow it to be superseded. + * Otherwise, things are *really* broken, and we just disregard + * duplicate matches entirely to retain compatibility. + */ + pr_err(FW_BUG "[map %p] conflicting mapping for input ID 0x%x\n", + map, rid_in); + if (rid_in != map->input_base) + return -ENXIO; + + pr_err(FW_BUG "applying workaround.\n"); + } + *rid_out = map->output_base + (rid_in - map->input_base); + + /* + * Due to confusion regarding the meaning of the id_count field (which + * carries the number of IDs *minus 1*), we may have to disregard this + * match if it is at the end of the range, and overlaps with the start + * of another one. + */ + if (map->id_count > 0 && rid_in == map->input_base + map->id_count) + return -EAGAIN; return 0; } @@ -414,6 +388,7 @@ static struct acpi_iort_node *iort_node_get_id(struct acpi_iort_node *node, static int iort_get_id_mapping_index(struct acpi_iort_node *node) { struct acpi_iort_smmu_v3 *smmu; + struct acpi_iort_pmcg *pmcg; switch (node->type) { case ACPI_IORT_NODE_SMMU_V3: @@ -441,6 +416,10 @@ static int iort_get_id_mapping_index(struct acpi_iort_node *node) return smmu->id_mapping_index; case ACPI_IORT_NODE_PMCG: + pmcg = (struct acpi_iort_pmcg *)node->node_data; + if (pmcg->overflow_gsiv || node->mapping_count == 0) + return -EINVAL; + return 0; default: return -EINVAL; @@ -456,7 +435,8 @@ static struct acpi_iort_node *iort_node_map_id(struct acpi_iort_node *node, /* Parse the ID mapping tree to find specified node type */ while (node) { struct acpi_iort_id_mapping *map; - int i, index; + int i, index, rc = 0; + u32 out_ref = 0, map_id = id; if (IORT_TYPE_MASK(node->type) & type_mask) { if (id_out) @@ -490,15 +470,18 @@ static struct acpi_iort_node *iort_node_map_id(struct acpi_iort_node *node, if (i == index) continue; - if (!iort_id_map(map, node->type, id, &id)) + rc = iort_id_map(map, node->type, map_id, &id, out_ref); + if (!rc) break; + if (rc == -EAGAIN) + out_ref = map->output_reference; } - if (i == node->mapping_count) + if (i == node->mapping_count && !out_ref) goto fail_map; node = ACPI_ADD_PTR(struct acpi_iort_node, iort_table, - map->output_reference); + rc ? out_ref : map->output_reference); } fail_map: @@ -789,15 +772,6 @@ void acpi_configure_pmsi_domain(struct device *dev) dev_set_msi_domain(dev, msi_domain); } -static int __maybe_unused __get_pci_rid(struct pci_dev *pdev, u16 alias, - void *data) -{ - u32 *rid = data; - - *rid = alias; - return 0; -} - #ifdef CONFIG_IOMMU_API static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev) { @@ -1148,13 +1122,10 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) else size = 1ULL << 32; - if (dev_is_pci(dev)) { - ret = acpi_dma_get_range(dev, &dmaaddr, &offset, &size); - if (ret == -ENODEV) - ret = rc_dma_get_range(dev, &size); - } else { - ret = nc_dma_get_range(dev, &size); - } + ret = acpi_dma_get_range(dev, &dmaaddr, &offset, &size); + if (ret == -ENODEV) + ret = dev_is_pci(dev) ? rc_dma_get_range(dev, &size) + : nc_dma_get_range(dev, &size); if (!ret) { /* @@ -1692,6 +1663,10 @@ void __init acpi_iort_init(void) { acpi_status status; + /* iort_table will be used at runtime after the iort init, + * so we don't need to call acpi_put_table() to release + * the IORT table mapping. + */ status = acpi_get_table(ACPI_SIG_IORT, 0, &iort_table); if (ACPI_FAILURE(status)) { if (status != AE_NOT_FOUND) { @@ -1703,6 +1678,5 @@ void __init acpi_iort_init(void) return; } - iort_check_id_count_workaround(iort_table); iort_init_platform_devices(); } diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index beca5f91bb4c..69361ec43db5 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5209,7 +5209,7 @@ void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp) * sata_link_init_spd - Initialize link->sata_spd_limit * @link: Link to configure sata_spd_limit for * - * Initialize @link->[hw_]sata_spd_limit to the currently + * Initialize ``link->[hw_]sata_spd_limit`` to the currently * configured value. * * LOCKING: diff --git a/drivers/base/core.c b/drivers/base/core.c index 0cad34f1eede..57562e0f6308 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1393,7 +1393,7 @@ static void device_release(struct kobject *kobj) else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev); else - WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/kobject.txt.\n", + WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", dev_name(dev)); kfree(p); } diff --git a/drivers/base/node.c b/drivers/base/node.c index 6012574913f7..5b02f69769e8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -415,6 +415,9 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d AnonPages: %8lu kB\n" "Node %d Shmem: %8lu kB\n" "Node %d KernelStack: %8lu kB\n" +#ifdef CONFIG_SHADOW_CALL_STACK + "Node %d ShadowCallStack:%8lu kB\n" +#endif "Node %d PageTables: %8lu kB\n" "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" @@ -438,6 +441,9 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), nid, K(i.sharedram), nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB), +#endif nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), nid, 0UL, nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), diff --git a/drivers/base/platform.c b/drivers/base/platform.c index b27d0f6c18c9..615c6b06b427 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -147,7 +147,8 @@ EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource_byname); * request_irq() APIs. This is the same as platform_get_irq(), except that it * does not print an error message if an IRQ can not be obtained. * - * Example: + * For example:: + * * int irq = platform_get_irq_optional(pdev, 0); * if (irq < 0) * return irq; @@ -226,7 +227,8 @@ EXPORT_SYMBOL_GPL(platform_get_irq_optional); * IRQ fails. Device drivers should check the return value for errors so as to * not pass a negative integer value to the request_irq() APIs. * - * Example: + * For example:: + * * int irq = platform_get_irq(pdev, 0); * if (irq < 0) * return irq; diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 1a8564a79d8d..5ee8e3fae551 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -37,19 +37,16 @@ static void zcomp_strm_free(struct zcomp_strm *zstrm) if (!IS_ERR_OR_NULL(zstrm->tfm)) crypto_free_comp(zstrm->tfm); free_pages((unsigned long)zstrm->buffer, 1); - kfree(zstrm); + zstrm->tfm = NULL; + zstrm->buffer = NULL; } /* - * allocate new zcomp_strm structure with ->tfm initialized by - * backend, return NULL on error + * Initialize zcomp_strm structure with ->tfm initialized by backend, and + * ->buffer. Return a negative value on error. */ -static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); - if (!zstrm) - return NULL; - zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the @@ -58,9 +55,9 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { zcomp_strm_free(zstrm); - zstrm = NULL; + return -ENOMEM; } - return zstrm; + return 0; } bool zcomp_available_algorithm(const char *comp) @@ -113,12 +110,13 @@ ssize_t zcomp_available_show(const char *comp, char *buf) struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) { - return *get_cpu_ptr(comp->stream); + local_lock(&comp->stream->lock); + return this_cpu_ptr(comp->stream); } void zcomp_stream_put(struct zcomp *comp) { - put_cpu_ptr(comp->stream); + local_unlock(&comp->stream->lock); } int zcomp_compress(struct zcomp_strm *zstrm, @@ -159,17 +157,15 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) { struct zcomp *comp = hlist_entry(node, struct zcomp, node); struct zcomp_strm *zstrm; + int ret; - if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) - return 0; + zstrm = per_cpu_ptr(comp->stream, cpu); + local_lock_init(&zstrm->lock); - zstrm = zcomp_strm_alloc(comp); - if (IS_ERR_OR_NULL(zstrm)) { + ret = zcomp_strm_init(zstrm, comp); + if (ret) pr_err("Can't allocate a compression stream\n"); - return -ENOMEM; - } - *per_cpu_ptr(comp->stream, cpu) = zstrm; - return 0; + return ret; } int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) @@ -177,10 +173,8 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) struct zcomp *comp = hlist_entry(node, struct zcomp, node); struct zcomp_strm *zstrm; - zstrm = *per_cpu_ptr(comp->stream, cpu); - if (!IS_ERR_OR_NULL(zstrm)) - zcomp_strm_free(zstrm); - *per_cpu_ptr(comp->stream, cpu) = NULL; + zstrm = per_cpu_ptr(comp->stream, cpu); + zcomp_strm_free(zstrm); return 0; } @@ -188,7 +182,7 @@ static int zcomp_init(struct zcomp *comp) { int ret; - comp->stream = alloc_percpu(struct zcomp_strm *); + comp->stream = alloc_percpu(struct zcomp_strm); if (!comp->stream) return -ENOMEM; diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 1806475b919d..40f6420f4b2e 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -5,8 +5,11 @@ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ +#include <linux/local_lock.h> struct zcomp_strm { + /* The members ->buffer and ->tfm are protected by ->lock. */ + local_lock_t lock; /* compression/decompression buffer */ void *buffer; struct crypto_comp *tfm; @@ -14,7 +17,7 @@ struct zcomp_strm { /* dynamic per-device compression frontend */ struct zcomp { - struct zcomp_strm * __percpu *stream; + struct zcomp_strm __percpu *stream; const char *name; struct hlist_node node; }; diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig index 7dc2c3ec4051..07847d9a459a 100644 --- a/drivers/char/ipmi/Kconfig +++ b/drivers/char/ipmi/Kconfig @@ -14,7 +14,7 @@ menuconfig IPMI_HANDLER IPMI is a standard for managing sensors (temperature, voltage, etc.) in a system. - See <file:Documentation/IPMI.txt> for more details on the driver. + See <file:Documentation/driver-api/ipmi.rst> for more details on the driver. If unsure, say N. diff --git a/drivers/char/ipmi/ipmi_si_hotmod.c b/drivers/char/ipmi/ipmi_si_hotmod.c index 42a925f8cf69..4fbb4e18bae2 100644 --- a/drivers/char/ipmi/ipmi_si_hotmod.c +++ b/drivers/char/ipmi/ipmi_si_hotmod.c @@ -18,7 +18,7 @@ static int hotmod_handler(const char *val, const struct kernel_param *kp); module_param_call(hotmod, hotmod_handler, NULL, NULL, 0200); MODULE_PARM_DESC(hotmod, "Add and remove interfaces. See" - " Documentation/IPMI.txt in the kernel sources for the" + " Documentation/driver-api/ipmi.rst in the kernel sources for the" " gory details."); /* diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index c7cc8538b84a..77b8d551ae7f 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -968,7 +968,7 @@ static inline bool ipmi_thread_busy_wait(enum si_sm_result smi_result, * that are not BT and do not have interrupts. It starts spinning * when an operation is complete or until max_busy tells it to stop * (if that is enabled). See the paragraph on kimid_max_busy_us in - * Documentation/IPMI.txt for details. + * Documentation/driver-api/ipmi.rst for details. */ static int ipmi_thread(void *data) { diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c index 4667844eee69..8206412d25ba 100644 --- a/drivers/char/nvram.c +++ b/drivers/char/nvram.c @@ -232,8 +232,6 @@ static ssize_t nvram_misc_read(struct file *file, char __user *buf, ssize_t ret; - if (!access_ok(buf, count)) - return -EFAULT; if (*ppos >= nvram_size) return 0; @@ -264,8 +262,6 @@ static ssize_t nvram_misc_write(struct file *file, const char __user *buf, char *tmp; ssize_t ret; - if (!access_ok(buf, count)) - return -EFAULT; if (*ppos >= nvram_size) return 0; diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c index 4edb4174a1e2..89681f07bc78 100644 --- a/drivers/char/pcmcia/cm4000_cs.c +++ b/drivers/char/pcmcia/cm4000_cs.c @@ -1404,7 +1404,6 @@ static long cmm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) unsigned int iobase = dev->p_dev->resource[0]->start; struct inode *inode = file_inode(filp); struct pcmcia_device *link; - int size; int rc; void __user *argp = (void __user *)arg; #ifdef CM4000_DEBUG @@ -1441,19 +1440,6 @@ static long cmm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) DEBUGP(4, dev, "iocnr mismatch\n"); goto out; } - size = _IOC_SIZE(cmd); - rc = -EFAULT; - DEBUGP(4, dev, "iocdir=%.4x iocr=%.4x iocw=%.4x iocsize=%d cmd=%.4x\n", - _IOC_DIR(cmd), _IOC_READ, _IOC_WRITE, size, cmd); - - if (_IOC_DIR(cmd) & _IOC_READ) { - if (!access_ok(argp, size)) - goto out; - } - if (_IOC_DIR(cmd) & _IOC_WRITE) { - if (!access_ok(argp, size)) - goto out; - } rc = 0; switch (cmd) { diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index d58ce664da84..646ad385e490 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -18,6 +18,7 @@ #include <linux/pid_namespace.h> #include <linux/cn_proc.h> +#include <linux/local_lock.h> /* * Size of a cn_msg followed by a proc_event structure. Since the @@ -38,25 +39,31 @@ static inline struct cn_msg *buffer_to_cn_msg(__u8 *buffer) static atomic_t proc_event_num_listeners = ATOMIC_INIT(0); static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC }; -/* proc_event_counts is used as the sequence number of the netlink message */ -static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 }; +/* local_event.count is used as the sequence number of the netlink message */ +struct local_event { + local_lock_t lock; + __u32 count; +}; +static DEFINE_PER_CPU(struct local_event, local_event) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static inline void send_msg(struct cn_msg *msg) { - preempt_disable(); + local_lock(&local_event.lock); - msg->seq = __this_cpu_inc_return(proc_event_counts) - 1; + msg->seq = __this_cpu_inc_return(local_event.count) - 1; ((struct proc_event *)msg->data)->cpu = smp_processor_id(); /* - * Preemption remains disabled during send to ensure the messages are - * ordered according to their sequence numbers. + * local_lock() disables preemption during send to ensure the messages + * are ordered according to their sequence numbers. * * If cn_netlink_send() fails, the data is not sent. */ cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT); - preempt_enable(); + local_unlock(&local_event.lock); } void proc_fork_connector(struct task_struct *task) diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c index a5fd8975f3d3..a6abb701bfc6 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c @@ -8,7 +8,7 @@ * This file add support for AES cipher with 128,192,256 bits keysize in * CBC and ECB mode. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include <linux/crypto.h> diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c index 3e4e4bbda34c..b957061424a1 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c @@ -7,7 +7,7 @@ * * Core file which registers crypto algorithms supported by the CryptoEngine. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include <linux/clk.h> #include <linux/crypto.h> diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c index 84d52fc3a2da..c89cb2ee2496 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c @@ -8,7 +8,7 @@ * This file add support for AES cipher with 128,192,256 bits keysize in * CBC and ECB mode. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include <linux/crypto.h> diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c index a1fb2fbdbe7b..5d9d0fedcb06 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c @@ -7,7 +7,7 @@ * * Core file which registers crypto algorithms supported by the SecuritySystem * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include <linux/clk.h> #include <linux/crypto.h> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 439cd737076e..a2426334be61 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -394,8 +394,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable) goto cmd; /* allocate a physically contiguous buffer to store the CSR blob */ - if (!access_ok(input.address, input.length) || - input.length > SEV_FW_BLOB_MAX_SIZE) { + if (input.length > SEV_FW_BLOB_MAX_SIZE) { ret = -EFAULT; goto e_free; } @@ -632,12 +631,6 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp) if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) return -EFAULT; - /* Check if we have write access to the userspace buffer */ - if (input.address && - input.length && - !access_ok(input.address, input.length)) - return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -753,15 +746,13 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable) goto cmd; /* Allocate a physically contiguous buffer to store the PDH blob. */ - if ((input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) || - !access_ok(input.pdh_cert_address, input.pdh_cert_len)) { + if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) { ret = -EFAULT; goto e_free; } /* Allocate a physically contiguous buffer to store the cert chain blob. */ - if ((input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) || - !access_ok(input.cert_chain_address, input.cert_chain_len)) { + if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) { ret = -EFAULT; goto e_free; } diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 6e291d8f3a27..c7ea4f2d5ca6 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1081,8 +1081,6 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg) return -EINVAL; p = (struct fw_cdev_iso_packet __user *)u64_to_uptr(a->packets); - if (!access_ok(p, a->size)) - return -EFAULT; end = (void __user *)p + a->size; count = 0; @@ -1120,7 +1118,7 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg) &p->header[transmit_header_bytes / 4]; if (next > end) return -EINVAL; - if (__copy_from_user + if (copy_from_user (u.packet.header, p->header, transmit_header_bytes)) return -EFAULT; if (u.packet.skip && ctx->type == FW_ISO_CONTEXT_TRANSMIT && diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 8007d4aa76dc..4843e94713a4 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -295,15 +295,13 @@ config TURRIS_MOX_RWTM other manufacturing data and also utilize the Entropy Bit Generator for hardware random number generation. -config HAVE_ARM_SMCCC - bool - -source "drivers/firmware/psci/Kconfig" source "drivers/firmware/broadcom/Kconfig" source "drivers/firmware/google/Kconfig" source "drivers/firmware/efi/Kconfig" source "drivers/firmware/imx/Kconfig" source "drivers/firmware/meson/Kconfig" +source "drivers/firmware/psci/Kconfig" +source "drivers/firmware/smccc/Kconfig" source "drivers/firmware/tegra/Kconfig" source "drivers/firmware/xilinx/Kconfig" diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index e9fb838af4df..99510be9f5ed 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -23,12 +23,13 @@ obj-$(CONFIG_TRUSTED_FOUNDATIONS) += trusted_foundations.o obj-$(CONFIG_TURRIS_MOX_RWTM) += turris-mox-rwtm.o obj-$(CONFIG_ARM_SCMI_PROTOCOL) += arm_scmi/ -obj-y += psci/ obj-y += broadcom/ obj-y += meson/ obj-$(CONFIG_GOOGLE_FIRMWARE) += google/ obj-$(CONFIG_EFI) += efi/ obj-$(CONFIG_UEFI_CPER) += efi/ obj-y += imx/ +obj-y += psci/ +obj-y += smccc/ obj-y += tegra/ obj-y += xilinx/ diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 334c8be0c11f..e7e36aab2386 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -429,7 +429,6 @@ int sdei_event_enable(u32 event_num) return err; } -EXPORT_SYMBOL(sdei_event_enable); static int sdei_api_event_disable(u32 event_num) { @@ -471,7 +470,6 @@ int sdei_event_disable(u32 event_num) return err; } -EXPORT_SYMBOL(sdei_event_disable); static int sdei_api_event_unregister(u32 event_num) { @@ -533,7 +531,6 @@ int sdei_event_unregister(u32 event_num) return err; } -EXPORT_SYMBOL(sdei_event_unregister); /* * unregister events, but don't destroy them as they are re-registered by @@ -643,7 +640,6 @@ int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg) return err; } -EXPORT_SYMBOL(sdei_event_register); static int sdei_reregister_event_llocked(struct sdei_event *event) { @@ -1079,26 +1075,9 @@ static struct platform_driver sdei_driver = { .probe = sdei_probe, }; -static bool __init sdei_present_dt(void) -{ - struct device_node *np, *fw_np; - - fw_np = of_find_node_by_name(NULL, "firmware"); - if (!fw_np) - return false; - - np = of_find_matching_node(fw_np, sdei_of_match); - if (!np) - return false; - of_node_put(np); - - return true; -} - static bool __init sdei_present_acpi(void) { acpi_status status; - struct platform_device *pdev; struct acpi_table_header *sdei_table_header; if (acpi_disabled) @@ -1113,20 +1092,26 @@ static bool __init sdei_present_acpi(void) if (ACPI_FAILURE(status)) return false; - pdev = platform_device_register_simple(sdei_driver.driver.name, 0, NULL, - 0); - if (IS_ERR(pdev)) - return false; + acpi_put_table(sdei_table_header); return true; } static int __init sdei_init(void) { - if (sdei_present_dt() || sdei_present_acpi()) - platform_driver_register(&sdei_driver); + int ret = platform_driver_register(&sdei_driver); - return 0; + if (!ret && sdei_present_acpi()) { + struct platform_device *pdev; + + pdev = platform_device_register_simple(sdei_driver.driver.name, + 0, NULL, 0); + if (IS_ERR(pdev)) + pr_info("Failed to register ACPI:SDEI platform device %ld\n", + PTR_ERR(pdev)); + } + + return ret; } /* @@ -1143,6 +1128,14 @@ int sdei_event_handler(struct pt_regs *regs, mm_segment_t orig_addr_limit; u32 event_num = arg->event_num; + /* + * Save restore 'fs'. + * The architecture's entry code save/restores 'fs' when taking an + * exception from the kernel. This ensures addr_limit isn't inherited + * if you interrupted something that allowed the uaccess routines to + * access kernel memory. + * Do the same here because this doesn't come via the same entry code. + */ orig_addr_limit = get_fs(); set_fs(USER_DS); diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 613828d3f106..6b38f9e5d203 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -106,12 +106,12 @@ config EFI_PARAMS_FROM_FDT config EFI_RUNTIME_WRAPPERS bool -config EFI_ARMSTUB +config EFI_GENERIC_STUB bool config EFI_ARMSTUB_DTB_LOADER bool "Enable the DTB loader" - depends on EFI_ARMSTUB + depends on EFI_GENERIC_STUB default y help Select this config option to add support for the dtb= command @@ -124,6 +124,17 @@ config EFI_ARMSTUB_DTB_LOADER functionality for bootloaders that do not have such support this option is necessary. +config EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER + bool "Enable the command line initrd loader" if !X86 + depends on EFI_STUB && (EFI_GENERIC_STUB || X86) + default y + help + Select this config option to add support for the initrd= command + line parameter, allowing an initrd that resides on the same volume + as the kernel image to be loaded into memory. + + This method is deprecated. + config EFI_BOOTLOADER_CONTROL tristate "EFI Bootloader Control" depends on EFI_VARS diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 9e5e62f5f94d..c697e70ca7e7 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -54,8 +54,8 @@ static phys_addr_t __init efi_to_phys(unsigned long addr) static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR; static const efi_config_table_type_t arch_tables[] __initconst = { - {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, NULL, &screen_info_table}, - {NULL_GUID, NULL, NULL} + {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, &screen_info_table}, + {} }; static void __init init_screen_info(void) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 4e3055238f31..9357d6b6e87c 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -499,21 +499,21 @@ void __init efi_mem_reserve(phys_addr_t addr, u64 size) } static const efi_config_table_type_t common_tables[] __initconst = { - {ACPI_20_TABLE_GUID, "ACPI 2.0", &efi.acpi20}, - {ACPI_TABLE_GUID, "ACPI", &efi.acpi}, - {SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios}, - {SMBIOS3_TABLE_GUID, "SMBIOS 3.0", &efi.smbios3}, - {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt}, - {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi_mem_attr_table}, - {LINUX_EFI_RANDOM_SEED_TABLE_GUID, "RNG", &efi_rng_seed}, - {LINUX_EFI_TPM_EVENT_LOG_GUID, "TPMEventLog", &efi.tpm_log}, - {LINUX_EFI_TPM_FINAL_LOG_GUID, "TPMFinalLog", &efi.tpm_final_log}, - {LINUX_EFI_MEMRESERVE_TABLE_GUID, "MEMRESERVE", &mem_reserve}, - {EFI_RT_PROPERTIES_TABLE_GUID, "RTPROP", &rt_prop}, + {ACPI_20_TABLE_GUID, &efi.acpi20, "ACPI 2.0" }, + {ACPI_TABLE_GUID, &efi.acpi, "ACPI" }, + {SMBIOS_TABLE_GUID, &efi.smbios, "SMBIOS" }, + {SMBIOS3_TABLE_GUID, &efi.smbios3, "SMBIOS 3.0" }, + {EFI_SYSTEM_RESOURCE_TABLE_GUID, &efi.esrt, "ESRT" }, + {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, &efi_mem_attr_table, "MEMATTR" }, + {LINUX_EFI_RANDOM_SEED_TABLE_GUID, &efi_rng_seed, "RNG" }, + {LINUX_EFI_TPM_EVENT_LOG_GUID, &efi.tpm_log, "TPMEventLog" }, + {LINUX_EFI_TPM_FINAL_LOG_GUID, &efi.tpm_final_log, "TPMFinalLog" }, + {LINUX_EFI_MEMRESERVE_TABLE_GUID, &mem_reserve, "MEMRESERVE" }, + {EFI_RT_PROPERTIES_TABLE_GUID, &rt_prop, "RTPROP" }, #ifdef CONFIG_EFI_RCI2_TABLE - {DELLEMC_EFI_RCI2_TABLE_GUID, NULL, &rci2_table_phys}, + {DELLEMC_EFI_RCI2_TABLE_GUID, &rci2_table_phys }, #endif - {NULL_GUID, NULL, NULL}, + {}, }; static __init int match_config_table(const efi_guid_t *guid, @@ -522,15 +522,13 @@ static __init int match_config_table(const efi_guid_t *guid, { int i; - if (table_types) { - for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) { - if (!efi_guidcmp(*guid, table_types[i].guid)) { - *(table_types[i].ptr) = table; - if (table_types[i].name) - pr_cont(" %s=0x%lx ", - table_types[i].name, table); - return 1; - } + for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) { + if (!efi_guidcmp(*guid, table_types[i].guid)) { + *(table_types[i].ptr) = table; + if (table_types[i].name[0]) + pr_cont("%s=0x%lx ", + table_types[i].name, table); + return 1; } } @@ -567,7 +565,7 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables, table = tbl32[i].table; } - if (!match_config_table(guid, table, common_tables)) + if (!match_config_table(guid, table, common_tables) && arch_tables) match_config_table(guid, table, arch_tables); } pr_cont("\n"); diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index 78ad1ba8c987..26528a46d99e 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -522,8 +522,10 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var) ret = kobject_init_and_add(&new_var->kobj, &efivar_ktype, NULL, "%s", short_name); kfree(short_name); - if (ret) + if (ret) { + kobject_put(&new_var->kobj); return ret; + } kobject_uevent(&new_var->kobj, KOBJ_ADD); if (efivar_entry_add(new_var, &efivar_sysfs_list)) { diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 094eabdecfe6..cce4a7436052 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -7,7 +7,7 @@ # cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -O2 \ +cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ \ -fPIC -fno-strict-aliasing -mno-red-zone \ -mno-mmx -mno-sse -fshort-wchar \ -Wno-pointer-sign \ @@ -23,15 +23,19 @@ cflags-$(CONFIG_ARM) := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) \ -fno-builtin -fpic \ $(call cc-option,-mno-single-pic-base) -cflags-$(CONFIG_EFI_ARMSTUB) += -I$(srctree)/scripts/dtc/libfdt +cflags-$(CONFIG_EFI_GENERIC_STUB) += -I$(srctree)/scripts/dtc/libfdt -KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \ +KBUILD_CFLAGS := $(cflags-y) -Os -DDISABLE_BRANCH_PROFILING \ -include $(srctree)/drivers/firmware/efi/libstub/hidden.h \ -D__NO_FORTIFY \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) \ + $(call cc-option,-fno-addrsig) \ -D__DISABLE_EXPORTS +# remove SCS flags from all objects in this directory +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) + GCOV_PROFILE := n KASAN_SANITIZE := n UBSAN_SANITIZE := n @@ -42,16 +46,17 @@ KCOV_INSTRUMENT := n lib-y := efi-stub-helper.o gop.o secureboot.o tpm.o \ file.o mem.o random.o randomalloc.o pci.o \ - skip_spaces.o lib-cmdline.o lib-ctype.o + skip_spaces.o lib-cmdline.o lib-ctype.o \ + alignedmem.o relocate.o vsprintf.o # include the stub's generic dependencies from lib/ when building for ARM/arm64 -arm-deps-y := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c +efi-deps-y := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE $(call if_changed_rule,cc_o_c) -lib-$(CONFIG_EFI_ARMSTUB) += arm-stub.o fdt.o string.o \ - $(patsubst %.c,lib-%.o,$(arm-deps-y)) +lib-$(CONFIG_EFI_GENERIC_STUB) += efi-stub.o fdt.o string.o \ + $(patsubst %.c,lib-%.o,$(efi-deps-y)) lib-$(CONFIG_ARM) += arm32-stub.o lib-$(CONFIG_ARM64) += arm64-stub.o @@ -60,6 +65,25 @@ CFLAGS_arm32-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_arm64-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) # +# For x86, bootloaders like systemd-boot or grub-efi do not zero-initialize the +# .bss section, so the .bss section of the EFI stub needs to be included in the +# .data section of the compressed kernel to ensure initialization. Rename the +# .bss section here so it's easy to pick out in the linker script. +# +STUBCOPY_FLAGS-$(CONFIG_X86) += --rename-section .bss=.bss.efistub,load,alloc +STUBCOPY_RELOC-$(CONFIG_X86_32) := R_386_32 +STUBCOPY_RELOC-$(CONFIG_X86_64) := R_X86_64_64 + +# +# ARM discards the .data section because it disallows r/w data in the +# decompressor. So move our .data to .data.efistub and .bss to .bss.efistub, +# which are preserved explicitly by the decompressor linker script. +# +STUBCOPY_FLAGS-$(CONFIG_ARM) += --rename-section .data=.data.efistub \ + --rename-section .bss=.bss.efistub,load,alloc +STUBCOPY_RELOC-$(CONFIG_ARM) := R_ARM_ABS + +# # arm64 puts the stub in the kernel proper, which will unnecessarily retain all # code indefinitely unless it is annotated as __init/__initdata/__initconst etc. # So let's apply the __init annotations at the section level, by prefixing @@ -73,8 +97,8 @@ CFLAGS_arm64-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) # a verification pass to see if any absolute relocations exist in any of the # object files. # -extra-$(CONFIG_EFI_ARMSTUB) := $(lib-y) -lib-$(CONFIG_EFI_ARMSTUB) := $(patsubst %.o,%.stub.o,$(lib-y)) +extra-y := $(lib-y) +lib-y := $(patsubst %.o,%.stub.o,$(lib-y)) STUBCOPY_FLAGS-$(CONFIG_ARM64) += --prefix-alloc-sections=.init \ --prefix-symbols=__efistub_ @@ -97,11 +121,3 @@ quiet_cmd_stubcopy = STUBCPY $@ /bin/false; \ fi; \ $(OBJCOPY) $(STUBCOPY_FLAGS-y) $< $@ - -# -# ARM discards the .data section because it disallows r/w data in the -# decompressor. So move our .data to .data.efistub, which is preserved -# explicitly by the decompressor linker script. -# -STUBCOPY_FLAGS-$(CONFIG_ARM) += --rename-section .data=.data.efistub -STUBCOPY_RELOC-$(CONFIG_ARM) := R_ARM_ABS diff --git a/drivers/firmware/efi/libstub/alignedmem.c b/drivers/firmware/efi/libstub/alignedmem.c new file mode 100644 index 000000000000..cc89c4d6196f --- /dev/null +++ b/drivers/firmware/efi/libstub/alignedmem.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/efi.h> +#include <asm/efi.h> + +#include "efistub.h" + +/** + * efi_allocate_pages_aligned() - Allocate memory pages + * @size: minimum number of bytes to allocate + * @addr: On return the address of the first allocated page. The first + * allocated page has alignment EFI_ALLOC_ALIGN which is an + * architecture dependent multiple of the page size. + * @max: the address that the last allocated memory page shall not + * exceed + * @align: minimum alignment of the base of the allocation + * + * Allocate pages as EFI_LOADER_DATA. The allocated pages are aligned according + * to @align, which should be >= EFI_ALLOC_ALIGN. The last allocated page will + * not exceed the address given by @max. + * + * Return: status code + */ +efi_status_t efi_allocate_pages_aligned(unsigned long size, unsigned long *addr, + unsigned long max, unsigned long align) +{ + efi_physical_addr_t alloc_addr; + efi_status_t status; + int slack; + + if (align < EFI_ALLOC_ALIGN) + align = EFI_ALLOC_ALIGN; + + alloc_addr = ALIGN_DOWN(max + 1, align) - 1; + size = round_up(size, EFI_ALLOC_ALIGN); + slack = align / EFI_PAGE_SIZE - 1; + + status = efi_bs_call(allocate_pages, EFI_ALLOCATE_MAX_ADDRESS, + EFI_LOADER_DATA, size / EFI_PAGE_SIZE + slack, + &alloc_addr); + if (status != EFI_SUCCESS) + return status; + + *addr = ALIGN((unsigned long)alloc_addr, align); + + if (slack > 0) { + int l = (alloc_addr % align) / EFI_PAGE_SIZE; + + if (l) { + efi_bs_call(free_pages, alloc_addr, slack - l + 1); + slack = l - 1; + } + if (slack) + efi_bs_call(free_pages, *addr + size, slack); + } + return EFI_SUCCESS; +} diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 7826553af2ba..40243f524556 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -18,7 +18,7 @@ efi_status_t check_platform_features(void) /* LPAE kernels need compatible hardware */ block = cpuid_feature_extract(CPUID_EXT_MMFR0, 0); if (block < 5) { - pr_efi_err("This LPAE kernel is not supported by your CPU\n"); + efi_err("This LPAE kernel is not supported by your CPU\n"); return EFI_UNSUPPORTED; } return EFI_SUCCESS; @@ -120,7 +120,7 @@ static efi_status_t reserve_kernel_base(unsigned long dram_base, */ status = efi_get_memory_map(&map); if (status != EFI_SUCCESS) { - pr_efi_err("reserve_kernel_base(): Unable to retrieve memory map.\n"); + efi_err("reserve_kernel_base(): Unable to retrieve memory map.\n"); return status; } @@ -162,7 +162,7 @@ static efi_status_t reserve_kernel_base(unsigned long dram_base, (end - start) / EFI_PAGE_SIZE, &start); if (status != EFI_SUCCESS) { - pr_efi_err("reserve_kernel_base(): alloc failed.\n"); + efi_err("reserve_kernel_base(): alloc failed.\n"); goto out; } break; @@ -199,14 +199,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long kernel_base; efi_status_t status; - /* - * Verify that the DRAM base address is compatible with the ARM - * boot protocol, which determines the base of DRAM by masking - * off the low 27 bits of the address at which the zImage is - * loaded. These assumptions are made by the decompressor, - * before any memory map is available. - */ - kernel_base = round_up(dram_base, SZ_128M); + /* use a 16 MiB aligned base for the decompressed kernel */ + kernel_base = round_up(dram_base, SZ_16M) + TEXT_OFFSET; /* * Note that some platforms (notably, the Raspberry Pi 2) put @@ -215,41 +209,14 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, * base of the kernel image is only partially used at the moment. * (Up to 5 pages are used for the swapper page tables) */ - kernel_base += TEXT_OFFSET - 5 * PAGE_SIZE; - - status = reserve_kernel_base(kernel_base, reserve_addr, reserve_size); - if (status != EFI_SUCCESS) { - pr_efi_err("Unable to allocate memory for uncompressed kernel.\n"); - return status; - } - - /* - * Relocate the zImage, so that it appears in the lowest 128 MB - * memory window. - */ - *image_addr = (unsigned long)image->image_base; - *image_size = image->image_size; - status = efi_relocate_kernel(image_addr, *image_size, *image_size, - kernel_base + MAX_UNCOMP_KERNEL_SIZE, 0, 0); + status = reserve_kernel_base(kernel_base - 5 * PAGE_SIZE, reserve_addr, + reserve_size); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to relocate kernel.\n"); - efi_free(*reserve_size, *reserve_addr); - *reserve_size = 0; + efi_err("Unable to allocate memory for uncompressed kernel.\n"); return status; } - /* - * Check to see if we were able to allocate memory low enough - * in memory. The kernel determines the base of DRAM from the - * address at which the zImage is loaded. - */ - if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) { - pr_efi_err("Failed to relocate kernel, no low memory available.\n"); - efi_free(*reserve_size, *reserve_addr); - *reserve_size = 0; - efi_free(*image_size, *image_addr); - *image_size = 0; - return EFI_LOAD_ERROR; - } + *image_addr = kernel_base; + *image_size = 0; return EFI_SUCCESS; } diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index fc9f8ab533a7..7f6a57dec513 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -26,14 +26,23 @@ efi_status_t check_platform_features(void) tg = (read_cpuid(ID_AA64MMFR0_EL1) >> ID_AA64MMFR0_TGRAN_SHIFT) & 0xf; if (tg != ID_AA64MMFR0_TGRAN_SUPPORTED) { if (IS_ENABLED(CONFIG_ARM64_64K_PAGES)) - pr_efi_err("This 64 KB granular kernel is not supported by your CPU\n"); + efi_err("This 64 KB granular kernel is not supported by your CPU\n"); else - pr_efi_err("This 16 KB granular kernel is not supported by your CPU\n"); + efi_err("This 16 KB granular kernel is not supported by your CPU\n"); return EFI_UNSUPPORTED; } return EFI_SUCCESS; } +/* + * Relocatable kernels can fix up the misalignment with respect to + * MIN_KIMG_ALIGN, so they only require a minimum alignment of EFI_KIMG_ALIGN + * (which accounts for the alignment of statically allocated objects such as + * the swapper stack.) + */ +static const u64 min_kimg_align = IS_ENABLED(CONFIG_RELOCATABLE) ? EFI_KIMG_ALIGN + : MIN_KIMG_ALIGN; + efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, @@ -43,106 +52,63 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, { efi_status_t status; unsigned long kernel_size, kernel_memsize = 0; - unsigned long preferred_offset; - u64 phys_seed = 0; + u32 phys_seed = 0; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - if (!nokaslr()) { + if (!efi_nokaslr) { status = efi_get_random_bytes(sizeof(phys_seed), (u8 *)&phys_seed); if (status == EFI_NOT_FOUND) { - pr_efi("EFI_RNG_PROTOCOL unavailable, no randomness supplied\n"); + efi_info("EFI_RNG_PROTOCOL unavailable, no randomness supplied\n"); } else if (status != EFI_SUCCESS) { - pr_efi_err("efi_get_random_bytes() failed\n"); + efi_err("efi_get_random_bytes() failed\n"); return status; } } else { - pr_efi("KASLR disabled on kernel command line\n"); + efi_info("KASLR disabled on kernel command line\n"); } } - /* - * The preferred offset of the kernel Image is TEXT_OFFSET bytes beyond - * a 2 MB aligned base, which itself may be lower than dram_base, as - * long as the resulting offset equals or exceeds it. - */ - preferred_offset = round_down(dram_base, MIN_KIMG_ALIGN) + TEXT_OFFSET; - if (preferred_offset < dram_base) - preferred_offset += MIN_KIMG_ALIGN; + if (image->image_base != _text) + efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n"); kernel_size = _edata - _text; kernel_memsize = kernel_size + (_end - _edata); + *reserve_size = kernel_memsize + TEXT_OFFSET % min_kimg_align; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) { /* - * Produce a displacement in the interval [0, MIN_KIMG_ALIGN) - * that doesn't violate this kernel's de-facto alignment - * constraints. - */ - u32 mask = (MIN_KIMG_ALIGN - 1) & ~(EFI_KIMG_ALIGN - 1); - u32 offset = (phys_seed >> 32) & mask; - - /* - * With CONFIG_RANDOMIZE_TEXT_OFFSET=y, TEXT_OFFSET may not - * be a multiple of EFI_KIMG_ALIGN, and we must ensure that - * we preserve the misalignment of 'offset' relative to - * EFI_KIMG_ALIGN so that statically allocated objects whose - * alignment exceeds PAGE_SIZE appear correctly aligned in - * memory. - */ - offset |= TEXT_OFFSET % EFI_KIMG_ALIGN; - - /* * If KASLR is enabled, and we have some randomness available, * locate the kernel at a randomized offset in physical memory. */ - *reserve_size = kernel_memsize + offset; - status = efi_random_alloc(*reserve_size, - MIN_KIMG_ALIGN, reserve_addr, - (u32)phys_seed); - - *image_addr = *reserve_addr + offset; + status = efi_random_alloc(*reserve_size, min_kimg_align, + reserve_addr, phys_seed); } else { - /* - * Else, try a straight allocation at the preferred offset. - * This will work around the issue where, if dram_base == 0x0, - * efi_low_alloc() refuses to allocate at 0x0 (to prevent the - * address of the allocation to be mistaken for a FAIL return - * value or a NULL pointer). It will also ensure that, on - * platforms where the [dram_base, dram_base + TEXT_OFFSET) - * interval is partially occupied by the firmware (like on APM - * Mustang), we can still place the kernel at the address - * 'dram_base + TEXT_OFFSET'. - */ - *image_addr = (unsigned long)_text; - if (*image_addr == preferred_offset) - return EFI_SUCCESS; - - *image_addr = *reserve_addr = preferred_offset; - *reserve_size = round_up(kernel_memsize, EFI_ALLOC_ALIGN); - - status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, - *reserve_size / EFI_PAGE_SIZE, - (efi_physical_addr_t *)reserve_addr); + status = EFI_OUT_OF_RESOURCES; } if (status != EFI_SUCCESS) { - *reserve_size = kernel_memsize + TEXT_OFFSET; - status = efi_low_alloc(*reserve_size, - MIN_KIMG_ALIGN, reserve_addr); + if (IS_ALIGNED((u64)_text - TEXT_OFFSET, min_kimg_align)) { + /* + * Just execute from wherever we were loaded by the + * UEFI PE/COFF loader if the alignment is suitable. + */ + *image_addr = (u64)_text; + *reserve_size = 0; + return EFI_SUCCESS; + } + + status = efi_allocate_pages_aligned(*reserve_size, reserve_addr, + ULONG_MAX, min_kimg_align); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to relocate kernel\n"); + efi_err("Failed to relocate kernel\n"); *reserve_size = 0; return status; } - *image_addr = *reserve_addr + TEXT_OFFSET; } - if (image->image_base != _text) - pr_efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n"); - + *image_addr = *reserve_addr + TEXT_OFFSET % min_kimg_align; memcpy((void *)*image_addr, _text, kernel_size); return EFI_SUCCESS; diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 9f34c7242939..89f075275300 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -7,60 +7,151 @@ * Copyright 2011 Intel Corporation; author Matt Fleming */ +#include <stdarg.h> + +#include <linux/ctype.h> #include <linux/efi.h> +#include <linux/kernel.h> +#include <linux/printk.h> /* For CONSOLE_LOGLEVEL_* */ #include <asm/efi.h> +#include <asm/setup.h> #include "efistub.h" -static bool __efistub_global efi_nochunk; -static bool __efistub_global efi_nokaslr; -static bool __efistub_global efi_noinitrd; -static bool __efistub_global efi_quiet; -static bool __efistub_global efi_novamap; -static bool __efistub_global efi_nosoftreserve; -static bool __efistub_global efi_disable_pci_dma = - IS_ENABLED(CONFIG_EFI_DISABLE_PCI_DMA); +bool efi_nochunk; +bool efi_nokaslr; +bool efi_noinitrd; +int efi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; +bool efi_novamap; -bool __pure nochunk(void) -{ - return efi_nochunk; -} -bool __pure nokaslr(void) -{ - return efi_nokaslr; -} -bool __pure noinitrd(void) +static bool efi_nosoftreserve; +static bool efi_disable_pci_dma = IS_ENABLED(CONFIG_EFI_DISABLE_PCI_DMA); + +bool __pure __efi_soft_reserve_enabled(void) { - return efi_noinitrd; + return !efi_nosoftreserve; } -bool __pure is_quiet(void) + +void efi_char16_puts(efi_char16_t *str) { - return efi_quiet; + efi_call_proto(efi_table_attr(efi_system_table, con_out), + output_string, str); } -bool __pure novamap(void) + +static +u32 utf8_to_utf32(const u8 **s8) { - return efi_novamap; + u32 c32; + u8 c0, cx; + size_t clen, i; + + c0 = cx = *(*s8)++; + /* + * The position of the most-significant 0 bit gives us the length of + * a multi-octet encoding. + */ + for (clen = 0; cx & 0x80; ++clen) + cx <<= 1; + /* + * If the 0 bit is in position 8, this is a valid single-octet + * encoding. If the 0 bit is in position 7 or positions 1-3, the + * encoding is invalid. + * In either case, we just return the first octet. + */ + if (clen < 2 || clen > 4) + return c0; + /* Get the bits from the first octet. */ + c32 = cx >> clen--; + for (i = 0; i < clen; ++i) { + /* Trailing octets must have 10 in most significant bits. */ + cx = (*s8)[i] ^ 0x80; + if (cx & 0xc0) + return c0; + c32 = (c32 << 6) | cx; + } + /* + * Check for validity: + * - The character must be in the Unicode range. + * - It must not be a surrogate. + * - It must be encoded using the correct number of octets. + */ + if (c32 > 0x10ffff || + (c32 & 0xf800) == 0xd800 || + clen != (c32 >= 0x80) + (c32 >= 0x800) + (c32 >= 0x10000)) + return c0; + *s8 += clen; + return c32; } -bool __pure __efi_soft_reserve_enabled(void) + +void efi_puts(const char *str) { - return !efi_nosoftreserve; + efi_char16_t buf[128]; + size_t pos = 0, lim = ARRAY_SIZE(buf); + const u8 *s8 = (const u8 *)str; + u32 c32; + + while (*s8) { + if (*s8 == '\n') + buf[pos++] = L'\r'; + c32 = utf8_to_utf32(&s8); + if (c32 < 0x10000) { + /* Characters in plane 0 use a single word. */ + buf[pos++] = c32; + } else { + /* + * Characters in other planes encode into a surrogate + * pair. + */ + buf[pos++] = (0xd800 - (0x10000 >> 10)) + (c32 >> 10); + buf[pos++] = 0xdc00 + (c32 & 0x3ff); + } + if (*s8 == '\0' || pos >= lim - 2) { + buf[pos] = L'\0'; + efi_char16_puts(buf); + pos = 0; + } + } } -void efi_printk(char *str) +int efi_printk(const char *fmt, ...) { - char *s8; + char printf_buf[256]; + va_list args; + int printed; + int loglevel = printk_get_level(fmt); + + switch (loglevel) { + case '0' ... '9': + loglevel -= '0'; + break; + default: + /* + * Use loglevel -1 for cases where we just want to print to + * the screen. + */ + loglevel = -1; + break; + } - for (s8 = str; *s8; s8++) { - efi_char16_t ch[2] = { 0 }; + if (loglevel >= efi_loglevel) + return 0; - ch[0] = *s8; - if (*s8 == '\n') { - efi_char16_t nl[2] = { '\r', 0 }; - efi_char16_printk(nl); - } + if (loglevel >= 0) + efi_puts("EFI stub: "); + + fmt = printk_skip_level(fmt); + + va_start(args, fmt); + printed = vsnprintf(printf_buf, sizeof(printf_buf), fmt, args); + va_end(args); - efi_char16_printk(ch); + efi_puts(printf_buf); + if (printed >= sizeof(printf_buf)) { + efi_puts("[Message truncated]\n"); + return -1; } + + return printed; } /* @@ -91,7 +182,7 @@ efi_status_t efi_parse_options(char const *cmdline) if (!strcmp(param, "nokaslr")) { efi_nokaslr = true; } else if (!strcmp(param, "quiet")) { - efi_quiet = true; + efi_loglevel = CONSOLE_LOGLEVEL_QUIET; } else if (!strcmp(param, "noinitrd")) { efi_noinitrd = true; } else if (!strcmp(param, "efi") && val) { @@ -105,6 +196,11 @@ efi_status_t efi_parse_options(char const *cmdline) efi_disable_pci_dma = true; if (parse_option_str(val, "no_disable_early_pci_dma")) efi_disable_pci_dma = false; + if (parse_option_str(val, "debug")) + efi_loglevel = CONSOLE_LOGLEVEL_DEBUG; + } else if (!strcmp(param, "video") && + val && strstarts(val, "efifb:")) { + efi_parse_option_graphics(val + strlen("efifb:")); } } efi_bs_call(free_pool, buf); @@ -112,97 +208,79 @@ efi_status_t efi_parse_options(char const *cmdline) } /* - * Get the number of UTF-8 bytes corresponding to an UTF-16 character. - * This overestimates for surrogates, but that is okay. - */ -static int efi_utf8_bytes(u16 c) -{ - return 1 + (c >= 0x80) + (c >= 0x800); -} - -/* - * Convert an UTF-16 string, not necessarily null terminated, to UTF-8. - */ -static u8 *efi_utf16_to_utf8(u8 *dst, const u16 *src, int n) -{ - unsigned int c; - - while (n--) { - c = *src++; - if (n && c >= 0xd800 && c <= 0xdbff && - *src >= 0xdc00 && *src <= 0xdfff) { - c = 0x10000 + ((c & 0x3ff) << 10) + (*src & 0x3ff); - src++; - n--; - } - if (c >= 0xd800 && c <= 0xdfff) - c = 0xfffd; /* Unmatched surrogate */ - if (c < 0x80) { - *dst++ = c; - continue; - } - if (c < 0x800) { - *dst++ = 0xc0 + (c >> 6); - goto t1; - } - if (c < 0x10000) { - *dst++ = 0xe0 + (c >> 12); - goto t2; - } - *dst++ = 0xf0 + (c >> 18); - *dst++ = 0x80 + ((c >> 12) & 0x3f); - t2: - *dst++ = 0x80 + ((c >> 6) & 0x3f); - t1: - *dst++ = 0x80 + (c & 0x3f); - } - - return dst; -} - -/* * Convert the unicode UEFI command line to ASCII to pass to kernel. * Size of memory allocated return in *cmd_line_len. * Returns NULL on error. */ -char *efi_convert_cmdline(efi_loaded_image_t *image, - int *cmd_line_len, unsigned long max_addr) +char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len) { const u16 *s2; - u8 *s1 = NULL; unsigned long cmdline_addr = 0; - int load_options_chars = efi_table_attr(image, load_options_size) / 2; + int options_chars = efi_table_attr(image, load_options_size) / 2; const u16 *options = efi_table_attr(image, load_options); - int options_bytes = 0; /* UTF-8 bytes */ - int options_chars = 0; /* UTF-16 chars */ + int options_bytes = 0, safe_options_bytes = 0; /* UTF-8 bytes */ + bool in_quote = false; efi_status_t status; - u16 zero = 0; if (options) { s2 = options; - while (*s2 && *s2 != '\n' - && options_chars < load_options_chars) { - options_bytes += efi_utf8_bytes(*s2++); - options_chars++; + while (options_bytes < COMMAND_LINE_SIZE && options_chars--) { + u16 c = *s2++; + + if (c < 0x80) { + if (c == L'\0' || c == L'\n') + break; + if (c == L'"') + in_quote = !in_quote; + else if (!in_quote && isspace((char)c)) + safe_options_bytes = options_bytes; + + options_bytes++; + continue; + } + + /* + * Get the number of UTF-8 bytes corresponding to a + * UTF-16 character. + * The first part handles everything in the BMP. + */ + options_bytes += 2 + (c >= 0x800); + /* + * Add one more byte for valid surrogate pairs. Invalid + * surrogates will be replaced with 0xfffd and take up + * only 3 bytes. + */ + if ((c & 0xfc00) == 0xd800) { + /* + * If the very last word is a high surrogate, + * we must ignore it since we can't access the + * low surrogate. + */ + if (!options_chars) { + options_bytes -= 3; + } else if ((*s2 & 0xfc00) == 0xdc00) { + options_bytes++; + options_chars--; + s2++; + } + } + } + if (options_bytes >= COMMAND_LINE_SIZE) { + options_bytes = safe_options_bytes; + efi_err("Command line is too long: truncated to %d bytes\n", + options_bytes); } - } - - if (!options_chars) { - /* No command line options, so return empty string*/ - options = &zero; } options_bytes++; /* NUL termination */ - status = efi_allocate_pages(options_bytes, &cmdline_addr, max_addr); + status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, options_bytes, + (void **)&cmdline_addr); if (status != EFI_SUCCESS) return NULL; - s1 = (u8 *)cmdline_addr; - s2 = (const u16 *)options; - - s1 = efi_utf16_to_utf8(s1, s2, options_chars); - *s1 = '\0'; + snprintf((char *)cmdline_addr, options_bytes, "%.*ls", + options_bytes - 1, options); *cmd_line_len = options_bytes; return (char *)cmdline_addr; @@ -285,8 +363,8 @@ fail: void *get_efi_config_table(efi_guid_t guid) { - unsigned long tables = efi_table_attr(efi_system_table(), tables); - int nr_tables = efi_table_attr(efi_system_table(), nr_tables); + unsigned long tables = efi_table_attr(efi_system_table, tables); + int nr_tables = efi_table_attr(efi_system_table, nr_tables); int i; for (i = 0; i < nr_tables; i++) { @@ -301,12 +379,6 @@ void *get_efi_config_table(efi_guid_t guid) return NULL; } -void efi_char16_printk(efi_char16_t *str) -{ - efi_call_proto(efi_table_attr(efi_system_table(), con_out), - output_string, str); -} - /* * The LINUX_EFI_INITRD_MEDIA_GUID vendor media device path below provides a way * for the firmware or bootloader to expose the initrd data directly to the stub @@ -348,6 +420,7 @@ static const struct { * %EFI_OUT_OF_RESOURCES if memory allocation failed * %EFI_LOAD_ERROR in all other cases */ +static efi_status_t efi_load_initrd_dev_path(unsigned long *load_addr, unsigned long *load_size, unsigned long max) @@ -360,9 +433,6 @@ efi_status_t efi_load_initrd_dev_path(unsigned long *load_addr, efi_handle_t handle; efi_status_t status; - if (!load_addr || !load_size) - return EFI_INVALID_PARAMETER; - dp = (efi_device_path_protocol_t *)&initrd_dev_path; status = efi_bs_call(locate_device_path, &lf2_proto_guid, &dp, &handle); if (status != EFI_SUCCESS) @@ -392,3 +462,80 @@ efi_status_t efi_load_initrd_dev_path(unsigned long *load_addr, *load_size = initrd_size; return EFI_SUCCESS; } + +static +efi_status_t efi_load_initrd_cmdline(efi_loaded_image_t *image, + unsigned long *load_addr, + unsigned long *load_size, + unsigned long soft_limit, + unsigned long hard_limit) +{ + if (!IS_ENABLED(CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER) || + (IS_ENABLED(CONFIG_X86) && (!efi_is_native() || image == NULL))) { + *load_addr = *load_size = 0; + return EFI_SUCCESS; + } + + return handle_cmdline_files(image, L"initrd=", sizeof(L"initrd=") - 2, + soft_limit, hard_limit, + load_addr, load_size); +} + +efi_status_t efi_load_initrd(efi_loaded_image_t *image, + unsigned long *load_addr, + unsigned long *load_size, + unsigned long soft_limit, + unsigned long hard_limit) +{ + efi_status_t status; + + if (!load_addr || !load_size) + return EFI_INVALID_PARAMETER; + + status = efi_load_initrd_dev_path(load_addr, load_size, hard_limit); + if (status == EFI_SUCCESS) { + efi_info("Loaded initrd from LINUX_EFI_INITRD_MEDIA_GUID device path\n"); + } else if (status == EFI_NOT_FOUND) { + status = efi_load_initrd_cmdline(image, load_addr, load_size, + soft_limit, hard_limit); + if (status == EFI_SUCCESS && *load_size > 0) + efi_info("Loaded initrd from command line option\n"); + } + + return status; +} + +efi_status_t efi_wait_for_key(unsigned long usec, efi_input_key_t *key) +{ + efi_event_t events[2], timer; + unsigned long index; + efi_simple_text_input_protocol_t *con_in; + efi_status_t status; + + con_in = efi_table_attr(efi_system_table, con_in); + if (!con_in) + return EFI_UNSUPPORTED; + efi_set_event_at(events, 0, efi_table_attr(con_in, wait_for_key)); + + status = efi_bs_call(create_event, EFI_EVT_TIMER, 0, NULL, NULL, &timer); + if (status != EFI_SUCCESS) + return status; + + status = efi_bs_call(set_timer, timer, EfiTimerRelative, + EFI_100NSEC_PER_USEC * usec); + if (status != EFI_SUCCESS) + return status; + efi_set_event_at(events, 1, timer); + + status = efi_bs_call(wait_for_event, 2, events, &index); + if (status == EFI_SUCCESS) { + if (index == 0) + status = efi_call_proto(con_in, read_keystroke, key); + else + status = EFI_TIMEOUT; + } + + efi_bs_call(close_event, timer); + + return status; +} diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 48161b1dd098..e97370bdfdb0 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -36,14 +36,9 @@ #endif static u64 virtmap_base = EFI_RT_VIRTUAL_BASE; -static bool __efistub_global flat_va_mapping; +static bool flat_va_mapping; -static efi_system_table_t *__efistub_global sys_table; - -__pure efi_system_table_t *efi_system_table(void) -{ - return sys_table; -} +const efi_system_table_t *efi_system_table; static struct screen_info *setup_graphics(void) { @@ -69,7 +64,7 @@ static struct screen_info *setup_graphics(void) return si; } -void install_memreserve_table(void) +static void install_memreserve_table(void) { struct linux_efi_memreserve *rsv; efi_guid_t memreserve_table_guid = LINUX_EFI_MEMRESERVE_TABLE_GUID; @@ -78,7 +73,7 @@ void install_memreserve_table(void) status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, sizeof(*rsv), (void **)&rsv); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to allocate memreserve entry!\n"); + efi_err("Failed to allocate memreserve entry!\n"); return; } @@ -89,7 +84,7 @@ void install_memreserve_table(void) status = efi_bs_call(install_configuration_table, &memreserve_table_guid, rsv); if (status != EFI_SUCCESS) - pr_efi_err("Failed to install memreserve config table!\n"); + efi_err("Failed to install memreserve config table!\n"); } static unsigned long get_dram_base(void) @@ -149,7 +144,8 @@ asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint, * for both archictectures, with the arch-specific code provided in the * handle_kernel_image() function. */ -efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) +efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, + efi_system_table_t *sys_table_arg) { efi_loaded_image_t *image; efi_status_t status; @@ -171,10 +167,10 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) efi_properties_table_t *prop_tbl; unsigned long max_addr; - sys_table = sys_table_arg; + efi_system_table = sys_table_arg; /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { + if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { status = EFI_INVALID_PARAMETER; goto fail; } @@ -188,16 +184,16 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) * information about the running image, such as size and the command * line. */ - status = sys_table->boottime->handle_protocol(handle, + status = efi_system_table->boottime->handle_protocol(handle, &loaded_image_proto, (void *)&image); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to get loaded image protocol\n"); + efi_err("Failed to get loaded image protocol\n"); goto fail; } dram_base = get_dram_base(); if (dram_base == EFI_ERROR) { - pr_efi_err("Failed to find DRAM base\n"); + efi_err("Failed to find DRAM base\n"); status = EFI_LOAD_ERROR; goto fail; } @@ -207,22 +203,32 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) * protocol. We are going to copy the command line into the * device tree, so this can be allocated anywhere. */ - cmdline_ptr = efi_convert_cmdline(image, &cmdline_size, ULONG_MAX); + cmdline_ptr = efi_convert_cmdline(image, &cmdline_size); if (!cmdline_ptr) { - pr_efi_err("getting command line via LOADED_IMAGE_PROTOCOL\n"); + efi_err("getting command line via LOADED_IMAGE_PROTOCOL\n"); status = EFI_OUT_OF_RESOURCES; goto fail; } if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) || IS_ENABLED(CONFIG_CMDLINE_FORCE) || - cmdline_size == 0) - efi_parse_options(CONFIG_CMDLINE); + cmdline_size == 0) { + status = efi_parse_options(CONFIG_CMDLINE); + if (status != EFI_SUCCESS) { + efi_err("Failed to parse options\n"); + goto fail_free_cmdline; + } + } - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0) - efi_parse_options(cmdline_ptr); + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0) { + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) { + efi_err("Failed to parse options\n"); + goto fail_free_cmdline; + } + } - pr_efi("Booting Linux Kernel...\n"); + efi_info("Booting Linux Kernel...\n"); si = setup_graphics(); @@ -231,8 +237,8 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) &reserve_size, dram_base, image); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to relocate kernel\n"); - goto fail_free_cmdline; + efi_err("Failed to relocate kernel\n"); + goto fail_free_screeninfo; } efi_retrieve_tpm2_eventlog(); @@ -250,42 +256,34 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) if (!IS_ENABLED(CONFIG_EFI_ARMSTUB_DTB_LOADER) || secure_boot != efi_secureboot_mode_disabled) { if (strstr(cmdline_ptr, "dtb=")) - pr_efi("Ignoring DTB from command line.\n"); + efi_err("Ignoring DTB from command line.\n"); } else { status = efi_load_dtb(image, &fdt_addr, &fdt_size); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to load device tree!\n"); + efi_err("Failed to load device tree!\n"); goto fail_free_image; } } if (fdt_addr) { - pr_efi("Using DTB from command line\n"); + efi_info("Using DTB from command line\n"); } else { /* Look for a device tree configuration table entry. */ fdt_addr = (uintptr_t)get_fdt(&fdt_size); if (fdt_addr) - pr_efi("Using DTB from configuration table\n"); + efi_info("Using DTB from configuration table\n"); } if (!fdt_addr) - pr_efi("Generating empty DTB\n"); + efi_info("Generating empty DTB\n"); - if (!noinitrd()) { + if (!efi_noinitrd) { max_addr = efi_get_max_initrd_addr(dram_base, image_addr); - status = efi_load_initrd_dev_path(&initrd_addr, &initrd_size, - max_addr); - if (status == EFI_SUCCESS) { - pr_efi("Loaded initrd from LINUX_EFI_INITRD_MEDIA_GUID device path\n"); - } else if (status == EFI_NOT_FOUND) { - status = efi_load_initrd(image, &initrd_addr, &initrd_size, - ULONG_MAX, max_addr); - if (status == EFI_SUCCESS && initrd_size > 0) - pr_efi("Loaded initrd from command line option\n"); - } + status = efi_load_initrd(image, &initrd_addr, &initrd_size, + ULONG_MAX, max_addr); if (status != EFI_SUCCESS) - pr_efi_err("Failed to load initrd!\n"); + efi_err("Failed to load initrd!\n"); } efi_random_get_seed(); @@ -303,7 +301,7 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA); /* hibernation expects the runtime regions to stay in the same place */ - if (!IS_ENABLED(CONFIG_HIBERNATION) && !nokaslr() && !flat_va_mapping) { + if (!IS_ENABLED(CONFIG_HIBERNATION) && !efi_nokaslr && !flat_va_mapping) { /* * Randomize the base of the UEFI runtime services region. * Preserve the 2 MB alignment of the region by taking a @@ -335,7 +333,7 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) /* not reached */ fail_free_initrd: - pr_efi_err("Failed to update FDT and exit boot services\n"); + efi_err("Failed to update FDT and exit boot services\n"); efi_free(initrd_size, initrd_addr); efi_free(fdt_size, fdt_addr); @@ -343,9 +341,10 @@ fail_free_initrd: fail_free_image: efi_free(image_size, image_addr); efi_free(reserve_size, reserve_addr); -fail_free_cmdline: +fail_free_screeninfo: free_screen_info(si); - efi_free(cmdline_size, (unsigned long)cmdline_ptr); +fail_free_cmdline: + efi_bs_call(free_pool, cmdline_ptr); fail: return status; } @@ -376,7 +375,7 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, size = in->num_pages * EFI_PAGE_SIZE; in->virt_addr = in->phys_addr; - if (novamap()) { + if (efi_novamap) { continue; } diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 62943992f02f..bcd8c0a785f0 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -3,6 +3,13 @@ #ifndef _DRIVERS_FIRMWARE_EFI_EFISTUB_H #define _DRIVERS_FIRMWARE_EFI_EFISTUB_H +#include <linux/compiler.h> +#include <linux/efi.h> +#include <linux/kernel.h> +#include <linux/kern_levels.h> +#include <linux/types.h> +#include <asm/efi.h> + /* error code which can't be mistaken for valid address */ #define EFI_ERROR (~0UL) @@ -25,25 +32,33 @@ #define EFI_ALLOC_ALIGN EFI_PAGE_SIZE #endif -#if defined(CONFIG_ARM) || defined(CONFIG_X86) -#define __efistub_global __section(.data) -#else -#define __efistub_global -#endif +extern bool efi_nochunk; +extern bool efi_nokaslr; +extern bool efi_noinitrd; +extern int efi_loglevel; +extern bool efi_novamap; -extern bool __pure nochunk(void); -extern bool __pure nokaslr(void); -extern bool __pure noinitrd(void); -extern bool __pure is_quiet(void); -extern bool __pure novamap(void); +extern const efi_system_table_t *efi_system_table; -extern __pure efi_system_table_t *efi_system_table(void); +efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, + efi_system_table_t *sys_table_arg); -#define pr_efi(msg) do { \ - if (!is_quiet()) efi_printk("EFI stub: "msg); \ -} while (0) +#ifndef ARCH_HAS_EFISTUB_WRAPPERS -#define pr_efi_err(msg) efi_printk("EFI stub: ERROR: "msg) +#define efi_is_native() (true) +#define efi_bs_call(func, ...) efi_system_table->boottime->func(__VA_ARGS__) +#define efi_rt_call(func, ...) efi_system_table->runtime->func(__VA_ARGS__) +#define efi_table_attr(inst, attr) (inst->attr) +#define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) + +#endif + +#define efi_info(fmt, ...) \ + efi_printk(KERN_INFO fmt, ##__VA_ARGS__) +#define efi_err(fmt, ...) \ + efi_printk(KERN_ERR "ERROR: " fmt, ##__VA_ARGS__) +#define efi_debug(fmt, ...) \ + efi_printk(KERN_DEBUG "DEBUG: " fmt, ##__VA_ARGS__) /* Helper macros for the usual case of using simple C variables: */ #ifndef fdt_setprop_inplace_var @@ -77,6 +92,13 @@ extern __pure efi_system_table_t *efi_system_table(void); ((handle = efi_get_handle_at((array), i)) || true); \ i++) +static inline +void efi_set_u64_split(u64 data, u32 *lo, u32 *hi) +{ + *lo = lower_32_bits(data); + *hi = upper_32_bits(data); +} + /* * Allocation types for calls to boottime->allocate_pages. */ @@ -93,6 +115,16 @@ extern __pure efi_system_table_t *efi_system_table(void); #define EFI_LOCATE_BY_PROTOCOL 2 /* + * boottime->stall takes the time period in microseconds + */ +#define EFI_USEC_PER_SEC 1000000 + +/* + * boottime->set_timer takes the time in 100ns units + */ +#define EFI_100NSEC_PER_USEC ((u64)10) + +/* * An efi_boot_memmap is used by efi_get_memory_map() to return the * EFI memory map in a dynamically allocated buffer. * @@ -116,6 +148,39 @@ struct efi_boot_memmap { typedef struct efi_generic_dev_path efi_device_path_protocol_t; +typedef void *efi_event_t; +/* Note that notifications won't work in mixed mode */ +typedef void (__efiapi *efi_event_notify_t)(efi_event_t, void *); + +#define EFI_EVT_TIMER 0x80000000U +#define EFI_EVT_RUNTIME 0x40000000U +#define EFI_EVT_NOTIFY_WAIT 0x00000100U +#define EFI_EVT_NOTIFY_SIGNAL 0x00000200U + +/* + * boottime->wait_for_event takes an array of events as input. + * Provide a helper to set it up correctly for mixed mode. + */ +static inline +void efi_set_event_at(efi_event_t *events, size_t idx, efi_event_t event) +{ + if (efi_is_native()) + events[idx] = event; + else + ((u32 *)events)[idx] = (u32)(unsigned long)event; +} + +#define EFI_TPL_APPLICATION 4 +#define EFI_TPL_CALLBACK 8 +#define EFI_TPL_NOTIFY 16 +#define EFI_TPL_HIGH_LEVEL 31 + +typedef enum { + EfiTimerCancel, + EfiTimerPeriodic, + EfiTimerRelative +} EFI_TIMER_DELAY; + /* * EFI Boot Services table */ @@ -134,11 +199,16 @@ union efi_boot_services { efi_status_t (__efiapi *allocate_pool)(int, unsigned long, void **); efi_status_t (__efiapi *free_pool)(void *); - void *create_event; - void *set_timer; - void *wait_for_event; + efi_status_t (__efiapi *create_event)(u32, unsigned long, + efi_event_notify_t, void *, + efi_event_t *); + efi_status_t (__efiapi *set_timer)(efi_event_t, + EFI_TIMER_DELAY, u64); + efi_status_t (__efiapi *wait_for_event)(unsigned long, + efi_event_t *, + unsigned long *); void *signal_event; - void *close_event; + efi_status_t (__efiapi *close_event)(efi_event_t); void *check_event; void *install_protocol_interface; void *reinstall_protocol_interface; @@ -165,7 +235,7 @@ union efi_boot_services { efi_status_t (__efiapi *exit_boot_services)(efi_handle_t, unsigned long); void *get_next_monotonic_count; - void *stall; + efi_status_t (__efiapi *stall)(unsigned long); void *set_watchdog_timer; void *connect_controller; efi_status_t (__efiapi *disconnect_controller)(efi_handle_t, @@ -250,6 +320,27 @@ union efi_uga_draw_protocol { } mixed_mode; }; +typedef struct { + u16 scan_code; + efi_char16_t unicode_char; +} efi_input_key_t; + +union efi_simple_text_input_protocol { + struct { + void *reset; + efi_status_t (__efiapi *read_keystroke)(efi_simple_text_input_protocol_t *, + efi_input_key_t *); + efi_event_t wait_for_key; + }; + struct { + u32 reset; + u32 read_keystroke; + u32 wait_for_key; + } mixed_mode; +}; + +efi_status_t efi_wait_for_key(unsigned long usec, efi_input_key_t *key); + union efi_simple_text_output_protocol { struct { void *reset; @@ -311,8 +402,10 @@ typedef union efi_graphics_output_protocol efi_graphics_output_protocol_t; union efi_graphics_output_protocol { struct { - void *query_mode; - void *set_mode; + efi_status_t (__efiapi *query_mode)(efi_graphics_output_protocol_t *, + u32, unsigned long *, + efi_graphics_output_mode_info_t **); + efi_status_t (__efiapi *set_mode) (efi_graphics_output_protocol_t *, u32); void *blt; efi_graphics_output_protocol_mode_t *mode; }; @@ -600,8 +693,6 @@ efi_status_t efi_exit_boot_services(void *handle, void *priv, efi_exit_boot_map_processing priv_func); -void efi_char16_printk(efi_char16_t *); - efi_status_t allocate_new_fdt_and_exit_boot(void *handle, unsigned long *new_fdt_addr, unsigned long max_addr, @@ -625,33 +716,24 @@ efi_status_t check_platform_features(void); void *get_efi_config_table(efi_guid_t guid); -void efi_printk(char *str); +/* NOTE: These functions do not print a trailing newline after the string */ +void efi_char16_puts(efi_char16_t *); +void efi_puts(const char *str); + +__printf(1, 2) int efi_printk(char const *fmt, ...); void efi_free(unsigned long size, unsigned long addr); -char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len, - unsigned long max_addr); +char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len); efi_status_t efi_get_memory_map(struct efi_boot_memmap *map); -efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, - unsigned long *addr, unsigned long min); - -static inline -efi_status_t efi_low_alloc(unsigned long size, unsigned long align, - unsigned long *addr) -{ - /* - * Don't allocate at 0x0. It will confuse code that - * checks pointers against NULL. Skip the first 8 - * bytes so we start at a nice even number. - */ - return efi_low_alloc_above(size, align, addr, 0x8); -} - efi_status_t efi_allocate_pages(unsigned long size, unsigned long *addr, unsigned long max); +efi_status_t efi_allocate_pages_aligned(unsigned long size, unsigned long *addr, + unsigned long max, unsigned long align); + efi_status_t efi_relocate_kernel(unsigned long *image_addr, unsigned long image_size, unsigned long alloc_size, @@ -661,12 +743,27 @@ efi_status_t efi_relocate_kernel(unsigned long *image_addr, efi_status_t efi_parse_options(char const *cmdline); +void efi_parse_option_graphics(char *option); + efi_status_t efi_setup_gop(struct screen_info *si, efi_guid_t *proto, unsigned long size); -efi_status_t efi_load_dtb(efi_loaded_image_t *image, - unsigned long *load_addr, - unsigned long *load_size); +efi_status_t handle_cmdline_files(efi_loaded_image_t *image, + const efi_char16_t *optstr, + int optstr_size, + unsigned long soft_limit, + unsigned long hard_limit, + unsigned long *load_addr, + unsigned long *load_size); + + +static inline efi_status_t efi_load_dtb(efi_loaded_image_t *image, + unsigned long *load_addr, + unsigned long *load_size) +{ + return handle_cmdline_files(image, L"dtb=", sizeof(L"dtb=") - 2, + ULONG_MAX, ULONG_MAX, load_addr, load_size); +} efi_status_t efi_load_initrd(efi_loaded_image_t *image, unsigned long *load_addr, @@ -674,8 +771,4 @@ efi_status_t efi_load_initrd(efi_loaded_image_t *image, unsigned long soft_limit, unsigned long hard_limit); -efi_status_t efi_load_initrd_dev_path(unsigned long *load_addr, - unsigned long *load_size, - unsigned long max); - #endif diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index 46cffac7a5f1..11ecf3c4640e 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -39,7 +39,7 @@ static efi_status_t update_fdt(void *orig_fdt, unsigned long orig_fdt_size, /* Do some checks on provided FDT, if it exists: */ if (orig_fdt) { if (fdt_check_header(orig_fdt)) { - pr_efi_err("Device Tree header not valid!\n"); + efi_err("Device Tree header not valid!\n"); return EFI_LOAD_ERROR; } /* @@ -47,7 +47,7 @@ static efi_status_t update_fdt(void *orig_fdt, unsigned long orig_fdt_size, * configuration table: */ if (orig_fdt_size && fdt_totalsize(orig_fdt) > orig_fdt_size) { - pr_efi_err("Truncated device tree! foo!\n"); + efi_err("Truncated device tree! foo!\n"); return EFI_LOAD_ERROR; } } @@ -110,7 +110,7 @@ static efi_status_t update_fdt(void *orig_fdt, unsigned long orig_fdt_size, /* Add FDT entries for EFI runtime services in chosen node. */ node = fdt_subnode_offset(fdt, 0, "chosen"); - fdt_val64 = cpu_to_fdt64((u64)(unsigned long)efi_system_table()); + fdt_val64 = cpu_to_fdt64((u64)(unsigned long)efi_system_table); status = fdt_setprop_var(fdt, node, "linux,uefi-system-table", fdt_val64); if (status) @@ -270,16 +270,16 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, */ status = efi_get_memory_map(&map); if (status != EFI_SUCCESS) { - pr_efi_err("Unable to retrieve UEFI memory map.\n"); + efi_err("Unable to retrieve UEFI memory map.\n"); return status; } - pr_efi("Exiting boot services and installing virtual address map...\n"); + efi_info("Exiting boot services and installing virtual address map...\n"); map.map = &memory_map; status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, max_addr); if (status != EFI_SUCCESS) { - pr_efi_err("Unable to allocate memory for new device tree.\n"); + efi_err("Unable to allocate memory for new device tree.\n"); goto fail; } @@ -296,7 +296,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, initrd_addr, initrd_size); if (status != EFI_SUCCESS) { - pr_efi_err("Unable to construct new device tree.\n"); + efi_err("Unable to construct new device tree.\n"); goto fail_free_new_fdt; } @@ -310,11 +310,11 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, if (status == EFI_SUCCESS) { efi_set_virtual_address_map_t *svam; - if (novamap()) + if (efi_novamap) return EFI_SUCCESS; /* Install the new virtual address map */ - svam = efi_system_table()->runtime->set_virtual_address_map; + svam = efi_system_table->runtime->set_virtual_address_map; status = svam(runtime_entry_count * desc_size, desc_size, desc_ver, runtime_map); @@ -342,13 +342,13 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, return EFI_SUCCESS; } - pr_efi_err("Exit boot services failed.\n"); + efi_err("Exit boot services failed.\n"); fail_free_new_fdt: efi_free(MAX_FDT_SIZE, *new_fdt_addr); fail: - efi_system_table()->boottime->free_pool(runtime_map); + efi_system_table->boottime->free_pool(runtime_map); return EFI_LOAD_ERROR; } @@ -363,7 +363,7 @@ void *get_fdt(unsigned long *fdt_size) return NULL; if (fdt_check_header(fdt) != 0) { - pr_efi_err("Invalid header detected on UEFI supplied FDT, ignoring ...\n"); + efi_err("Invalid header detected on UEFI supplied FDT, ignoring ...\n"); return NULL; } *fdt_size = fdt_totalsize(fdt); diff --git a/drivers/firmware/efi/libstub/file.c b/drivers/firmware/efi/libstub/file.c index ea66b1f16a79..2005e33b33d5 100644 --- a/drivers/firmware/efi/libstub/file.c +++ b/drivers/firmware/efi/libstub/file.c @@ -46,16 +46,14 @@ static efi_status_t efi_open_file(efi_file_protocol_t *volume, status = volume->open(volume, &fh, fi->filename, EFI_FILE_MODE_READ, 0); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to open file: "); - efi_char16_printk(fi->filename); - efi_printk("\n"); + efi_err("Failed to open file: %ls\n", fi->filename); return status; } info_sz = sizeof(struct finfo); status = fh->get_info(fh, &info_guid, &info_sz, fi); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to get file info\n"); + efi_err("Failed to get file info\n"); fh->close(fh); return status; } @@ -75,13 +73,13 @@ static efi_status_t efi_open_volume(efi_loaded_image_t *image, status = efi_bs_call(handle_protocol, image->device_handle, &fs_proto, (void **)&io); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to handle fs_proto\n"); + efi_err("Failed to handle fs_proto\n"); return status; } status = io->open_volume(io, fh); if (status != EFI_SUCCESS) - pr_efi_err("Failed to open volume\n"); + efi_err("Failed to open volume\n"); return status; } @@ -121,13 +119,13 @@ static int find_file_option(const efi_char16_t *cmdline, int cmdline_len, * We only support loading a file from the same filesystem as * the kernel image. */ -static efi_status_t handle_cmdline_files(efi_loaded_image_t *image, - const efi_char16_t *optstr, - int optstr_size, - unsigned long soft_limit, - unsigned long hard_limit, - unsigned long *load_addr, - unsigned long *load_size) +efi_status_t handle_cmdline_files(efi_loaded_image_t *image, + const efi_char16_t *optstr, + int optstr_size, + unsigned long soft_limit, + unsigned long hard_limit, + unsigned long *load_addr, + unsigned long *load_size) { const efi_char16_t *cmdline = image->load_options; int cmdline_len = image->load_options_size / 2; @@ -142,7 +140,7 @@ static efi_status_t handle_cmdline_files(efi_loaded_image_t *image, if (!load_addr || !load_size) return EFI_INVALID_PARAMETER; - if (IS_ENABLED(CONFIG_X86) && !nochunk()) + if (IS_ENABLED(CONFIG_X86) && !efi_nochunk) efi_chunk_size = EFI_READ_CHUNK_SIZE; alloc_addr = alloc_size = 0; @@ -191,7 +189,7 @@ static efi_status_t handle_cmdline_files(efi_loaded_image_t *image, &alloc_addr, hard_limit); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to allocate memory for files\n"); + efi_err("Failed to allocate memory for files\n"); goto err_close_file; } @@ -215,7 +213,7 @@ static efi_status_t handle_cmdline_files(efi_loaded_image_t *image, status = file->read(file, &chunksize, addr); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to read file\n"); + efi_err("Failed to read file\n"); goto err_close_file; } addr += chunksize; @@ -239,21 +237,3 @@ err_close_volume: efi_free(alloc_size, alloc_addr); return status; } - -efi_status_t efi_load_dtb(efi_loaded_image_t *image, - unsigned long *load_addr, - unsigned long *load_size) -{ - return handle_cmdline_files(image, L"dtb=", sizeof(L"dtb=") - 2, - ULONG_MAX, ULONG_MAX, load_addr, load_size); -} - -efi_status_t efi_load_initrd(efi_loaded_image_t *image, - unsigned long *load_addr, - unsigned long *load_size, - unsigned long soft_limit, - unsigned long hard_limit) -{ - return handle_cmdline_files(image, L"initrd=", sizeof(L"initrd=") - 2, - soft_limit, hard_limit, load_addr, load_size); -} diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c index 55e6b3f286fe..ea5da307d542 100644 --- a/drivers/firmware/efi/libstub/gop.c +++ b/drivers/firmware/efi/libstub/gop.c @@ -5,169 +5,546 @@ * * ----------------------------------------------------------------------- */ +#include <linux/bitops.h> +#include <linux/ctype.h> #include <linux/efi.h> #include <linux/screen_info.h> +#include <linux/string.h> #include <asm/efi.h> #include <asm/setup.h> #include "efistub.h" -static void find_bits(unsigned long mask, u8 *pos, u8 *size) +enum efi_cmdline_option { + EFI_CMDLINE_NONE, + EFI_CMDLINE_MODE_NUM, + EFI_CMDLINE_RES, + EFI_CMDLINE_AUTO, + EFI_CMDLINE_LIST +}; + +static struct { + enum efi_cmdline_option option; + union { + u32 mode; + struct { + u32 width, height; + int format; + u8 depth; + } res; + }; +} cmdline = { .option = EFI_CMDLINE_NONE }; + +static bool parse_modenum(char *option, char **next) +{ + u32 m; + + if (!strstarts(option, "mode=")) + return false; + option += strlen("mode="); + m = simple_strtoull(option, &option, 0); + if (*option && *option++ != ',') + return false; + cmdline.option = EFI_CMDLINE_MODE_NUM; + cmdline.mode = m; + + *next = option; + return true; +} + +static bool parse_res(char *option, char **next) +{ + u32 w, h, d = 0; + int pf = -1; + + if (!isdigit(*option)) + return false; + w = simple_strtoull(option, &option, 10); + if (*option++ != 'x' || !isdigit(*option)) + return false; + h = simple_strtoull(option, &option, 10); + if (*option == '-') { + option++; + if (strstarts(option, "rgb")) { + option += strlen("rgb"); + pf = PIXEL_RGB_RESERVED_8BIT_PER_COLOR; + } else if (strstarts(option, "bgr")) { + option += strlen("bgr"); + pf = PIXEL_BGR_RESERVED_8BIT_PER_COLOR; + } else if (isdigit(*option)) + d = simple_strtoull(option, &option, 10); + else + return false; + } + if (*option && *option++ != ',') + return false; + cmdline.option = EFI_CMDLINE_RES; + cmdline.res.width = w; + cmdline.res.height = h; + cmdline.res.format = pf; + cmdline.res.depth = d; + + *next = option; + return true; +} + +static bool parse_auto(char *option, char **next) +{ + if (!strstarts(option, "auto")) + return false; + option += strlen("auto"); + if (*option && *option++ != ',') + return false; + cmdline.option = EFI_CMDLINE_AUTO; + + *next = option; + return true; +} + +static bool parse_list(char *option, char **next) { - u8 first, len; + if (!strstarts(option, "list")) + return false; + option += strlen("list"); + if (*option && *option++ != ',') + return false; + cmdline.option = EFI_CMDLINE_LIST; + + *next = option; + return true; +} + +void efi_parse_option_graphics(char *option) +{ + while (*option) { + if (parse_modenum(option, &option)) + continue; + if (parse_res(option, &option)) + continue; + if (parse_auto(option, &option)) + continue; + if (parse_list(option, &option)) + continue; + + while (*option && *option++ != ',') + ; + } +} + +static u32 choose_mode_modenum(efi_graphics_output_protocol_t *gop) +{ + efi_status_t status; + + efi_graphics_output_protocol_mode_t *mode; + efi_graphics_output_mode_info_t *info; + unsigned long info_size; + + u32 max_mode, cur_mode; + int pf; + + mode = efi_table_attr(gop, mode); + + cur_mode = efi_table_attr(mode, mode); + if (cmdline.mode == cur_mode) + return cur_mode; + + max_mode = efi_table_attr(mode, max_mode); + if (cmdline.mode >= max_mode) { + efi_err("Requested mode is invalid\n"); + return cur_mode; + } + + status = efi_call_proto(gop, query_mode, cmdline.mode, + &info_size, &info); + if (status != EFI_SUCCESS) { + efi_err("Couldn't get mode information\n"); + return cur_mode; + } + + pf = info->pixel_format; + + efi_bs_call(free_pool, info); + + if (pf == PIXEL_BLT_ONLY || pf >= PIXEL_FORMAT_MAX) { + efi_err("Invalid PixelFormat\n"); + return cur_mode; + } + + return cmdline.mode; +} + +static u8 pixel_bpp(int pixel_format, efi_pixel_bitmask_t pixel_info) +{ + if (pixel_format == PIXEL_BIT_MASK) { + u32 mask = pixel_info.red_mask | pixel_info.green_mask | + pixel_info.blue_mask | pixel_info.reserved_mask; + if (!mask) + return 0; + return __fls(mask) - __ffs(mask) + 1; + } else + return 32; +} + +static u32 choose_mode_res(efi_graphics_output_protocol_t *gop) +{ + efi_status_t status; + + efi_graphics_output_protocol_mode_t *mode; + efi_graphics_output_mode_info_t *info; + unsigned long info_size; + + u32 max_mode, cur_mode; + int pf; + efi_pixel_bitmask_t pi; + u32 m, w, h; + + mode = efi_table_attr(gop, mode); + + cur_mode = efi_table_attr(mode, mode); + info = efi_table_attr(mode, info); + pf = info->pixel_format; + pi = info->pixel_information; + w = info->horizontal_resolution; + h = info->vertical_resolution; + + if (w == cmdline.res.width && h == cmdline.res.height && + (cmdline.res.format < 0 || cmdline.res.format == pf) && + (!cmdline.res.depth || cmdline.res.depth == pixel_bpp(pf, pi))) + return cur_mode; + + max_mode = efi_table_attr(mode, max_mode); + + for (m = 0; m < max_mode; m++) { + if (m == cur_mode) + continue; + + status = efi_call_proto(gop, query_mode, m, + &info_size, &info); + if (status != EFI_SUCCESS) + continue; + + pf = info->pixel_format; + pi = info->pixel_information; + w = info->horizontal_resolution; + h = info->vertical_resolution; + + efi_bs_call(free_pool, info); + + if (pf == PIXEL_BLT_ONLY || pf >= PIXEL_FORMAT_MAX) + continue; + if (w == cmdline.res.width && h == cmdline.res.height && + (cmdline.res.format < 0 || cmdline.res.format == pf) && + (!cmdline.res.depth || cmdline.res.depth == pixel_bpp(pf, pi))) + return m; + } + + efi_err("Couldn't find requested mode\n"); + + return cur_mode; +} + +static u32 choose_mode_auto(efi_graphics_output_protocol_t *gop) +{ + efi_status_t status; + + efi_graphics_output_protocol_mode_t *mode; + efi_graphics_output_mode_info_t *info; + unsigned long info_size; + + u32 max_mode, cur_mode, best_mode, area; + u8 depth; + int pf; + efi_pixel_bitmask_t pi; + u32 m, w, h, a; + u8 d; + + mode = efi_table_attr(gop, mode); + + cur_mode = efi_table_attr(mode, mode); + max_mode = efi_table_attr(mode, max_mode); - first = 0; - len = 0; + info = efi_table_attr(mode, info); - if (mask) { - while (!(mask & 0x1)) { - mask = mask >> 1; - first++; + pf = info->pixel_format; + pi = info->pixel_information; + w = info->horizontal_resolution; + h = info->vertical_resolution; + + best_mode = cur_mode; + area = w * h; + depth = pixel_bpp(pf, pi); + + for (m = 0; m < max_mode; m++) { + if (m == cur_mode) + continue; + + status = efi_call_proto(gop, query_mode, m, + &info_size, &info); + if (status != EFI_SUCCESS) + continue; + + pf = info->pixel_format; + pi = info->pixel_information; + w = info->horizontal_resolution; + h = info->vertical_resolution; + + efi_bs_call(free_pool, info); + + if (pf == PIXEL_BLT_ONLY || pf >= PIXEL_FORMAT_MAX) + continue; + a = w * h; + if (a < area) + continue; + d = pixel_bpp(pf, pi); + if (a > area || d > depth) { + best_mode = m; + area = a; + depth = d; } + } + + return best_mode; +} + +static u32 choose_mode_list(efi_graphics_output_protocol_t *gop) +{ + efi_status_t status; + + efi_graphics_output_protocol_mode_t *mode; + efi_graphics_output_mode_info_t *info; + unsigned long info_size; + + u32 max_mode, cur_mode; + int pf; + efi_pixel_bitmask_t pi; + u32 m, w, h; + u8 d; + const char *dstr; + bool valid; + efi_input_key_t key; - while (mask & 0x1) { - mask = mask >> 1; - len++; + mode = efi_table_attr(gop, mode); + + cur_mode = efi_table_attr(mode, mode); + max_mode = efi_table_attr(mode, max_mode); + + efi_printk("Available graphics modes are 0-%u\n", max_mode-1); + efi_puts(" * = current mode\n" + " - = unusable mode\n"); + for (m = 0; m < max_mode; m++) { + status = efi_call_proto(gop, query_mode, m, + &info_size, &info); + if (status != EFI_SUCCESS) + continue; + + pf = info->pixel_format; + pi = info->pixel_information; + w = info->horizontal_resolution; + h = info->vertical_resolution; + + efi_bs_call(free_pool, info); + + valid = !(pf == PIXEL_BLT_ONLY || pf >= PIXEL_FORMAT_MAX); + d = 0; + switch (pf) { + case PIXEL_RGB_RESERVED_8BIT_PER_COLOR: + dstr = "rgb"; + break; + case PIXEL_BGR_RESERVED_8BIT_PER_COLOR: + dstr = "bgr"; + break; + case PIXEL_BIT_MASK: + dstr = ""; + d = pixel_bpp(pf, pi); + break; + case PIXEL_BLT_ONLY: + dstr = "blt"; + break; + default: + dstr = "xxx"; + break; } + + efi_printk("Mode %3u %c%c: Resolution %ux%u-%s%.0hhu\n", + m, + m == cur_mode ? '*' : ' ', + !valid ? '-' : ' ', + w, h, dstr, d); + } + + efi_puts("\nPress any key to continue (or wait 10 seconds)\n"); + status = efi_wait_for_key(10 * EFI_USEC_PER_SEC, &key); + if (status != EFI_SUCCESS && status != EFI_TIMEOUT) { + efi_err("Unable to read key, continuing in 10 seconds\n"); + efi_bs_call(stall, 10 * EFI_USEC_PER_SEC); + } + + return cur_mode; +} + +static void set_mode(efi_graphics_output_protocol_t *gop) +{ + efi_graphics_output_protocol_mode_t *mode; + u32 cur_mode, new_mode; + + switch (cmdline.option) { + case EFI_CMDLINE_MODE_NUM: + new_mode = choose_mode_modenum(gop); + break; + case EFI_CMDLINE_RES: + new_mode = choose_mode_res(gop); + break; + case EFI_CMDLINE_AUTO: + new_mode = choose_mode_auto(gop); + break; + case EFI_CMDLINE_LIST: + new_mode = choose_mode_list(gop); + break; + default: + return; + } + + mode = efi_table_attr(gop, mode); + cur_mode = efi_table_attr(mode, mode); + + if (new_mode == cur_mode) + return; + + if (efi_call_proto(gop, set_mode, new_mode) != EFI_SUCCESS) + efi_err("Failed to set requested mode\n"); +} + +static void find_bits(u32 mask, u8 *pos, u8 *size) +{ + if (!mask) { + *pos = *size = 0; + return; } - *pos = first; - *size = len; + /* UEFI spec guarantees that the set bits are contiguous */ + *pos = __ffs(mask); + *size = __fls(mask) - *pos + 1; } static void setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, efi_pixel_bitmask_t pixel_info, int pixel_format) { - if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 0; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 16; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BIT_MASK) { - find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); - find_bits(pixel_info.green_mask, &si->green_pos, - &si->green_size); - find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); - find_bits(pixel_info.reserved_mask, &si->rsvd_pos, - &si->rsvd_size); + if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, + &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, + &si->green_pos, &si->green_size); + find_bits(pixel_info.blue_mask, + &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, + &si->rsvd_pos, &si->rsvd_size); si->lfb_depth = si->red_size + si->green_size + si->blue_size + si->rsvd_size; si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; } else { - si->lfb_depth = 4; - si->lfb_linelength = si->lfb_width / 2; - si->red_size = 0; - si->red_pos = 0; - si->green_size = 0; - si->green_pos = 0; - si->blue_size = 0; - si->blue_pos = 0; - si->rsvd_size = 0; - si->rsvd_pos = 0; + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->red_pos = 0; + si->blue_pos = 16; + } else /* PIXEL_BGR_RESERVED_8BIT_PER_COLOR */ { + si->blue_pos = 0; + si->red_pos = 16; + } + + si->green_pos = 8; + si->rsvd_pos = 24; + si->red_size = si->green_size = + si->blue_size = si->rsvd_size = 8; + + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; } } -static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **handles) +static efi_graphics_output_protocol_t * +find_gop(efi_guid_t *proto, unsigned long size, void **handles) { - efi_graphics_output_protocol_t *gop, *first_gop; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - efi_physical_addr_t fb_base; - efi_pixel_bitmask_t pixel_info; - int pixel_format; - efi_status_t status; + efi_graphics_output_protocol_t *first_gop; efi_handle_t h; int i; first_gop = NULL; - gop = NULL; for_each_efi_handle(h, handles, size, i) { + efi_status_t status; + + efi_graphics_output_protocol_t *gop; efi_graphics_output_protocol_mode_t *mode; - efi_graphics_output_mode_info_t *info = NULL; + efi_graphics_output_mode_info_t *info; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; void *dummy = NULL; - efi_physical_addr_t current_fb_base; status = efi_bs_call(handle_protocol, h, proto, (void **)&gop); if (status != EFI_SUCCESS) continue; + mode = efi_table_attr(gop, mode); + info = efi_table_attr(mode, info); + if (info->pixel_format == PIXEL_BLT_ONLY || + info->pixel_format >= PIXEL_FORMAT_MAX) + continue; + + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + * + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ status = efi_bs_call(handle_protocol, h, &conout_proto, &dummy); if (status == EFI_SUCCESS) - conout_found = true; + return gop; - mode = efi_table_attr(gop, mode); - info = efi_table_attr(mode, info); - current_fb_base = efi_table_attr(mode, frame_buffer_base); - - if ((!first_gop || conout_found) && - info->pixel_format != PIXEL_BLT_ONLY) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ + if (!first_gop) first_gop = gop; - if (conout_found) - break; - } } + return first_gop; +} + +static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, + unsigned long size, void **handles) +{ + efi_graphics_output_protocol_t *gop; + efi_graphics_output_protocol_mode_t *mode; + efi_graphics_output_mode_info_t *info; + + gop = find_gop(proto, size, handles); + /* Did we find any GOPs? */ - if (!first_gop) + if (!gop) return EFI_NOT_FOUND; + /* Change mode if requested */ + set_mode(gop); + /* EFI framebuffer */ + mode = efi_table_attr(gop, mode); + info = efi_table_attr(mode, info); + si->orig_video_isVGA = VIDEO_TYPE_EFI; - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; + si->lfb_width = info->horizontal_resolution; + si->lfb_height = info->vertical_resolution; - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { + efi_set_u64_split(efi_table_attr(mode, frame_buffer_base), + &si->lfb_base, &si->ext_lfb_base); + if (si->ext_lfb_base) si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } si->pages = 1; - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + setup_pixel_info(si, info->pixels_per_scan_line, + info->pixel_information, info->pixel_format); si->lfb_size = si->lfb_linelength * si->lfb_height; diff --git a/drivers/firmware/efi/libstub/mem.c b/drivers/firmware/efi/libstub/mem.c index 09f4fa01914e..feef8d4be113 100644 --- a/drivers/firmware/efi/libstub/mem.c +++ b/drivers/firmware/efi/libstub/mem.c @@ -91,120 +91,23 @@ fail: efi_status_t efi_allocate_pages(unsigned long size, unsigned long *addr, unsigned long max) { - efi_physical_addr_t alloc_addr = ALIGN_DOWN(max + 1, EFI_ALLOC_ALIGN) - 1; - int slack = EFI_ALLOC_ALIGN / EFI_PAGE_SIZE - 1; + efi_physical_addr_t alloc_addr; efi_status_t status; - size = round_up(size, EFI_ALLOC_ALIGN); + if (EFI_ALLOC_ALIGN > EFI_PAGE_SIZE) + return efi_allocate_pages_aligned(size, addr, max, + EFI_ALLOC_ALIGN); + + alloc_addr = ALIGN_DOWN(max + 1, EFI_ALLOC_ALIGN) - 1; status = efi_bs_call(allocate_pages, EFI_ALLOCATE_MAX_ADDRESS, - EFI_LOADER_DATA, size / EFI_PAGE_SIZE + slack, + EFI_LOADER_DATA, DIV_ROUND_UP(size, EFI_PAGE_SIZE), &alloc_addr); if (status != EFI_SUCCESS) return status; - *addr = ALIGN((unsigned long)alloc_addr, EFI_ALLOC_ALIGN); - - if (slack > 0) { - int l = (alloc_addr % EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE; - - if (l) { - efi_bs_call(free_pages, alloc_addr, slack - l + 1); - slack = l - 1; - } - if (slack) - efi_bs_call(free_pages, *addr + size, slack); - } + *addr = alloc_addr; return EFI_SUCCESS; } -/** - * efi_low_alloc_above() - allocate pages at or above given address - * @size: size of the memory area to allocate - * @align: minimum alignment of the allocated memory area. It should - * a power of two. - * @addr: on exit the address of the allocated memory - * @min: minimum address to used for the memory allocation - * - * Allocate at the lowest possible address that is not below @min as - * EFI_LOADER_DATA. The allocated pages are aligned according to @align but at - * least EFI_ALLOC_ALIGN. The first allocated page will not below the address - * given by @min. - * - * Return: status code - */ -efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, - unsigned long *addr, unsigned long min) -{ - unsigned long map_size, desc_size, buff_size; - efi_memory_desc_t *map; - efi_status_t status; - unsigned long nr_pages; - int i; - struct efi_boot_memmap boot_map; - - boot_map.map = ↦ - boot_map.map_size = &map_size; - boot_map.desc_size = &desc_size; - boot_map.desc_ver = NULL; - boot_map.key_ptr = NULL; - boot_map.buff_size = &buff_size; - - status = efi_get_memory_map(&boot_map); - if (status != EFI_SUCCESS) - goto fail; - - /* - * Enforce minimum alignment that EFI or Linux requires when - * requesting a specific address. We are doing page-based (or - * larger) allocations, and both the address and size must meet - * alignment constraints. - */ - if (align < EFI_ALLOC_ALIGN) - align = EFI_ALLOC_ALIGN; - - size = round_up(size, EFI_ALLOC_ALIGN); - nr_pages = size / EFI_PAGE_SIZE; - for (i = 0; i < map_size / desc_size; i++) { - efi_memory_desc_t *desc; - unsigned long m = (unsigned long)map; - u64 start, end; - - desc = efi_early_memdesc_ptr(m, desc_size, i); - - if (desc->type != EFI_CONVENTIONAL_MEMORY) - continue; - - if (efi_soft_reserve_enabled() && - (desc->attribute & EFI_MEMORY_SP)) - continue; - - if (desc->num_pages < nr_pages) - continue; - - start = desc->phys_addr; - end = start + desc->num_pages * EFI_PAGE_SIZE; - - if (start < min) - start = min; - - start = round_up(start, align); - if ((start + size) > end) - continue; - - status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, nr_pages, &start); - if (status == EFI_SUCCESS) { - *addr = start; - break; - } - } - - if (i == map_size / desc_size) - status = EFI_NOT_FOUND; - - efi_bs_call(free_pool, map); -fail: - return status; -} /** * efi_free() - free memory pages @@ -227,81 +130,3 @@ void efi_free(unsigned long size, unsigned long addr) nr_pages = round_up(size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE; efi_bs_call(free_pages, addr, nr_pages); } - -/** - * efi_relocate_kernel() - copy memory area - * @image_addr: pointer to address of memory area to copy - * @image_size: size of memory area to copy - * @alloc_size: minimum size of memory to allocate, must be greater or - * equal to image_size - * @preferred_addr: preferred target address - * @alignment: minimum alignment of the allocated memory area. It - * should be a power of two. - * @min_addr: minimum target address - * - * Copy a memory area to a newly allocated memory area aligned according - * to @alignment but at least EFI_ALLOC_ALIGN. If the preferred address - * is not available, the allocated address will not be below @min_addr. - * On exit, @image_addr is updated to the target copy address that was used. - * - * This function is used to copy the Linux kernel verbatim. It does not apply - * any relocation changes. - * - * Return: status code - */ -efi_status_t efi_relocate_kernel(unsigned long *image_addr, - unsigned long image_size, - unsigned long alloc_size, - unsigned long preferred_addr, - unsigned long alignment, - unsigned long min_addr) -{ - unsigned long cur_image_addr; - unsigned long new_addr = 0; - efi_status_t status; - unsigned long nr_pages; - efi_physical_addr_t efi_addr = preferred_addr; - - if (!image_addr || !image_size || !alloc_size) - return EFI_INVALID_PARAMETER; - if (alloc_size < image_size) - return EFI_INVALID_PARAMETER; - - cur_image_addr = *image_addr; - - /* - * The EFI firmware loader could have placed the kernel image - * anywhere in memory, but the kernel has restrictions on the - * max physical address it can run at. Some architectures - * also have a prefered address, so first try to relocate - * to the preferred address. If that fails, allocate as low - * as possible while respecting the required alignment. - */ - nr_pages = round_up(alloc_size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE; - status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, nr_pages, &efi_addr); - new_addr = efi_addr; - /* - * If preferred address allocation failed allocate as low as - * possible. - */ - if (status != EFI_SUCCESS) { - status = efi_low_alloc_above(alloc_size, alignment, &new_addr, - min_addr); - } - if (status != EFI_SUCCESS) { - pr_efi_err("Failed to allocate usable memory for kernel.\n"); - return status; - } - - /* - * We know source/dest won't overlap since both memory ranges - * have been allocated by UEFI, so we can safely use memcpy. - */ - memcpy((void *)new_addr, (void *)cur_image_addr, image_size); - - /* Return the new address of the relocated image. */ - *image_addr = new_addr; - - return status; -} diff --git a/drivers/firmware/efi/libstub/pci.c b/drivers/firmware/efi/libstub/pci.c index b025e59b94df..99fb25d2bcf5 100644 --- a/drivers/firmware/efi/libstub/pci.c +++ b/drivers/firmware/efi/libstub/pci.c @@ -28,21 +28,21 @@ void efi_pci_disable_bridge_busmaster(void) if (status != EFI_BUFFER_TOO_SMALL) { if (status != EFI_SUCCESS && status != EFI_NOT_FOUND) - pr_efi_err("Failed to locate PCI I/O handles'\n"); + efi_err("Failed to locate PCI I/O handles'\n"); return; } status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, pci_handle_size, (void **)&pci_handle); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to allocate memory for 'pci_handle'\n"); + efi_err("Failed to allocate memory for 'pci_handle'\n"); return; } status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL, &pci_proto, NULL, &pci_handle_size, pci_handle); if (status != EFI_SUCCESS) { - pr_efi_err("Failed to locate PCI I/O handles'\n"); + efi_err("Failed to locate PCI I/O handles'\n"); goto free_handle; } @@ -69,7 +69,7 @@ void efi_pci_disable_bridge_busmaster(void) * access to the framebuffer. Drivers for true PCIe graphics * controllers that are behind a PCIe root port do not use * DMA to implement the GOP framebuffer anyway [although they - * may use it in their implentation of Gop->Blt()], and so + * may use it in their implementation of Gop->Blt()], and so * disabling DMA in the PCI bridge should not interfere with * normal operation of the device. */ @@ -106,7 +106,7 @@ void efi_pci_disable_bridge_busmaster(void) status = efi_call_proto(pci, pci.write, EfiPciIoWidthUint16, PCI_COMMAND, 1, &command); if (status != EFI_SUCCESS) - pr_efi_err("Failed to disable PCI busmastering\n"); + efi_err("Failed to disable PCI busmastering\n"); } free_handle: diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 4578f59e160c..a408df474d83 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -74,6 +74,8 @@ efi_status_t efi_random_alloc(unsigned long size, if (align < EFI_ALLOC_ALIGN) align = EFI_ALLOC_ALIGN; + size = round_up(size, EFI_ALLOC_ALIGN); + /* count the suitable slots in each memory map entry */ for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { efi_memory_desc_t *md = (void *)memory_map + map_offset; @@ -85,7 +87,7 @@ efi_status_t efi_random_alloc(unsigned long size, } /* find a random number between 0 and total_slots */ - target_slot = (total_slots * (u16)random_seed) >> 16; + target_slot = (total_slots * (u64)(random_seed & U32_MAX)) >> 32; /* * target_slot is now a value in the range [0, total_slots), and so @@ -109,7 +111,7 @@ efi_status_t efi_random_alloc(unsigned long size, } target = round_up(md->phys_addr, align) + target_slot * align; - pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + pages = size / EFI_PAGE_SIZE; status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, pages, &target); diff --git a/drivers/firmware/efi/libstub/relocate.c b/drivers/firmware/efi/libstub/relocate.c new file mode 100644 index 000000000000..9b1aaf8b123f --- /dev/null +++ b/drivers/firmware/efi/libstub/relocate.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/efi.h> +#include <asm/efi.h> + +#include "efistub.h" + +/** + * efi_low_alloc_above() - allocate pages at or above given address + * @size: size of the memory area to allocate + * @align: minimum alignment of the allocated memory area. It should + * a power of two. + * @addr: on exit the address of the allocated memory + * @min: minimum address to used for the memory allocation + * + * Allocate at the lowest possible address that is not below @min as + * EFI_LOADER_DATA. The allocated pages are aligned according to @align but at + * least EFI_ALLOC_ALIGN. The first allocated page will not below the address + * given by @min. + * + * Return: status code + */ +static efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, + unsigned long *addr, unsigned long min) +{ + unsigned long map_size, desc_size, buff_size; + efi_memory_desc_t *map; + efi_status_t status; + unsigned long nr_pages; + int i; + struct efi_boot_memmap boot_map; + + boot_map.map = ↦ + boot_map.map_size = &map_size; + boot_map.desc_size = &desc_size; + boot_map.desc_ver = NULL; + boot_map.key_ptr = NULL; + boot_map.buff_size = &buff_size; + + status = efi_get_memory_map(&boot_map); + if (status != EFI_SUCCESS) + goto fail; + + /* + * Enforce minimum alignment that EFI or Linux requires when + * requesting a specific address. We are doing page-based (or + * larger) allocations, and both the address and size must meet + * alignment constraints. + */ + if (align < EFI_ALLOC_ALIGN) + align = EFI_ALLOC_ALIGN; + + size = round_up(size, EFI_ALLOC_ALIGN); + nr_pages = size / EFI_PAGE_SIZE; + for (i = 0; i < map_size / desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map; + u64 start, end; + + desc = efi_early_memdesc_ptr(m, desc_size, i); + + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (efi_soft_reserve_enabled() && + (desc->attribute & EFI_MEMORY_SP)) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * EFI_PAGE_SIZE; + + if (start < min) + start = min; + + start = round_up(start, align); + if ((start + size) > end) + continue; + + status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, nr_pages, &start); + if (status == EFI_SUCCESS) { + *addr = start; + break; + } + } + + if (i == map_size / desc_size) + status = EFI_NOT_FOUND; + + efi_bs_call(free_pool, map); +fail: + return status; +} + +/** + * efi_relocate_kernel() - copy memory area + * @image_addr: pointer to address of memory area to copy + * @image_size: size of memory area to copy + * @alloc_size: minimum size of memory to allocate, must be greater or + * equal to image_size + * @preferred_addr: preferred target address + * @alignment: minimum alignment of the allocated memory area. It + * should be a power of two. + * @min_addr: minimum target address + * + * Copy a memory area to a newly allocated memory area aligned according + * to @alignment but at least EFI_ALLOC_ALIGN. If the preferred address + * is not available, the allocated address will not be below @min_addr. + * On exit, @image_addr is updated to the target copy address that was used. + * + * This function is used to copy the Linux kernel verbatim. It does not apply + * any relocation changes. + * + * Return: status code + */ +efi_status_t efi_relocate_kernel(unsigned long *image_addr, + unsigned long image_size, + unsigned long alloc_size, + unsigned long preferred_addr, + unsigned long alignment, + unsigned long min_addr) +{ + unsigned long cur_image_addr; + unsigned long new_addr = 0; + efi_status_t status; + unsigned long nr_pages; + efi_physical_addr_t efi_addr = preferred_addr; + + if (!image_addr || !image_size || !alloc_size) + return EFI_INVALID_PARAMETER; + if (alloc_size < image_size) + return EFI_INVALID_PARAMETER; + + cur_image_addr = *image_addr; + + /* + * The EFI firmware loader could have placed the kernel image + * anywhere in memory, but the kernel has restrictions on the + * max physical address it can run at. Some architectures + * also have a preferred address, so first try to relocate + * to the preferred address. If that fails, allocate as low + * as possible while respecting the required alignment. + */ + nr_pages = round_up(alloc_size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE; + status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, nr_pages, &efi_addr); + new_addr = efi_addr; + /* + * If preferred address allocation failed allocate as low as + * possible. + */ + if (status != EFI_SUCCESS) { + status = efi_low_alloc_above(alloc_size, alignment, &new_addr, + min_addr); + } + if (status != EFI_SUCCESS) { + efi_err("Failed to allocate usable memory for kernel.\n"); + return status; + } + + /* + * We know source/dest won't overlap since both memory ranges + * have been allocated by UEFI, so we can safely use memcpy. + */ + memcpy((void *)new_addr, (void *)cur_image_addr, image_size); + + /* Return the new address of the relocated image. */ + *image_addr = new_addr; + + return status; +} diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c index a765378ad18c..5efc524b14be 100644 --- a/drivers/firmware/efi/libstub/secureboot.c +++ b/drivers/firmware/efi/libstub/secureboot.c @@ -67,10 +67,10 @@ enum efi_secureboot_mode efi_get_secureboot(void) return efi_secureboot_mode_disabled; secure_boot_enabled: - pr_efi("UEFI Secure Boot is enabled.\n"); + efi_info("UEFI Secure Boot is enabled.\n"); return efi_secureboot_mode_enabled; out_efi_err: - pr_efi_err("Could not determine UEFI Secure Boot status.\n"); + efi_err("Could not determine UEFI Secure Boot status.\n"); return efi_secureboot_mode_unknown; } diff --git a/drivers/firmware/efi/libstub/tpm.c b/drivers/firmware/efi/libstub/tpm.c index e9a684637b70..7acbac16eae0 100644 --- a/drivers/firmware/efi/libstub/tpm.c +++ b/drivers/firmware/efi/libstub/tpm.c @@ -119,7 +119,7 @@ void efi_retrieve_tpm2_eventlog(void) sizeof(*log_tbl) + log_size, (void **)&log_tbl); if (status != EFI_SUCCESS) { - efi_printk("Unable to allocate memory for event log\n"); + efi_err("Unable to allocate memory for event log\n"); return; } diff --git a/drivers/firmware/efi/libstub/vsprintf.c b/drivers/firmware/efi/libstub/vsprintf.c new file mode 100644 index 000000000000..e65ef49a54cd --- /dev/null +++ b/drivers/firmware/efi/libstub/vsprintf.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * Oh, it's a waste of space, but oh-so-yummy for debugging. + */ + +#include <stdarg.h> + +#include <linux/compiler.h> +#include <linux/ctype.h> +#include <linux/kernel.h> +#include <linux/limits.h> +#include <linux/string.h> +#include <linux/types.h> + +static +int skip_atoi(const char **s) +{ + int i = 0; + + while (isdigit(**s)) + i = i * 10 + *((*s)++) - '0'; + return i; +} + +/* + * put_dec_full4 handles numbers in the range 0 <= r < 10000. + * The multiplier 0xccd is round(2^15/10), and the approximation + * r/10 == (r * 0xccd) >> 15 is exact for all r < 16389. + */ +static +void put_dec_full4(char *end, unsigned int r) +{ + int i; + + for (i = 0; i < 3; i++) { + unsigned int q = (r * 0xccd) >> 15; + *--end = '0' + (r - q * 10); + r = q; + } + *--end = '0' + r; +} + +/* put_dec is copied from lib/vsprintf.c with small modifications */ + +/* + * Call put_dec_full4 on x % 10000, return x / 10000. + * The approximation x/10000 == (x * 0x346DC5D7) >> 43 + * holds for all x < 1,128,869,999. The largest value this + * helper will ever be asked to convert is 1,125,520,955. + * (second call in the put_dec code, assuming n is all-ones). + */ +static +unsigned int put_dec_helper4(char *end, unsigned int x) +{ + unsigned int q = (x * 0x346DC5D7ULL) >> 43; + + put_dec_full4(end, x - q * 10000); + return q; +} + +/* Based on code by Douglas W. Jones found at + * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> + * (with permission from the author). + * Performs no 64-bit division and hence should be fast on 32-bit machines. + */ +static +char *put_dec(char *end, unsigned long long n) +{ + unsigned int d3, d2, d1, q, h; + char *p = end; + + d1 = ((unsigned int)n >> 16); /* implicit "& 0xffff" */ + h = (n >> 32); + d2 = (h ) & 0xffff; + d3 = (h >> 16); /* implicit "& 0xffff" */ + + /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 + = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ + q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((unsigned int)n & 0xffff); + q = put_dec_helper4(p, q); + p -= 4; + + q += 7671 * d3 + 9496 * d2 + 6 * d1; + q = put_dec_helper4(p, q); + p -= 4; + + q += 4749 * d3 + 42 * d2; + q = put_dec_helper4(p, q); + p -= 4; + + q += 281 * d3; + q = put_dec_helper4(p, q); + p -= 4; + + put_dec_full4(p, q); + p -= 4; + + /* strip off the extra 0's we printed */ + while (p < end && *p == '0') + ++p; + + return p; +} + +static +char *number(char *end, unsigned long long num, int base, char locase) +{ + /* + * locase = 0 or 0x20. ORing digits or letters with 'locase' + * produces same digits or (maybe lowercased) letters + */ + + /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ + static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ + + switch (base) { + case 10: + if (num != 0) + end = put_dec(end, num); + break; + case 8: + for (; num != 0; num >>= 3) + *--end = '0' + (num & 07); + break; + case 16: + for (; num != 0; num >>= 4) + *--end = digits[num & 0xf] | locase; + break; + default: + unreachable(); + }; + + return end; +} + +#define ZEROPAD 1 /* pad with zero */ +#define SIGN 2 /* unsigned/signed long */ +#define PLUS 4 /* show plus */ +#define SPACE 8 /* space if plus */ +#define LEFT 16 /* left justified */ +#define SMALL 32 /* Must be 32 == 0x20 */ +#define SPECIAL 64 /* 0x */ +#define WIDE 128 /* UTF-16 string */ + +static +int get_flags(const char **fmt) +{ + int flags = 0; + + do { + switch (**fmt) { + case '-': + flags |= LEFT; + break; + case '+': + flags |= PLUS; + break; + case ' ': + flags |= SPACE; + break; + case '#': + flags |= SPECIAL; + break; + case '0': + flags |= ZEROPAD; + break; + default: + return flags; + } + ++(*fmt); + } while (1); +} + +static +int get_int(const char **fmt, va_list *ap) +{ + if (isdigit(**fmt)) + return skip_atoi(fmt); + if (**fmt == '*') { + ++(*fmt); + /* it's the next argument */ + return va_arg(*ap, int); + } + return 0; +} + +static +unsigned long long get_number(int sign, int qualifier, va_list *ap) +{ + if (sign) { + switch (qualifier) { + case 'L': + return va_arg(*ap, long long); + case 'l': + return va_arg(*ap, long); + case 'h': + return (short)va_arg(*ap, int); + case 'H': + return (signed char)va_arg(*ap, int); + default: + return va_arg(*ap, int); + }; + } else { + switch (qualifier) { + case 'L': + return va_arg(*ap, unsigned long long); + case 'l': + return va_arg(*ap, unsigned long); + case 'h': + return (unsigned short)va_arg(*ap, int); + case 'H': + return (unsigned char)va_arg(*ap, int); + default: + return va_arg(*ap, unsigned int); + } + } +} + +static +char get_sign(long long *num, int flags) +{ + if (!(flags & SIGN)) + return 0; + if (*num < 0) { + *num = -(*num); + return '-'; + } + if (flags & PLUS) + return '+'; + if (flags & SPACE) + return ' '; + return 0; +} + +static +size_t utf16s_utf8nlen(const u16 *s16, size_t maxlen) +{ + size_t len, clen; + + for (len = 0; len < maxlen && *s16; len += clen) { + u16 c0 = *s16++; + + /* First, get the length for a BMP character */ + clen = 1 + (c0 >= 0x80) + (c0 >= 0x800); + if (len + clen > maxlen) + break; + /* + * If this is a high surrogate, and we're already at maxlen, we + * can't include the character if it's a valid surrogate pair. + * Avoid accessing one extra word just to check if it's valid + * or not. + */ + if ((c0 & 0xfc00) == 0xd800) { + if (len + clen == maxlen) + break; + if ((*s16 & 0xfc00) == 0xdc00) { + ++s16; + ++clen; + } + } + } + + return len; +} + +static +u32 utf16_to_utf32(const u16 **s16) +{ + u16 c0, c1; + + c0 = *(*s16)++; + /* not a surrogate */ + if ((c0 & 0xf800) != 0xd800) + return c0; + /* invalid: low surrogate instead of high */ + if (c0 & 0x0400) + return 0xfffd; + c1 = **s16; + /* invalid: missing low surrogate */ + if ((c1 & 0xfc00) != 0xdc00) + return 0xfffd; + /* valid surrogate pair */ + ++(*s16); + return (0x10000 - (0xd800 << 10) - 0xdc00) + (c0 << 10) + c1; +} + +#define PUTC(c) \ +do { \ + if (pos < size) \ + buf[pos] = (c); \ + ++pos; \ +} while (0); + +int vsnprintf(char *buf, size_t size, const char *fmt, va_list ap) +{ + /* The maximum space required is to print a 64-bit number in octal */ + char tmp[(sizeof(unsigned long long) * 8 + 2) / 3]; + char *tmp_end = &tmp[ARRAY_SIZE(tmp)]; + long long num; + int base; + const char *s; + size_t len, pos; + char sign; + + int flags; /* flags to number() */ + + int field_width; /* width of output field */ + int precision; /* min. # of digits for integers; max + number of chars for from string */ + int qualifier; /* 'h', 'hh', 'l' or 'll' for integer fields */ + + va_list args; + + /* + * We want to pass our input va_list to helper functions by reference, + * but there's an annoying edge case. If va_list was originally passed + * to us by value, we could just pass &ap down to the helpers. This is + * the case on, for example, X86_32. + * However, on X86_64 (and possibly others), va_list is actually a + * size-1 array containing a structure. Our function parameter ap has + * decayed from T[1] to T*, and &ap has type T** rather than T(*)[1], + * which is what will be expected by a function taking a va_list * + * parameter. + * One standard way to solve this mess is by creating a copy in a local + * variable of type va_list and then passing a pointer to that local + * copy instead, which is what we do here. + */ + va_copy(args, ap); + + for (pos = 0; *fmt; ++fmt) { + if (*fmt != '%' || *++fmt == '%') { + PUTC(*fmt); + continue; + } + + /* process flags */ + flags = get_flags(&fmt); + + /* get field width */ + field_width = get_int(&fmt, &args); + if (field_width < 0) { + field_width = -field_width; + flags |= LEFT; + } + + if (flags & LEFT) + flags &= ~ZEROPAD; + + /* get the precision */ + precision = -1; + if (*fmt == '.') { + ++fmt; + precision = get_int(&fmt, &args); + if (precision >= 0) + flags &= ~ZEROPAD; + } + + /* get the conversion qualifier */ + qualifier = -1; + if (*fmt == 'h' || *fmt == 'l') { + qualifier = *fmt; + ++fmt; + if (qualifier == *fmt) { + qualifier -= 'a'-'A'; + ++fmt; + } + } + + sign = 0; + + switch (*fmt) { + case 'c': + flags &= LEFT; + s = tmp; + if (qualifier == 'l') { + ((u16 *)tmp)[0] = (u16)va_arg(args, unsigned int); + ((u16 *)tmp)[1] = L'\0'; + precision = INT_MAX; + goto wstring; + } else { + tmp[0] = (unsigned char)va_arg(args, int); + precision = len = 1; + } + goto output; + + case 's': + flags &= LEFT; + if (precision < 0) + precision = INT_MAX; + s = va_arg(args, void *); + if (!s) + s = precision < 6 ? "" : "(null)"; + else if (qualifier == 'l') { + wstring: + flags |= WIDE; + precision = len = utf16s_utf8nlen((const u16 *)s, precision); + goto output; + } + precision = len = strnlen(s, precision); + goto output; + + /* integer number formats - set up the flags and "break" */ + case 'o': + base = 8; + break; + + case 'p': + if (precision < 0) + precision = 2 * sizeof(void *); + fallthrough; + case 'x': + flags |= SMALL; + fallthrough; + case 'X': + base = 16; + break; + + case 'd': + case 'i': + flags |= SIGN; + fallthrough; + case 'u': + flags &= ~SPECIAL; + base = 10; + break; + + default: + /* + * Bail out if the conversion specifier is invalid. + * There's probably a typo in the format string and the + * remaining specifiers are unlikely to match up with + * the arguments. + */ + goto fail; + } + if (*fmt == 'p') { + num = (unsigned long)va_arg(args, void *); + } else { + num = get_number(flags & SIGN, qualifier, &args); + } + + sign = get_sign(&num, flags); + if (sign) + --field_width; + + s = number(tmp_end, num, base, flags & SMALL); + len = tmp_end - s; + /* default precision is 1 */ + if (precision < 0) + precision = 1; + /* precision is minimum number of digits to print */ + if (precision < len) + precision = len; + if (flags & SPECIAL) { + /* + * For octal, a leading 0 is printed only if necessary, + * i.e. if it's not already there because of the + * precision. + */ + if (base == 8 && precision == len) + ++precision; + /* + * For hexadecimal, the leading 0x is skipped if the + * output is empty, i.e. both the number and the + * precision are 0. + */ + if (base == 16 && precision > 0) + field_width -= 2; + else + flags &= ~SPECIAL; + } + /* + * For zero padding, increase the precision to fill the field + * width. + */ + if ((flags & ZEROPAD) && field_width > precision) + precision = field_width; + +output: + /* Calculate the padding necessary */ + field_width -= precision; + /* Leading padding with ' ' */ + if (!(flags & LEFT)) + while (field_width-- > 0) + PUTC(' '); + /* sign */ + if (sign) + PUTC(sign); + /* 0x/0X for hexadecimal */ + if (flags & SPECIAL) { + PUTC('0'); + PUTC( 'X' | (flags & SMALL)); + } + /* Zero padding and excess precision */ + while (precision-- > len) + PUTC('0'); + /* Actual output */ + if (flags & WIDE) { + const u16 *ws = (const u16 *)s; + + while (len-- > 0) { + u32 c32 = utf16_to_utf32(&ws); + u8 *s8; + size_t clen; + + if (c32 < 0x80) { + PUTC(c32); + continue; + } + + /* Number of trailing octets */ + clen = 1 + (c32 >= 0x800) + (c32 >= 0x10000); + + len -= clen; + s8 = (u8 *)&buf[pos]; + + /* Avoid writing partial character */ + PUTC('\0'); + pos += clen; + if (pos >= size) + continue; + + /* Set high bits of leading octet */ + *s8 = (0xf00 >> 1) >> clen; + /* Write trailing octets in reverse order */ + for (s8 += clen; clen; --clen, c32 >>= 6) + *s8-- = 0x80 | (c32 & 0x3f); + /* Set low bits of leading octet */ + *s8 |= c32; + } + } else { + while (len-- > 0) + PUTC(*s++); + } + /* Trailing padding with ' ' */ + while (field_width-- > 0) + PUTC(' '); + } +fail: + va_end(args); + + if (size) + buf[min(pos, size-1)] = '\0'; + + return pos; +} + +int snprintf(char *buf, size_t size, const char *fmt, ...) +{ + va_list args; + int i; + + va_start(args, fmt); + i = vsnprintf(buf, size, fmt, args); + va_end(args); + return i; +} diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index f0339b5d3658..5a48d996ed71 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -20,21 +20,9 @@ /* Maximum physical address for 64-bit kernel with 4-level paging */ #define MAXMEM_X86_64_4LEVEL (1ull << 46) -static efi_system_table_t *sys_table __efistub_global; -extern const bool efi_is64; +const efi_system_table_t *efi_system_table; extern u32 image_offset; - -__pure efi_system_table_t *efi_system_table(void) -{ - return sys_table; -} - -__attribute_const__ bool efi_is_64bit(void) -{ - if (IS_ENABLED(CONFIG_EFI_MIXED)) - return efi_is64; - return IS_ENABLED(CONFIG_X86_64); -} +static efi_loaded_image_t *image = NULL; static efi_status_t preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) @@ -62,7 +50,7 @@ preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size, (void **)&rom); if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'rom'\n"); + efi_err("Failed to allocate memory for 'rom'\n"); return status; } @@ -78,7 +66,7 @@ preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) PCI_VENDOR_ID, 1, &rom->vendor); if (status != EFI_SUCCESS) { - efi_printk("Failed to read rom->vendor\n"); + efi_err("Failed to read rom->vendor\n"); goto free_struct; } @@ -86,7 +74,7 @@ preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) PCI_DEVICE_ID, 1, &rom->devid); if (status != EFI_SUCCESS) { - efi_printk("Failed to read rom->devid\n"); + efi_err("Failed to read rom->devid\n"); goto free_struct; } @@ -131,7 +119,7 @@ static void setup_efi_pci(struct boot_params *params) (void **)&pci_handle); if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'pci_handle'\n"); + efi_err("Failed to allocate memory for 'pci_handle'\n"); return; } @@ -185,7 +173,7 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) return; if (efi_table_attr(p, version) != 0x10000) { - efi_printk("Unsupported properties proto version\n"); + efi_err("Unsupported properties proto version\n"); return; } @@ -198,7 +186,7 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) size + sizeof(struct setup_data), (void **)&new); if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'properties'\n"); + efi_err("Failed to allocate memory for 'properties'\n"); return; } @@ -227,7 +215,7 @@ static const efi_char16_t apple[] = L"Apple"; static void setup_quirks(struct boot_params *boot_params) { efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) - efi_table_attr(efi_system_table(), fw_vendor); + efi_table_attr(efi_system_table, fw_vendor); if (!memcmp(fw_vendor, apple, sizeof(apple))) { if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) @@ -368,7 +356,6 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, { struct boot_params *boot_params; struct setup_header *hdr; - efi_loaded_image_t *image; void *image_base; efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; int options_size = 0; @@ -377,28 +364,29 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, unsigned long ramdisk_addr; unsigned long ramdisk_size; - sys_table = sys_table_arg; + efi_system_table = sys_table_arg; /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); status = efi_bs_call(handle_protocol, handle, &proto, (void **)&image); if (status != EFI_SUCCESS) { - efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); + efi_err("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); efi_exit(handle, status); } image_base = efi_table_attr(image, image_base); image_offset = (void *)startup_32 - image_base; - status = efi_allocate_pages(0x4000, (unsigned long *)&boot_params, ULONG_MAX); + status = efi_allocate_pages(sizeof(struct boot_params), + (unsigned long *)&boot_params, ULONG_MAX); if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate lowmem for boot params\n"); + efi_err("Failed to allocate lowmem for boot params\n"); efi_exit(handle, status); } - memset(boot_params, 0x0, 0x4000); + memset(boot_params, 0x0, sizeof(struct boot_params)); hdr = &boot_params->hdr; @@ -416,43 +404,21 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ - cmdline_ptr = efi_convert_cmdline(image, &options_size, ULONG_MAX); + cmdline_ptr = efi_convert_cmdline(image, &options_size); if (!cmdline_ptr) goto fail; - hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; - /* Fill in upper bits of command line address, NOP on 32 bit */ - boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; + efi_set_u64_split((unsigned long)cmdline_ptr, + &hdr->cmd_line_ptr, &boot_params->ext_cmd_line_ptr); hdr->ramdisk_image = 0; hdr->ramdisk_size = 0; - if (efi_is_native()) { - status = efi_parse_options(cmdline_ptr); - if (status != EFI_SUCCESS) - goto fail2; - - if (!noinitrd()) { - status = efi_load_initrd(image, &ramdisk_addr, - &ramdisk_size, - hdr->initrd_addr_max, - ULONG_MAX); - if (status != EFI_SUCCESS) - goto fail2; - hdr->ramdisk_image = ramdisk_addr & 0xffffffff; - hdr->ramdisk_size = ramdisk_size & 0xffffffff; - boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32; - boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32; - } - } - - efi_stub_entry(handle, sys_table, boot_params); + efi_stub_entry(handle, sys_table_arg, boot_params); /* not reached */ -fail2: - efi_free(options_size, (unsigned long)cmdline_ptr); fail: - efi_free(0x4000, (unsigned long)boot_params); + efi_free(sizeof(struct boot_params), (unsigned long)boot_params); efi_exit(handle, status); } @@ -645,17 +611,14 @@ static efi_status_t exit_boot_func(struct efi_boot_memmap *map, : EFI32_LOADER_SIGNATURE; memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - p->efi->efi_systab = (unsigned long)efi_system_table(); + efi_set_u64_split((unsigned long)efi_system_table, + &p->efi->efi_systab, &p->efi->efi_systab_hi); p->efi->efi_memdesc_size = *map->desc_size; p->efi->efi_memdesc_version = *map->desc_ver; - p->efi->efi_memmap = (unsigned long)*map->map; + efi_set_u64_split((unsigned long)*map->map, + &p->efi->efi_memmap, &p->efi->efi_memmap_hi); p->efi->efi_memmap_size = *map->map_size; -#ifdef CONFIG_X86_64 - p->efi->efi_systab_hi = (unsigned long)efi_system_table() >> 32; - p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; -#endif - return EFI_SUCCESS; } @@ -711,12 +674,11 @@ unsigned long efi_main(efi_handle_t handle, unsigned long buffer_start, buffer_end; struct setup_header *hdr = &boot_params->hdr; efi_status_t status; - unsigned long cmdline_paddr; - sys_table = sys_table_arg; + efi_system_table = sys_table_arg; /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); /* @@ -759,7 +721,7 @@ unsigned long efi_main(efi_handle_t handle, hdr->kernel_alignment, LOAD_PHYSICAL_ADDR); if (status != EFI_SUCCESS) { - efi_printk("efi_relocate_kernel() failed!\n"); + efi_err("efi_relocate_kernel() failed!\n"); goto fail; } /* @@ -770,35 +732,48 @@ unsigned long efi_main(efi_handle_t handle, image_offset = 0; } - /* - * efi_pe_entry() may have been called before efi_main(), in which - * case this is the second time we parse the cmdline. This is ok, - * parsing the cmdline multiple times does not have side-effects. - */ - cmdline_paddr = ((u64)hdr->cmd_line_ptr | - ((u64)boot_params->ext_cmd_line_ptr << 32)); - efi_parse_options((char *)cmdline_paddr); +#ifdef CONFIG_CMDLINE_BOOL + status = efi_parse_options(CONFIG_CMDLINE); + if (status != EFI_SUCCESS) { + efi_err("Failed to parse options\n"); + goto fail; + } +#endif + if (!IS_ENABLED(CONFIG_CMDLINE_OVERRIDE)) { + unsigned long cmdline_paddr = ((u64)hdr->cmd_line_ptr | + ((u64)boot_params->ext_cmd_line_ptr << 32)); + status = efi_parse_options((char *)cmdline_paddr); + if (status != EFI_SUCCESS) { + efi_err("Failed to parse options\n"); + goto fail; + } + } /* - * At this point, an initrd may already have been loaded, either by - * the bootloader and passed via bootparams, or loaded from a initrd= - * command line option by efi_pe_entry() above. In either case, we - * permit an initrd loaded from the LINUX_EFI_INITRD_MEDIA_GUID device - * path to supersede it. + * At this point, an initrd may already have been loaded by the + * bootloader and passed via bootparams. We permit an initrd loaded + * from the LINUX_EFI_INITRD_MEDIA_GUID device path to supersede it. + * + * If the device path is not present, any command-line initrd= + * arguments will be processed only if image is not NULL, which will be + * the case only if we were loaded via the PE entry point. */ - if (!noinitrd()) { + if (!efi_noinitrd) { unsigned long addr, size; - status = efi_load_initrd_dev_path(&addr, &size, ULONG_MAX); - if (status == EFI_SUCCESS) { - hdr->ramdisk_image = (u32)addr; - hdr->ramdisk_size = (u32)size; - boot_params->ext_ramdisk_image = (u64)addr >> 32; - boot_params->ext_ramdisk_size = (u64)size >> 32; - } else if (status != EFI_NOT_FOUND) { - efi_printk("efi_load_initrd_dev_path() failed!\n"); + status = efi_load_initrd(image, &addr, &size, + hdr->initrd_addr_max, ULONG_MAX); + + if (status != EFI_SUCCESS) { + efi_err("Failed to load initrd!\n"); goto fail; } + if (size > 0) { + efi_set_u64_split(addr, &hdr->ramdisk_image, + &boot_params->ext_ramdisk_image); + efi_set_u64_split(size, &hdr->ramdisk_size, + &boot_params->ext_ramdisk_size); + } } /* @@ -823,13 +798,13 @@ unsigned long efi_main(efi_handle_t handle, status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) { - efi_printk("exit_boot() failed!\n"); + efi_err("exit_boot() failed!\n"); goto fail; } return bzimage_addr; fail: - efi_printk("efi_main() failed!\n"); + efi_err("efi_main() failed!\n"); efi_exit(handle, status); } diff --git a/drivers/firmware/efi/test/efi_test.c b/drivers/firmware/efi/test/efi_test.c index 7baf48c01e72..ddf9eae396fe 100644 --- a/drivers/firmware/efi/test/efi_test.c +++ b/drivers/firmware/efi/test/efi_test.c @@ -70,9 +70,6 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src, return 0; } - if (!access_ok(src, 1)) - return -EFAULT; - buf = memdup_user(src, len); if (IS_ERR(buf)) { *dst = NULL; @@ -91,9 +88,6 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src, static inline int get_ucs2_strsize_from_user(efi_char16_t __user *src, size_t *len) { - if (!access_ok(src, 1)) - return -EFAULT; - *len = user_ucs2_strsize(src); if (*len == 0) return -EFAULT; @@ -118,9 +112,6 @@ copy_ucs2_from_user(efi_char16_t **dst, efi_char16_t __user *src) { size_t len; - if (!access_ok(src, 1)) - return -EFAULT; - len = user_ucs2_strsize(src); if (len == 0) return -EFAULT; @@ -142,9 +133,6 @@ copy_ucs2_to_user_len(efi_char16_t __user *dst, efi_char16_t *src, size_t len) if (!src) return 0; - if (!access_ok(dst, 1)) - return -EFAULT; - return copy_to_user(dst, src, len); } diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 2937d44b5df4..92013ecc2d9e 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -46,25 +46,14 @@ * require cooperation with a Trusted OS driver. */ static int resident_cpu = -1; +struct psci_operations psci_ops; +static enum arm_smccc_conduit psci_conduit = SMCCC_CONDUIT_NONE; bool psci_tos_resident_on(int cpu) { return cpu == resident_cpu; } -struct psci_operations psci_ops = { - .conduit = SMCCC_CONDUIT_NONE, - .smccc_version = SMCCC_VERSION_1_0, -}; - -enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void) -{ - if (psci_ops.smccc_version < SMCCC_VERSION_1_1) - return SMCCC_CONDUIT_NONE; - - return psci_ops.conduit; -} - typedef unsigned long (psci_fn)(unsigned long, unsigned long, unsigned long, unsigned long); static psci_fn *invoke_psci_fn; @@ -242,7 +231,7 @@ static void set_conduit(enum arm_smccc_conduit conduit) WARN(1, "Unexpected PSCI conduit %d\n", conduit); } - psci_ops.conduit = conduit; + psci_conduit = conduit; } static int get_set_conduit_method(struct device_node *np) @@ -411,8 +400,8 @@ static void __init psci_init_smccc(void) if (feature != PSCI_RET_NOT_SUPPORTED) { u32 ret; ret = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0); - if (ret == ARM_SMCCC_VERSION_1_1) { - psci_ops.smccc_version = SMCCC_VERSION_1_1; + if (ret >= ARM_SMCCC_VERSION_1_1) { + arm_smccc_version_init(ret, psci_conduit); ver = ret; } } diff --git a/drivers/firmware/smccc/Kconfig b/drivers/firmware/smccc/Kconfig new file mode 100644 index 000000000000..27b675d76235 --- /dev/null +++ b/drivers/firmware/smccc/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +config HAVE_ARM_SMCCC + bool + help + Include support for the Secure Monitor Call (SMC) and Hypervisor + Call (HVC) instructions on Armv7 and above architectures. + +config HAVE_ARM_SMCCC_DISCOVERY + bool + depends on ARM_PSCI_FW + default y + help + SMCCC v1.0 lacked discoverability and hence PSCI v1.0 was updated + to add SMCCC discovery mechanism though the PSCI firmware + implementation of PSCI_FEATURES(SMCCC_VERSION) which returns + success on firmware compliant to SMCCC v1.1 and above. diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile new file mode 100644 index 000000000000..6f369fe3f0b9 --- /dev/null +++ b/drivers/firmware/smccc/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +# +obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c new file mode 100644 index 000000000000..4e80921ee212 --- /dev/null +++ b/drivers/firmware/smccc/smccc.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Arm Limited + */ + +#define pr_fmt(fmt) "smccc: " fmt + +#include <linux/init.h> +#include <linux/arm-smccc.h> + +static u32 smccc_version = ARM_SMCCC_VERSION_1_0; +static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE; + +void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit) +{ + smccc_version = version; + smccc_conduit = conduit; +} + +enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void) +{ + if (smccc_version < ARM_SMCCC_VERSION_1_1) + return SMCCC_CONDUIT_NONE; + + return smccc_conduit; +} + +u32 arm_smccc_get_version(void) +{ + return smccc_version; +} diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c index 62f924489db5..d902acb36d14 100644 --- a/drivers/fpga/dfl-afu-dma-region.c +++ b/drivers/fpga/dfl-afu-dma-region.c @@ -324,10 +324,6 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata, if (user_addr + length < user_addr) return -EINVAL; - if (!access_ok((void __user *)(unsigned long)user_addr, - length)) - return -EINVAL; - region = kzalloc(sizeof(*region), GFP_KERNEL); if (!region) return -ENOMEM; diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c index a233a53db708..1194c0e850e0 100644 --- a/drivers/fpga/dfl-fme-pr.c +++ b/drivers/fpga/dfl-fme-pr.c @@ -97,10 +97,6 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg) return -EINVAL; } - if (!access_ok((void __user *)(unsigned long)port_pr.buffer_address, - port_pr.buffer_size)) - return -EFAULT; - /* * align PR buffer per PR bandwidth, as HW ignores the extra padding * data automatically. diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 43594978958e..fb92be7e8aa7 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -161,7 +161,7 @@ config DRM_LOAD_EDID_FIRMWARE monitor are unable to provide appropriate EDID data. Since this feature is provided as a workaround for broken hardware, the default case is N. Details and instructions how to build your own - EDID data are given in Documentation/driver-api/edid.rst. + EDID data are given in Documentation/admin-guide/edid.rst. config DRM_DP_CEC bool "Enable DisplayPort CEC-Tunneling-over-AUX HDMI support" diff --git a/drivers/gpu/drm/armada/armada_drv.c b/drivers/gpu/drm/armada/armada_drv.c index 197dca3fc84c..5232f81c16a5 100644 --- a/drivers/gpu/drm/armada/armada_drv.c +++ b/drivers/gpu/drm/armada/armada_drv.c @@ -311,7 +311,7 @@ static void __exit armada_drm_exit(void) } module_exit(armada_drm_exit); -MODULE_AUTHOR("Russell King <rmk+kernel@arm.linux.org.uk>"); +MODULE_AUTHOR("Russell King <rmk+kernel@armlinux.org.uk>"); MODULE_DESCRIPTION("Armada DRM Driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:armada-drm"); diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-ahb-audio.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-ahb-audio.c index dd56996fe9c7..d0db1acf11d7 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-ahb-audio.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-ahb-audio.c @@ -630,7 +630,7 @@ static struct platform_driver snd_dw_hdmi_driver = { module_platform_driver(snd_dw_hdmi_driver); -MODULE_AUTHOR("Russell King <rmk+kernel@arm.linux.org.uk>"); +MODULE_AUTHOR("Russell King <rmk+kernel@armlinux.org.uk>"); MODULE_DESCRIPTION("Synopsis Designware HDMI AHB ALSA interface"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("platform:" DRIVER_NAME); diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c index 9d89ebf3a749..35b62c5d18b4 100644 --- a/drivers/gpu/drm/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/drm_dp_mst_topology.c @@ -5499,7 +5499,7 @@ struct drm_dp_aux *drm_dp_mst_dsc_aux_for_port(struct drm_dp_mst_port *port) { struct drm_dp_mst_port *immediate_upstream_port; struct drm_dp_mst_port *fec_port; - struct drm_dp_desc desc = { 0 }; + struct drm_dp_desc desc = { }; u8 endpoint_fec; u8 endpoint_dsc; diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index eb009d3ab48f..6a1f6c802415 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -569,9 +569,6 @@ ssize_t drm_read(struct file *filp, char __user *buffer, struct drm_device *dev = file_priv->minor->dev; ssize_t ret; - if (!access_ok(buffer, count)) - return -EFAULT; - ret = mutex_lock_interruptible(&file_priv->event_read_lock); if (ret) return ret; diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 9e41972c4bbc..c2b8d2a953ae 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -741,7 +741,7 @@ static const struct drm_ioctl_desc drm_ioctls[] = { * }; * * Please make sure that you follow all the best practices from - * ``Documentation/ioctl/botching-up-ioctls.rst``. Note that drm_ioctl() + * ``Documentation/process/botching-up-ioctls.rst``. Note that drm_ioctl() * automatically zero-extends structures, hence make sure you can add more stuff * at the end, i.e. don't put a variable sized array there. * diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c index a8685b2e1803..7585d8f68fb9 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c @@ -736,7 +736,7 @@ static void __exit etnaviv_exit(void) module_exit(etnaviv_exit); MODULE_AUTHOR("Christian Gmeiner <christian.gmeiner@gmail.com>"); -MODULE_AUTHOR("Russell King <rmk+kernel@arm.linux.org.uk>"); +MODULE_AUTHOR("Russell King <rmk+kernel@armlinux.org.uk>"); MODULE_AUTHOR("Lucas Stach <l.stach@pengutronix.de>"); MODULE_DESCRIPTION("etnaviv DRM Driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index b7440f06c5e2..8a4e9c1cbf6c 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -2794,7 +2794,8 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, * And this range already got effectively checked earlier * when we did the "copy_from_user()" above. */ - if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list))) + if (!user_write_access_begin(user_exec_list, + count * sizeof(*user_exec_list))) goto end; for (i = 0; i < args->buffer_count; i++) { @@ -2808,7 +2809,7 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, end_user); } end_user: - user_access_end(); + user_write_access_end(); end:; } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 7ffd7afeb7a5..f80cf6ac20c5 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -598,6 +598,14 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + /* + * Using __get_user_pages_fast() with a read-only + * access is questionable. A read-only page may be + * COW-broken, and then this might end up giving + * the wrong side of the COW.. + * + * We may or may not care. + */ if (pvec) /* defer to worker if malloc fails */ pinned = __get_user_pages_fast(obj->userptr.ptr, num_pages, diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index cf2c01f17da8..60da28d412d6 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -3388,10 +3388,10 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf, /* Similar to perf's kernel.perf_paranoid_cpu sysctl option * we check a dev.i915.perf_stream_paranoid sysctl option * to determine if it's ok to access system wide OA counters - * without CAP_SYS_ADMIN privileges. + * without CAP_PERFMON or CAP_SYS_ADMIN privileges. */ if (privileged_op && - i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to open i915 perf stream\n"); ret = -EACCES; goto err_ctx; @@ -3584,9 +3584,8 @@ static int read_properties_unlocked(struct i915_perf *perf, } else oa_freq_hz = 0; - if (oa_freq_hz > i915_oa_max_sample_rate && - !capable(CAP_SYS_ADMIN)) { - DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n", + if (oa_freq_hz > i915_oa_max_sample_rate && !perfmon_capable()) { + DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without CAP_PERFMON or CAP_SYS_ADMIN privileges\n", i915_oa_max_sample_rate); return -EACCES; } @@ -4007,7 +4006,7 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data, return -EINVAL; } - if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + if (i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to add i915 OA config\n"); return -EACCES; } @@ -4154,7 +4153,7 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data, return -ENOTSUPP; } - if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + if (i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to remove i915 OA config\n"); return -EACCES; } diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h index 211f5de99a44..9aba2910d83a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h @@ -170,7 +170,7 @@ struct dpu_global_state * * Main debugfs documentation is located at, * - * Documentation/filesystems/debugfs.txt + * Documentation/filesystems/debugfs.rst * * @dpu_debugfs_setup_regset32: Initialize data for dpu_debugfs_create_regset32 * @dpu_debugfs_create_regset32: Create 32-bit register dump file diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig index 83e841be1081..02dbb5ca3bcf 100644 --- a/drivers/hwtracing/coresight/Kconfig +++ b/drivers/hwtracing/coresight/Kconfig @@ -107,7 +107,7 @@ config CORESIGHT_CPU_DEBUG can quickly get to know program counter (PC), secure state, exception level, etc. Before use debugging functionality, platform needs to ensure the clock domain and power domain are enabled - properly, please refer Documentation/trace/coresight-cpu-debug.rst + properly, please refer Documentation/trace/coresight/coresight-cpu-debug.rst for detailed description and the example for usage. config CORESIGHT_CTI diff --git a/drivers/iio/dac/ad5761.c b/drivers/iio/dac/ad5761.c index 4fb42b743f0f..7468fbd11684 100644 --- a/drivers/iio/dac/ad5761.c +++ b/drivers/iio/dac/ad5761.c @@ -3,7 +3,7 @@ * AD5721, AD5721R, AD5761, AD5761R, Voltage Output Digital to Analog Converter * * Copyright 2016 Qtechnology A/S - * 2016 Ricardo Ribalda <ricardo.ribalda@gmail.com> + * 2016 Ricardo Ribalda <ribalda@kernel.org> */ #include <linux/kernel.h> #include <linux/module.h> @@ -423,6 +423,6 @@ static struct spi_driver ad5761_driver = { }; module_spi_driver(ad5761_driver); -MODULE_AUTHOR("Ricardo Ribalda <ricardo.ribalda@gmail.com>"); +MODULE_AUTHOR("Ricardo Ribalda <ribalda@kernel.org>"); MODULE_DESCRIPTION("Analog Devices AD5721, AD5721R, AD5761, AD5761R driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/iio/dac/ti-dac7612.c b/drivers/iio/dac/ti-dac7612.c index c46805144dd4..de0c6573cd97 100644 --- a/drivers/iio/dac/ti-dac7612.c +++ b/drivers/iio/dac/ti-dac7612.c @@ -3,7 +3,7 @@ * DAC7612 Dual, 12-Bit Serial input Digital-to-Analog Converter * * Copyright 2019 Qtechnology A/S - * 2019 Ricardo Ribalda <ricardo@ribalda.com> + * 2019 Ricardo Ribalda <ribalda@kernel.org> * * Licensed under the GPL-2. */ @@ -179,6 +179,6 @@ static struct spi_driver dac7612_driver = { }; module_spi_driver(dac7612_driver); -MODULE_AUTHOR("Ricardo Ribalda <ricardo@ribalda.com>"); +MODULE_AUTHOR("Ricardo Ribalda <ribalda@kernel.org>"); MODULE_DESCRIPTION("Texas Instruments DAC7612 DAC driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 4da03f823474..f81ca20f4b69 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -206,13 +206,6 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) return -EINVAL; } - /* Verify that access is OK for the user buffer */ - if (!access_ok((void __user *)vaddr, - npages * PAGE_SIZE)) { - dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", - (void *)vaddr, npages); - return -EFAULT; - } /* Allocate the array of struct page pointers needed for pinning */ pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); if (!pages) diff --git a/drivers/leds/leds-pca963x.c b/drivers/leds/leds-pca963x.c index 66cdc003b8f4..d288acbc99c7 100644 --- a/drivers/leds/leds-pca963x.c +++ b/drivers/leds/leds-pca963x.c @@ -4,7 +4,7 @@ * Copyright 2013 Qtechnology/AS * * Author: Peter Meerwald <p.meerwald@bct-electronic.com> - * Author: Ricardo Ribalda <ricardo.ribalda@gmail.com> + * Author: Ricardo Ribalda <ribalda@kernel.org> * * Based on leds-pca955x.c * diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 83eb05bf85ff..8450d7c008d0 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -2184,8 +2184,6 @@ pmu_read(struct file *file, char __user *buf, if (count < 1 || !pp) return -EINVAL; - if (!access_ok(buf, count)) - return -EFAULT; spin_lock_irqsave(&pp->lock, flags); add_wait_queue(&pp->wait, &wait); diff --git a/drivers/media/cec/cec-notifier.c b/drivers/media/cec/cec-notifier.c index e748cd54b45d..517e0035fc99 100644 --- a/drivers/media/cec/cec-notifier.c +++ b/drivers/media/cec/cec-notifier.c @@ -2,7 +2,7 @@ /* * cec-notifier.c - notify CEC drivers of physical address changes * - * Copyright 2016 Russell King <rmk+kernel@arm.linux.org.uk> + * Copyright 2016 Russell King. * Copyright 2016-2017 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ diff --git a/drivers/media/i2c/imx214.c b/drivers/media/i2c/imx214.c index 4175d06ffd47..1ef5af9a8c8b 100644 --- a/drivers/media/i2c/imx214.c +++ b/drivers/media/i2c/imx214.c @@ -4,7 +4,7 @@ * * Copyright 2018 Qtechnology A/S * - * Ricardo Ribalda <ricardo.ribalda@gmail.com> + * Ricardo Ribalda <ribalda@kernel.org> */ #include <linux/clk.h> #include <linux/delay.h> @@ -1120,5 +1120,5 @@ static struct i2c_driver imx214_i2c_driver = { module_i2c_driver(imx214_i2c_driver); MODULE_DESCRIPTION("Sony IMX214 Camera driver"); -MODULE_AUTHOR("Ricardo Ribalda <ricardo.ribalda@gmail.com>"); +MODULE_AUTHOR("Ricardo Ribalda <ribalda@kernel.org>"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c index 97f0f8b23b5d..8a1e1b95b379 100644 --- a/drivers/media/v4l2-core/v4l2-fwnode.c +++ b/drivers/media/v4l2-core/v4l2-fwnode.c @@ -980,7 +980,7 @@ static int v4l2_fwnode_reference_parse(struct device *dev, * * THIS EXAMPLE EXISTS MERELY TO DOCUMENT THIS FUNCTION. DO NOT USE IT AS A * REFERENCE IN HOW ACPI TABLES SHOULD BE WRITTEN!! See documentation under - * Documentation/acpi/dsd instead and especially graph.txt, + * Documentation/firmware-guide/acpi/dsd/ instead and especially graph.txt, * data-node-references.txt and leds.txt . * * Scope (\_SB.PCI0.I2C2) diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c index ce16d6b99295..2d8328d928d5 100644 --- a/drivers/misc/vmw_vmci/vmci_host.c +++ b/drivers/misc/vmw_vmci/vmci_host.c @@ -233,8 +233,6 @@ static int vmci_host_setup_notify(struct vmci_ctx *context, * about the size. */ BUILD_BUG_ON(sizeof(bool) != sizeof(u8)); - if (!access_ok((void __user *)uva, sizeof(u8))) - return VMCI_ERROR_GENERIC; /* * Lock physical page backing a given user VA. diff --git a/drivers/net/phy/swphy.c b/drivers/net/phy/swphy.c index 53c214a22b95..59f1ba4d49bc 100644 --- a/drivers/net/phy/swphy.c +++ b/drivers/net/phy/swphy.c @@ -2,7 +2,7 @@ /* * Software PHY emulation * - * Code taken from fixed_phy.c by Russell King <rmk+kernel@arm.linux.org.uk> + * Code taken from fixed_phy.c by Russell King. * * Author: Vitaly Bordug <vbordug@ru.mvista.com> * Anton Vorontsov <avorontsov@ru.mvista.com> diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index 12ea4a4ad607..6c9edc8bbc95 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -113,7 +113,7 @@ static int event_buffer_open(struct inode *inode, struct file *file) { int err = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (test_and_set_bit_lock(0, &buffer_opened)) diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 09ae8a970880..a9261cf48293 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -79,13 +79,6 @@ config FSL_IMX8_DDR_PMU can give information about memory throughput and other related events. -config HISI_PMU - bool "HiSilicon SoC PMU" - depends on ARM64 && ACPI - help - Support for HiSilicon SoC uncore performance monitoring - unit (PMU), such as: L3C, HHA and DDRC. - config QCOM_L2_PMU bool "Qualcomm Technologies L2-cache PMU" depends on ARCH_QCOM && ARM64 && ACPI @@ -129,4 +122,6 @@ config ARM_SPE_PMU Extension, which provides periodic sampling of operations in the CPU pipeline and reports this via the perf AUX interface. +source "drivers/perf/hisilicon/Kconfig" + endmenu diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 70968c8c09d7..518d0603e24f 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -690,10 +690,8 @@ static int dsu_pmu_device_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); - if (irq < 0) { - dev_warn(&pdev->dev, "Failed to find IRQ\n"); + if (irq < 0) return -EINVAL; - } name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s_%d", PMUNAME, atomic_inc_return(&pmu_idx)); diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index f01a57e5a5f3..48e28ef93a70 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -814,7 +814,7 @@ static int smmu_pmu_probe(struct platform_device *pdev) if (err) { dev_err(dev, "Error %d registering hotplug, PMU @%pa\n", err, &res_0->start); - return err; + goto out_clear_affinity; } err = perf_pmu_register(&smmu_pmu->pmu, name, -1); @@ -833,6 +833,8 @@ static int smmu_pmu_probe(struct platform_device *pdev) out_unregister: cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node); +out_clear_affinity: + irq_set_affinity_hint(smmu_pmu->irq, NULL); return err; } @@ -842,6 +844,7 @@ static int smmu_pmu_remove(struct platform_device *pdev) perf_pmu_unregister(&smmu_pmu->pmu); cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node); + irq_set_affinity_hint(smmu_pmu->irq, NULL); return 0; } diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index b72c04852599..d80f48798bce 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -274,7 +274,7 @@ static u64 arm_spe_event_to_pmscr(struct perf_event *event) if (!attr->exclude_kernel) reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT); - if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && capable(CAP_SYS_ADMIN)) + if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable()) reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT); return reg; @@ -700,7 +700,7 @@ static int arm_spe_pmu_event_init(struct perf_event *event) return -EOPNOTSUPP; reg = arm_spe_event_to_pmscr(event); - if (!capable(CAP_SYS_ADMIN) && + if (!perfmon_capable() && (reg & (BIT(SYS_PMSCR_EL1_PA_SHIFT) | BIT(SYS_PMSCR_EL1_CX_SHIFT) | BIT(SYS_PMSCR_EL1_PCT_SHIFT)))) @@ -1133,10 +1133,8 @@ static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu) struct platform_device *pdev = spe_pmu->pdev; int irq = platform_get_irq(pdev, 0); - if (irq < 0) { - dev_err(&pdev->dev, "failed to get IRQ (%d)\n", irq); + if (irq < 0) return -ENXIO; - } if (!irq_is_percpu(irq)) { dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq); diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig new file mode 100644 index 000000000000..c5d1b7019fff --- /dev/null +++ b/drivers/perf/hisilicon/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +config HISI_PMU + tristate "HiSilicon SoC PMU drivers" + depends on ARM64 && ACPI + help + Support for HiSilicon SoC L3 Cache performance monitor, Hydra Home + Agent performance monitor and DDR Controller performance monitor. diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index c3a96ec2bf66..e8377061845f 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o +obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ + hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c index 453f1c6a16ca..15713faaa07e 100644 --- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c @@ -394,8 +394,9 @@ static int hisi_ddrc_pmu_probe(struct platform_device *pdev) ret = perf_pmu_register(&ddrc_pmu->pmu, name, -1); if (ret) { dev_err(ddrc_pmu->dev, "DDRC PMU register failed!\n"); - cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, - &ddrc_pmu->node); + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, &ddrc_pmu->node); + irq_set_affinity_hint(ddrc_pmu->irq, NULL); } return ret; @@ -406,8 +407,9 @@ static int hisi_ddrc_pmu_remove(struct platform_device *pdev) struct hisi_pmu *ddrc_pmu = platform_get_drvdata(pdev); perf_pmu_unregister(&ddrc_pmu->pmu); - cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, - &ddrc_pmu->node); + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, + &ddrc_pmu->node); + irq_set_affinity_hint(ddrc_pmu->irq, NULL); return 0; } diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c index 6a1dd72d8abb..dcc5600788a9 100644 --- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c @@ -283,7 +283,7 @@ static struct attribute *hisi_hha_pmu_events_attr[] = { HISI_PMU_EVENT_ATTR(rx_wbip, 0x05), HISI_PMU_EVENT_ATTR(rx_wtistash, 0x11), HISI_PMU_EVENT_ATTR(rd_ddr_64b, 0x1c), - HISI_PMU_EVENT_ATTR(wr_dr_64b, 0x1d), + HISI_PMU_EVENT_ATTR(wr_ddr_64b, 0x1d), HISI_PMU_EVENT_ATTR(rd_ddr_128b, 0x1e), HISI_PMU_EVENT_ATTR(wr_ddr_128b, 0x1f), HISI_PMU_EVENT_ATTR(spill_num, 0x20), @@ -406,8 +406,9 @@ static int hisi_hha_pmu_probe(struct platform_device *pdev) ret = perf_pmu_register(&hha_pmu->pmu, name, -1); if (ret) { dev_err(hha_pmu->dev, "HHA PMU register failed!\n"); - cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, - &hha_pmu->node); + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, &hha_pmu->node); + irq_set_affinity_hint(hha_pmu->irq, NULL); } return ret; @@ -418,8 +419,9 @@ static int hisi_hha_pmu_remove(struct platform_device *pdev) struct hisi_pmu *hha_pmu = platform_get_drvdata(pdev); perf_pmu_unregister(&hha_pmu->pmu); - cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, - &hha_pmu->node); + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, + &hha_pmu->node); + irq_set_affinity_hint(hha_pmu->irq, NULL); return 0; } diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 1151e99b241c..8dd1278bec04 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -396,8 +396,9 @@ static int hisi_l3c_pmu_probe(struct platform_device *pdev) ret = perf_pmu_register(&l3c_pmu->pmu, name, -1); if (ret) { dev_err(l3c_pmu->dev, "L3C PMU register failed!\n"); - cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, - &l3c_pmu->node); + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, &l3c_pmu->node); + irq_set_affinity_hint(l3c_pmu->irq, NULL); } return ret; @@ -408,8 +409,9 @@ static int hisi_l3c_pmu_remove(struct platform_device *pdev) struct hisi_pmu *l3c_pmu = platform_get_drvdata(pdev); perf_pmu_unregister(&l3c_pmu->pmu); - cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, - &l3c_pmu->node); + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, + &l3c_pmu->node); + irq_set_affinity_hint(l3c_pmu->irq, NULL); return 0; } diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 584de8f807cc..97aff877a4e7 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -35,6 +35,7 @@ ssize_t hisi_format_sysfs_show(struct device *dev, return sprintf(buf, "%s\n", (char *)eattr->var); } +EXPORT_SYMBOL_GPL(hisi_format_sysfs_show); /* * PMU event attributes @@ -48,6 +49,7 @@ ssize_t hisi_event_sysfs_show(struct device *dev, return sprintf(page, "config=0x%lx\n", (unsigned long)eattr->var); } +EXPORT_SYMBOL_GPL(hisi_event_sysfs_show); /* * sysfs cpumask attributes. For uncore PMU, we only have a single CPU to show @@ -59,6 +61,7 @@ ssize_t hisi_cpumask_sysfs_show(struct device *dev, return sprintf(buf, "%d\n", hisi_pmu->on_cpu); } +EXPORT_SYMBOL_GPL(hisi_cpumask_sysfs_show); static bool hisi_validate_event_group(struct perf_event *event) { @@ -97,6 +100,7 @@ int hisi_uncore_pmu_counter_valid(struct hisi_pmu *hisi_pmu, int idx) { return idx >= 0 && idx < hisi_pmu->num_counters; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_counter_valid); int hisi_uncore_pmu_get_event_idx(struct perf_event *event) { @@ -113,6 +117,7 @@ int hisi_uncore_pmu_get_event_idx(struct perf_event *event) return idx; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_get_event_idx); static void hisi_uncore_pmu_clear_event_idx(struct hisi_pmu *hisi_pmu, int idx) { @@ -173,6 +178,7 @@ int hisi_uncore_pmu_event_init(struct perf_event *event) return 0; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_event_init); /* * Set the counter to count the event that we're interested in, @@ -220,6 +226,7 @@ void hisi_uncore_pmu_set_event_period(struct perf_event *event) /* Write start value to the hardware event counter */ hisi_pmu->ops->write_counter(hisi_pmu, hwc, val); } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_set_event_period); void hisi_uncore_pmu_event_update(struct perf_event *event) { @@ -240,6 +247,7 @@ void hisi_uncore_pmu_event_update(struct perf_event *event) HISI_MAX_PERIOD(hisi_pmu->counter_bits); local64_add(delta, &event->count); } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_event_update); void hisi_uncore_pmu_start(struct perf_event *event, int flags) { @@ -262,6 +270,7 @@ void hisi_uncore_pmu_start(struct perf_event *event, int flags) hisi_uncore_pmu_enable_event(event); perf_event_update_userpage(event); } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_start); void hisi_uncore_pmu_stop(struct perf_event *event, int flags) { @@ -278,6 +287,7 @@ void hisi_uncore_pmu_stop(struct perf_event *event, int flags) hisi_uncore_pmu_event_update(event); hwc->state |= PERF_HES_UPTODATE; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_stop); int hisi_uncore_pmu_add(struct perf_event *event, int flags) { @@ -300,6 +310,7 @@ int hisi_uncore_pmu_add(struct perf_event *event, int flags) return 0; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_add); void hisi_uncore_pmu_del(struct perf_event *event, int flags) { @@ -311,12 +322,14 @@ void hisi_uncore_pmu_del(struct perf_event *event, int flags) perf_event_update_userpage(event); hisi_pmu->pmu_events.hw_events[hwc->idx] = NULL; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_del); void hisi_uncore_pmu_read(struct perf_event *event) { /* Read hardware counter and update the perf counter statistics */ hisi_uncore_pmu_event_update(event); } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_read); void hisi_uncore_pmu_enable(struct pmu *pmu) { @@ -329,6 +342,7 @@ void hisi_uncore_pmu_enable(struct pmu *pmu) hisi_pmu->ops->start_counters(hisi_pmu); } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_enable); void hisi_uncore_pmu_disable(struct pmu *pmu) { @@ -336,6 +350,7 @@ void hisi_uncore_pmu_disable(struct pmu *pmu) hisi_pmu->ops->stop_counters(hisi_pmu); } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_disable); /* @@ -414,10 +429,11 @@ int hisi_uncore_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) hisi_pmu->on_cpu = cpu; /* Overflow interrupt also should use the same CPU */ - WARN_ON(irq_set_affinity(hisi_pmu->irq, cpumask_of(cpu))); + WARN_ON(irq_set_affinity_hint(hisi_pmu->irq, cpumask_of(cpu))); return 0; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_online_cpu); int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) { @@ -446,7 +462,10 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) perf_pmu_migrate_context(&hisi_pmu->pmu, cpu, target); /* Use this CPU for event counting */ hisi_pmu->on_cpu = target; - WARN_ON(irq_set_affinity(hisi_pmu->irq, cpumask_of(target))); + WARN_ON(irq_set_affinity_hint(hisi_pmu->irq, cpumask_of(target))); return 0; } +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_offline_cpu); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/scsi/esas2r/esas2r_ioctl.c b/drivers/scsi/esas2r/esas2r_ioctl.c index 442c5e70a7b4..cc620f10eabc 100644 --- a/drivers/scsi/esas2r/esas2r_ioctl.c +++ b/drivers/scsi/esas2r/esas2r_ioctl.c @@ -1510,7 +1510,7 @@ ioctl_done: } /* Always copy the buffer back, if only to pick up the status */ - err = __copy_to_user(arg, ioctl, sizeof(struct atto_express_ioctl)); + err = copy_to_user(arg, ioctl, sizeof(struct atto_express_ioctl)); if (err != 0) { esas2r_log(ESAS2R_LOG_WARN, "ioctl_handler copy_to_user didn't copy everything (err %d, cmd %u)", diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 8a6e02aa553f..5a754fb5f854 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -2166,10 +2166,6 @@ lpfc_debugfs_lockstat_write(struct file *file, const char __user *buf, char *pbuf; int i; - /* Protect copy from user */ - if (!access_ok(buf, nbytes)) - return -EFAULT; - memset(mybuf, 0, sizeof(mybuf)); if (copy_from_user(mybuf, buf, nbytes)) @@ -2621,10 +2617,6 @@ lpfc_debugfs_multixripools_write(struct file *file, const char __user *buf, if (nbytes > 64) nbytes = 64; - /* Protect copy from user */ - if (!access_ok(buf, nbytes)) - return -EFAULT; - memset(mybuf, 0, sizeof(mybuf)); if (copy_from_user(mybuf, buf, nbytes)) @@ -2787,10 +2779,6 @@ lpfc_debugfs_scsistat_write(struct file *file, const char __user *buf, char mybuf[6] = {0}; int i; - /* Protect copy from user */ - if (!access_ok(buf, nbytes)) - return -EFAULT; - if (copy_from_user(mybuf, buf, (nbytes >= sizeof(mybuf)) ? (sizeof(mybuf) - 1) : nbytes)) return -EFAULT; diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index 8f3af87b6bb0..45d04b7b2643 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -211,18 +211,18 @@ static int scsi_ioctl_common(struct scsi_device *sdev, int cmd, void __user *arg } switch (cmd) { - case SCSI_IOCTL_GET_IDLUN: - if (!access_ok(arg, sizeof(struct scsi_idlun))) + case SCSI_IOCTL_GET_IDLUN: { + struct scsi_idlun v = { + .dev_id = (sdev->id & 0xff) + + ((sdev->lun & 0xff) << 8) + + ((sdev->channel & 0xff) << 16) + + ((sdev->host->host_no & 0xff) << 24), + .host_unique_id = sdev->host->unique_id + }; + if (copy_to_user(arg, &v, sizeof(struct scsi_idlun))) return -EFAULT; - - __put_user((sdev->id & 0xff) - + ((sdev->lun & 0xff) << 8) - + ((sdev->channel & 0xff) << 16) - + ((sdev->host->host_no & 0xff) << 24), - &((struct scsi_idlun __user *)arg)->dev_id); - __put_user(sdev->host->unique_id, - &((struct scsi_idlun __user *)arg)->host_unique_id); return 0; + } case SCSI_IOCTL_GET_BUS_NUMBER: return put_user(sdev->host->host_no, (int __user *)arg); case SCSI_IOCTL_PROBE_HOST: diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c index 991f49ee4026..b09eac4b6d64 100644 --- a/drivers/tty/n_hdlc.c +++ b/drivers/tty/n_hdlc.c @@ -423,13 +423,6 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file, struct n_hdlc_buf *rbuf; DECLARE_WAITQUEUE(wait, current); - /* verify user access to buffer */ - if (!access_ok(buf, nr)) { - pr_warn("%s(%d) %s() can't verify user buffer\n", - __FILE__, __LINE__, __func__); - return -EFAULT; - } - add_wait_queue(&tty->read_wait, &wait); for (;;) { diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c index 44f28a114c2b..94b6fa6e585e 100644 --- a/drivers/usb/core/devices.c +++ b/drivers/usb/core/devices.c @@ -598,8 +598,6 @@ static ssize_t usb_device_read(struct file *file, char __user *buf, return -EINVAL; if (nbytes <= 0) return 0; - if (!access_ok(buf, nbytes)) - return -EFAULT; mutex_lock(&usb_bus_idr_lock); /* print devices for all busses */ diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index d93d94d7ff50..96d4507d988a 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1136,11 +1136,6 @@ static int proc_control(struct usb_dev_state *ps, void __user *arg) ctrl.bRequestType, ctrl.bRequest, ctrl.wValue, ctrl.wIndex, ctrl.wLength); if (ctrl.bRequestType & 0x80) { - if (ctrl.wLength && !access_ok(ctrl.data, - ctrl.wLength)) { - ret = -EINVAL; - goto done; - } pipe = usb_rcvctrlpipe(dev, 0); snoop_urb(dev, NULL, pipe, ctrl.wLength, tmo, SUBMIT, NULL, 0); @@ -1225,10 +1220,6 @@ static int proc_bulk(struct usb_dev_state *ps, void __user *arg) } tmo = bulk.timeout; if (bulk.ep & 0x80) { - if (len1 && !access_ok(bulk.data, len1)) { - ret = -EINVAL; - goto done; - } snoop_urb(dev, NULL, pipe, len1, tmo, SUBMIT, NULL, 0); usb_unlock_device(dev); diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c index f3816a5c861e..df671acdd464 100644 --- a/drivers/usb/gadget/function/f_hid.c +++ b/drivers/usb/gadget/function/f_hid.c @@ -252,9 +252,6 @@ static ssize_t f_hidg_read(struct file *file, char __user *buffer, if (!count) return 0; - if (!access_ok(buffer, count)) - return -EFAULT; - spin_lock_irqsave(&hidg->read_spinlock, flags); #define READ_COND (!list_empty(&hidg->completed_out_req)) @@ -339,9 +336,6 @@ static ssize_t f_hidg_write(struct file *file, const char __user *buffer, unsigned long flags; ssize_t status = -ENOMEM; - if (!access_ok(buffer, count)) - return -EFAULT; - spin_lock_irqsave(&hidg->write_spinlock, flags); #define WRITE_COND (!hidg->write_pending) diff --git a/drivers/video/fbdev/amifb.c b/drivers/video/fbdev/amifb.c index 20e03e00b66d..6062104f3afb 100644 --- a/drivers/video/fbdev/amifb.c +++ b/drivers/video/fbdev/amifb.c @@ -1855,8 +1855,6 @@ static int ami_get_var_cursorinfo(struct fb_var_cursorinfo *var, var->yspot = par->crsr.spot_y; if (size > var->height * var->width) return -ENAMETOOLONG; - if (!access_ok(data, size)) - return -EFAULT; delta = 1 << par->crsr.fmode; lspr = lofsprite + (delta << 1); if (par->bplcon0 & BPC0_LACE) @@ -1935,8 +1933,6 @@ static int ami_set_var_cursorinfo(struct fb_var_cursorinfo *var, return -EINVAL; if (!var->height) return -EINVAL; - if (!access_ok(data, var->width * var->height)) - return -EFAULT; delta = 1 << fmode; lofsprite = shfsprite = (u_short *)spritememory; lspr = lofsprite + (delta << 1); diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c index 56995f44e76d..f40be68d5aac 100644 --- a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c +++ b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c @@ -482,9 +482,6 @@ static int omapfb_memory_read(struct fb_info *fbi, if (!display || !display->driver->memory_read) return -ENOENT; - if (!access_ok(mr->buffer, mr->buffer_size)) - return -EFAULT; - if (mr->w > 4096 || mr->h > 4096) return -EINVAL; diff --git a/fs/Kconfig b/fs/Kconfig index f08fbbfafd9a..d1ad3935fb85 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -166,7 +166,7 @@ config TMPFS space. If you unmount a tmpfs instance, everything stored therein is lost. - See <file:Documentation/filesystems/tmpfs.txt> for details. + See <file:Documentation/filesystems/tmpfs.rst> for details. config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 62dc4f577ba1..04f86b8c100e 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -36,6 +36,12 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_HAVE_ELF_PROT + bool + +config ARCH_USE_GNU_PROPERTY + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF @@ -72,7 +78,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS The core dump behavior can be controlled per process using the /proc/PID/coredump_filter pseudo-file; this setting is - inherited. See Documentation/filesystems/proc.txt for details. + inherited. See Documentation/filesystems/proc.rst for details. This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index df4650dccf68..44738fed6625 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -12,7 +12,7 @@ config ADFS_FS The ADFS partition should be the first partition (i.e., /dev/[hs]d?1) on each of your drives. Please read the file - <file:Documentation/filesystems/adfs.txt> for further details. + <file:Documentation/filesystems/adfs.rst> for further details. To compile this code as a module, choose M here: the module will be called adfs. diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 84c46b9025c5..eb9d0ab850cb 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -9,7 +9,7 @@ config AFFS_FS FFS partition on your hard drive. Amiga floppies however cannot be read with this driver due to an incompatibility of the floppy controller used in an Amiga and the standard floppy controller in - PCs and workstations. Read <file:Documentation/filesystems/affs.txt> + PCs and workstations. Read <file:Documentation/filesystems/affs.rst> and <file:fs/affs/Changes>. With this driver you can also mount disk files used by Bernd diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 3fb1f559e317..1ad211d72b3b 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -8,7 +8,7 @@ config AFS_FS If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -18,7 +18,7 @@ config AFS_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -37,6 +37,6 @@ config AFS_DEBUG_CURSOR the dmesg log if the server rotation algorithm fails to successfully contact a server. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -176,6 +176,7 @@ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; + struct cred *creds; }; struct poll_iocb { @@ -1589,8 +1590,11 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); + const struct cred *old_cred = override_creds(iocb->fsync.creds); iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + revert_creds(old_cred); + put_cred(iocb->fsync.creds); iocb_put(iocb); } @@ -1604,6 +1608,10 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, if (unlikely(!req->file->f_op->fsync)) return -EINVAL; + req->creds = prepare_creds(); + if (!req->creds) + return -ENOMEM; + req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3e1247f07913..3a757805b585 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -11,7 +11,7 @@ config BFS_FS on your /stand slice from within Linux. You then also need to say Y to "UnixWare slices support", below. More information about the BFS file system is contained in the file - <file:Documentation/filesystems/bfs.txt>. + <file:Documentation/filesystems/bfs.rst>. If you don't know what this is about, say N. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 25d489bc9453..8945671fe0e5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -40,12 +40,18 @@ #include <linux/sched/coredump.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sizes.h> +#include <linux/types.h> #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <asm/param.h> #include <asm/page.h> +#ifndef ELF_COMPAT +#define ELF_COMPAT 0 +#endif + #ifndef user_long_t #define user_long_t long #endif @@ -539,7 +545,8 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ -static inline int make_prot(u32 p_flags) +static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state, + bool has_interp, bool is_interp) { int prot = 0; @@ -549,7 +556,8 @@ static inline int make_prot(u32 p_flags) prot |= PROT_WRITE; if (p_flags & PF_X) prot |= PROT_EXEC; - return prot; + + return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp); } /* This is much more generalized than the library routine read function, @@ -559,7 +567,8 @@ static inline int make_prot(u32 p_flags) static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, - unsigned long no_base, struct elf_phdr *interp_elf_phdata) + unsigned long no_base, struct elf_phdr *interp_elf_phdata, + struct arch_elf_state *arch_state) { struct elf_phdr *eppnt; unsigned long load_addr = 0; @@ -591,7 +600,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; - int elf_prot = make_prot(eppnt->p_flags); + int elf_prot = make_prot(eppnt->p_flags, arch_state, + true, true); unsigned long vaddr = 0; unsigned long k, map_addr; @@ -682,6 +692,111 @@ out: * libraries. There is no binary dependent code anywhere else. */ +static int parse_elf_property(const char *data, size_t *off, size_t datasz, + struct arch_elf_state *arch, + bool have_prev_type, u32 *prev_type) +{ + size_t o, step; + const struct gnu_property *pr; + int ret; + + if (*off == datasz) + return -ENOENT; + + if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN)) + return -EIO; + o = *off; + datasz -= *off; + + if (datasz < sizeof(*pr)) + return -ENOEXEC; + pr = (const struct gnu_property *)(data + o); + o += sizeof(*pr); + datasz -= sizeof(*pr); + + if (pr->pr_datasz > datasz) + return -ENOEXEC; + + WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN); + step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN); + if (step > datasz) + return -ENOEXEC; + + /* Properties are supposed to be unique and sorted on pr_type: */ + if (have_prev_type && pr->pr_type <= *prev_type) + return -ENOEXEC; + *prev_type = pr->pr_type; + + ret = arch_parse_elf_property(pr->pr_type, data + o, + pr->pr_datasz, ELF_COMPAT, arch); + if (ret) + return ret; + + *off = o + step; + return 0; +} + +#define NOTE_DATA_SZ SZ_1K +#define GNU_PROPERTY_TYPE_0_NAME "GNU" +#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) + +static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, + struct arch_elf_state *arch) +{ + union { + struct elf_note nhdr; + char data[NOTE_DATA_SZ]; + } note; + loff_t pos; + ssize_t n; + size_t off, datasz; + int ret; + bool have_prev_type; + u32 prev_type; + + if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr) + return 0; + + /* load_elf_binary() shouldn't call us unless this is true... */ + if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY)) + return -ENOEXEC; + + /* If the properties are crazy large, that's too bad (for now): */ + if (phdr->p_filesz > sizeof(note)) + return -ENOEXEC; + + pos = phdr->p_offset; + n = kernel_read(f, ¬e, phdr->p_filesz, &pos); + + BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ); + if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ) + return -EIO; + + if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || + note.nhdr.n_namesz != NOTE_NAME_SZ || + strncmp(note.data + sizeof(note.nhdr), + GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + return -ENOEXEC; + + off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, + ELF_GNU_PROPERTY_ALIGN); + if (off > n) + return -ENOEXEC; + + if (note.nhdr.n_descsz > n - off) + return -ENOEXEC; + datasz = off + note.nhdr.n_descsz; + + have_prev_type = false; + do { + ret = parse_elf_property(note.data, &off, datasz, arch, + have_prev_type, &prev_type); + have_prev_type = true; + } while (!ret); + + return ret == -ENOENT ? 0 : ret; +} + static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -689,6 +804,7 @@ static int load_elf_binary(struct linux_binprm *bprm) int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; + struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_bss, elf_brk; int bss_prot = 0; int retval, i; @@ -726,6 +842,11 @@ static int load_elf_binary(struct linux_binprm *bprm) for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; + if (elf_ppnt->p_type == PT_GNU_PROPERTY) { + elf_property_phdata = elf_ppnt; + continue; + } + if (elf_ppnt->p_type != PT_INTERP) continue; @@ -819,9 +940,14 @@ out_free_interp: goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { + case PT_GNU_PROPERTY: + elf_property_phdata = elf_ppnt; + break; + case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, @@ -832,6 +958,11 @@ out_free_interp: } } + retval = parse_elf_properties(interpreter ?: bprm->file, + elf_property_phdata, &arch_state); + if (retval) + goto out_free_dentry; + /* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked @@ -913,7 +1044,8 @@ out_free_interp: } } - elf_prot = make_prot(elf_ppnt->p_flags); + elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, + !!interpreter, false); elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; @@ -1056,7 +1188,8 @@ out_free_interp: if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, - load_bias, interp_elf_phdata); + load_bias, interp_elf_phdata, + &arch_state); if (!IS_ERR((void *)elf_entry)) { /* * load_elf_interp() returns relocation @@ -1355,7 +1488,6 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { u32 __user *header = (u32 __user *) vma->vm_start; u32 word; - mm_segment_t fs = get_fs(); /* * Doing it this way gets the constant folded by GCC. */ @@ -1368,14 +1500,8 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, magic.elfmag[EI_MAG1] = ELFMAG1; magic.elfmag[EI_MAG2] = ELFMAG2; magic.elfmag[EI_MAG3] = ELFMAG3; - /* - * Switch to the user "segment" for get_user(), - * then put back what elf_core_dump() had in place. - */ - set_fs(USER_DS); if (unlikely(get_user(word, header))) word = 0; - set_fs(fs); if (word == magic.cmp) return PAGE_SIZE; } @@ -1556,10 +1682,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo); - set_fs(old_fs); + copy_siginfo_to_external(csigdata, siginfo); fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } @@ -2186,7 +2309,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - mm_segment_t fs; int segs, i; size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; @@ -2235,13 +2357,10 @@ static int elf_core_dump(struct coredump_params *cprm) * notes. This also sets up the file header. */ if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) - goto cleanup; + goto end_coredump; has_dumped = 1; - fs = get_fs(); - set_fs(KERNEL_DS); - offset += sizeof(elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ @@ -2369,9 +2488,6 @@ static int elf_core_dump(struct coredump_params *cprm) } end_coredump: - set_fs(fs); - -cleanup: free_note_info(&info); kfree(shdr4extnum); kvfree(vma_filesz); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 240f66663543..d9501a86cec9 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1549,7 +1549,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) { #define NUM_NOTES 6 int has_dumped = 0; - mm_segment_t fs; int segs; int i; struct vm_area_struct *vma; @@ -1589,31 +1588,31 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); if (!elf) - goto cleanup; + goto end_coredump; prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); if (!prstatus) - goto cleanup; + goto end_coredump; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) - goto cleanup; + goto end_coredump; notes = kmalloc_array(NUM_NOTES, sizeof(struct memelfnote), GFP_KERNEL); if (!notes) - goto cleanup; + goto end_coredump; fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); if (!fpu) - goto cleanup; + goto end_coredump; #ifdef ELF_CORE_COPY_XFPREGS xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); if (!xfpu) - goto cleanup; + goto end_coredump; #endif for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); if (!tmp) - goto cleanup; + goto end_coredump; tmp->thread = ct->task; list_add(&tmp->list, &thread_list); @@ -1678,9 +1677,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu); #endif - fs = get_fs(); - set_fs(KERNEL_DS); - offset += sizeof(*elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ @@ -1788,9 +1784,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) } end_coredump: - set_fs(fs); - -cleanup: while (!list_empty(&thread_list)) { struct list_head *tmp = thread_list.next; list_del(tmp); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c5f41bd86765..6a92ecf9eaa2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7065,13 +7065,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - if (!access_ok(arg->clone_sources, - sizeof(*arg->clone_sources) * - arg->clone_sources_count)) { - ret = -EFAULT; - goto out; - } - if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { ret = -EINVAL; goto out; diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index ae559ed5b3b3..ff9ca55a9ae9 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -8,7 +8,7 @@ config CACHEFILES filesystems - primarily networking filesystems - thus allowing fast local disk to enhance the speed of slower devices. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. config CACHEFILES_DEBUG @@ -36,5 +36,5 @@ config CACHEFILES_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. diff --git a/fs/char_dev.c b/fs/char_dev.c index c5e6eff5a381..ba0ded7842a7 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -483,6 +483,9 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) p->dev = dev; p->count = count; + if (WARN_ON(dev == WHITEOUT_DEV)) + return -EBUSY; + error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig index ae6759f9594a..c3477eeafb3f 100644 --- a/fs/coda/Kconfig +++ b/fs/coda/Kconfig @@ -15,7 +15,7 @@ config CODA_FS *client*. You will need user level code as well, both for the client and server. Servers are currently user level, i.e. they need no kernel support. Please read - <file:Documentation/filesystems/coda.txt> and check out the Coda + <file:Documentation/filesystems/coda.rst> and check out the Coda home page <http://www.coda.cs.cmu.edu/>. To compile the coda client support as a module, choose M here: the diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index aaad4ca1217e..e61f3fe8e32a 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -17,6 +17,8 @@ #include <linux/elfcore-compat.h> #include <linux/time.h> +#define ELF_COMPAT 1 + /* * Rename the basic ELF layout types to refer to the 32-bit class of files. */ @@ -28,18 +30,20 @@ #undef elf_shdr #undef elf_note #undef elf_addr_t +#undef ELF_GNU_PROPERTY_ALIGN #define elfhdr elf32_hdr #define elf_phdr elf32_phdr #define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN /* * Some data types as stored in coredump. */ #define user_long_t compat_long_t #define user_siginfo_t compat_siginfo_t -#define copy_siginfo_to_user copy_siginfo_to_user32 +#define copy_siginfo_to_external copy_siginfo_to_external32 /* * The machine-dependent core note format types are defined in elfcore-compat.h, diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fd0b5dd68f9e..8bd6a883c94c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs/configfs.txt for more + * Please see Documentation/filesystems/configfs.rst for more * information. */ diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 6e0f1fcb8a5b..704a4356f137 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs/configfs.txt for + * Please see the file Documentation/filesystems/configfs.rst for * critical information about using the config_item interface. */ diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index c8bebb70a971..d98cef0dbb6b 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -9,7 +9,7 @@ config CRAMFS limited to 256MB file systems (with 16MB files), and doesn't support 16/32 bits uid/gid, hard links and timestamps. - See <file:Documentation/filesystems/cramfs.txt> and + See <file:Documentation/filesystems/cramfs.rst> and <file:fs/cramfs/README> for further information. To compile this as a module, choose M here: the module will be called diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 522c35d5292b..1bdeaa6d5790 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -7,7 +7,7 @@ config ECRYPT_FS select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See - <file:Documentation/filesystems/ecryptfs.txt> to learn more about + <file:Documentation/filesystems/ecryptfs.rst> to learn more about eCryptfs. Userspace components are required and can be obtained from <http://ecryptfs.sf.net>. diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 718163d0c621..ca31993dcb47 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -69,7 +69,7 @@ config VFAT_FS The VFAT support enlarges your kernel by about 10 KB and it only works if you said Y to the "DOS FAT fs support" above. Please read - the file <file:Documentation/filesystems/vfat.txt> for details. If + the file <file:Documentation/filesystems/vfat.rst> for details. If unsure, say Y. To compile this as a module, choose M here: the module will be called @@ -82,7 +82,7 @@ config FAT_DEFAULT_CODEPAGE help This option should be set to the codepage of your FAT filesystems. It can be overridden with the "codepage" mount option. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. config FAT_DEFAULT_IOCHARSET string "Default iocharset for FAT" @@ -96,7 +96,7 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here - select the next option instead if you would like to use UTF-8 encoded file names by default. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. Enable any character sets you need in File Systems/Native Language Support. @@ -114,4 +114,4 @@ config FAT_DEFAULT_UTF8 Say Y if you use UTF-8 encoding for file names, N otherwise. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 054acd9fd033..b4ddf48fa444 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -804,8 +804,6 @@ static long fat_dir_ioctl(struct file *filp, unsigned int cmd, return fat_generic_ioctl(filp, cmd, arg); } - if (!access_ok(d1, sizeof(struct __fat_dirent[2]))) - return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, @@ -844,8 +842,6 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } - if (!access_ok(d1, sizeof(struct compat_dirent[2]))) - return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, diff --git a/fs/fs_context.c b/fs/fs_context.c index fc9f6ef93b55..7d5c5dd2b1d5 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -42,7 +42,6 @@ static const struct constant_table common_set_sb_flag[] = { { "dirsync", SB_DIRSYNC }, { "lazytime", SB_LAZYTIME }, { "mand", SB_MANDLOCK }, - { "posixacl", SB_POSIXACL }, { "ro", SB_RDONLY }, { "sync", SB_SYNCHRONOUS }, { }, @@ -53,44 +52,15 @@ static const struct constant_table common_clear_sb_flag[] = { { "nolazytime", SB_LAZYTIME }, { "nomand", SB_MANDLOCK }, { "rw", SB_RDONLY }, - { "silent", SB_SILENT }, { }, }; -static const char *const forbidden_sb_flag[] = { - "bind", - "dev", - "exec", - "move", - "noatime", - "nodev", - "nodiratime", - "noexec", - "norelatime", - "nostrictatime", - "nosuid", - "private", - "rec", - "relatime", - "remount", - "shared", - "slave", - "strictatime", - "suid", - "unbindable", -}; - /* * Check for a common mount option that manipulates s_flags. */ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) { unsigned int token; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) - if (strcmp(key, forbidden_sb_flag[i]) == 0) - return -EINVAL; token = lookup_constant(common_set_sb_flag, key, 0); if (token) { diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 506c5e643f0d..5e796e6c38e5 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -8,7 +8,7 @@ config FSCACHE Different sorts of caches can be plugged in, depending on the resources available. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_STATS bool "Gather statistical information on local caching" @@ -25,7 +25,7 @@ config FSCACHE_STATS between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_HISTOGRAM bool "Gather latency information on local caching" @@ -42,7 +42,7 @@ config FSCACHE_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_DEBUG bool "Debug FS-Cache" @@ -52,7 +52,7 @@ config FSCACHE_DEBUG management module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/fscache/parameter/debug. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_OBJECT_LIST bool "Maintain global object list for debugging purposes" diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f78793f3d21e..fcc136361415 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -172,7 +172,7 @@ no_preference: * * Initialise a record of a cache and fill in the name. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_init_cache(struct fscache_cache *cache, @@ -207,7 +207,7 @@ EXPORT_SYMBOL(fscache_init_cache); * * Add a cache to the system, making it available for netfs's to use. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ int fscache_add_cache(struct fscache_cache *cache, @@ -307,7 +307,7 @@ EXPORT_SYMBOL(fscache_add_cache); * Note that an I/O error occurred in a cache and that it should no longer be * used for anything. This also reports the error into the kernel log. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_io_error(struct fscache_cache *cache) @@ -355,7 +355,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, * Withdraw a cache from service, unbinding all its cache objects from the * netfs cookies they're currently representing. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_withdraw_cache(struct fscache_cache *cache) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0ce39658a620..751bc5b1cddf 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -4,7 +4,7 @@ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/netfs-api.txt for more information on + * See Documentation/filesystems/caching/netfs-api.rst for more information on * the netfs API. */ diff --git a/fs/fscache/object.c b/fs/fscache/object.c index cfeba839a0f2..cb2146e02cd5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -4,7 +4,7 @@ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/object.txt for a description of the + * See Documentation/filesystems/caching/object.rst for a description of the * object state machine and the in-kernel representations. */ @@ -295,7 +295,7 @@ static void fscache_object_work_func(struct work_struct *work) * * Initialise a cache object description to its basic values. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_object_init(struct fscache_object *object, diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 1a22a55f75a0..4a5651d4904e 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/operations.txt + * See Documentation/filesystems/caching/operations.rst */ #define FSCACHE_DEBUG_LEVEL OPERATION diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index eb2a585572dc..774b2618018a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -12,7 +12,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See <file:Documentation/filesystems/fuse.txt> for more information. + See <file:Documentation/filesystems/fuse.rst> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..c7a65cf2bcca 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2081,7 +2081,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.txt). + * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index 44f6e89bcb75..129926b5142d 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -6,7 +6,7 @@ config HFS_FS help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. - Please read <file:Documentation/filesystems/hfs.txt> to learn about + Please read <file:Documentation/filesystems/hfs.rst> to learn about the available mount options. To compile this file system support as a module, choose M here: the diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 56aa0336254a..2b36dc6f0a10 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -9,7 +9,7 @@ config HPFS_FS write files to an OS/2 HPFS partition on your hard drive. OS/2 floppies however are in regular MSDOS format, so you don't need this option in order to be able to read them. Read - <file:Documentation/filesystems/hpfs.txt>. + <file:Documentation/filesystems/hpfs.rst>. To compile this file system support as a module, choose M here: the module will be called hpfs. If unsure, say N. diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..37226a9cfa4f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1606,14 +1606,14 @@ EXPORT_SYMBOL(iput); * @inode: inode owning the block number being requested * @block: pointer containing the block to find * - * Replaces the value in *block with the block number on the device holding + * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the - * 4 in *block, with disk block relative to the disk start that holds that + * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a - * hole, returns 0 and *block is also set to 0. + * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..0d467e32dd7e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,7 +126,6 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -long do_faccessat(int dfd, const char __user *filename, int mode); int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 5e7419599f50..08ffd37b9bb8 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -8,7 +8,7 @@ config ISO9660_FS long Unix filenames and symbolic links are also supported by this driver. If you have a CD-ROM drive and want to do more with it than just listen to audio CDs and watch its LEDs, say Y (and read - <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO, + <file:Documentation/filesystems/isofs.rst> and the CD-ROM-HOWTO, available from <http://www.tldp.org/docs.html#howto>), thereby enlarging your kernel by about 27 KB; otherwise say N. diff --git a/fs/locks.c b/fs/locks.c index b8a31c1c4fff..1d4f4d5da704 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -61,7 +61,7 @@ * * Initial implementation of mandatory locks. SunOS turned out to be * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/filesystems/mandatory-locking.txt' for details. + * See 'Documentation/filesystems/mandatory-locking.rst' for details. * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. * * Don't allow mandatory locks on mmap()'ed files. Added simple functions to diff --git a/fs/mount.h b/fs/mount.h index 711a4093e475..c7abb7b394d8 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -9,7 +9,13 @@ struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; + /* + * Traversal and modification of .list is protected by either + * - taking namespace_sem for write, OR + * - taking namespace_sem for read AND taking .ns_lock. + */ struct list_head list; + spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ @@ -133,9 +139,7 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); - void *cached_mount; - u64 cached_event; - loff_t cached_index; + struct mount cursor; }; extern const struct seq_operations mounts_op; @@ -153,3 +157,5 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } + +extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); diff --git a/fs/namei.c b/fs/namei.c index a320371899cf..d81f73ff1a8b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3505,12 +3505,14 @@ EXPORT_SYMBOL(user_path_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { + bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(dir, dentry); if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && + !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) @@ -4345,9 +4347,6 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, (flags & RENAME_EXCHANGE)) return -EINVAL; - if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) - return -EPERM; - if (flags & RENAME_EXCHANGE) target_flags = 0; @@ -4483,20 +4482,6 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_whiteout(struct inode *dir, struct dentry *dentry) -{ - int error = may_create(dir, dentry); - if (error) - return error; - - if (!dir->i_op->mknod) - return -EPERM; - - return dir->i_op->mknod(dir, dentry, - S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); -} -EXPORT_SYMBOL(vfs_whiteout); - int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..a6baee3c7904 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,6 +648,21 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } +static inline void lock_ns_list(struct mnt_namespace *ns) +{ + spin_lock(&ns->ns_lock); +} + +static inline void unlock_ns_list(struct mnt_namespace *ns) +{ + spin_unlock(&ns->ns_lock); +} + +static inline bool mnt_is_cursor(struct mount *mnt) +{ + return mnt->mnt.mnt_flags & MNT_CURSOR; +} + /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -673,11 +688,15 @@ bool __is_local_mountpoint(struct dentry *dentry) goto out; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { + if (mnt_is_cursor(mnt)) + continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } + unlock_ns_list(ns); up_read(&namespace_sem); out: return is_covered; @@ -1245,46 +1264,71 @@ struct vfsmount *mnt_clone_internal(const struct path *path) } #ifdef CONFIG_PROC_FS +static struct mount *mnt_list_next(struct mnt_namespace *ns, + struct list_head *p) +{ + struct mount *mnt, *ret = NULL; + + lock_ns_list(ns); + list_for_each_continue(p, &ns->list) { + mnt = list_entry(p, typeof(*mnt), mnt_list); + if (!mnt_is_cursor(mnt)) { + ret = mnt; + break; + } + } + unlock_ns_list(ns); + + return ret; +} + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; + struct list_head *prev; down_read(&namespace_sem); - if (p->cached_event == p->ns->event) { - void *v = p->cached_mount; - if (*pos == p->cached_index) - return v; - if (*pos == p->cached_index + 1) { - v = seq_list_next(v, &p->ns->list, &p->cached_index); - return p->cached_mount = v; - } + if (!*pos) { + prev = &p->ns->list; + } else { + prev = &p->cursor.mnt_list; + + /* Read after we'd reached the end? */ + if (list_empty(prev)) + return NULL; } - p->cached_event = p->ns->event; - p->cached_mount = seq_list_start(&p->ns->list, *pos); - p->cached_index = *pos; - return p->cached_mount; + return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; + struct mount *mnt = v; - p->cached_mount = seq_list_next(v, &p->ns->list, pos); - p->cached_index = *pos; - return p->cached_mount; + ++*pos; + return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + lock_ns_list(p->ns); + if (mnt) + list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); + else + list_del_init(&p->cursor.mnt_list); + unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; - struct mount *r = list_entry(v, struct mount, mnt_list); + struct mount *r = v; return p->show(m, &r->mnt); } @@ -1294,6 +1338,15 @@ const struct seq_operations mounts_op = { .stop = m_stop, .show = m_show, }; + +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) +{ + down_read(&namespace_sem); + lock_ns_list(ns); + list_del(&cursor->mnt_list); + unlock_ns_list(ns); + up_read(&namespace_sem); +} #endif /* CONFIG_PROC_FS */ /** @@ -3202,6 +3255,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); + spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3595,7 +3649,7 @@ EXPORT_SYMBOL(path_is_under); * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: @@ -3842,10 +3896,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, bool visible = false; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; + if (mnt_is_cursor(mnt)) + continue; + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -3893,6 +3951,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: + unlock_ns_list(ns); up_read(&namespace_sem); return visible; } diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 6736e47d94d8..7715fadd5fff 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -12,6 +12,6 @@ config INOTIFY_USER new features including multiple file events, one-shot support, and unmount notification. - For more information, see <file:Documentation/filesystems/inotify.txt> + For more information, see <file:Documentation/filesystems/inotify.rst> If unsure, say Y. diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index de9fb5cff226..1667a7e590d8 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -18,7 +18,7 @@ config NTFS_FS the Linux 2.4 kernel series is separately available as a patch from the project web site. - For more information see <file:Documentation/filesystems/ntfs.txt> + For more information see <file:Documentation/filesystems/ntfs.rst> and <http://www.linux-ntfs.org/>. To compile this file system support as a module, choose M here: the diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 46bba20da6b5..1177c33df895 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -21,7 +21,7 @@ config OCFS2_FS OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ For more information on OCFS2, see the file - <file:Documentation/filesystems/ocfs2.txt>. + <file:Documentation/filesystems/ocfs2.rst>. config OCFS2_FS_O2CB tristate "O2CB Kernelspace Clustering" diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 1de77f1a600b..ea868c6f9800 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -227,7 +227,7 @@ static ssize_t dlmfs_file_read(struct file *filp, loff_t *ppos) { int bytes_left; - ssize_t readlen, got; + ssize_t got; char *lvb_buf; struct inode *inode = file_inode(filp); @@ -237,36 +237,31 @@ static ssize_t dlmfs_file_read(struct file *filp, if (*ppos >= i_size_read(inode)) return 0; + /* don't read past the lvb */ + if (count > i_size_read(inode) - *ppos) + count = i_size_read(inode) - *ppos; + if (!count) return 0; - if (!access_ok(buf, count)) - return -EFAULT; - - /* don't read past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - readlen = i_size_read(inode) - *ppos; - else - readlen = count; - - lvb_buf = kmalloc(readlen, GFP_NOFS); + lvb_buf = kmalloc(count, GFP_NOFS); if (!lvb_buf) return -ENOMEM; - got = user_dlm_read_lvb(inode, lvb_buf, readlen); + got = user_dlm_read_lvb(inode, lvb_buf, count); if (got) { - BUG_ON(got != readlen); - bytes_left = __copy_to_user(buf, lvb_buf, readlen); - readlen -= bytes_left; + BUG_ON(got != count); + bytes_left = copy_to_user(buf, lvb_buf, count); + count -= bytes_left; } else - readlen = 0; + count = 0; kfree(lvb_buf); - *ppos = *ppos + readlen; + *ppos = *ppos + count; - mlog(0, "read %zd bytes\n", readlen); - return readlen; + mlog(0, "read %zu bytes\n", count); + return count; } static ssize_t dlmfs_file_write(struct file *filp, @@ -291,9 +286,6 @@ static ssize_t dlmfs_file_write(struct file *filp, if (!count) return 0; - if (!access_ok(buf, count)) - return -EFAULT; - lvb_buf = kmalloc(count, GFP_NOFS); if (!lvb_buf) return -ENOMEM; diff --git a/fs/open.c b/fs/open.c index d9467a8a7f6a..6cd48a61cda3 100644 --- a/fs/open.c +++ b/fs/open.c @@ -345,21 +345,14 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -long do_faccessat(int dfd, const char __user *filename, int mode) +static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; - struct path path; - struct inode *inode; - int res; - unsigned int lookup_flags = LOOKUP_FOLLOW; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; override_cred = prepare_creds(); if (!override_cred) - return -ENOMEM; + return NULL; override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -394,6 +387,38 @@ long do_faccessat(int dfd, const char __user *filename, int mode) override_cred->non_rcu = 1; old_cred = override_creds(override_cred); + + /* override_cred() gets its own ref */ + put_cred(override_cred); + + return old_cred; +} + +long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +{ + struct path path; + struct inode *inode; + int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *old_cred = NULL; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (!(flags & AT_EACCESS)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } + retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) @@ -435,19 +460,26 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); - put_cred(override_cred); + if (old_cred) + revert_creds(old_cred); + return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { - return do_faccessat(dfd, filename, mode); + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } int ksys_chdir(const char __user *filename) diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 714c14c47ca5..dd188c7996b3 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -9,7 +9,7 @@ config OVERLAY_FS 'lower' filesystem is either hidden or, in the case of directories, merged with the 'upper' object. - For more information see Documentation/filesystems/overlayfs.txt + For more information see Documentation/filesystems/overlayfs.rst config OVERLAY_FS_REDIRECT_DIR bool "Overlayfs: turn on redirect directory feature by default" @@ -38,7 +38,7 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW If backward compatibility is not an issue, then it is safe and recommended to say N here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say Y. @@ -103,7 +103,7 @@ config OVERLAY_FS_XINO_AUTO If compatibility with applications that expect 32bit inodes is not an issue, then it is safe and recommended to say Y here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say N. diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 27ef84d99f59..971a42f6357d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -23,7 +23,7 @@ config PROC_FS /proc" or the equivalent line in /etc/fstab does the job. The /proc file system is explained in the file - <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage ("man 5 proc"). This option will enlarge your kernel by about 67 KB. Several @@ -95,7 +95,7 @@ config PROC_CHILDREN default n help Provides a fast way to retrieve first level children pids of a task. See - <file:Documentation/filesystems/proc.txt> for more information. + <file:Documentation/filesystems/proc.rst> for more information. Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 9bd94b5a9658..ecc63ce01be7 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -103,6 +103,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); +#ifdef CONFIG_SHADOW_CALL_STACK + seq_printf(m, "ShadowCallStack:%8lu kB\n", + global_zone_page_state(NR_KERNEL_SCS_KB)); +#endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 36dc7417c0df..6ad407d5efe2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -628,9 +628,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_DENYWRITE)] = "dw", -#ifdef CONFIG_X86_INTEL_MPX - [ilog2(VM_MPX)] = "mp", -#endif [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", @@ -644,6 +641,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_ARM64_BTI + [ilog2(VM_ARM64_BTI)] = "bt", +#endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 273ee82d8aa9..e4d70c0dffe9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -279,7 +279,8 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->ns = ns; p->root = root; p->show = show; - p->cached_event = ~0ULL; + INIT_LIST_HEAD(&p->cursor.mnt_list); + p->cursor.mnt.mnt_flags = MNT_CURSOR; return 0; @@ -296,6 +297,7 @@ static int mounts_release(struct inode *inode, struct file *file) struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); + mnt_cursor_del(p->ns, &p->cursor); put_mnt_ns(p->ns); return seq_release_private(inode, file); } diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index c917c191e78c..aa8e0b65ff1a 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -283,7 +283,7 @@ static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, const void __user *s, unsigned int start, unsigned int count) { struct persistent_ram_buffer *buffer = prz->buffer; - int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + int ret = unlikely(copy_from_user(buffer->data + start, s, count)) ? -EFAULT : 0; persistent_ram_update_ecc(prz, start, count); return ret; @@ -348,8 +348,6 @@ int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, int rem, ret = 0, c = count; size_t start; - if (unlikely(!access_ok(s, count))) - return -EFAULT; if (unlikely(c > prz->buffer_size)) { s += c - prz->buffer_size; c = prz->buffer_size; diff --git a/fs/readdir.c b/fs/readdir.c index de2eceffdee8..a49f07c11cfb 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -157,17 +157,18 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, } buf->result++; dirent = buf->dirent; - if (!access_ok(dirent, + if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(offset, &dirent->d_offset, efault_end); + unsafe_put_user(namlen, &dirent->d_namlen, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); return 0; +efault_end: + user_write_access_end(); efault: buf->result = -EFAULT; return -EFAULT; @@ -242,7 +243,7 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EINTR; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; - if (!user_access_begin(prev, reclen + prev_reclen)) + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ @@ -251,14 +252,14 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); - user_access_end(); + user_write_access_end(); buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; return 0; efault_end: - user_access_end(); + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -275,9 +276,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -327,7 +325,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EINTR; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; - if (!user_access_begin(prev, reclen + prev_reclen)) + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ @@ -336,7 +334,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, &dirent->d_type, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); - user_access_end(); + user_write_access_end(); buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; @@ -344,7 +342,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return 0; efault_end: - user_access_end(); + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -361,9 +359,6 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -376,7 +371,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, typeof(lastdirent->d_off) d_off = buf.ctx.pos; lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; - if (__put_user(d_off, &lastdirent->d_off)) + if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; @@ -424,17 +419,18 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, } buf->result++; dirent = buf->dirent; - if (!access_ok(dirent, + if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(offset, &dirent->d_offset, efault_end); + unsafe_put_user(namlen, &dirent->d_namlen, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); return 0; +efault_end: + user_write_access_end(); efault: buf->result = -EFAULT; return -EFAULT; @@ -471,7 +467,7 @@ struct compat_linux_dirent { struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; - struct compat_linux_dirent __user *previous; + int prev_reclen; int count; int error; }; @@ -479,13 +475,17 @@ struct compat_getdents_callback { static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct compat_linux_dirent __user * dirent; + struct compat_linux_dirent __user *dirent, *prev; struct compat_getdents_callback *buf = container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); + int prev_reclen; + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -494,29 +494,27 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = -EOVERFLOW; return -EOVERFLOW; } - dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } + prev_reclen = buf->prev_reclen; + if (prev_reclen && signal_pending(current)) + return -EINTR; dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + prev = (void __user *) dirent - prev_reclen; + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; + + unsafe_put_user(offset, &prev->d_off, efault_end); + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); + + buf->prev_reclen = reclen; + buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return 0; +efault_end: + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -526,7 +524,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { struct fd f; - struct compat_linux_dirent __user * lastdirent; struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .current_dir = dirent, @@ -534,9 +531,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -544,8 +538,10 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { + if (buf.prev_reclen) { + struct compat_linux_dirent __user * lastdirent; + lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index ad4c45788896..9737b8e68878 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -6,7 +6,7 @@ config ROMFS_FS This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for other read-only media as well. Read - <file:Documentation/filesystems/romfs.txt> for details. + <file:Documentation/filesystems/romfs.rst> for details. To compile this file system support as a module, choose M here: the module will be called romfs. Note that the file system of your diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index d93e12d9b712..b881b9283b7f 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/percpu.h> #include <linux/buffer_head.h> +#include <linux/local_lock.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -20,7 +21,8 @@ */ struct squashfs_stream { - void *stream; + void *stream; + local_lock_t lock; }; void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, @@ -41,6 +43,7 @@ void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, err = PTR_ERR(stream->stream); goto out; } + local_lock_init(&stream->lock); } kfree(comp_opts); @@ -75,15 +78,16 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_stream __percpu *percpu; struct squashfs_stream *stream; int res; - percpu = (struct squashfs_stream __percpu *)msblk->stream; - stream = get_cpu_ptr(percpu); + local_lock(&msblk->stream->lock); + stream = this_cpu_ptr(msblk->stream); + res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); - put_cpu_ptr(stream); + + local_unlock(&msblk->stream->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/stat.c b/fs/stat.c index 030008796479..b9faa6cafafe 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -22,6 +22,7 @@ #include <asm/unistd.h> #include "internal.h" +#include "mount.h" /** * generic_fillattr - Fill in the basic attributes from the inode struct @@ -70,11 +71,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; - request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; /* allow the fs to override these if it really wants to */ - if (IS_NOATIME(inode)) + /* SB_NOATIME means filesystem supplies dummy atime value */ + if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; @@ -199,6 +200,11 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) + stat->attributes |= STATX_ATTR_MOUNT_ROOT; + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -563,6 +569,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); + tmp.stx_mnt_id = stat->mnt_id; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aa85f2874a9f..59dffd5ca517 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 26bbf960e2a2..f275fcda62fb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/module.h> diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db81cfbab9d6..e747c135c1d1 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c4deecc80f67..5603530a1a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index d4edf7d9ae10..b4e23e03fbeb 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -28,7 +28,7 @@ config SYSV_FS tar" or preferably "info tar"). Note also that this option has nothing whatsoever to do with the option "System V IPC". Read about the System V file system in - <file:Documentation/filesystems/sysv-fs.txt>. + <file:Documentation/filesystems/sysv-fs.rst>. Saying Y here will enlarge your kernel by about 27 KB. To compile this as a module, choose M here: the module will be called diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 6848de581ce1..26e1a49f3ba7 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -9,7 +9,7 @@ config UDF_FS compatible with standard unix file systems, it is also suitable for removable USB disks. Say Y if you intend to mount DVD discs or CDRW's written in packet mode, or if you want to use UDF for removable USB - disks. Please read <file:Documentation/filesystems/udf.txt>. + disks. Please read <file:Documentation/filesystems/udf.rst>. To compile this file system support as a module, choose M here: the module will be called udf. diff --git a/fs/utimes.c b/fs/utimes.c index 1d17ce98cb80..b7b927502d6e 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -95,13 +95,13 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, goto out; } - if (flags & ~AT_SYMLINK_NOFOLLOW) + if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) goto out; if (filename == NULL && dfd != AT_FDCWD) { struct fd f; - if (flags & AT_SYMLINK_NOFOLLOW) + if (flags) goto out; f = fdget(dfd); @@ -117,6 +117,8 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h index 34785c0f57b0..5a80f8e54300 100644 --- a/include/asm-generic/checksum.h +++ b/include/asm-generic/checksum.h @@ -25,15 +25,6 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum); */ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum); -/* - * the same as csum_partial_copy, but copies from user space. - * - * here even more important to align src and dst on a 32-bit (or even - * better 64-bit) boundary - */ -extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, int *csum_err); - #ifndef csum_partial_copy_nocheck #define csum_partial_copy_nocheck(src, dst, len, sum) \ csum_partial_copy((src), (dst), (len), (sum)) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index d1779d442aa5..66397ed10acb 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -53,6 +53,9 @@ extern char __ctors_start[], __ctors_end[]; /* Start and end of .opd section - used for function descriptors. */ extern char __start_opd[], __end_opd[]; +/* Start and end of instrumentation protected text section */ +extern char __noinstr_text_start[], __noinstr_text_end[]; + extern __visible const void __nosave_begin, __nosave_end; /* Function descriptor handling (if any). Override in asm/sections.h */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 71e387a5fe90..db600ef218d7 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -541,6 +541,15 @@ __end_rodata = .; /* + * Non-instrumentable text section + */ +#define NOINSTR_TEXT \ + ALIGN_FUNCTION(); \ + __noinstr_text_start = .; \ + *(.noinstr.text) \ + __noinstr_text_end = .; + +/* * .text section. Map to function alignment to avoid address changes * during second ld run in second ld pass when generating System.map * @@ -551,6 +560,7 @@ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ *(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \ + NOINSTR_TEXT \ *(.text..refcount) \ *(.ref.text) \ MEM_KEEP(init.text*) \ diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 59494df0f55b..56d6a5c6e353 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -5,12 +5,15 @@ #ifndef __LINUX_ARM_SMCCC_H #define __LINUX_ARM_SMCCC_H +#include <linux/init.h> #include <uapi/linux/const.h> /* * This file provides common defines for ARM SMC Calling Convention as * specified in - * http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html + * https://developer.arm.com/docs/den0028/latest + * + * This code is up-to-date with version DEN 0028 C */ #define ARM_SMCCC_STD_CALL _AC(0,U) @@ -56,6 +59,7 @@ #define ARM_SMCCC_VERSION_1_0 0x10000 #define ARM_SMCCC_VERSION_1_1 0x10001 +#define ARM_SMCCC_VERSION_1_2 0x10002 #define ARM_SMCCC_VERSION_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ @@ -98,6 +102,19 @@ enum arm_smccc_conduit { enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void); /** + * arm_smccc_get_version() + * + * Returns the version to be used for SMCCCv1.1 or later. + * + * When SMCCCv1.1 or above is not present, returns SMCCCv1.0, but this + * does not imply the presence of firmware or a valid conduit. Caller + * handling SMCCCv1.0 must determine the conduit by other means. + */ +u32 arm_smccc_get_version(void); + +void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit); + +/** * struct arm_smccc_res - Result from SMC/HVC call * @a0-a3 result values from registers 0 to 3 */ @@ -314,10 +331,14 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, */ #define arm_smccc_1_1_hvc(...) __arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__) -/* Return codes defined in ARM DEN 0070A */ +/* + * Return codes defined in ARM DEN 0070A + * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C + */ #define SMCCC_RET_SUCCESS 0 #define SMCCC_RET_NOT_SUPPORTED -1 #define SMCCC_RET_NOT_REQUIRED -2 +#define SMCCC_RET_INVALID_PARAMETER -3 /* * Like arm_smccc_1_1* but always returns SMCCC_RET_NOT_SUPPORTED. diff --git a/include/linux/capability.h b/include/linux/capability.h index ecce0f43c73a..027d7e4a853b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); +static inline bool perfmon_capable(void) +{ + return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); +} /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/linux/compat.h b/include/linux/compat.h index 0480ba4db592..e90100c0de72 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -402,8 +402,15 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size); -int copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo __user *from); -int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from); +int copy_siginfo_from_user32(kernel_siginfo_t *to, + const struct compat_siginfo __user *from); +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const kernel_siginfo_t *from); +#ifndef copy_siginfo_to_user32 +#define copy_siginfo_to_user32 __copy_siginfo_to_user32 +#endif int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 333a6695a918..790c0c6b8552 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -42,3 +42,7 @@ * compilers, like ICC. */ #define barrier() __asm__ __volatile__("" : : : "memory") + +#if __has_feature(shadow_call_stack) +# define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) +#endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 448c91bf543b..6325d64e3c3b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -120,12 +120,65 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(.rodata..c_jump_table) +#ifdef CONFIG_DEBUG_ENTRY +/* Begin/end of an instrumentation safe region */ +#define instrumentation_begin() ({ \ + asm volatile("%c0:\n\t" \ + ".pushsection .discard.instr_begin\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) + +/* + * Because instrumentation_{begin,end}() can nest, objtool validation considers + * _begin() a +1 and _end() a -1 and computes a sum over the instructions. + * When the value is greater than 0, we consider instrumentation allowed. + * + * There is a problem with code like: + * + * noinstr void foo() + * { + * instrumentation_begin(); + * ... + * if (cond) { + * instrumentation_begin(); + * ... + * instrumentation_end(); + * } + * bar(); + * instrumentation_end(); + * } + * + * If instrumentation_end() would be an empty label, like all the other + * annotations, the inner _end(), which is at the end of a conditional block, + * would land on the instruction after the block. + * + * If we then consider the sum of the !cond path, we'll see that the call to + * bar() is with a 0-value, even though, we meant it to happen with a positive + * value. + * + * To avoid this, have _end() be a NOP instruction, this ensures it will be + * part of the condition block and does not escape. + */ +#define instrumentation_end() ({ \ + asm volatile("%c0: nop\n\t" \ + ".pushsection .discard.instr_end\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) +#endif /* CONFIG_DEBUG_ENTRY */ + #else #define annotate_reachable() #define annotate_unreachable() #define __annotate_jump_table #endif +#ifndef instrumentation_begin +#define instrumentation_begin() do { } while(0) +#define instrumentation_end() do { } while(0) +#endif + #ifndef ASM_UNREACHABLE # define ASM_UNREACHABLE #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e970f97a7fcb..6fcf73200b67 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -118,6 +118,10 @@ struct ftrace_likely_data { #define notrace __attribute__((__no_instrument_function__)) #endif +/* Section for code which can't be instrumented at all */ +#define noinstr \ + noinline notrace __attribute((__section__(".noinstr.text"))) + /* * it doesn't make sense on ARM (currently the only user of __naked) * to trace naked functions because then mcount is called without @@ -193,6 +197,10 @@ struct ftrace_likely_data { # define randomized_struct_fields_end #endif +#ifndef __noscs +# define __noscs +#endif + #ifndef asm_volatile_goto #define asm_volatile_goto(x...) asm goto(x) #endif diff --git a/include/linux/configfs.h b/include/linux/configfs.h index fa9490a8874c..2e8c69b43c64 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -13,7 +13,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please read Documentation/filesystems/configfs/configfs.txt before using + * Please read Documentation/filesystems/configfs.rst before using * the configfs interface, ESPECIALLY the parts about reference counts and * item destructors. */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index beaed2dc269e..52692587f7fe 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -144,18 +144,8 @@ static inline void get_online_cpus(void) { cpus_read_lock(); } static inline void put_online_cpus(void) { cpus_read_unlock(); } #ifdef CONFIG_PM_SLEEP_SMP -int __freeze_secondary_cpus(int primary, bool suspend); -static inline int freeze_secondary_cpus(int primary) -{ - return __freeze_secondary_cpus(primary, true); -} - -static inline int disable_nonboot_cpus(void) -{ - return __freeze_secondary_cpus(0, false); -} - -void enable_nonboot_cpus(void); +extern int freeze_secondary_cpus(int primary); +extern void thaw_secondary_cpus(void); static inline int suspend_disable_secondary_cpus(void) { @@ -168,12 +158,11 @@ static inline int suspend_disable_secondary_cpus(void) } static inline void suspend_enable_secondary_cpus(void) { - return enable_nonboot_cpus(); + return thaw_secondary_cpus(); } #else /* !CONFIG_PM_SLEEP_SMP */ -static inline int disable_nonboot_cpus(void) { return 0; } -static inline void enable_nonboot_cpus(void) {} +static inline void thaw_secondary_cpus(void) {} static inline int suspend_disable_secondary_cpus(void) { return 0; } static inline void suspend_enable_secondary_cpus(void) { } #endif /* !CONFIG_PM_SLEEP_SMP */ diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 4664fc1871de..bc156285d097 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -97,8 +97,6 @@ extern void unregister_oldmem_pfn_is_ram(void); static inline bool is_kdump_kernel(void) { return 0; } #endif /* CONFIG_CRASH_DUMP */ -extern unsigned long saved_max_pfn; - /* Device Dump information to be filled by drivers */ struct vmcoredd_data { char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */ diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 9a72214496e5..d02f32b7514e 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -44,6 +44,9 @@ static inline int devcgroup_inode_mknod(int mode, dev_t dev) if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; + if (S_ISCHR(mode) && dev == WHITEOUT_DEV) + return 0; + if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else diff --git a/include/linux/efi.h b/include/linux/efi.h index 9430d01c0c3d..2c6495f72f79 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -39,6 +39,7 @@ #define EFI_WRITE_PROTECTED ( 8 | (1UL << (BITS_PER_LONG-1))) #define EFI_OUT_OF_RESOURCES ( 9 | (1UL << (BITS_PER_LONG-1))) #define EFI_NOT_FOUND (14 | (1UL << (BITS_PER_LONG-1))) +#define EFI_TIMEOUT (18 | (1UL << (BITS_PER_LONG-1))) #define EFI_ABORTED (21 | (1UL << (BITS_PER_LONG-1))) #define EFI_SECURITY_VIOLATION (26 | (1UL << (BITS_PER_LONG-1))) @@ -379,8 +380,8 @@ typedef union { typedef struct { efi_guid_t guid; - const char *name; unsigned long *ptr; + const char name[16]; } efi_config_table_type_t; #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL) @@ -426,6 +427,7 @@ typedef struct { u32 tables; } efi_system_table_32_t; +typedef union efi_simple_text_input_protocol efi_simple_text_input_protocol_t; typedef union efi_simple_text_output_protocol efi_simple_text_output_protocol_t; typedef union { @@ -434,7 +436,7 @@ typedef union { unsigned long fw_vendor; /* physical addr of CHAR16 vendor string */ u32 fw_revision; unsigned long con_in_handle; - unsigned long con_in; + efi_simple_text_input_protocol_t *con_in; unsigned long con_out_handle; efi_simple_text_output_protocol_t *con_out; unsigned long stderr_handle; diff --git a/include/linux/elf.h b/include/linux/elf.h index e3649b3e970e..5d5b0321da0b 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -2,6 +2,7 @@ #ifndef _LINUX_ELF_H #define _LINUX_ELF_H +#include <linux/types.h> #include <asm/elf.h> #include <uapi/linux/elf.h> @@ -21,6 +22,9 @@ SET_PERSONALITY(ex) #endif +#define ELF32_GNU_PROPERTY_ALIGN 4 +#define ELF64_GNU_PROPERTY_ALIGN 8 + #if ELF_CLASS == ELFCLASS32 extern Elf32_Dyn _DYNAMIC []; @@ -31,6 +35,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_addr_t Elf32_Off #define Elf_Half Elf32_Half #define Elf_Word Elf32_Word +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN #else @@ -42,6 +47,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_addr_t Elf64_Off #define Elf_Half Elf64_Half #define Elf_Word Elf64_Word +#define ELF_GNU_PROPERTY_ALIGN ELF64_GNU_PROPERTY_ALIGN #endif @@ -56,4 +62,41 @@ static inline int elf_coredump_extra_notes_write(struct coredump_params *cprm) { extern int elf_coredump_extra_notes_size(void); extern int elf_coredump_extra_notes_write(struct coredump_params *cprm); #endif + +/* + * NT_GNU_PROPERTY_TYPE_0 header: + * Keep this internal until/unless there is an agreed UAPI definition. + * pr_type values (GNU_PROPERTY_*) are public and defined in the UAPI header. + */ +struct gnu_property { + u32 pr_type; + u32 pr_datasz; +}; + +struct arch_elf_state; + +#ifndef CONFIG_ARCH_USE_GNU_PROPERTY +static inline int arch_parse_elf_property(u32 type, const void *data, + size_t datasz, bool compat, + struct arch_elf_state *arch) +{ + return 0; +} +#else +extern int arch_parse_elf_property(u32 type, const void *data, size_t datasz, + bool compat, struct arch_elf_state *arch); +#endif + +#ifdef CONFIG_ARCH_HAVE_ELF_PROT +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp); +#else +static inline int arch_elf_adjust_prot(int prot, + const struct arch_elf_state *state, + bool has_interp, bool is_interp) +{ + return prot; +} +#endif + #endif /* _LINUX_ELF_H */ diff --git a/include/linux/frame.h b/include/linux/frame.h index 02d3ca2d9598..303cda600e56 100644 --- a/include/linux/frame.h +++ b/include/linux/frame.h @@ -15,9 +15,20 @@ static void __used __section(.discard.func_stack_frame_non_standard) \ *__func_stack_frame_non_standard_##func = func +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL \ + 999: \ + .pushsection .discard.intra_function_calls; \ + .long 999b; \ + .popsection; + #else /* !CONFIG_STACK_VALIDATION */ #define STACK_FRAME_NON_STANDARD(func) +#define ANNOTATE_INTRA_FUNCTION_CALL #endif /* CONFIG_STACK_VALIDATION */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 1434ed801b80..ef6acd2062eb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1727,7 +1727,11 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); -extern int vfs_whiteout(struct inode *, struct dentry *); + +static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index e6c3e4c61dad..5f24fcbfbfb4 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -85,7 +85,7 @@ struct p_log { * Superblock creation fills in ->root whereas reconfiguration begins with this * already set. * - * See Documentation/filesystems/mount_api.txt + * See Documentation/filesystems/mount_api.rst */ struct fs_context { const struct fs_context_operations *ops; diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index d5ba431b5d63..ce0b5fbf239d 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -6,7 +6,7 @@ * * NOTE!!! See: * - * Documentation/filesystems/caching/backend-api.txt + * Documentation/filesystems/caching/backend-api.rst * * for a description of the cache backend interface declared here. */ @@ -454,7 +454,7 @@ static inline void fscache_object_lookup_error(struct fscache_object *object) * Set the maximum size an object is permitted to reach, implying the highest * byte that may be written. Intended to be called by the attr_changed() op. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ static inline diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ad044c0cb1f3..a1c928fe98e7 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -6,7 +6,7 @@ * * NOTE!!! See: * - * Documentation/filesystems/caching/netfs-api.txt + * Documentation/filesystems/caching/netfs-api.rst * * for a description of the network filesystem interface declared here. */ @@ -233,7 +233,7 @@ extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_ * * Register a filesystem as desiring caching services if they're available. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -253,7 +253,7 @@ int fscache_register_netfs(struct fscache_netfs *netfs) * Indicate that a filesystem no longer desires caching services for the * moment. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -270,7 +270,7 @@ void fscache_unregister_netfs(struct fscache_netfs *netfs) * Acquire a specific cache referral tag that can be used to select a specific * cache in which to cache an index. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -288,7 +288,7 @@ struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name) * * Release a reference to a cache referral tag previously looked up. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -315,7 +315,7 @@ void fscache_release_cache_tag(struct fscache_cache_tag *tag) * that can be used to locate files. This is done by requesting a cookie for * each index in the path to the file. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -351,7 +351,7 @@ struct fscache_cookie *fscache_acquire_cookie( * provided to update the auxiliary data in the cache before the object is * disconnected. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -394,7 +394,7 @@ int fscache_check_consistency(struct fscache_cookie *cookie, * cookie. The auxiliary data on the cookie will be updated first if @aux_data * is set. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -410,7 +410,7 @@ void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) * * Permit data-storage cache objects to be pinned in the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -425,7 +425,7 @@ int fscache_pin_cookie(struct fscache_cookie *cookie) * * Permit data-storage cache objects to be unpinned from the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -441,7 +441,7 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie) * changed. This includes the data size. These attributes will be obtained * through the get_attr() cookie definition op. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -463,7 +463,7 @@ int fscache_attr_changed(struct fscache_cookie *cookie) * * This can be called with spinlocks held. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -479,7 +479,7 @@ void fscache_invalidate(struct fscache_cookie *cookie) * * Wait for the invalidation of an object to complete. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -498,7 +498,7 @@ void fscache_wait_on_invalidate(struct fscache_cookie *cookie) * cookie so that a write to that object within the space can always be * honoured. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -533,7 +533,7 @@ int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size) * Else, if the page is unbacked, -ENODATA is returned and a block may have * been allocated in the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -582,7 +582,7 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie, * regard to different pages, the return values are prioritised in that order. * Any pages submitted for reading are removed from the pages list. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -617,7 +617,7 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, * Else, a block will be allocated if one wasn't already, and 0 will be * returned * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -667,7 +667,7 @@ void fscache_readpages_cancel(struct fscache_cookie *cookie, * be cleared at the completion of the write to indicate the success or failure * of the operation. Note that the completion may happen before the return. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -693,7 +693,7 @@ int fscache_write_page(struct fscache_cookie *cookie, * Note that this cannot cancel any outstanding I/O operations between this * page and the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -711,7 +711,7 @@ void fscache_uncache_page(struct fscache_cookie *cookie, * * Ask the cache if a page is being written to the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -731,7 +731,7 @@ bool fscache_check_page_write(struct fscache_cookie *cookie, * Ask the cache to wake us up when a page is no longer being written to the * cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index ccda97dc7f8b..0abd9a1d2852 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,15 +2,6 @@ #ifndef _LINUX_FTRACE_IRQ_H #define _LINUX_FTRACE_IRQ_H - -#ifdef CONFIG_FTRACE_NMI_ENTER -extern void arch_ftrace_nmi_enter(void); -extern void arch_ftrace_nmi_exit(void); -#else -static inline void arch_ftrace_nmi_enter(void) { } -static inline void arch_ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_HWLAT_TRACER extern bool trace_hwlat_callback_enabled; extern void trace_hwlat_callback(bool enter); @@ -22,12 +13,10 @@ static inline void ftrace_nmi_enter(void) if (trace_hwlat_callback_enabled) trace_hwlat_callback(true); #endif - arch_ftrace_nmi_enter(); } static inline void ftrace_nmi_exit(void) { - arch_ftrace_nmi_exit(); #ifdef CONFIG_HWLAT_TRACER if (trace_hwlat_callback_enabled) trace_hwlat_callback(false); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 7c8b82f69288..e07cf853aa16 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -2,31 +2,28 @@ #ifndef LINUX_HARDIRQ_H #define LINUX_HARDIRQ_H +#include <linux/context_tracking_state.h> #include <linux/preempt.h> #include <linux/lockdep.h> #include <linux/ftrace_irq.h> #include <linux/vtime.h> #include <asm/hardirq.h> - extern void synchronize_irq(unsigned int irq); extern bool synchronize_hardirq(unsigned int irq); -#if defined(CONFIG_TINY_RCU) - -static inline void rcu_nmi_enter(void) -{ -} +#ifdef CONFIG_NO_HZ_FULL +void __rcu_irq_enter_check_tick(void); +#else +static inline void __rcu_irq_enter_check_tick(void) { } +#endif -static inline void rcu_nmi_exit(void) +static __always_inline void rcu_irq_enter_check_tick(void) { + if (context_tracking_enabled()) + __rcu_irq_enter_check_tick(); } -#else -extern void rcu_nmi_enter(void); -extern void rcu_nmi_exit(void); -#endif - /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -65,14 +62,34 @@ extern void irq_exit(void); #define arch_nmi_exit() do { } while (0) #endif +#ifdef CONFIG_TINY_RCU +static inline void rcu_nmi_enter(void) { } +static inline void rcu_nmi_exit(void) { } +#else +extern void rcu_nmi_enter(void); +extern void rcu_nmi_exit(void); +#endif + +/* + * NMI vs Tracing + * -------------- + * + * We must not land in a tracer until (or after) we've changed preempt_count + * such that in_nmi() becomes true. To that effect all NMI C entry points must + * be marked 'notrace' and call nmi_enter() as soon as possible. + */ + +/* + * nmi_enter() can nest up to 15 times; see NMI_BITS. + */ #define nmi_enter() \ do { \ arch_nmi_enter(); \ printk_nmi_enter(); \ lockdep_off(); \ ftrace_nmi_enter(); \ - BUG_ON(in_nmi()); \ - preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ + BUG_ON(in_nmi() == NMI_MASK); \ + __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ lockdep_hardirq_enter(); \ } while (0) @@ -82,7 +99,7 @@ extern void irq_exit(void); lockdep_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ - preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ + __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ printk_nmi_exit(); \ diff --git a/include/linux/idr.h b/include/linux/idr.h index ac6e946b6767..3ade03e5c7af 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -171,7 +171,7 @@ static inline bool idr_is_empty(const struct idr *idr) */ static inline void idr_preload_end(void) { - preempt_enable(); + local_unlock(&radix_tree_preloads.lock); } /** diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e2ca0a292e21..fc8d83e91379 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -7,7 +7,7 @@ * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2006-2008 Novell Inc. * - * Please read Documentation/kobject.txt before using the kobject + * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 069aa2ebef90..2b5b64256cf4 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -8,7 +8,7 @@ * * Split from kobject.h by David Howells (dhowells@redhat.com) * - * Please read Documentation/kobject.txt before using the kobject + * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 9280209d1f62..d796ec20d114 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -105,7 +105,7 @@ /* === DEPRECATED annotations === */ -#ifndef CONFIG_X86 +#ifndef CONFIG_ARCH_USE_SYM_ANNOTATIONS #ifndef GLOBAL /* deprecated, use SYM_DATA*, SYM_ENTRY, or similar */ #define GLOBAL(name) \ @@ -118,10 +118,10 @@ #define ENTRY(name) \ SYM_FUNC_START(name) #endif -#endif /* CONFIG_X86 */ +#endif /* CONFIG_ARCH_USE_SYM_ANNOTATIONS */ #endif /* LINKER_SCRIPT */ -#ifndef CONFIG_X86 +#ifndef CONFIG_ARCH_USE_SYM_ANNOTATIONS #ifndef WEAK /* deprecated, use SYM_FUNC_START_WEAK* */ #define WEAK(name) \ @@ -143,7 +143,7 @@ #define ENDPROC(name) \ SYM_FUNC_END(name) #endif -#endif /* CONFIG_X86 */ +#endif /* CONFIG_ARCH_USE_SYM_ANNOTATIONS */ /* === generic annotations === */ diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h new file mode 100644 index 000000000000..e55010fa7329 --- /dev/null +++ b/include/linux/local_lock.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LOCAL_LOCK_H +#define _LINUX_LOCAL_LOCK_H + +#include <linux/local_lock_internal.h> + +/** + * local_lock_init - Runtime initialize a lock instance + */ +#define local_lock_init(lock) __local_lock_init(lock) + +/** + * local_lock - Acquire a per CPU local lock + * @lock: The lock variable + */ +#define local_lock(lock) __local_lock(lock) + +/** + * local_lock_irq - Acquire a per CPU local lock and disable interrupts + * @lock: The lock variable + */ +#define local_lock_irq(lock) __local_lock_irq(lock) + +/** + * local_lock_irqsave - Acquire a per CPU local lock, save and disable + * interrupts + * @lock: The lock variable + * @flags: Storage for interrupt flags + */ +#define local_lock_irqsave(lock, flags) \ + __local_lock_irqsave(lock, flags) + +/** + * local_unlock - Release a per CPU local lock + * @lock: The lock variable + */ +#define local_unlock(lock) __local_unlock(lock) + +/** + * local_unlock_irq - Release a per CPU local lock and enable interrupts + * @lock: The lock variable + */ +#define local_unlock_irq(lock) __local_unlock_irq(lock) + +/** + * local_unlock_irqrestore - Release a per CPU local lock and restore + * interrupt flags + * @lock: The lock variable + * @flags: Interrupt flags to restore + */ +#define local_unlock_irqrestore(lock, flags) \ + __local_unlock_irqrestore(lock, flags) + +#endif diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h new file mode 100644 index 000000000000..4a8795b21d77 --- /dev/null +++ b/include/linux/local_lock_internal.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LOCAL_LOCK_H +# error "Do not include directly, include linux/local_lock.h" +#endif + +#include <linux/percpu-defs.h> +#include <linux/lockdep.h> + +typedef struct { +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + struct task_struct *owner; +#endif +} local_lock_t; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LL_DEP_MAP_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_CONFIG, \ + } +#else +# define LL_DEP_MAP_INIT(lockname) +#endif + +#define INIT_LOCAL_LOCK(lockname) { LL_DEP_MAP_INIT(lockname) } + +#define __local_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ + lockdep_init_map_wait(&(lock)->dep_map, #lock, &__key, 0, LD_WAIT_CONFIG);\ +} while (0) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void local_lock_acquire(local_lock_t *l) +{ + lock_map_acquire(&l->dep_map); + DEBUG_LOCKS_WARN_ON(l->owner); + l->owner = current; +} + +static inline void local_lock_release(local_lock_t *l) +{ + DEBUG_LOCKS_WARN_ON(l->owner != current); + l->owner = NULL; + lock_map_release(&l->dep_map); +} + +#else /* CONFIG_DEBUG_LOCK_ALLOC */ +static inline void local_lock_acquire(local_lock_t *l) { } +static inline void local_lock_release(local_lock_t *l) { } +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + +#define __local_lock(lock) \ + do { \ + preempt_disable(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_lock_irq(lock) \ + do { \ + local_irq_disable(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_lock_irqsave(lock, flags) \ + do { \ + local_irq_save(flags); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + preempt_enable(); \ + } while (0) + +#define __local_unlock_irq(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + local_irq_enable(); \ + } while (0) + +#define __local_unlock_irqrestore(lock, flags) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + local_irq_restore(flags); \ + } while (0) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 206774ac6946..8fce5c98a4b0 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -308,8 +308,27 @@ extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); -extern void lockdep_off(void); -extern void lockdep_on(void); +/* + * Split the recrursion counter in two to readily detect 'off' vs recursion. + */ +#define LOCKDEP_RECURSION_BITS 16 +#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) +#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) + +/* + * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due + * to header dependencies. + */ + +#define lockdep_off() \ +do { \ + current->lockdep_recursion += LOCKDEP_OFF; \ +} while (0) + +#define lockdep_on() \ +do { \ + current->lockdep_recursion -= LOCKDEP_OFF; \ +} while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 988ca0df7824..44d5422c18e4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -77,7 +77,7 @@ * state. This is called immediately after commit_creds(). * * Security hooks for mount using fs_context. - * [See also Documentation/filesystems/mount_api.txt] + * [See also Documentation/filesystems/mount_api.rst] * * @fs_context_dup: * Allocate and attach a security structure to sc->security. This pointer diff --git a/include/linux/mm.h b/include/linux/mm.h index fda41eb7f1c8..6e6c71cdfa13 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -325,17 +325,13 @@ extern unsigned int kobjsize(const void *objp); #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI +#elif defined(CONFIG_ARM64) +# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ +# define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif -#if defined(CONFIG_X86_INTEL_MPX) -/* MPX specific bounds table or bounds directory */ -# define VM_MPX VM_HIGH_ARCH_4 -#else -# define VM_MPX VM_NONE -#endif - #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif @@ -1230,7 +1226,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages); * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS * scheme). * - * For more information, please see Documentation/vm/pin_user_pages.rst. + * For more information, please see Documentation/core-api/pin_user_pages.rst. * * @page: pointer to page to be queried. * @Return: True, if it is likely that the page has been "dma-pinned". @@ -2874,7 +2870,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * - * Please see Documentation/vm/pin_user_pages.rst for more information. + * Please see Documentation/core-api/pin_user_pages.rst for more information. */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a89f47515eb1..fdd9beb5efed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -156,6 +156,9 @@ enum zone_stat_item { NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_PAGETABLE, /* used for pagetables */ NR_KERNEL_STACK_KB, /* measured in KiB */ +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + NR_KERNEL_SCS_KB, /* measured in KiB */ +#endif /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..0754b8d71262 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -663,6 +663,7 @@ struct x86_cpu_id { __u16 vendor; __u16 family; __u16 model; + __u16 steppings; __u16 feature; /* bit index */ kernel_ulong_t driver_data; }; @@ -671,6 +672,7 @@ struct x86_cpu_id { #define X86_VENDOR_ANY 0xffff #define X86_FAMILY_ANY 0 #define X86_MODEL_ANY 0 +#define X86_STEPPING_ANY 0 #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ /* diff --git a/include/linux/module.h b/include/linux/module.h index 1ad393e62bef..d849d06e4d44 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -458,6 +458,8 @@ struct module { void __percpu *percpu; unsigned int percpu_size; #endif + void *noinstr_text_start; + unsigned int noinstr_text_size; #ifdef CONFIG_TRACEPOINTS unsigned int num_tracepoints; @@ -489,6 +491,12 @@ struct module { unsigned int num_ftrace_callsites; unsigned long *ftrace_callsites; #endif +#ifdef CONFIG_KPROBES + void *kprobes_text_start; + unsigned int kprobes_text_size; + unsigned long *kprobe_blacklist; + unsigned int num_kprobe_blacklist; +#endif #ifdef CONFIG_LIVEPATCH bool klp; /* Is this a livepatch module? */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index ca92aea8a6bd..4fa67a8b2265 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -29,6 +29,11 @@ void *module_alloc(unsigned long size); /* Free memory returned from module_alloc. */ void module_memfree(void *module_region); +/* Determines if the section name is an init section (that is only used during + * module loading). + */ +bool module_init_section(const char *name); + /* Determines if the section name is an exit section (that is only used during * module unloading) */ diff --git a/include/linux/mount.h b/include/linux/mount.h index bf8cc4108b8f..7edac8c7a9c1 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,8 @@ struct fs_context; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ + MNT_CURSOR) #define MNT_INTERNAL 0x4000 @@ -64,6 +65,7 @@ struct fs_context; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 +#define MNT_CURSOR 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..d7b610c4eebd 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,7 +61,7 @@ struct perf_guest_info_callbacks { struct perf_callchain_entry { __u64 nr; - __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ + __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { @@ -113,7 +113,7 @@ struct perf_raw_record { struct perf_branch_stack { __u64 nr; __u64 hw_idx; - struct perf_branch_entry entries[0]; + struct perf_branch_entry entries[]; }; struct task_struct; @@ -1305,7 +1305,7 @@ static inline int perf_is_paranoid(void) static inline int perf_allow_kernel(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); @@ -1313,7 +1313,7 @@ static inline int perf_allow_kernel(struct perf_event_attr *attr) static inline int perf_allow_cpu(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); @@ -1321,7 +1321,7 @@ static inline int perf_allow_cpu(struct perf_event_attr *attr) static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); diff --git a/include/linux/platform_data/ad5761.h b/include/linux/platform_data/ad5761.h index 02bef5177ff5..69e261e2ca14 100644 --- a/include/linux/platform_data/ad5761.h +++ b/include/linux/platform_data/ad5761.h @@ -3,7 +3,7 @@ * AD5721, AD5721R, AD5761, AD5761R, Voltage Output Digital to Analog Converter * * Copyright 2016 Qtechnology A/S - * 2016 Ricardo Ribalda <ricardo.ribalda@gmail.com> + * 2016 Ricardo Ribalda <ribalda@kernel.org> */ #ifndef __LINUX_PLATFORM_DATA_AD5761_H__ #define __LINUX_PLATFORM_DATA_AD5761_H__ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bc3f1aecaa19..7d9c1c0e149c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -26,13 +26,13 @@ * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 - * NMI_MASK: 0x00100000 + * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 -#define NMI_BITS 1 +#define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) diff --git a/include/linux/printk.h b/include/linux/printk.h index 07f2d08f79ff..15c8133b194f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -279,39 +279,116 @@ static inline void printk_safe_flush_on_panic(void) extern int kptr_restrict; +/** + * pr_fmt - used by the pr_*() macros to generate the printk format string + * @fmt: format string passed from a pr_*() macro + * + * This macro can be used to generate a unified format string for pr_*() + * macros. A common use is to prefix all pr_*() messages in a file with a common + * string. For example, defining this at the top of a source file: + * + * #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + * + * would prefix all pr_info, pr_emerg... messages in the file with the module + * name. + */ #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif -/* - * These can be used to print at the various log levels. - * All of these will print unconditionally, although note that pr_debug() - * and other debug macros are compiled out unless either DEBUG is defined - * or CONFIG_DYNAMIC_DEBUG is set. +/** + * pr_emerg - Print an emergency-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to + * generate the format string. */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_alert - Print an alert-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_alert(fmt, ...) \ printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_crit - Print a critical-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_crit(fmt, ...) \ printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_err - Print an error-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_err(fmt, ...) \ printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_warn - Print a warning-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt() + * to generate the format string. + */ #define pr_warn(fmt, ...) \ printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_notice - Print a notice-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_notice(fmt, ...) \ printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_info - Print an info-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -/* - * Like KERN_CONT, pr_cont() should only be used when continuing - * a line with no newline ('\n') enclosed. Otherwise it defaults - * back to KERN_DEFAULT. + +/** + * pr_cont - Continues a previous log message in the same line. + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_CONT loglevel. It should only be + * used when continuing a log message with no newline ('\n') enclosed. Otherwise + * it defaults back to KERN_DEFAULT loglevel. */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) -/* pr_devel() should produce zero code unless DEBUG is defined */ +/** + * pr_devel - Print a debug-level message conditionally + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is + * defined. Otherwise it does nothing. + * + * It uses pr_fmt() to generate the format string. + */ #ifdef DEBUG #define pr_devel(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) @@ -325,8 +402,19 @@ extern int kptr_restrict; #if defined(CONFIG_DYNAMIC_DEBUG) #include <linux/dynamic_debug.h> -/* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */ -#define pr_debug(fmt, ...) \ +/** + * pr_debug - Print a debug-level message conditionally + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is + * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with + * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing. + * + * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses + * pr_fmt() internally). + */ +#define pr_debug(fmt, ...) \ dynamic_pr_debug(fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define pr_debug(fmt, ...) \ diff --git a/include/linux/psci.h b/include/linux/psci.h index a67712b73b6c..14ad9b9ebcd6 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -21,11 +21,6 @@ bool psci_power_state_is_valid(u32 state); int psci_set_osi_mode(void); bool psci_has_osi_support(void); -enum smccc_version { - SMCCC_VERSION_1_0, - SMCCC_VERSION_1_1, -}; - struct psci_operations { u32 (*get_version)(void); int (*cpu_suspend)(u32 state, unsigned long entry_point); @@ -35,8 +30,6 @@ struct psci_operations { int (*affinity_info)(unsigned long target_affinity, unsigned long lowest_affinity_level); int (*migrate_info_type)(void); - enum arm_smccc_conduit conduit; - enum smccc_version smccc_version; }; extern struct psci_operations psci_ops; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 63e62372443a..c2a9f7c90727 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -16,11 +16,20 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/xarray.h> +#include <linux/local_lock.h> /* Keep unconverted code working */ #define radix_tree_root xarray #define radix_tree_node xa_node +struct radix_tree_preload { + local_lock_t lock; + unsigned nr; + /* nodes->parent points to next preallocated node */ + struct radix_tree_node *nodes; +}; +DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads); + /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: @@ -245,7 +254,7 @@ int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag); static inline void radix_tree_preload_end(void) { - preempt_enable(); + local_unlock(&radix_tree_preloads.lock); } void __rcu **idr_get_free(struct radix_tree_root *root, diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 1fd61a9af45c..d7db17996322 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -11,7 +11,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - See Documentation/rbtree.txt for documentation and samples. + See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 724b0d036b57..d1c53e9d8c75 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -21,7 +21,7 @@ * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * - * See Documentation/rbtree.txt for documentation and samples. + * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8214cdc715f2..7375bb3da140 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -371,7 +371,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() @@ -646,7 +646,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2678a37c3169..659cbfa7581a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -37,6 +37,7 @@ /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); +void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU @@ -129,25 +130,57 @@ static inline void rcu_init_nohz(void) { } * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ -#ifdef CONFIG_TASKS_RCU -#define rcu_tasks_qs(t) \ - do { \ - if (READ_ONCE((t)->rcu_tasks_holdout)) \ - WRITE_ONCE((t)->rcu_tasks_holdout, false); \ +#ifdef CONFIG_TASKS_RCU_GENERIC + +# ifdef CONFIG_TASKS_RCU +# define rcu_tasks_classic_qs(t, preempt) \ + do { \ + if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ + WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) -#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +# else +# define rcu_tasks_classic_qs(t, preempt) do { } while (0) +# define call_rcu_tasks call_rcu +# define synchronize_rcu_tasks synchronize_rcu +# endif + +# ifdef CONFIG_TASKS_RCU_TRACE +# define rcu_tasks_trace_qs(t) \ + do { \ + if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ + !unlikely(READ_ONCE((t)->trc_reader_nesting))) { \ + smp_store_release(&(t)->trc_reader_checked, true); \ + smp_mb(); /* Readers partitioned by store. */ \ + } \ + } while (0) +# else +# define rcu_tasks_trace_qs(t) do { } while (0) +# endif + +#define rcu_tasks_qs(t, preempt) \ +do { \ + rcu_tasks_classic_qs((t), (preempt)); \ + rcu_tasks_trace_qs((t)); \ +} while (0) + +# ifdef CONFIG_TASKS_RUDE_RCU +void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); +void synchronize_rcu_tasks_rude(void); +# endif + +#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); -#else /* #ifdef CONFIG_TASKS_RCU */ -#define rcu_tasks_qs(t) do { } while (0) +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +#define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } -#endif /* #else #ifdef CONFIG_TASKS_RCU */ +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU @@ -158,7 +191,7 @@ static inline void exit_tasks_rcu_finish(void) { } */ #define cond_resched_tasks_rcu_qs() \ do { \ - rcu_tasks_qs(current); \ + rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h new file mode 100644 index 000000000000..4c25a41f8b27 --- /dev/null +++ b/include/linux/rcupdate_trace.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion, adapted for tracing. + * + * Copyright (C) 2020 Paul E. McKenney. + */ + +#ifndef __LINUX_RCUPDATE_TRACE_H +#define __LINUX_RCUPDATE_TRACE_H + +#include <linux/sched.h> +#include <linux/rcupdate.h> + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +extern struct lockdep_map rcu_trace_lock_map; + +static inline int rcu_read_lock_trace_held(void) +{ + return lock_is_held(&rcu_trace_lock_map); +} + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +static inline int rcu_read_lock_trace_held(void) +{ + return 1; +} + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); + +/** + * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section + * + * When synchronize_rcu_trace() is invoked by one task, then that task + * is guaranteed to block until all other tasks exit their read-side + * critical sections. Similarly, if call_rcu_trace() is invoked on one + * task while other tasks are within RCU read-side critical sections, + * invocation of the corresponding RCU callback is deferred until after + * the all the other tasks exit their critical sections. + * + * For more details, please see the documentation for rcu_read_lock(). + */ +static inline void rcu_read_lock_trace(void) +{ + struct task_struct *t = current; + + WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers + rcu_lock_acquire(&rcu_trace_lock_map); +} + +/** + * rcu_read_unlock_trace - mark end of RCU-trace read-side critical section + * + * Pairs with a preceding call to rcu_read_lock_trace(), and nesting is + * allowed. Invoking a rcu_read_unlock_trace() when there is no matching + * rcu_read_lock_trace() is verboten, and will result in lockdep complaints. + * + * For more details, please see the documentation for rcu_read_unlock(). + */ +static inline void rcu_read_unlock_trace(void) +{ + int nesting; + struct task_struct *t = current; + + rcu_lock_release(&rcu_trace_lock_map); + nesting = READ_ONCE(t->trc_reader_nesting) - 1; + if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { + WRITE_ONCE(t->trc_reader_nesting, nesting); + return; // We assume shallow reader nesting. + } + rcu_read_unlock_trace_special(t, nesting); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +void synchronize_rcu_tasks_trace(void); +void rcu_barrier_tasks_trace(void); + +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + +#endif /* __LINUX_RCUPDATE_TRACE_H */ diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index c0578ba23c1a..699b938358bf 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -31,4 +31,23 @@ do { \ #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) +/** + * synchronize_rcu_mult - Wait concurrently for multiple grace periods + * @...: List of call_rcu() functions for different grace periods to wait on + * + * This macro waits concurrently for multiple types of RCU grace periods. + * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait + * on concurrent RCU and RCU-tasks grace periods. Waiting on a given SRCU + * domain requires you to write a wrapper function for that SRCU domain's + * call_srcu() function, with this wrapper supplying the pointer to the + * corresponding srcu_struct. + * + * The first argument tells Tiny RCU's _wait_rcu_gp() not to + * bother waiting for RCU. The reason for this is because anywhere + * synchronize_rcu_mult() can be called is automatically already a full + * grace period. + */ +#define synchronize_rcu_mult(...) \ + _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 045c28b71f4f..8512caeb7682 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -49,7 +49,7 @@ static inline void rcu_softirq_qs(void) #define rcu_note_context_switch(preempt) \ do { \ rcu_qs(); \ - rcu_tasks_qs(current); \ + rcu_tasks_qs(current, (preempt)); \ } while (0) static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) @@ -71,6 +71,8 @@ static inline void rcu_irq_enter(void) { } static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } +static inline void rcu_irq_exit_preempt(void) { } +static inline void rcu_irq_exit_check_preempt(void) { } static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { @@ -85,8 +87,10 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } +static inline bool __rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } +static inline bool rcu_gp_might_be_stalled(void) { return false; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 45f3f66bb04d..d5cc9d675987 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -39,6 +39,7 @@ void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); +bool rcu_gp_might_be_stalled(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); @@ -46,9 +47,16 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +void rcu_irq_exit_preempt(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); +#ifdef CONFIG_PROVE_RCU +void rcu_irq_exit_check_preempt(void); +#else +static inline void rcu_irq_exit_check_preempt(void) { } +#endif + void exit_rcu(void); void rcu_scheduler_starting(void); @@ -56,6 +64,7 @@ extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); +bool __rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif diff --git a/include/linux/relay.h b/include/linux/relay.h index c759f96e39c1..e13a333e7c37 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -141,7 +141,7 @@ struct rchan_callbacks * cause relay_open() to create a single global buffer rather * than the default set of per-cpu buffers. * - * See Documentation/filesystems/relay.txt for more info. + * See Documentation/filesystems/relay.rst for more info. */ struct dentry *(*create_buf_file)(const char *filename, struct dentry *parent, diff --git a/include/linux/sched.h b/include/linux/sched.h index 12ef0c753284..33bb7c539246 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -613,7 +613,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 deferred_qs; + u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; @@ -724,6 +724,14 @@ struct task_struct { struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + int trc_reader_nesting; + int trc_ipi_to_cpu; + union rcu_special trc_reader_special; + bool trc_reader_checked; + struct list_head trc_holdout_list; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + struct sched_info sched_info; struct list_head tasks; @@ -1289,6 +1297,12 @@ struct task_struct { unsigned long prev_lowest_stack; #endif +#ifdef CONFIG_X86_MCE + u64 mce_addr; + u64 mce_status; + struct callback_head mce_kill_me; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/scs.h b/include/linux/scs.h new file mode 100644 index 000000000000..6dec390cf154 --- /dev/null +++ b/include/linux/scs.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#ifndef _LINUX_SCS_H +#define _LINUX_SCS_H + +#include <linux/gfp.h> +#include <linux/poison.h> +#include <linux/sched.h> +#include <linux/sizes.h> + +#ifdef CONFIG_SHADOW_CALL_STACK + +/* + * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit + * architecture) provided ~40% safety margin on stack usage while keeping + * memory allocation overhead reasonable. + */ +#define SCS_SIZE SZ_1K +#define GFP_SCS (GFP_KERNEL | __GFP_ZERO) + +/* An illegal pointer value to mark the end of the shadow stack. */ +#define SCS_END_MAGIC (0x5f6UL + POISON_POINTER_DELTA) + +/* Allocate a static per-CPU shadow stack */ +#define DEFINE_SCS(name) \ + DEFINE_PER_CPU(unsigned long [SCS_SIZE/sizeof(long)], name) \ + +#define task_scs(tsk) (task_thread_info(tsk)->scs_base) +#define task_scs_sp(tsk) (task_thread_info(tsk)->scs_sp) + +void scs_init(void); +int scs_prepare(struct task_struct *tsk, int node); +void scs_release(struct task_struct *tsk); + +static inline void scs_task_reset(struct task_struct *tsk) +{ + /* + * Reset the shadow stack to the base address in case the task + * is reused. + */ + task_scs_sp(tsk) = task_scs(tsk); +} + +static inline unsigned long *__scs_magic(void *s) +{ + return (unsigned long *)(s + SCS_SIZE) - 1; +} + +static inline bool task_scs_end_corrupted(struct task_struct *tsk) +{ + unsigned long *magic = __scs_magic(task_scs(tsk)); + unsigned long sz = task_scs_sp(tsk) - task_scs(tsk); + + return sz >= SCS_SIZE - 1 || READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC; +} + +#else /* CONFIG_SHADOW_CALL_STACK */ + +static inline void scs_init(void) {} +static inline void scs_task_reset(struct task_struct *tsk) {} +static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; } +static inline void scs_release(struct task_struct *tsk) {} +static inline bool task_scs_end_corrupted(struct task_struct *tsk) { return false; } + +#endif /* CONFIG_SHADOW_CALL_STACK */ + +#endif /* _LINUX_SCS_H */ diff --git a/include/linux/signal.h b/include/linux/signal.h index 05bacd2ab135..6bb1a3f0258c 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -24,6 +24,14 @@ static inline void clear_siginfo(kernel_siginfo_t *info) #define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo)) +static inline void copy_siginfo_to_external(siginfo_t *to, + const kernel_siginfo_t *from) +{ + memcpy(to, from, sizeof(*from)); + memset(((char *)to) + sizeof(struct kernel_siginfo), 0, + SI_EXPANSION_SIZE); +} + int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from); int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from); diff --git a/include/linux/smp.h b/include/linux/smp.h index cbc9162689d0..04019872c7bc 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -227,8 +227,8 @@ static inline int get_boot_cpu_id(void) */ extern void arch_disable_smp_support(void); -extern void arch_enable_nonboot_cpus_begin(void); -extern void arch_enable_nonboot_cpus_end(void); +extern void arch_thaw_secondary_cpus_begin(void); +extern void arch_thaw_secondary_cpus_end(void); void smp_setup_processor_id(void); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 38286de779e3..aac57b5b7c21 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -394,6 +394,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * for example doing DMA mapping. Called from threaded * context. * @transfer_one: transfer a single spi_transfer. + * * - return 0 if the transfer is finished, * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must diff --git a/include/linux/stat.h b/include/linux/stat.h index 528c4baad091..56614af83d4a 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -47,6 +47,7 @@ struct kstat { struct timespec64 ctime; struct timespec64 btime; /* File creation time */ u64 blocks; + u64 mnt_id; }; #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 68ef7638311f..e92176fc8824 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -343,6 +343,7 @@ extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); +extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1815065d52f3..7c354c2955f5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -428,6 +428,8 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); #endif asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode); +asmlinkage long sys_faccessat2(int dfd, const char __user *filename, int mode, + int flags); asmlinkage long sys_chdir(const char __user *filename); asmlinkage long sys_fchdir(unsigned int fd); asmlinkage long sys_chroot(const char __user *filename); @@ -1333,11 +1335,11 @@ static inline int ksys_chmod(const char __user *filename, umode_t mode) return do_fchmodat(AT_FDCWD, filename, mode); } -extern long do_faccessat(int dfd, const char __user *filename, int mode); +long do_faccessat(int dfd, const char __user *filename, int mode, int flags); static inline long ksys_access(const char __user *filename, int mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } extern int do_fchownat(int dfd, const char __user *filename, uid_t user, diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 80bb865b3a33..86067dbe7745 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -7,7 +7,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ diff --git a/include/linux/tboot.h b/include/linux/tboot.h index 5424bc6feac8..c7e424766360 100644 --- a/include/linux/tboot.h +++ b/include/linux/tboot.h @@ -121,13 +121,7 @@ struct tboot { #define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} -extern struct tboot *tboot; - -static inline int tboot_enabled(void) -{ - return tboot != NULL; -} - +bool tboot_enabled(void); extern void tboot_probe(void); extern void tboot_shutdown(u32 shutdown_type); extern struct acpi_table_header *tboot_get_dmar_table( diff --git a/include/linux/torture.h b/include/linux/torture.h index 6241f59e2d6f..629b66e6c161 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -89,7 +89,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() preempt_schedule() #else -#define torture_preempt_schedule() +#define torture_preempt_schedule() do { } while (0) #endif #endif /* __LINUX_TORTURE_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 67f016010aad..9861c89f93be 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -378,6 +378,14 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif +#ifndef user_write_access_begin +#define user_write_access_begin user_access_begin +#define user_write_access_end user_access_end +#endif +#ifndef user_read_access_begin +#define user_read_access_begin user_access_begin +#define user_read_access_end user_access_end +#endif #ifdef CONFIG_HARDENED_USERCOPY void usercopy_warn(const char *name, const char *detail, bool to_user, diff --git a/include/linux/wait.h b/include/linux/wait.h index feeb6be5cad6..898c890fc153 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1149,4 +1149,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i (wait)->flags = 0; \ } while (0) +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg); + #endif /* _LINUX_WAIT_H */ diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 417d9f37077a..1464ce6ffa31 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -37,15 +37,15 @@ struct watchdog_governor; * * The watchdog_ops structure contains a list of low-level operations * that control a watchdog device. It also contains the module that owns - * these operations. The start and stop function are mandatory, all other + * these operations. The start function is mandatory, all other * functions are optional. */ struct watchdog_ops { struct module *owner; /* mandatory operations */ int (*start)(struct watchdog_device *); - int (*stop)(struct watchdog_device *); /* optional operations */ + int (*stop)(struct watchdog_device *); int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); diff --git a/include/media/cec-notifier.h b/include/media/cec-notifier.h index 38956969fd12..b1c839734124 100644 --- a/include/media/cec-notifier.h +++ b/include/media/cec-notifier.h @@ -2,7 +2,7 @@ /* * cec-notifier.h - notify CEC drivers of physical address changes * - * Copyright 2016 Russell King <rmk+kernel@arm.linux.org.uk> + * Copyright 2016 Russell King. * Copyright 2016-2017 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ diff --git a/include/net/checksum.h b/include/net/checksum.h index 97bf4885a962..46754ba9d7b7 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -26,13 +26,9 @@ static inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len, __wsum sum, int *err_ptr) { - if (access_ok(src, len)) - return csum_partial_copy_from_user(src, dst, len, sum, err_ptr); - - if (len) + if (copy_from_user(dst, src, len)) *err_ptr = -EFAULT; - - return sum; + return csum_partial(dst, len, sum); } #endif @@ -42,10 +38,8 @@ static __inline__ __wsum csum_and_copy_to_user { sum = csum_partial(src, len, sum); - if (access_ok(dst, len)) { - if (copy_to_user(dst, src, len) == 0) - return sum; - } + if (copy_to_user(dst, src, len) == 0) + return sum; if (len) *err_ptr = -EFAULT; diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 3a3201e4618e..f4a01305d9a6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -855,9 +855,11 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) #undef __NR_syscalls -#define __NR_syscalls 439 +#define __NR_syscalls 440 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 272dc69fa080..e58c9636741b 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -367,8 +367,14 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* + * Allow system performance and observability privileged operations + * using perf_events, i915_perf and other kernel subsystems + */ + +#define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 34c02e4290fe..c6dd0215482e 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -36,6 +36,7 @@ typedef __s64 Elf64_Sxword; #define PT_LOPROC 0x70000000 #define PT_HIPROC 0x7fffffff #define PT_GNU_EH_FRAME 0x6474e550 +#define PT_GNU_PROPERTY 0x6474e553 #define PT_GNU_STACK (PT_LOOS + 0x474e551) @@ -367,6 +368,7 @@ typedef struct elf64_shdr { * Notes used in ET_CORE. Architectures export some of the arch register sets * using the corresponding note types via the PTRACE_GETREGSET and * PTRACE_SETREGSET requests. + * The note name for all these is "LINUX". */ #define NT_PRSTATUS 1 #define NT_PRFPREG 2 @@ -429,6 +431,9 @@ typedef struct elf64_shdr { #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +/* Note types with note name "GNU" */ +#define NT_GNU_PROPERTY_TYPE_0 5 + /* Note header in a PT_NOTE section */ typedef struct elf32_note { Elf32_Word n_namesz; /* Name size */ @@ -443,4 +448,10 @@ typedef struct elf64_note { Elf64_Word n_type; /* Content type */ } Elf64_Nhdr; +/* .note.gnu.property types for EM_AARCH64: */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +/* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) + #endif /* _UAPI_LINUX_ELF_H */ diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7fde76366ba4..1711e57f7848 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -2,7 +2,7 @@ /* * include/uapi/linux/ethtool_netlink.h - netlink interface for ethtool * - * See Documentation/networking/ethtool-netlink.txt in kernel source tree for + * See Documentation/networking/ethtool-netlink.rst in kernel source tree for * doucumentation of the interface. */ diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ca88b7bce553..2f86b2ad6d7e 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -84,10 +84,20 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ #define AT_FDCWD -100 /* Special value used to indicate openat should use the current working directory. */ #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ #define AT_REMOVEDIR 0x200 /* Remove directory instead of unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 1acd2b179aef..7e5b5c10a49c 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -308,7 +308,7 @@ struct fw_cdev_event_iso_interrupt_mc { /** * struct fw_cdev_event_iso_resource - Iso resources were allocated or freed * @closure: See &fw_cdev_event_common; - * set by %FW_CDEV_IOC_(DE)ALLOCATE_ISO_RESOURCE(_ONCE) ioctl + * set by``FW_CDEV_IOC_(DE)ALLOCATE_ISO_RESOURCE(_ONCE)`` ioctl * @type: %FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or * %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED * @handle: Reference by which an allocated resource can be deallocated diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 428c7dde6b4b..fdd632c833b4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virt/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.rst */ union { __u32 irq; @@ -1107,7 +1107,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virt/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.rst. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index ad80a5c885d5..6df9348bb277 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -123,7 +123,10 @@ struct statx { __u32 stx_dev_major; /* ID of device containing file [uncond] */ __u32 stx_dev_minor; /* 0x90 */ - __u64 __spare2[14]; /* Spare space for future expansion */ + __u64 stx_mnt_id; + __u64 __spare2; + /* 0xa0 */ + __u64 __spare3[12]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -148,9 +151,19 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ -#define STATX_ALL 0x00000fffU /* All currently supported flags */ +#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ + #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + /* * Attributes to be found in stx_attributes and masked in stx_attributes_mask. * @@ -168,6 +181,7 @@ struct statx { #define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ #define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ #define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 7b1ec806f8f9..38ab7accb7be 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -36,7 +36,7 @@ #include <linux/types.h> #include <linux/ioctl.h> -/* Documentation/ioctl/ioctl-number.rst */ +/* Documentation/userspace-api/ioctl/ioctl-number.rst */ #define RDMA_IOCTL_MAGIC 0x1b #define RDMA_VERBS_IOCTL \ _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index dab8b1151b56..d72beda824aa 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -28,7 +28,7 @@ static int __init no_initrd(char *str) __setup("noinitrd", no_initrd); -static int __init early_initrd(char *p) +static int __init early_initrdmem(char *p) { phys_addr_t start; unsigned long size; @@ -43,6 +43,17 @@ static int __init early_initrd(char *p) } return 0; } +early_param("initrdmem", early_initrdmem); + +/* + * This is here as the initrd keyword has been in use since 11/2018 + * on ARM, PowerPC, and MIPS. + * It should not be; it is reserved for bootloaders. + */ +static int __init early_initrd(char *p) +{ + return early_initrdmem(p); +} early_param("initrd", early_initrd); static int init_linuxrc(struct subprocess_info *info, struct cred *new) diff --git a/init/init_task.c b/init/init_task.c index bd403ed3e418..15303d58d9db 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -11,6 +11,7 @@ #include <linux/mm.h> #include <linux/audit.h> #include <linux/numa.h> +#include <linux/scs.h> #include <asm/pgtable.h> #include <linux/uaccess.h> @@ -50,6 +51,13 @@ static struct sighand_struct init_sighand = { .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), }; +#ifdef CONFIG_SHADOW_CALL_STACK +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] + __init_task_data = { + [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC +}; +#endif + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) @@ -141,6 +149,11 @@ struct task_struct init_task .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, #endif +#ifdef CONFIG_TASKS_TRACE_RCU + .trc_reader_nesting = 0, + .trc_reader_special.s = 0, + .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), +#endif #ifdef CONFIG_CPUSETS .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), #endif diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..c332eb9d4841 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/compat.c b/kernel/compat.c index 843dd17e6078..b8d2800bb4b7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -199,7 +199,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_read_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -211,11 +211,11 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, } if (nr_compat_longs) unsafe_get_user(*mask, umask++, Efault); - user_access_end(); + user_read_access_end(); return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -228,7 +228,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_write_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -239,10 +239,10 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } if (nr_compat_longs) unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); - user_access_end(); + user_write_access_end(); return 0; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..9f892144db6b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -432,7 +432,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each - * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); @@ -1327,7 +1327,7 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus) #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int __freeze_secondary_cpus(int primary, bool suspend) +int freeze_secondary_cpus(int primary) { int cpu, error = 0; @@ -1352,7 +1352,7 @@ int __freeze_secondary_cpus(int primary, bool suspend) if (cpu == primary) continue; - if (suspend && pm_wakeup_pending()) { + if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; @@ -1376,8 +1376,8 @@ int __freeze_secondary_cpus(int primary, bool suspend) /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1385,15 +1385,15 @@ int __freeze_secondary_cpus(int primary, bool suspend) return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; @@ -1405,7 +1405,7 @@ void enable_nonboot_cpus(void) pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1418,7 +1418,7 @@ void enable_nonboot_cpus(void) pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -6,12 +6,6 @@ #include <linux/export.h> /* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - -/* * stores the physical address of elf header of crash image * * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..b1991043b7d8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -16,7 +16,7 @@ struct callchain_cpus_entries { struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; + struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; diff --git a/kernel/events/core.c b/kernel/events/core.c index 633b4ae72ed5..e296c5c59c6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -95,11 +95,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -112,11 +112,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - if (!ret) - ret = data.ret; - } while (ret == -EAGAIN); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); + ret = !ret ? data.ret : -EAGAIN; + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } @@ -9404,7 +9409,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9464,7 +9469,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -11511,7 +11516,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index f16f66b6b655..fcbf5616a441 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -55,7 +55,7 @@ struct perf_buffer { void *aux_priv; struct perf_event_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..1b772f2c671b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1558,7 +1558,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1567,10 +1567,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1685,7 +1685,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1694,10 +1694,10 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 48ed22774efa..be98e94cb3cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include <linux/thread_info.h> #include <linux/stackleak.h> #include <linux/kasan.h> +#include <linux/scs.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -456,6 +457,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -840,6 +843,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +904,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -1683,6 +1692,11 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..b4b9f960b610 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -486,10 +486,13 @@ static u64 get_inode_sequence_number(struct inode *inode) * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: + * * ( inode->i_sequence, page->index, offset_within_page ) + * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: + * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..3f310df4a693 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2179,6 +2179,24 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2211,10 +2229,62 @@ static int __init populate_kprobe_blacklist(unsigned long *start, /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); + if (ret) + return ret; + + /* Symbols in noinstr section are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, + (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } +static void add_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_add_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_add_area_blacklist(start, end); + } +} + +static void remove_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_remove_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_remove_area_blacklist(start, end); + } +} + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2225,6 +2295,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) { + mutex_lock(&kprobe_mutex); + add_module_kprobe_blacklist(mod); + mutex_unlock(&kprobe_mutex); + } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2255,6 +2330,8 @@ static int kprobes_module_callback(struct notifier_block *nb, kill_kprobe(p); } } + if (val == MODULE_STATE_GOING) + remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } @@ -2420,6 +2497,7 @@ static const struct file_operations debugfs_kprobes_operations = { /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&kprobe_mutex); return seq_list_start(&kprobe_blacklist, *pos); } @@ -2446,10 +2524,15 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) return 0; } +static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) +{ + mutex_unlock(&kprobe_mutex); +} + static const struct seq_operations kprobe_blacklist_seq_ops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, - .stop = kprobe_seq_stop, /* Reuse void function */ + .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..dd3cc0854c32 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,25 +393,6 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -/* - * Split the recrursion counter in two to readily detect 'off' vs recursion. - */ -#define LOCKDEP_RECURSION_BITS 16 -#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) -#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) - -void lockdep_off(void) -{ - current->lockdep_recursion += LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion -= LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_on); - static inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) @@ -489,7 +470,7 @@ struct lock_trace { struct hlist_node hash_entry; u32 hash; u32 nr_entries; - unsigned long entries[0] __aligned(sizeof(unsigned long)); + unsigned long entries[] __aligned(sizeof(unsigned long)); }; #define LOCK_TRACE_SIZE_IN_LONGS \ (sizeof(struct lock_trace) / sizeof(unsigned long)) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c9f090d64f00..cfdd5b93264d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -141,7 +141,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) @@ -202,7 +201,6 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) # define rt_mutex_cmpxchg_acquire(l,c,n) (0) # define rt_mutex_cmpxchg_release(l,c,n) (0) diff --git a/kernel/module.c b/kernel/module.c index 086618a0058f..a0f201d2e184 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2400,7 +2400,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) + || module_init_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2433,7 +2433,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) + || !module_init_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2768,6 +2768,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); +} + bool __weak module_exit_section(const char *name) { return strstarts(name, ".exit"); @@ -3149,6 +3154,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) } #endif + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), @@ -3193,6 +3201,13 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ei_funcs), &mod->num_ei_funcs); #endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index b2b0f526f249..660f9a6bf73a 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,9 +6,11 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 +#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 + +#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 extern raw_spinlock_t logbuf_lock; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d9a659a686f3..4242403316bb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -10,6 +10,7 @@ #include <linux/cpumask.h> #include <linux/irq_work.h> #include <linux/printk.h> +#include <linux/kprobes.h> #include "internal.h" @@ -293,14 +294,14 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void notrace printk_nmi_enter(void) +void noinstr printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } -void notrace printk_nmi_exit(void) +void noinstr printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } /* diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,37 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. + +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -210,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4aa02eee8f6c..452feae8de20 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -29,6 +29,8 @@ config RCU_PERF_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -46,6 +48,8 @@ config RCU_TORTURE_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..cf66a3ccd757 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -431,6 +431,7 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void show_rcu_tasks_gp_kthreads(void); void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ @@ -441,6 +442,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TASKS_RUDE_FLAVOR, + RCU_TASKS_TRACING_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR @@ -454,6 +457,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) @@ -471,6 +475,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) @@ -498,6 +503,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU +static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -507,6 +513,7 @@ static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a8d097d84d..16dd1e6b7c09 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,6 +88,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -635,7 +636,7 @@ kfree_perf_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -722,6 +723,8 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } + pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); if (kfree_reader_tasks == NULL) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..efb792e13fca 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -20,7 +20,7 @@ #include <linux/err.h> #include <linux/spinlock.h> #include <linux/smp.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -45,12 +45,25 @@ #include <linux/sched/sysctl.h> #include <linux/oom.h> #include <linux/tick.h> +#include <linux/rcupdate_trace.h> #include "rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ @@ -102,6 +115,9 @@ torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_gp_kthread, 0, + "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -665,6 +681,11 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); } +static void synchronize_rcu_mult_test(void) +{ + synchronize_rcu_mult(call_rcu_tasks, call_rcu); +} + static struct rcu_torture_ops tasks_ops = { .ttype = RCU_TASKS_FLAVOR, .init = rcu_sync_torture_init, @@ -674,7 +695,7 @@ static struct rcu_torture_ops tasks_ops = { .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .fqs = NULL, @@ -725,6 +746,72 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +/* + * Definitions for rude RCU-tasks torture testing. + */ + +static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_rude_ops = { + .ttype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_rude_torture_deferred_free, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .call = call_rcu_tasks_rude, + .cb_barrier = rcu_barrier_tasks_rude, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks-rude" +}; + +/* + * Definitions for tracing RCU-tasks torture testing. + */ + +static int tasks_tracing_torture_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_tracing_torture_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_tracing_ops = { + .ttype = RCU_TASKS_TRACING_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_tracing_torture_read_lock, + .read_delay = srcu_read_delay, /* just reuse srcu's version. */ + .readunlock = tasks_tracing_torture_read_unlock, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_tracing_torture_deferred_free, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .call = call_rcu_tasks_trace, + .cb_barrier = rcu_barrier_tasks_trace, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .slow_gps = 1, + .name = "tasks-tracing" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -734,7 +821,7 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) static bool __maybe_unused torturing_tasks(void) { - return cur_ops == &tasks_ops; + return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops; } /* @@ -833,7 +920,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) @@ -843,7 +930,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { + while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ @@ -914,7 +1001,7 @@ rcu_torture_fqs(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { schedule_timeout_interruptible(1); } @@ -1147,6 +1234,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + unsigned long flags; int idxnew = -1; int idxold = *readstate; int statesnew = ~*readstate & newstate; @@ -1181,8 +1269,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); - if (statesold & RCUTORTURE_RDR_RCU) + if (statesold & RCUTORTURE_RDR_RCU) { + bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -1283,6 +1378,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(srcu_ctlp) || + rcu_read_lock_trace_held() || torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ @@ -1444,9 +1540,9 @@ rcu_torture_stats_print(void) atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1536,6 +1632,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1544,6 +1641,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1599,6 +1697,7 @@ static int rcutorture_booster_init(unsigned int cpu) */ static int rcu_torture_stall(void *args) { + int idx; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); @@ -1607,26 +1706,37 @@ static int rcu_torture_stall(void *args) schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); + idx = cur_ops->readlock(); if (stall_cpu_irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); + raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ + if (stall_cpu_block) + schedule_timeout_uninterruptible(HZ); if (stall_cpu_irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); + cur_ops->readunlock(idx); } + pr_alert("rcu_torture_stall end.\n"); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -1636,7 +1746,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } @@ -1692,8 +1802,8 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd *rcu_fwds; -bool rcu_fwd_emergency_stop; +static struct rcu_fwd *rcu_fwds; +static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { @@ -2400,7 +2510,8 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, + &tasks_tracing_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0c71505f0e19..6d3ef700fb0e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,6 +29,19 @@ #include "rcu.h" #include "rcu_segcblist.h" +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; @@ -1268,8 +1281,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1277,8 +1290,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h new file mode 100644 index 000000000000..ce23f6cc5043 --- /dev/null +++ b/kernel/rcu/tasks.h @@ -0,0 +1,1193 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Task-based RCU implementations. + * + * Copyright (C) 2020 Paul E. McKenney + */ + +#ifdef CONFIG_TASKS_RCU_GENERIC + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); +typedef void (*pregp_func_t)(void); +typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); +typedef void (*postscan_func_t)(struct list_head *hop); +typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); +typedef void (*postgp_func_t)(struct rcu_tasks *rtp); + +/** + * Definition for a Tasks-RCU-like mechanism. + * @cbs_head: Head of callback list. + * @cbs_tail: Tail pointer for callback list. + * @cbs_wq: Wait queue allowning new callback to get kthread's attention. + * @cbs_lock: Lock protecting callback list. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @gp_state: Grace period's most recent state transition (debugging). + * @gp_jiffies: Time of last @gp_state transition. + * @gp_start: Most recent grace-period start in jiffies. + * @n_gps: Number of grace periods completed since boot. + * @n_ipis: Number of IPIs sent to encourage grace periods to end. + * @n_ipis_fails: Number of IPI-send failures. + * @pregp_func: This flavor's pre-grace-period function (optional). + * @pertask_func: This flavor's per-task scan function (optional). + * @postscan_func: This flavor's post-task scan function (optional). + * @holdout_func: This flavor's holdout-list scan function (optional). + * @postgp_func: This flavor's post-grace-period function (optional). + * @call_func: This flavor's call_rcu()-equivalent function. + * @name: This flavor's textual name. + * @kname: This flavor's kthread name. + */ +struct rcu_tasks { + struct rcu_head *cbs_head; + struct rcu_head **cbs_tail; + struct wait_queue_head cbs_wq; + raw_spinlock_t cbs_lock; + int gp_state; + unsigned long gp_jiffies; + unsigned long gp_start; + unsigned long n_gps; + unsigned long n_ipis; + unsigned long n_ipis_fails; + struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + pregp_func_t pregp_func; + pertask_func_t pertask_func; + postscan_func_t postscan_func; + holdouts_func_t holdouts_func; + postgp_func_t postgp_func; + call_rcu_func_t call_func; + char *name; + char *kname; +}; + +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_tail = &rt_name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ + .name = n, \ + .kname = #rt_name, \ +} + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); + +/* Avoid IPIing CPUs early in the grace period. */ +#define RCU_TASK_IPI_DELAY (HZ / 2) +static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; +module_param(rcu_task_ipi_delay, int, 0644); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; +module_param(rcu_task_stall_timeout, int, 0644); + +/* RCU tasks grace-period state for debugging. */ +#define RTGS_INIT 0 +#define RTGS_WAIT_WAIT_CBS 1 +#define RTGS_WAIT_GP 2 +#define RTGS_PRE_WAIT_GP 3 +#define RTGS_SCAN_TASKLIST 4 +#define RTGS_POST_SCAN_TASKLIST 5 +#define RTGS_WAIT_SCAN_HOLDOUTS 6 +#define RTGS_SCAN_HOLDOUTS 7 +#define RTGS_POST_GP 8 +#define RTGS_WAIT_READERS 9 +#define RTGS_INVOKE_CBS 10 +#define RTGS_WAIT_CBS 11 +static const char * const rcu_tasks_gp_state_names[] = { + "RTGS_INIT", + "RTGS_WAIT_WAIT_CBS", + "RTGS_WAIT_GP", + "RTGS_PRE_WAIT_GP", + "RTGS_SCAN_TASKLIST", + "RTGS_POST_SCAN_TASKLIST", + "RTGS_WAIT_SCAN_HOLDOUTS", + "RTGS_SCAN_HOLDOUTS", + "RTGS_POST_GP", + "RTGS_WAIT_READERS", + "RTGS_INVOKE_CBS", + "RTGS_WAIT_CBS", +}; + +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +/* Record grace-period phase and time. */ +static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) +{ + rtp->gp_state = newstate; + rtp->gp_jiffies = jiffies; +} + +/* Return state name. */ +static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) +{ + int i = data_race(rtp->gp_state); // Let KCSAN detect update races + int j = READ_ONCE(i); // Prevent the compiler from reading twice + + if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names)) + return "???"; + return rcu_tasks_gp_state_names[j]; +} + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + needwake = !rtp->cbs_head; + WRITE_ONCE(*rtp->cbs_tail, rhp); + rtp->cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + /* We can't create the thread unless interrupts are enabled. */ + if (needwake && READ_ONCE(rtp->kthread_ptr)) + wake_up(&rtp->cbs_wq); +} + +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(rtp->call_func); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_tasks *rtp = arg; + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current, HK_FLAG_RCU); + WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + smp_mb__after_spinlock(); // Order updates vs. GP. + list = rtp->cbs_head; + rtp->cbs_head = NULL; + rtp->cbs_tail = &rtp->cbs_head; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rtp->cbs_wq, + READ_ONCE(rtp->cbs_head)); + if (!rtp->cbs_head) { + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); + schedule_timeout_interruptible(HZ/10); + } + continue; + } + + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rtp->gp_func(rtp); + rtp->n_gps++; + + /* Invoke the callbacks. */ + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + /* Paranoid sleep to keep this from entering a tight loop */ + schedule_timeout_uninterruptible(HZ/10); + + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + } +} + +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) +{ + struct task_struct *t; + + t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) + return; + smp_mb(); /* Ensure others see full kthread. */ +} + +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + +/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ +static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) +{ + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + rtp->kname, + tasks_gp_state_getname(rtp), data_race(rtp->gp_state), + jiffies - data_race(rtp->gp_jiffies), + data_race(rtp->n_gps), + data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), + ".k"[!!data_race(rtp->kthread_ptr)], + ".C"[!!data_race(rtp->cbs_head)], + s); +} + +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + +//////////////////////////////////////////////////////////////////////// +// +// Shared code between task-list-scanning variants of Tasks RCU. + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(holdouts); + int fract; + + set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); + rtp->pregp_func(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in holdouts. + */ + set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + + set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); + rtp->postscan_func(&holdouts); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + + if (list_empty(&holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); + rtp->holdouts_func(&holdouts, needreport, &firstreport); + } + + set_tasks_gp_state(rtp, RTGS_POST_GP); + rtp->postgp_func(rtp); +} + +#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* Pre-grace-period preparation. */ +static void rcu_tasks_pregp_step(void) +{ + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); +} + +/* Per-task initial processing. */ +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) +{ + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, hop); + } +} + +/* Processing between scanning taskslist and draining the holdout list. */ +void rcu_tasks_postscan(struct list_head *hop) +{ + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); +} + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Scan the holdout lists for tasks no longer holding out. */ +static void check_all_holdout_tasks(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *t, *t1; + + list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, firstreport); + cond_resched(); + } +} + +/* Finish off the Tasks-RCU grace period. */ +static void rcu_tasks_postgp(struct rcu_tasks *rtp) +{ + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_tasks.pregp_func = rcu_tasks_pregp_step; + rcu_tasks.pertask_func = rcu_tasks_pertask; + rcu_tasks.postscan_func = rcu_tasks_postscan; + rcu_tasks.holdouts_func = check_all_holdout_tasks; + rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +static void show_rcu_tasks_classic_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); +} + +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + struct task_struct *t = current; + + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + preempt_enable(); + exit_tasks_rcu_finish_trace(t); +} + +#else /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) { } +void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + rtp->n_ipis += cpumask_weight(cpu_online_mask); + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, + "RCU Tasks Rude"); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +static void show_rcu_tasks_rude_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); +} + +#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +// The number of detections of task quiescent state relying on +// heavyweight readers executing explicit memory barriers. +unsigned long n_heavy_reader_attempts; +unsigned long n_heavy_reader_updates; +unsigned long n_heavy_reader_ofl_updates; + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/* + * This irq_work handler allows rcu_read_unlock_trace() to be invoked + * while the scheduler locks are held. + */ +static void rcu_read_unlock_iw(struct irq_work *iwp) +{ + wake_up(&trc_wait); +} +static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) +{ + int nq = t->trc_reader_special.b.need_qs; + + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) + irq_work_queue(&rcu_tasks_trace_iw); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + int cpu = task_cpu(t); + bool in_qs = false; + bool ofl = cpu_is_offline(cpu); + + if (task_curr(t)) { + WARN_ON_ONCE(ofl & !is_idle_task(t)); + + // If no chance of heavyweight readers, do it the hard way. + if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + return false; + + // If heavyweight readers are enabled on the remote task, + // we can inspect its state despite its currently running. + // However, we cannot safely change its state. + n_heavy_reader_attempts++; + if (!ofl && // Check for "running" idle tasks on offline CPUs. + !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + return false; // No quiescent state, do it the hard way. + n_heavy_reader_updates++; + if (ofl) + n_heavy_reader_ofl_updates++; + in_qs = true; + } else { + in_qs = likely(!t->trc_reader_nesting); + } + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + if (in_qs) + return true; // Already in quiescent state, done!!! + + // The task is in a read-side critical section, so set up its + // state so that it will awaken the grace-period kthread upon exit + // from that critical section. + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + rcu_tasks_trace.n_ipis++; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + rcu_tasks_trace.n_ipis_fails++; + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_checked, false); + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* + * Do intermediate processing between task and holdout scans and + * pick up the idle tasks. + */ +static void rcu_tasks_trace_postscan(struct list_head *hop) +{ + int cpu; + + for_each_possible_cpu(cpu) + rcu_tasks_trace_pertask(idle_task(cpu), hop); + + // Re-enable CPU hotplug now that the tasklist scan has completed. + cpus_read_unlock(); + + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Show the state of a task stalling the current RCU tasks trace GP. */ +static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) +{ + int cpu; + + if (*firstreport) { + pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); + *firstreport = false; + } + // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). + cpu = task_cpu(t); + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], + ".i"[is_idle_task(t)], + ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], + t->trc_reader_nesting, + " N"[!!t->trc_reader_special.b.need_qs], + cpu); + sched_show_task(t); +} + +/* List stalled IPIs for RCU tasks trace. */ +static void show_stalled_ipi_trace(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + if (per_cpu(trc_ipi_to_cpu, cpu)) + pr_alert("\tIPI outstanding to CPU %d\n", cpu); +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *g, *t; + + // Disable CPU hotplug across the holdout list scan. + cpus_read_lock(); + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + else if (needreport) + show_stalled_task_trace(t, firstreport); + } + + // Re-enable CPU hotplug now that the holdout list scan has completed. + cpus_read_unlock(); + + if (needreport) { + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); + show_stalled_ipi_trace(); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) +{ + bool firstreport; + struct task_struct *g, *t; + LIST_HEAD(holdouts); + long ret; + + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + set_tasks_gp_state(rtp, RTGS_WAIT_READERS); + for (;;) { + ret = wait_event_idle_exclusive_timeout( + trc_wait, + atomic_read(&trc_n_readers_need_end) == 0, + READ_ONCE(rcu_task_stall_timeout)); + if (ret) + break; // Count reached zero. + // Stall warning time, so make a list of the offenders. + for_each_process_thread(g, t) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) + trc_add_holdout(t, &holdouts); + firstreport = true; + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { + show_stalled_task_trace(t, &firstreport); + trc_del_holdout(t); + } + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); + show_stalled_ipi_trace(); + pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); + } + smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. +} + +/* Report any needed quiescent state for this exiting task. */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); +} + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +static void show_rcu_tasks_trace_gp_kthread(void) +{ + char buf[64]; + + sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_ofl_updates), + data_race(n_heavy_reader_updates), + data_race(n_heavy_reader_attempts)); + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); +} + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static inline void show_rcu_tasks_trace_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ + +void show_rcu_tasks_gp_kthreads(void) +{ + show_rcu_tasks_classic_gp_kthread(); + show_rcu_tasks_rude_gp_kthread(); + show_rcu_tasks_trace_gp_kthread(); +} + +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +void show_rcu_tasks_gp_kthreads(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9a49cd6065a..c716eadc7617 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -67,6 +67,19 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Data structures. */ /* @@ -75,9 +88,6 @@ */ #define RCU_DYNTICK_CTRL_MASK 0x1 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, @@ -100,7 +110,7 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = 1; +static bool use_softirq = true; module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; @@ -225,9 +235,11 @@ void rcu_softirq_qs(void) /* * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state. + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. */ -static void rcu_dynticks_eqs_enter(void) +static noinstr void rcu_dynticks_eqs_enter(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -237,8 +249,9 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - /* Better be in an extended quiescent state! */ + // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); /* Better not have special action (TLB flush) pending! */ @@ -248,9 +261,10 @@ static void rcu_dynticks_eqs_enter(void) /* * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state. + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. */ -static void rcu_dynticks_eqs_exit(void) +static noinstr void rcu_dynticks_eqs_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -261,13 +275,13 @@ static void rcu_dynticks_eqs_exit(void) * critical section. */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ - /* Prefer duplicate flushes to losing a flush. */ - rcu_eqs_special_exit(); } } @@ -295,7 +309,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -static bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -333,6 +347,28 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) } /* + * Return true if the referenced integer is zero while the specified + * CPU remains within a single extended quiescent state. + */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int snap; + + // If not quiescent, force back to earlier extended quiescent state. + snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | + RCU_DYNTICK_CTRL_CTR); + + smp_rmb(); // Order ->dynticks and *vp reads. + if (READ_ONCE(*vp)) + return false; // Non-zero, so report failure; + smp_rmb(); // Order *vp read and ->dynticks re-read. + + // If still in the same extended quiescent state, we are good! + return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); +} + +/* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the * next exit from an extended quiescent state. Returns true if @@ -382,16 +418,23 @@ void rcu_momentary_dyntick_idle(void) EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); /** - * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle * * If the current CPU is idle and running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. + * interrupt, or directly, from idle, return true. + * + * The caller must have at least disabled IRQs. */ static int rcu_is_cpu_rrupt_from_idle(void) { - /* Called only from within the scheduling-clock interrupt */ - lockdep_assert_in_irq(); + long nesting; + + /* + * Usually called from the tick; but also used from smp_function_call() + * for expedited grace periods. This latter can result in running from + * the idle task, instead of an actual IPI. + */ + lockdep_assert_irqs_disabled(); /* Check for counter underflows */ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, @@ -400,9 +443,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) "RCU dynticks_nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); + if (nesting > 1) return false; + /* + * If we're not in an interrupt, we must be in the idle task! + */ + WARN_ON_ONCE(!nesting && !is_idle_task(current)); + /* Does CPU appear to be idle from an RCU standpoint? */ return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } @@ -562,7 +611,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter(bool user) +static noinstr void rcu_eqs_enter(bool user) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -571,19 +620,24 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdp->dynticks_nesting == 0); if (rdp->dynticks_nesting != 1) { + // RCU will still be watching, so just do accounting and leave. rdp->dynticks_nesting--; return; } lockdep_assert_irqs_disabled(); + instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. rcu_dynticks_task_enter(); } @@ -616,23 +670,25 @@ void rcu_idle_enter(void) * If you add or remove a call to rcu_user_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_enter(void) +noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ -/* +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * * If we are returning from the outermost NMI handler that interrupted an * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit_common(), be sure to test + * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_exit_common(bool irq) +noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -649,38 +705,33 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { + instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); + instrumentation_end(); return; } + instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (irq) + if (!in_nmi()) rcu_prepare_for_idle(); + instrumentation_end(); + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. - if (irq) + if (!in_nmi()) rcu_dynticks_task_enter(); } /** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_exit(void) -{ - rcu_nmi_exit_common(false); -} - -/** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * * Exit from an interrupt handler, which might possibly result in entering @@ -699,12 +750,52 @@ void rcu_nmi_exit(void) * If you add or remove a call to rcu_irq_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_exit(void) +void noinstr rcu_irq_exit(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); +} + +/** + * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq + * towards in kernel preemption + * + * Same as rcu_irq_exit() but has a sanity check that scheduling is safe + * from RCU point of view. Invoked from return from interrupt before kernel + * preemption. + */ +void rcu_irq_exit_preempt(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_exit_common(true); + rcu_nmi_exit(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); } +#ifdef CONFIG_PROVE_RCU +/** + * rcu_irq_exit_check_preempt - Validate that scheduling is possible + */ +void rcu_irq_exit_check_preempt(void) +{ + lockdep_assert_irqs_disabled(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} +#endif /* #ifdef CONFIG_PROVE_RCU */ + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * @@ -728,7 +819,7 @@ void rcu_irq_exit_irqson(void) * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ -static void rcu_eqs_exit(bool user) +static void noinstr rcu_eqs_exit(bool user) { struct rcu_data *rdp; long oldval; @@ -738,17 +829,22 @@ static void rcu_eqs_exit(bool user) oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { + // RCU was already watching, so just do accounting and leave. rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. + instrumentation_begin(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); } /** @@ -779,14 +875,75 @@ void rcu_idle_exit(void) * If you add or remove a call to rcu_user_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_exit(void) +void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } + +/** + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. + * + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. + * + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. + * + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. + */ +void __rcu_irq_enter_check_tick(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + // Enabling the tick is unsafe in NMI handlers. + if (WARN_ON_ONCE(in_nmi())) + return; + + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); + + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; + } + + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); +} #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_nmi_enter_common - inform RCU of entry to NMI context + * rcu_nmi_enter - inform RCU of entry to NMI context * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and @@ -795,10 +952,10 @@ void rcu_user_exit(void) * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter_common(), be sure to test + * If you add or remove a call to rcu_nmi_enter(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_enter_common(bool irq) +noinstr void rcu_nmi_enter(void) { long incby = 2; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -816,45 +973,33 @@ static __always_inline void rcu_nmi_enter_common(bool irq) */ if (rcu_dynticks_curr_cpu_in_eqs()) { - if (irq) + if (!in_nmi()) rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. - if (irq) + if (!in_nmi()) rcu_cleanup_after_idle(); incby = 1; - } else if (irq && tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - raw_spin_lock_rcu_node(rdp->mynode); - // Recheck under lock. - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - } - raw_spin_unlock_rcu_node(rdp->mynode); + } else if (!in_nmi()) { + instrumentation_begin(); + rcu_irq_enter_check_tick(); + instrumentation_end(); } + instrumentation_begin(); trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); } /** - * rcu_nmi_enter - inform RCU of entry to NMI context - */ -void rcu_nmi_enter(void) -{ - rcu_nmi_enter_common(false); -} -NOKPROBE_SYMBOL(rcu_nmi_enter); - -/** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * * Enter an interrupt handler, which might possibly result in exiting @@ -876,10 +1021,10 @@ NOKPROBE_SYMBOL(rcu_nmi_enter); * If you add or remove a call to rcu_irq_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_enter(void) +noinstr void rcu_irq_enter(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_enter_common(true); + rcu_nmi_enter(); } /* @@ -913,6 +1058,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } +noinstr bool __rcu_is_watching(void) +{ + return !rcu_dynticks_curr_cpu_in_eqs(); +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -921,7 +1071,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. */ -bool notrace rcu_is_watching(void) +bool rcu_is_watching(void) { bool ret; @@ -973,12 +1123,12 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi() || !rcu_scheduler_fully_active) return true; - preempt_disable(); + preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ret = true; - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); @@ -1217,7 +1367,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1473,6 +1623,31 @@ static void rcu_gp_slow(int delay) schedule_timeout_uninterruptible(delay); } +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_uninterruptible(duration); + pr_alert("%s: Wait complete\n", __func__); + } +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1506,6 +1681,7 @@ static bool rcu_gp_init(void) record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1611,12 +1787,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1654,13 +1834,15 @@ static void rcu_gp_fqs(bool first_time) static void rcu_gp_fqs_loop(void) { bool first_gp_fqs; - int gf; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { if (!ret) { @@ -1673,6 +1855,7 @@ static void rcu_gp_fqs_loop(void) rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ @@ -1680,12 +1863,16 @@ static void rcu_gp_fqs_loop(void) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); @@ -1705,6 +1892,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } @@ -1781,6 +1969,7 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); @@ -1821,6 +2010,7 @@ static int __noreturn rcu_gp_kthread(void *unused) swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init()) @@ -2811,6 +3001,8 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; + // Number of objects for which GP not started + int count; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); @@ -2924,6 +3116,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } + WRITE_ONCE(krcp->count, 0); + /* * One work is per one batch, so there are two "free channels", * "bhead_free" and "head_free" the batch can handle. It can be @@ -3060,6 +3254,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } + WRITE_ONCE(krcp->count, krcp->count + 1); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !krcp->monitor_todo) { @@ -3074,6 +3270,56 @@ unlock_return: } EXPORT_SYMBOL_GPL(kfree_call_rcu); +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += READ_ONCE(krcp->count); + } + + return count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + unsigned long flags; + + for_each_online_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krcp->count; + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed; +} + +static struct shrinker kfree_rcu_shrinker = { + .count_objects = kfree_rcu_shrink_count, + .scan_objects = kfree_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; + void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -3599,6 +3845,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -3994,6 +4241,8 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } + if (register_shrinker(&kfree_rcu_shrinker)) + pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..43991a40b084 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -359,6 +359,7 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ @@ -454,6 +455,8 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +static void rcu_dynticks_task_trace_enter(void); +static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..72952edad1e4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -639,6 +639,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) */ static void rcu_exp_handler(void *unused) { + int depth = rcu_preempt_depth(); unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -649,7 +650,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!rcu_preempt_depth()) { + if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -673,7 +674,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (rcu_preempt_depth() > 0) { + if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; @@ -683,30 +684,8 @@ static void rcu_exp_handler(void *unused) return; } - /* - * The final and least likely case is where the interrupted - * code was just about to or just finished exiting the RCU-preempt - * read-side critical section, and no, we can't tell which. - * So either way, set ->deferred_qs to flag later code that - * a quiescent state is required. - * - * If the CPU is fully enabled (or if some buggy RCU-preempt - * read-side critical section is being used from idle), just - * invoke rcu_preempt_deferred_qs() to immediately report the - * quiescent state. We cannot use rcu_read_unlock_special() - * because we are in an interrupt handler, which will cause that - * function to take an early exit without doing anything. - * - * Otherwise, force a context switch after the CPU enables everything. - */ - rdp->exp_deferred_qs = true; - if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { - rcu_preempt_deferred_qs(t); - } else { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + // Finally, negative nesting depth should not happen. + WARN_ON_ONCE(1); } /* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ @@ -721,17 +700,20 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; - if (!rnp->exp_tasks) + if (!READ_ONCE(rnp->exp_tasks)) return 0; + raw_spin_lock_irqsave_rcu_node(rnp, flags); t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..352223664ebd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -331,6 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -345,9 +346,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return READ_ONCE(rnp->gp_tasks) != NULL; } -/* Bias and limit values for ->rcu_read_lock_nesting. */ -#define RCU_NEST_BIAS INT_MAX -#define RCU_NEST_NMAX (-INT_MAX / 2) +/* limit value for ->rcu_read_lock_nesting. */ #define RCU_NEST_PMAX (INT_MAX / 2) static void rcu_preempt_read_enter(void) @@ -355,9 +354,9 @@ static void rcu_preempt_read_enter(void) current->rcu_read_lock_nesting++; } -static void rcu_preempt_read_exit(void) +static int rcu_preempt_read_exit(void) { - current->rcu_read_lock_nesting--; + return --current->rcu_read_lock_nesting; } static void rcu_preempt_depth_set(int val) @@ -390,21 +389,15 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (rcu_preempt_depth() != 1) { - rcu_preempt_read_exit(); - } else { + if (rcu_preempt_read_exit() == 0) { barrier(); /* critical section before exit code. */ - rcu_preempt_depth_set(-RCU_NEST_BIAS); - barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { int rrln = rcu_preempt_depth(); - WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -500,12 +493,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -556,7 +549,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - rcu_preempt_depth() <= 0; + rcu_preempt_depth() == 0; } /* @@ -569,16 +562,11 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -615,19 +603,18 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - tick_nohz_full_cpu(rdp->cpu); + exp = (t->rcu_blocked_node && + READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && - (in_interrupt() || - (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { - // Using softirq, safe to awaken, and we get - // no help from enabling irqs, unlike bh/preempt. + if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + // Using softirq, safe to awaken, and either the + // wakeup is free or there is an expedited GP. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting or NO_HZ_FULL, slow is OK. + // Also if no expediting, slow is OK. + // Plus nohz_full CPUs eventually get tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && @@ -640,7 +627,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } @@ -699,7 +685,7 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!rcu_preempt_depth()) { + } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { rcu_qs(); /* Report immediate QS. */ return; } @@ -760,8 +746,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -854,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } @@ -1036,7 +1021,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); @@ -1079,9 +1065,9 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, READ_ONCE(rnp->boost_kthread_status)); @@ -2536,7 +2522,7 @@ static bool rcu_nohz_full_cpu(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; @@ -2553,7 +2539,7 @@ static void rcu_bind_gp_kthread(void) } /* Record the current task on dyntick-idle entry. */ -static void rcu_dynticks_task_enter(void) +static void noinstr rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); @@ -2561,9 +2547,27 @@ static void rcu_dynticks_task_enter(void) } /* Record no current task on dyntick-idle exit. */ -static void rcu_dynticks_task_exit(void) +static void noinstr rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..ae76bd329582 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } @@ -192,14 +224,40 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +// Communicate task state back to the RCU CPU stall warning request. +struct rcu_stall_chk_rdr { + int nesting; + union rcu_special rs; + bool on_blkd_list; +}; + +/* + * Report out the state of a not-running task that is stalling the + * current RCU grace period. + */ +static bool check_slow_task(struct task_struct *t, void *arg) +{ + struct rcu_node *rnp; + struct rcu_stall_chk_rdr *rscrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + rscrp->nesting = t->rcu_read_lock_nesting; + rscrp->rs = t->rcu_read_unlock_special; + rnp = t->rcu_blocked_node; + rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); + return true; +} + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ static int rcu_print_task_stall(struct rcu_node *rnp) { - struct task_struct *t; int ndetected = 0; + struct rcu_stall_chk_rdr rscr; + struct task_struct *t; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -208,7 +266,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); + if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + pr_cont(" P%d", t->pid); + else + pr_cont(" P%d/%d:%c%c%c%c", + t->pid, rscr.nesting, + ".b"[rscr.rs.b.blocked], + ".q"[rscr.rs.b.need_qs], + ".e"[rscr.rs.b.exp_hint], + ".l"[rscr.on_blkd_list]); ndetected++; } pr_cont("\n"); @@ -299,6 +365,16 @@ static const char *gp_state_getname(short gs) return gp_state_names[gs]; } +/* Is the RCU grace-period kthread being starved of CPU time? */ +static bool rcu_is_gp_kthread_starving(unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -313,6 +389,7 @@ static const char *gp_state_getname(short gs) static void print_cpu_stall_info(int cpu) { unsigned long delta; + bool falsepositive; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; @@ -333,7 +410,9 @@ static void print_cpu_stall_info(int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + falsepositive = rcu_is_gp_kthread_starving(NULL) && + rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -345,8 +424,9 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_snap(rdp) & 0xfff, rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); + data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz, + falsepositive ? " (false positive?)" : ""); } /* Complain about starvation of grace-period kthread. */ @@ -355,15 +435,15 @@ static void rcu_check_gp_kthread_starvation(void) struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { + if (rcu_is_gp_kthread_starving(&j)) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), + data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); wake_up_process(gpk); @@ -371,7 +451,7 @@ static void rcu_check_gp_kthread_starvation(void) } } -static void print_other_cpu_stall(unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -408,7 +488,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + smp_processor_id(), (long)(jiffies - gps), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(); @@ -421,13 +501,11 @@ static void print_other_cpu_stall(unsigned long gp_seq) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); + gpa = data_race(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), + data_race(jiffies_till_next_fqs), rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ @@ -442,7 +520,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(void) +static void print_cpu_stall(unsigned long gps) { int cpu; unsigned long flags; @@ -467,7 +545,7 @@ static void print_cpu_stall(void) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, + jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); rcu_check_gp_kthread_starvation(); @@ -546,7 +624,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); + print_cpu_stall(gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); @@ -555,7 +633,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); + print_other_cpu_stall(gs2, gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); } @@ -581,23 +659,23 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - READ_ONCE(rcu_state.gp_activity); - jr = j - READ_ONCE(rcu_state.gp_req_activity); - jw = j - READ_ONCE(rcu_state.gp_wake_time); + ja = j - data_race(rcu_state.gp_activity); + jr = j - data_race(rcu_state.gp_req_activity); + jw = j - data_race(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), - (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rcu_get_root()->gp_seq_needed), - READ_ONCE(rcu_state.gp_flags)); + ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + (long)data_race(rcu_state.gp_seq), + (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), - (long)READ_ONCE(rnp->gp_seq_needed)); + rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), + (long)data_race(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -607,7 +685,7 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)READ_ONCE(rdp->gp_seq_needed)); + cpu, (long)data_race(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { @@ -615,7 +693,7 @@ void show_rcu_gp_kthreads(void) if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } - /* sched_show_task(rcu_state.gp_kthread); */ + show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 28a8bdc5072f..84843adfd939 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -41,6 +41,7 @@ #include <linux/sched/isolation.h> #include <linux/kprobes.h> #include <linux/slab.h> +#include <linux/irq_work.h> #define CREATE_TRACE_POINTS @@ -51,6 +52,19 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); @@ -63,12 +77,12 @@ module_param(rcu_normal_after_boot, int, 0); * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? * @ret: Best guess answer if lockdep cannot be relied on * - * Returns true if lockdep must be ignored, in which case *ret contains + * Returns true if lockdep must be ignored, in which case ``*ret`` contains * the best guess described below. Otherwise returns false, in which - * case *ret tells the caller nothing and the caller should instead + * case ``*ret`` tells the caller nothing and the caller should instead * consult lockdep. * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -82,7 +96,7 @@ module_param(rcu_normal_after_boot, int, 0); * * Note that if the CPU is in the idle loop from an RCU point of view (ie: * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) - * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, * so such a CPU is effectively never in an RCU read-side critical section @@ -98,15 +112,15 @@ module_param(rcu_normal_after_boot, int, 0); static bool rcu_read_lock_held_common(bool *ret) { if (!debug_lockdep_rcu_enabled()) { - *ret = 1; + *ret = true; return true; } if (!rcu_is_watching()) { - *ret = 0; + *ret = false; return true; } if (!rcu_lockdep_current_cpu_online()) { - *ret = 0; + *ret = false; return true; } return false; @@ -270,13 +284,12 @@ struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); EXPORT_SYMBOL_GPL(rcu_callback_map); -int notrace debug_lockdep_rcu_enabled(void) +noinstr int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); -NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? @@ -501,370 +514,6 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); -#ifdef CONFIG_TASKS_RCU - -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); - -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); - -/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) -static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; -module_param(rcu_task_stall_timeout, int, 0644); - -static struct task_struct *rcu_tasks_kthread_ptr; - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) -{ - unsigned long flags; - bool needwake; - - rhp->next = NULL; - rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks); - -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); -} - -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) -{ - unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; - struct rcu_head *list; - struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); - int fract; - - /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); - - /* - * Each pass through the following loop makes one check for - * newly arrived callbacks, and, if there are some, waits for - * one RCU-tasks grace period and then invokes the callbacks. - * This loop is terminated by the system going down. ;-) - */ - for (;;) { - - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(HZ/10); - } - continue; - } - - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); - - /* Invoke the callbacks. */ - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); - } -} - -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) -{ - struct task_struct *t; - - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; - smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); - return 0; -} -core_initcall(rcu_spawn_tasks_kthread); - -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); - preempt_enable(); -} - -#endif /* #ifdef CONFIG_TASKS_RCU */ - -#ifndef CONFIG_TINY_RCU - -/* - * Print any non-default Tasks RCU settings. - */ -static void __init rcu_tasks_bootup_oddness(void) -{ -#ifdef CONFIG_TASKS_RCU - if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) - pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); -#endif /* #ifdef CONFIG_TASKS_RCU */ -} - -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_PROVE_RCU /* @@ -935,6 +584,8 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#include "tasks.h" + #ifndef CONFIG_TINY_RCU /* diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..d0c9c287680a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..0ae29fd57817 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #include <linux/nospec.h> #include <linux/kcov.h> +#include <linux/scs.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -2561,6 +2562,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2635,6 +2638,52 @@ out: } /** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -3877,6 +3926,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -6040,6 +6092,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..222a7a9ad543 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include <linux/kasan.h> +#include <linux/mm.h> +#include <linux/scs.h> +#include <linux/slab.h> +#include <linux/vmstat.h> + +static struct kmem_cache *scs_cache; + +static void __scs_account(void *s, int account) +{ + struct page *scs_page = virt_to_page(s); + + mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / SZ_1K)); +} + +static void *scs_alloc(int node) +{ + void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + + if (!s) + return NULL; + + *__scs_magic(s) = SCS_END_MAGIC; + + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + __scs_account(s, 1); + return s; +} + +static void scs_free(void *s) +{ + __scs_account(s, -1); + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = task_scs_sp(tsk) = s; + return 0; +} + +static void scs_check_usage(struct task_struct *tsk) +{ + static unsigned long highest; + + unsigned long *p, prev, curr = highest, used = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) + return; + + for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + if (!READ_ONCE_NOCHECK(*p)) + break; + used++; + } + + while (used > curr) { + prev = cmpxchg_relaxed(&highest, curr, used); + + if (prev == curr) { + pr_info("%s (%d): highest shadow stack usage: %lu bytes\n", + tsk->comm, task_pid_nr(tsk), used); + break; + } + + curr = prev; + } +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(task_scs_end_corrupted(tsk), + "corrupted shadow stack detected when freeing task\n"); + scs_check_usage(tsk); + scs_free(s); +} diff --git a/kernel/signal.c b/kernel/signal.c index 284fc1600063..5ca48cc5da76 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3235,94 +3235,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } diff --git a/kernel/smp.c b/kernel/smp.c index 786092aabdcd..84303197caf9 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -620,7 +620,7 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -void on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/kernel/sys.c b/kernel/sys.c index 180a2fa33f7f..891667a49bb7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2634,6 +2634,7 @@ struct compat_sysinfo { COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; + struct compat_sysinfo s_32; do_sysinfo(&s); @@ -2658,23 +2659,23 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(info, sizeof(struct compat_sysinfo)) || - __put_user(s.uptime, &info->uptime) || - __put_user(s.loads[0], &info->loads[0]) || - __put_user(s.loads[1], &info->loads[1]) || - __put_user(s.loads[2], &info->loads[2]) || - __put_user(s.totalram, &info->totalram) || - __put_user(s.freeram, &info->freeram) || - __put_user(s.sharedram, &info->sharedram) || - __put_user(s.bufferram, &info->bufferram) || - __put_user(s.totalswap, &info->totalswap) || - __put_user(s.freeswap, &info->freeswap) || - __put_user(s.procs, &info->procs) || - __put_user(s.totalhigh, &info->totalhigh) || - __put_user(s.freehigh, &info->freehigh) || - __put_user(s.mem_unit, &info->mem_unit)) + memset(&s_32, 0, sizeof(s_32)); + s_32.uptime = s.uptime; + s_32.loads[0] = s.loads[0]; + s_32.loads[1] = s.loads[1]; + s_32.loads[2] = s.loads[2]; + s_32.totalram = s.totalram; + s_32.freeram = s.freeram; + s_32.sharedram = s.sharedram; + s_32.bufferram = s.bufferram; + s_32.totalswap = s.totalswap; + s_32.freeswap = s.freeswap; + s_32.procs = s.procs; + s_32.totalhigh = s.totalhigh; + s_32.freehigh = s.freehigh; + s_32.mem_unit = s.mem_unit; + if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; - return 0; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 743647005f64..24876faac753 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool -config HAVE_FTRACE_NMI_ENTER - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_FUNCTION_TRACER bool help @@ -72,11 +67,6 @@ config RING_BUFFER select TRACE_CLOCK select IRQ_WORK -config FTRACE_NMI_ENTER - bool - depends on HAVE_FTRACE_NMI_ENTER - default y - config EVENT_TRACING select CONTEXT_SWITCH_TRACER select GLOB @@ -158,6 +148,7 @@ config FUNCTION_TRACER select CONTEXT_SWITCH_TRACER select GLOB select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a010edc37ee0..92ba69b716dc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1500,7 +1500,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bd030b1b9514..b5765aeea698 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -160,17 +160,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_rcu(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ @@ -256,7 +245,7 @@ static void update_ftrace_function(void) * Make sure all CPUs see this. Yes this is slow, but static * tracing is slow and nasty to have enabled. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* Now all cpus are using the list ops. */ function_trace_op = set_function_trace_op; /* Make sure the function_trace_op is visible on all CPUs */ @@ -2932,7 +2921,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* * When the kernel is preeptive, tasks can be preempted @@ -5888,7 +5877,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); free_ftrace_hash(old_hash); } diff --git a/lib/Kconfig b/lib/Kconfig index 8ec05335426c..df3f3da95990 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -83,6 +83,9 @@ config ARCH_USE_CMPXCHG_LOCKREF config ARCH_HAS_FAST_MULTIPLIER bool +config ARCH_USE_SYM_ANNOTATIONS + bool + config INDIRECT_PIO bool "Access I/O in non-MMIO mode" depends on ARM64 @@ -430,7 +433,7 @@ config INTERVAL_TREE See: - Documentation/rbtree.txt + Documentation/core-api/rbtree.rst for more information. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f3322a620674..b3b05adab575 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -369,6 +369,11 @@ config STACK_VALIDATION For more information, see tools/objtool/Documentation/stack-validation.txt. +config VMLINUX_VALIDATION + bool + depends on STACK_VALIDATION && DEBUG_ENTRY && !PARAVIRT + default y + config DEBUG_FORCE_WEAK_PER_CPU bool "Force weak per-cpu definitions" depends on DEBUG_KERNEL @@ -1510,7 +1515,7 @@ config PROVIDE_OHCI1394_DMA_INIT This code (~1k) is freed after boot. By then, the firewire stack in charge of the OHCI-1394 controllers should be used instead. - See Documentation/debugging-via-ohci1394.txt for more information. + See Documentation/core-api/debugging-via-ohci1394.rst for more information. source "samples/Kconfig" diff --git a/lib/bitmap.c b/lib/bitmap.c index 89260aa342d6..21a7640c5eed 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -182,21 +182,22 @@ EXPORT_SYMBOL(__bitmap_shift_left); * * In pictures, example for a big-endian 32-bit architecture: * - * @src: - * 31 63 - * | | - * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 - * | | | | - * 16 14 0 32 - * - * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is: - * - * 31 63 - * | | - * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 - * | | | - * 14 (bit 17 0 32 - * from @src) + * The @src bitmap is:: + * + * 31 63 + * | | + * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 + * | | | | + * 16 14 0 32 + * + * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is:: + * + * 31 63 + * | | + * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 + * | | | + * 14 (bit 17 0 32 + * from @src) * * Note that @dst and @src might overlap partially or entirely. * diff --git a/lib/checksum.c b/lib/checksum.c index de032ad96f4a..7ac65a0000ff 100644 --- a/lib/checksum.c +++ b/lib/checksum.c @@ -146,26 +146,6 @@ __sum16 ip_compute_csum(const void *buff, int len) EXPORT_SYMBOL(ip_compute_csum); /* - * copy from fs while checksumming, otherwise like csum_partial - */ -__wsum -csum_partial_copy_from_user(const void __user *src, void *dst, int len, - __wsum sum, int *csum_err) -{ - int missing; - - missing = __copy_from_user(dst, src, len); - if (missing) { - memset(dst + len - missing, 0, missing); - *csum_err = -EFAULT; - } else - *csum_err = 0; - - return csum_partial(dst, len, sum); -} -EXPORT_SYMBOL(csum_partial_copy_from_user); - -/* * copy from ds while checksumming, otherwise like csum_partial */ __wsum diff --git a/lib/kobject.c b/lib/kobject.c index 83198cb37d8d..65fa7bf70c57 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2006-2007 Novell Inc. * - * Please see the file Documentation/kobject.txt for critical information + * Please see the file Documentation/core-api/kobject.rst for critical information * about using the kobject interface. */ @@ -670,7 +670,7 @@ static void kobject_cleanup(struct kobject *kobj) kobject_name(kobj), kobj, __func__, kobj->parent); if (t && !t->release) - pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/kobject.txt.\n", + pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", kobject_name(kobj), kobj); /* send "remove" if the caller did not do it but sent "add" */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 2ee6ae3b0ade..34e406fe561f 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/percpu.h> +#include <linux/local_lock.h> #include <linux/preempt.h> /* in_interrupt() */ #include <linux/radix-tree.h> #include <linux/rcupdate.h> @@ -27,7 +28,6 @@ #include <linux/string.h> #include <linux/xarray.h> - /* * Radix tree node cache. */ @@ -58,12 +58,10 @@ struct kmem_cache *radix_tree_node_cachep; /* * Per-cpu pool of preloaded nodes */ -struct radix_tree_preload { - unsigned nr; - /* nodes->parent points to next preallocated node */ - struct radix_tree_node *nodes; +DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { + .lock = INIT_LOCAL_LOCK(lock), }; -static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads); static inline struct radix_tree_node *entry_to_node(void *ptr) { @@ -332,14 +330,14 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) */ gfp_mask &= ~__GFP_ACCOUNT; - preempt_disable(); + local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { - preempt_enable(); + local_unlock(&radix_tree_preloads.lock); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; - preempt_disable(); + local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < nr) { node->parent = rtp->nodes; @@ -381,7 +379,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ - preempt_disable(); + local_lock(&radix_tree_preloads.lock); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); @@ -1470,7 +1468,7 @@ EXPORT_SYMBOL(radix_tree_tagged); void idr_preload(gfp_t gfp_mask) { if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) - preempt_disable(); + local_lock(&radix_tree_preloads.lock); } EXPORT_SYMBOL(idr_preload); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 706020b06617..b90ec550183a 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -116,9 +116,9 @@ long strncpy_from_user(char *dst, const char __user *src, long count) kasan_check_write(dst, count); check_object_size(dst, count, false); - if (user_access_begin(src, max)) { + if (user_read_access_begin(src, max)) { retval = do_strncpy_from_user(dst, src, count, max); - user_access_end(); + user_read_access_end(); return retval; } } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 41670d4a5816..1616710b8a82 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -109,9 +109,9 @@ long strnlen_user(const char __user *str, long count) if (max > count) max = count; - if (user_access_begin(str, max)) { + if (user_read_access_begin(str, max)) { retval = do_strnlen_user(str, count, max); - user_access_end(); + user_read_access_end(); return retval; } } diff --git a/lib/usercopy.c b/lib/usercopy.c index cbb4d9ec00f2..ca2a697a2061 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -58,7 +58,7 @@ int check_zeroed_user(const void __user *from, size_t size) from -= align; size += align; - if (!user_access_begin(from, size)) + if (!user_read_access_begin(from, size)) return -EFAULT; unsafe_get_user(val, (unsigned long __user *) from, err_fault); @@ -79,10 +79,10 @@ int check_zeroed_user(const void __user *from, size_t size) val &= aligned_byte_mask(size); done: - user_access_end(); + user_read_access_end(); return (val == 0); err_fault: - user_access_end(); + user_read_access_end(); return -EFAULT; } EXPORT_SYMBOL(check_zeroed_user); diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..c9d659e6a02c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2243,15 +2243,11 @@ check_drain: * would succeed. */ if (cc->order > 0 && last_migrated_pfn) { - int cpu; unsigned long current_block_start = block_start_pfn(cc->migrate_pfn, cc->order); if (last_migrated_pfn < current_block_start) { - cpu = get_cpu(); - lru_add_drain_cpu(cpu); - drain_local_pages(cc->zone); - put_cpu(); + lru_add_drain_cpu_zone(cc->zone); /* No more flushing until we migrate again */ last_migrated_pfn = 0; } @@ -382,13 +382,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, } /* - * FOLL_FORCE can write to even unwritable pte's, but only - * after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE or a forced COW break can write even to unwritable pte's, + * but only after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte)); +} + +/* + * A (separate) COW fault might break the page the other way and + * get_user_pages() would return the page from what is now the wrong + * VM. So we need to force a COW break at GUP time even for reads. + */ +static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags) +{ + return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN)); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -1066,9 +1075,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto out; } if (is_vm_hugetlb_page(vma)) { + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags, locked); + foll_flags, locked); if (locked && *locked == 0) { /* * We've got a VM_FAULT_RETRY @@ -1082,6 +1093,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } } + + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; + retry: /* * If we have a pending SIGKILL, don't keep faulting pages and @@ -2696,6 +2711,10 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) * * If the architecture does not support this function, simply return with no * pages pinned. + * + * Careful, careful! COW breaking can go either way, so a non-write + * access can get ambiguous page results. If you call this function without + * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -2731,6 +2750,12 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * * We do not adopt an rcu_read_lock(.) here as we also want to * block IPIs that come from THPs splitting. + * + * NOTE! We allow read-only gup_fast() here, but you'd better be + * careful about possible COW pages. You'll get _a_ COW page, but + * not necessarily the one you intended to get depending on what + * COW event happens after this. COW may break the page copy in a + * random direction. */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && @@ -2788,10 +2813,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; + /* + * The FAST_GUP case requires FOLL_WRITE even for pure reads, + * because get_user_pages() may need to cause an early COW in + * order to avoid confusing the normal COW routines. So only + * targets that are already writable are safe to do by just + * looking at the page tables. + */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { local_irq_disable(); - gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned); + gup_pgd_range(addr, end, gup_flags | FOLL_WRITE, pages, &nr_pinned); local_irq_enable(); ret = nr_pinned; } @@ -2867,9 +2899,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for further details. + * see Documentation/core-api/pin_user_pages.rst for further details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, @@ -2907,9 +2939,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -2943,9 +2975,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ecd1045113b..11fe0b4dbe67 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1515,13 +1515,12 @@ out_unlock: } /* - * FOLL_FORCE can write to even unwritable pmd's, but only - * after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE or a forced COW break can write even to unwritable pmd's, + * but only after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd)); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 45ad73122e82..ca864102bebe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5417,6 +5417,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " managed:%lukB" " mlocked:%lukB" " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -5439,6 +5442,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + zone_page_state(zone, NR_KERNEL_SCS_KB), +#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), diff --git a/mm/swap.c b/mm/swap.c index bf9a79fed62d..0ac463d44cff 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -35,6 +35,7 @@ #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> +#include <linux/local_lock.h> #include "internal.h" @@ -44,14 +45,32 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); -static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); +/* Protecting only lru_rotate.pvec which requires disabling interrupts */ +struct lru_rotate { + local_lock_t lock; + struct pagevec pvec; +}; +static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +/* + * The following struct pagevec are grouped together because they are protected + * by disabling preemption (and interrupts remain enabled). + */ +struct lru_pvecs { + local_lock_t lock; + struct pagevec lru_add; + struct pagevec lru_deactivate_file; + struct pagevec lru_deactivate; + struct pagevec lru_lazyfree; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + struct pagevec activate_page; #endif +}; +static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { + .lock = INIT_LOCAL_LOCK(lock), +}; /* * This path almost never happens for VM activity - pages are normally @@ -254,11 +273,11 @@ void rotate_reclaimable_page(struct page *page) unsigned long flags; get_page(page); - local_irq_save(flags); - pvec = this_cpu_ptr(&lru_rotate_pvecs); + local_lock_irqsave(&lru_rotate.lock, flags); + pvec = this_cpu_ptr(&lru_rotate.pvec); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } } @@ -293,7 +312,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, #ifdef CONFIG_SMP static void activate_page_drain(int cpu) { - struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, __activate_page, NULL); @@ -301,19 +320,21 @@ static void activate_page_drain(int cpu) static bool need_activate_page_drain(int cpu) { - return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, __activate_page, NULL); - put_cpu_var(activate_page_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -335,9 +356,12 @@ void activate_page(struct page *page) static void __lru_cache_activate_page(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + struct pagevec *pvec; int i; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + /* * Search backwards on the optimistic assumption that the page being * activated has just been added to this pagevec. Note that only @@ -357,7 +381,7 @@ static void __lru_cache_activate_page(struct page *page) } } - put_cpu_var(lru_add_pvec); + local_unlock(&lru_pvecs.lock); } /* @@ -385,7 +409,7 @@ void mark_page_accessed(struct page *page) } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via - * activate_page_pvecs. Otherwise, assume the page is on a + * lru_pvecs.activate_page. Otherwise, assume the page is on a * pagevec, mark it active and it'll be moved to the active * LRU on the next drain. */ @@ -404,12 +428,14 @@ EXPORT_SYMBOL(mark_page_accessed); static void __lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvec); + local_unlock(&lru_pvecs.lock); } /** @@ -593,30 +619,30 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_rotate_pvecs, cpu); + pvec = &per_cpu(lru_rotate.pvec, cpu); if (pagevec_count(pvec)) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_irq_save(flags); + local_lock_irqsave(&lru_rotate.lock, flags); pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } - pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - pvec = &per_cpu(lru_lazyfree_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -641,11 +667,14 @@ void deactivate_file_page(struct page *page) return; if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - put_cpu_var(lru_deactivate_file_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -660,12 +689,14 @@ void deactivate_file_page(struct page *page) void deactivate_page(struct page *page) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -680,19 +711,30 @@ void mark_page_lazyfree(struct page *page) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); - put_cpu_var(lru_lazyfree_pvecs); + local_unlock(&lru_pvecs.lock); } } void lru_add_drain(void) { - lru_add_drain_cpu(get_cpu()); - put_cpu(); + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + local_unlock(&lru_pvecs.lock); +} + +void lru_add_drain_cpu_zone(struct zone *zone) +{ + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + drain_local_pages(zone); + local_unlock(&lru_pvecs.lock); } #ifdef CONFIG_SMP @@ -743,11 +785,11 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || - pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || + if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + pagevec_count(&per_cpu(lru_rotate.pvec, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); diff --git a/mm/vmstat.c b/mm/vmstat.c index b1582fdf757c..5e241434cab2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1119,6 +1119,9 @@ const char * const vmstat_text[] = { "nr_mlock", "nr_page_table_pages", "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", diff --git a/samples/Kconfig b/samples/Kconfig index 9d236c346de5..205076cf234e 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -171,7 +171,7 @@ config SAMPLE_VFIO_MDEV_MBOCHS config SAMPLE_ANDROID_BINDERFS bool "Build Android binderfs example" - depends on CONFIG_ANDROID_BINDERFS + depends on ANDROID_BINDERFS help Builds a sample program to illustrate the use of the Android binderfs filesystem. diff --git a/samples/binderfs/Makefile b/samples/binderfs/Makefile index ea4c93d36256..a3ac5476338a 100644 --- a/samples/binderfs/Makefile +++ b/samples/binderfs/Makefile @@ -1,2 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs_example.o +ifndef CROSS_COMPILE +ifdef CONFIG_SAMPLE_ANDROID_BINDERFS +hostprogs := binderfs_example +endif +endif diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index e04229d21475..c13a5bc5095b 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -20,18 +20,22 @@ static unsigned long my_ip = (unsigned long)schedule; asm ( " .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" " my_tramp1:" " pushq %rbp\n" " movq %rsp, %rbp\n" " call my_direct_func1\n" " leave\n" +" .size my_tramp1, .-my_tramp1\n" " ret\n" +" .type my_tramp2, @function\n" " my_tramp2:" " pushq %rbp\n" " movq %rsp, %rbp\n" " call my_direct_func2\n" " leave\n" " ret\n" +" .size my_tramp2, .-my_tramp2\n" " .popsection\n" ); diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index 27efa5f6ff52..d5c5022be664 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -15,6 +15,7 @@ extern void my_tramp(void *); asm ( " .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" " my_tramp:" " pushq %rbp\n" " movq %rsp, %rbp\n" @@ -27,6 +28,7 @@ asm ( " popq %rdi\n" " leave\n" " ret\n" +" .size my_tramp, .-my_tramp\n" " .popsection\n" ); diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index a2e3063bd306..63ca06d42c80 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -13,6 +13,7 @@ extern void my_tramp(void *); asm ( " .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" " my_tramp:" " pushq %rbp\n" " movq %rsp, %rbp\n" @@ -21,6 +22,7 @@ asm ( " popq %rdi\n" " leave\n" " ret\n" +" .size my_tramp, .-my_tramp\n" " .popsection\n" ); diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index d693c23a85e8..501911d1b327 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -25,7 +25,7 @@ static struct kprobe kp = { }; /* kprobe pre_handler: called just before the probed instruction is executed */ -static int handler_pre(struct kprobe *p, struct pt_regs *regs) +static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs) { #ifdef CONFIG_X86 pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", @@ -54,7 +54,7 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs) } /* kprobe post_handler: called after the probed instruction is executed */ -static void handler_post(struct kprobe *p, struct pt_regs *regs, +static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { #ifdef CONFIG_X86 @@ -90,6 +90,8 @@ static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) /* Return 0 because we don't handle the fault. */ return 0; } +/* NOKPROBE_SYMBOL() is also available */ +NOKPROBE_SYMBOL(handler_fault); static int __init kprobe_init(void) { diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index 186315ca88b3..013e8e6ebae9 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -48,6 +48,7 @@ static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) data->entry_stamp = ktime_get(); return 0; } +NOKPROBE_SYMBOL(entry_handler); /* * Return-probe handler: Log the return value and duration. Duration may turn @@ -67,6 +68,7 @@ static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) func_name, retval, (long long)delta); return 0; } +NOKPROBE_SYMBOL(ret_handler); static struct kretprobe my_kretprobe = { .handler = ret_handler, diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c index a3d68159fb51..76c577ea4fd8 100644 --- a/samples/vfs/test-statx.c +++ b/samples/vfs/test-statx.c @@ -216,7 +216,7 @@ int main(int argc, char **argv) struct statx stx; int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW; - unsigned int mask = STATX_ALL; + unsigned int mask = STATX_BASIC_STATS | STATX_BTIME; for (argv++; *argv; argv++) { if (strcmp(*argv, "-F") == 0) { diff --git a/scripts/kernel-doc b/scripts/kernel-doc index f746ca8fa403..f68d76dd97ba 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -213,7 +213,9 @@ my $type_constant = '\b``([^\`]+)``\b'; my $type_constant2 = '\%([-_\w]+)'; my $type_func = '(\w+)\(\)'; my $type_param = '\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; +my $type_param_ref = '([\!]?)\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; my $type_fp_param = '\@(\w+)\(\)'; # Special RST handling for func ptr params +my $type_fp_param2 = '\@(\w+->\S+)\(\)'; # Special RST handling for structs with func ptr params my $type_env = '(\$\w+)'; my $type_enum = '\&(enum\s*([_\w]+))'; my $type_struct = '\&(struct\s*([_\w]+))'; @@ -236,6 +238,7 @@ my @highlights_man = ( [$type_typedef, "\\\\fI\$1\\\\fP"], [$type_union, "\\\\fI\$1\\\\fP"], [$type_param, "\\\\fI\$1\\\\fP"], + [$type_param_ref, "\\\\fI\$1\$2\\\\fP"], [$type_member, "\\\\fI\$1\$2\$3\\\\fP"], [$type_fallback, "\\\\fI\$1\\\\fP"] ); @@ -249,6 +252,7 @@ my @highlights_rst = ( [$type_member_func, "\\:c\\:type\\:`\$1\$2\$3\\\\(\\\\) <\$1>`"], [$type_member, "\\:c\\:type\\:`\$1\$2\$3 <\$1>`"], [$type_fp_param, "**\$1\\\\(\\\\)**"], + [$type_fp_param2, "**\$1\\\\(\\\\)**"], [$type_func, "\$1()"], [$type_enum, "\\:c\\:type\\:`\$1 <\$2>`"], [$type_struct, "\\:c\\:type\\:`\$1 <\$2>`"], @@ -256,7 +260,7 @@ my @highlights_rst = ( [$type_union, "\\:c\\:type\\:`\$1 <\$2>`"], # in rst this can refer to any type [$type_fallback, "\\:c\\:type\\:`\$1`"], - [$type_param, "**\$1**"] + [$type_param_ref, "**\$1\$2**"] ); my $blankline_rst = "\n"; @@ -327,13 +331,14 @@ my $lineprefix=""; # Parser states use constant { - STATE_NORMAL => 0, # normal code - STATE_NAME => 1, # looking for function name - STATE_BODY_MAYBE => 2, # body - or maybe more description - STATE_BODY => 3, # the body of the comment - STATE_PROTO => 4, # scanning prototype - STATE_DOCBLOCK => 5, # documentation block - STATE_INLINE => 6, # gathering documentation outside main block + STATE_NORMAL => 0, # normal code + STATE_NAME => 1, # looking for function name + STATE_BODY_MAYBE => 2, # body - or maybe more description + STATE_BODY => 3, # the body of the comment + STATE_BODY_WITH_BLANK_LINE => 4, # the body, which has a blank line + STATE_PROTO => 5, # scanning prototype + STATE_DOCBLOCK => 6, # documentation block + STATE_INLINE => 7, # gathering doc outside main block }; my $state; my $in_doc_sect; @@ -1953,6 +1958,12 @@ sub process_body($$) { } } + if ($state == STATE_BODY_WITH_BLANK_LINE && /^\s*\*\s?\S/) { + dump_section($file, $section, $contents); + $section = $section_default; + $contents = ""; + } + if (/$doc_sect/i) { # case insensitive for supported section names $newsection = $1; $newcontents = $2; @@ -2006,18 +2017,21 @@ sub process_body($$) { $state = STATE_PROTO; $brcount = 0; } elsif (/$doc_content/) { - # miguel-style comment kludge, look for blank lines after - # @parameter line to signify start of description if ($1 eq "") { - if ($section =~ m/^@/ || $section eq $section_context) { + if ($section eq $section_context) { dump_section($file, $section, $contents); $section = $section_default; $contents = ""; $new_start_line = $.; + $state = STATE_BODY; } else { + if ($section ne $section_default) { + $state = STATE_BODY_WITH_BLANK_LINE; + } else { + $state = STATE_BODY; + } $contents .= "\n"; } - $state = STATE_BODY; } elsif ($state == STATE_BODY_MAYBE) { # Continued declaration purpose chomp($declaration_purpose); @@ -2169,7 +2183,8 @@ sub process_file($) { process_normal(); } elsif ($state == STATE_NAME) { process_name($file, $_); - } elsif ($state == STATE_BODY || $state == STATE_BODY_MAYBE) { + } elsif ($state == STATE_BODY || $state == STATE_BODY_MAYBE || + $state == STATE_BODY_WITH_BLANK_LINE) { process_body($file, $_); } elsif ($state == STATE_INLINE) { # scanning for inline parameters process_inline($file, $_); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index d09ab4afbda4..3adef49250af 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -55,6 +55,29 @@ modpost_link() ${LD} ${KBUILD_LDFLAGS} -r -o ${1} ${objects} } +objtool_link() +{ + local objtoolopt; + + if [ -n "${CONFIG_VMLINUX_VALIDATION}" ]; then + objtoolopt="check" + if [ -z "${CONFIG_FRAME_POINTER}" ]; then + objtoolopt="${objtoolopt} --no-fp" + fi + if [ -n "${CONFIG_GCOV_KERNEL}" ]; then + objtoolopt="${objtoolopt} --no-unreachable" + fi + if [ -n "${CONFIG_RETPOLINE}" ]; then + objtoolopt="${objtoolopt} --retpoline" + fi + if [ -n "${CONFIG_X86_SMAP}" ]; then + objtoolopt="${objtoolopt} --uaccess" + fi + info OBJTOOL ${1} + tools/objtool/objtool ${objtoolopt} ${1} + fi +} + # Link of vmlinux # ${1} - output file # ${2}, ${3}, ... - optional extra .o files @@ -251,6 +274,7 @@ ${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init need-builtin=1 #link vmlinux.o info LD vmlinux.o modpost_link vmlinux.o +objtool_link vmlinux.o # modpost vmlinux.o to check for section mismatches ${MAKE} -f "${srctree}/scripts/Makefile.modpost" MODPOST_VMLINUX=1 diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5c3c50c5ec52..0053d4fea847 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -948,7 +948,7 @@ static void check_section(const char *modname, struct elf_info *elf, #define DATA_SECTIONS ".data", ".data.rel" #define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \ - ".kprobes.text", ".cpuidle.text" + ".kprobes.text", ".cpuidle.text", ".noinstr.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", ".text.*", \ ".coldtext" diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index fa3fb05cd54b..c680c3efb176 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0-or-later use strict; -# Copyright (c) 2017-2019 Mauro Carvalho Chehab <mchehab@kernel.org> +# Copyright (c) 2017-2020 Mauro Carvalho Chehab <mchehab@kernel.org> # my $prefix = "./"; @@ -22,10 +22,16 @@ my $need = 0; my $optional = 0; my $need_symlink = 0; my $need_sphinx = 0; +my $need_venv = 0; +my $need_virtualenv = 0; my $rec_sphinx_upgrade = 0; my $install = ""; my $virtenv_dir = ""; +my $python_cmd = ""; my $min_version; +my $cur_version; +my $rec_version = "1.7.9"; # PDF won't build here +my $min_pdf_version = "2.4.4"; # Min version where pdf builds # # Command line arguments @@ -142,12 +148,30 @@ sub findprog($) } } +sub find_python_no_venv() +{ + my $prog = shift; + + my $cur_dir = qx(pwd); + $cur_dir =~ s/\s+$//; + + foreach my $dir (split(/:/, $ENV{PATH})) { + next if ($dir =~ m,($cur_dir)/sphinx,); + return "$dir/python3" if(-x "$dir/python3"); + } + foreach my $dir (split(/:/, $ENV{PATH})) { + next if ($dir =~ m,($cur_dir)/sphinx,); + return "$dir/python" if(-x "$dir/python"); + } + return "python"; +} + sub check_program($$) { my $prog = shift; my $is_optional = shift; - return if findprog($prog); + return $prog if findprog($prog); add_package($prog, $is_optional); } @@ -168,9 +192,9 @@ sub check_python_module($$) my $prog = shift; my $is_optional = shift; - my $err = system("python3 -c 'import $prog' 2>/dev/null /dev/null"); - return if ($err == 0); - my $err = system("python -c 'import $prog' 2>/dev/null /dev/null"); + return if (!$python_cmd); + + my $err = system("$python_cmd -c 'import $prog' 2>/dev/null /dev/null"); return if ($err == 0); add_package($prog, $is_optional); @@ -225,23 +249,33 @@ sub get_sphinx_fname() return $fname; } - if ($virtualenv) { - my $prog = findprog("virtualenv-3"); - $prog = findprog("virtualenv-3.5") if (!$prog); + return ""; +} - check_program("virtualenv", 0) if (!$prog); - $need_sphinx = 1; - } else { - add_package("python-sphinx", 0); - } +sub get_sphinx_version($) +{ + my $cmd = shift; + my $ver; - return ""; + open IN, "$cmd --version 2>&1 |"; + while (<IN>) { + if (m/^\s*sphinx-build\s+([\d\.]+)(\+\/[\da-f]+)?$/) { + $ver=$1; + last; + } + # Sphinx 1.2.x uses a different format + if (m/^\s*Sphinx.*\s+([\d\.]+)$/) { + $ver=$1; + last; + } + } + close IN; + return $ver; } sub check_sphinx() { - my $rec_version; - my $cur_version; + my $default_version; open IN, $conf or die "Can't open $conf"; while (<IN>) { @@ -257,45 +291,39 @@ sub check_sphinx() open IN, $requirement_file or die "Can't open $requirement_file"; while (<IN>) { if (m/^\s*Sphinx\s*==\s*([\d\.]+)$/) { - $rec_version=$1; + $default_version=$1; last; } } close IN; - die "Can't get recommended sphinx version from $requirement_file" if (!$min_version); + die "Can't get default sphinx version from $requirement_file" if (!$default_version); - $virtenv_dir = $virtenv_prefix . $rec_version; + $virtenv_dir = $virtenv_prefix . $default_version; my $sphinx = get_sphinx_fname(); - return if ($sphinx eq ""); - - open IN, "$sphinx --version 2>&1 |" or die "$sphinx returned an error"; - while (<IN>) { - if (m/^\s*sphinx-build\s+([\d\.]+)(\+\/[\da-f]+)?$/) { - $cur_version=$1; - last; - } - # Sphinx 1.2.x uses a different format - if (m/^\s*Sphinx.*\s+([\d\.]+)$/) { - $cur_version=$1; - last; - } + if ($sphinx eq "") { + $need_sphinx = 1; + return; } - close IN; + + $cur_version = get_sphinx_version($sphinx); + die ("$sphinx returned an error") if (!$cur_version); die "$sphinx didn't return its version" if (!$cur_version); if ($cur_version lt $min_version) { printf "ERROR: Sphinx version is %s. It should be >= %s (recommended >= %s)\n", - $cur_version, $min_version, $rec_version;; + $cur_version, $min_version, $default_version; $need_sphinx = 1; return; } if ($cur_version lt $rec_version) { - printf "Sphinx version %s\n", $cur_version; - print "Warning: It is recommended at least Sphinx version $rec_version.\n"; + $rec_sphinx_upgrade = 1; + return; + } + if ($cur_version lt $min_pdf_version) { $rec_sphinx_upgrade = 1; return; } @@ -336,6 +364,7 @@ sub give_debian_hints() my %map = ( "python-sphinx" => "python3-sphinx", "sphinx_rtd_theme" => "python3-sphinx-rtd-theme", + "ensurepip" => "python3-venv", "virtualenv" => "virtualenv", "dot" => "graphviz", "convert" => "imagemagick", @@ -349,7 +378,8 @@ sub give_debian_hints() "fonts-dejavu", 2); check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc", - "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc"], + "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc"], "fonts-noto-cjk", 2); } @@ -446,9 +476,11 @@ sub give_opensuse_hints() "convert" => "ImageMagick", "Pod::Usage" => "perl-Pod-Usage", "xelatex" => "texlive-xetex-bin", - "rsvg-convert" => "rsvg-view", ); + # On Tumbleweed, this package is also named rsvg-convert + $map{"rsvg-convert"} = "rsvg-view" if (!($system_release =~ /Tumbleweed/)); + my @suse_tex_pkgs = ( "texlive-babel-english", "texlive-caption", @@ -491,7 +523,7 @@ sub give_mageia_hints() "convert" => "ImageMagick", "Pod::Usage" => "perl-Pod-Usage", "xelatex" => "texlive", - "rsvg-convert" => "librsvg2-tools", + "rsvg-convert" => "librsvg2", ); my @tex_pkgs = ( @@ -500,16 +532,29 @@ sub give_mageia_hints() $map{"latexmk"} = "texlive-collection-basic"; + my $packager_cmd; + my $noto_sans; + if ($system_release =~ /OpenMandriva/) { + $packager_cmd = "dnf install"; + $noto_sans = "noto-sans-cjk-fonts"; + @tex_pkgs = ( "texlive-collection-fontsextra" ); + } else { + $packager_cmd = "urpmi"; + $noto_sans = "google-noto-sans-cjk-ttc-fonts"; + } + + if ($pdf) { - check_missing_file(["/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc"], - "google-noto-sans-cjk-ttc-fonts", 2); + check_missing_file(["/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/TTF/NotoSans-Regular.ttf"], + $noto_sans, 2); } check_rpm_missing(\@tex_pkgs, 2) if ($pdf); check_missing(\%map); return if (!$need && !$optional); - printf("You should run:\n\n\tsudo urpmi $install\n"); + printf("You should run:\n\n\tsudo $packager_cmd $install\n"); } sub give_arch_linux_hints() @@ -557,7 +602,8 @@ sub give_gentoo_hints() "media-fonts/dejavu", 2) if ($pdf); if ($pdf) { - check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf"], + check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf", + "/usr/share/fonts/noto-cjk/NotoSerifCJK-Regular.ttc"], "media-fonts/noto-cjk", 2); } @@ -572,10 +618,10 @@ sub give_gentoo_hints() my $portage_imagemagick = "/etc/portage/package.use/imagemagick"; my $portage_cairo = "/etc/portage/package.use/graphviz"; - if (qx(cat $portage_imagemagick) ne "$imagemagick\n") { + if (qx(grep imagemagick $portage_imagemagick 2>/dev/null) eq "") { printf("\tsudo su -c 'echo \"$imagemagick\" > $portage_imagemagick'\n") } - if (qx(cat $portage_cairo) ne "$cairo\n") { + if (qx(grep graphviz $portage_cairo 2>/dev/null) eq "") { printf("\tsudo su -c 'echo \"$cairo\" > $portage_cairo'\n"); } @@ -622,6 +668,10 @@ sub check_distros() give_mageia_hints; return; } + if ($system_release =~ /OpenMandriva/) { + give_mageia_hints; + return; + } if ($system_release =~ /Arch Linux/) { give_arch_linux_hints; return; @@ -651,22 +701,58 @@ sub check_distros() sub deactivate_help() { - printf "\tIf you want to exit the virtualenv, you can use:\n"; + printf "\nIf you want to exit the virtualenv, you can use:\n"; printf "\tdeactivate\n"; } sub check_needs() { - # Check for needed programs/tools + # Check if Sphinx is already accessible from current environment check_sphinx(); if ($system_release) { - print "Detected OS: $system_release.\n\n"; + print "Detected OS: $system_release.\n"; } else { - print "Unknown OS\n\n"; + print "Unknown OS\n"; + } + printf "Sphinx version: %s\n\n", $cur_version if ($cur_version); + + # Check python command line, trying first python3 + $python_cmd = findprog("python3"); + $python_cmd = check_program("python", 0) if (!$python_cmd); + + # Check the type of virtual env, depending on Python version + if ($python_cmd) { + if ($virtualenv) { + my $tmp = qx($python_cmd --version 2>&1); + if ($tmp =~ m/(\d+\.)(\d+\.)/) { + if ($1 >= 3 && $2 >= 3) { + $need_venv = 1; # python 3.3 or upper + } else { + $need_virtualenv = 1; + } + if ($1 < 3) { + # Complain if it finds python2 (or worse) + printf "Warning: python$1 support is deprecated. Use it with caution!\n"; + } + } else { + die "Warning: couldn't identify $python_cmd version!"; + } + } else { + add_package("python-sphinx", 0); + } } - print "To upgrade Sphinx, use:\n\n" if ($rec_sphinx_upgrade); + # Set virtualenv command line, if python < 3.3 + my $virtualenv_cmd; + if ($need_virtualenv) { + $virtualenv_cmd = findprog("virtualenv-3"); + $virtualenv_cmd = findprog("virtualenv-3.5") if (!$virtualenv_cmd); + if (!$virtualenv_cmd) { + check_program("virtualenv", 0); + $virtualenv_cmd = "virtualenv"; + } + } # Check for needed programs/tools check_perl_module("Pod::Usage", 0); @@ -681,46 +767,81 @@ sub check_needs() check_program("rsvg-convert", 2) if ($pdf); check_program("latexmk", 2) if ($pdf); + if ($need_sphinx || $rec_sphinx_upgrade) { + check_python_module("ensurepip", 0) if ($need_venv); + } + + # Do distro-specific checks and output distro-install commands check_distros(); + if (!$python_cmd) { + if ($need == 1) { + die "Can't build as $need mandatory dependency is missing"; + } elsif ($need) { + die "Can't build as $need mandatory dependencies are missing"; + } + } + + # Check if sphinx-build is called sphinx-build-3 if ($need_symlink) { printf "\tsudo ln -sf %s /usr/bin/sphinx-build\n\n", which("sphinx-build-3"); } + + # NOTE: if the system has a too old Sphinx version installed, + # it will recommend installing a newer version using virtualenv + if ($need_sphinx || $rec_sphinx_upgrade) { my $min_activate = "$ENV{'PWD'}/${virtenv_prefix}${min_version}/bin/activate"; my @activates = glob "$ENV{'PWD'}/${virtenv_prefix}*/bin/activate"; + if ($cur_version lt $rec_version) { + print "Warning: It is recommended at least Sphinx version $rec_version.\n"; + print " If you want pdf, you need at least $min_pdf_version.\n"; + } + if ($cur_version lt $min_pdf_version) { + print "Note: It is recommended at least Sphinx version $min_pdf_version if you need PDF support.\n"; + } @activates = sort {$b cmp $a} @activates; - - if ($need_sphinx && scalar @activates > 0 && $activates[0] ge $min_activate) { - printf "\nNeed to activate a compatible Sphinx version on virtualenv with:\n"; - printf "\t. $activates[0]\n"; - deactivate_help(); - exit (1); - } else { - my $rec_activate = "$virtenv_dir/bin/activate"; - my $virtualenv = findprog("virtualenv-3"); - my $rec_python3 = ""; - $virtualenv = findprog("virtualenv-3.5") if (!$virtualenv); - $virtualenv = findprog("virtualenv") if (!$virtualenv); - $virtualenv = "virtualenv" if (!$virtualenv); - - my $rel = ""; - if (index($system_release, "Ubuntu") != -1) { - $rel = $1 if ($system_release =~ /Ubuntu\s+(\d+)[.]/); - if ($rel && $rel >= 16) { - $rec_python3 = " -p python3"; - } + my ($activate, $ver); + foreach my $f (@activates) { + next if ($f lt $min_activate); + + my $sphinx_cmd = $f; + $sphinx_cmd =~ s/activate/sphinx-build/; + next if (! -f $sphinx_cmd); + + $ver = get_sphinx_version($sphinx_cmd); + if ($need_sphinx && ($ver ge $min_version)) { + $activate = $f; + last; + } elsif ($ver gt $cur_version) { + $activate = $f; + last; } - if (index($system_release, "Debian") != -1) { - $rel = $1 if ($system_release =~ /Debian\s+(\d+)/); - if ($rel && $rel >= 7) { - $rec_python3 = " -p python3"; - } + } + if ($activate ne "") { + if ($need_sphinx) { + printf "\nNeed to activate Sphinx (version $ver) on virtualenv with:\n"; + printf "\t. $activate\n"; + deactivate_help(); + exit (1); + } else { + printf "\nYou may also use a newer Sphinx (version $ver) with:\n"; + printf "\tdeactivate && . $activate\n"; } + } else { + my $rec_activate = "$virtenv_dir/bin/activate"; + + print "To upgrade Sphinx, use:\n\n" if ($rec_sphinx_upgrade); + + $python_cmd = find_python_no_venv(); - printf "\t$virtualenv$rec_python3 $virtenv_dir\n"; + if ($need_venv) { + printf "\t$python_cmd -m venv $virtenv_dir\n"; + } else { + printf "\t$virtualenv_cmd $virtenv_dir\n"; + } printf "\t. $rec_activate\n"; printf "\tpip install -r $requirement_file\n"; deactivate_help(); @@ -780,6 +901,24 @@ $system_release = catcheck("/etc/system-release") if !$system_release; $system_release = catcheck("/etc/redhat-release") if !$system_release; $system_release = catcheck("/etc/lsb-release") if !$system_release; $system_release = catcheck("/etc/gentoo-release") if !$system_release; + +# This seems more common than LSB these days +if (!$system_release) { + my %os_var; + if (open IN, "cat /etc/os-release|") { + while (<IN>) { + if (m/^([\w\d\_]+)=\"?([^\"]*)\"?\n/) { + $os_var{$1}=$2; + } + } + $system_release = $os_var{"NAME"}; + if (defined($os_var{"VERSION_ID"})) { + $system_release .= " " . $os_var{"VERSION_ID"} if (defined($os_var{"VERSION_ID"})); + } else { + $system_release .= " " . $os_var{"VERSION"}; + } + } +} $system_release = catcheck("/etc/issue") if !$system_release; $system_release =~ s/\s+$//; diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 986f3ac14282..d233ab3f1533 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read" + "wake_alarm", "block_suspend", "audit_read", "perfmon" -#if CAP_LAST_CAP > CAP_AUDIT_READ +#if CAP_LAST_CAP > CAP_PERFMON #error New capability defined, please update COMMON_CAP2_PERMS. #endif diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index 1b467381986f..f93f8acd05f7 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2662,8 +2662,6 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, if (!head->write) return -EINVAL; - if (!access_ok(buffer, buffer_len)) - return -EFAULT; if (mutex_lock_interruptible(&head->io_sem)) return -EINTR; head->read_user_buf_avail = 0; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index aef860256278..47838f57a647 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3093,7 +3093,8 @@ static int snd_pcm_xferi_frames_ioctl(struct snd_pcm_substream *substream, result = snd_pcm_lib_write(substream, xferi.buf, xferi.frames); else result = snd_pcm_lib_read(substream, xferi.buf, xferi.frames); - __put_user(result, &_xferi->result); + if (put_user(result, &_xferi->result)) + return -EFAULT; return result < 0 ? result : 0; } @@ -3122,7 +3123,8 @@ static int snd_pcm_xfern_frames_ioctl(struct snd_pcm_substream *substream, else result = snd_pcm_lib_readv(substream, bufs, xfern.frames); kfree(bufs); - __put_user(result, &_xfern->result); + if (put_user(result, &_xfern->result)) + return -EFAULT; return result < 0 ? result : 0; } @@ -3137,7 +3139,8 @@ static int snd_pcm_rewind_ioctl(struct snd_pcm_substream *substream, if (put_user(0, _frames)) return -EFAULT; result = snd_pcm_rewind(substream, frames); - __put_user(result, _frames); + if (put_user(result, _frames)) + return -EFAULT; return result < 0 ? result : 0; } @@ -3152,7 +3155,8 @@ static int snd_pcm_forward_ioctl(struct snd_pcm_substream *substream, if (put_user(0, _frames)) return -EFAULT; result = snd_pcm_forward(substream, frames); - __put_user(result, _frames); + if (put_user(result, _frames)) + return -EFAULT; return result < 0 ? result : 0; } diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index 6e060907c163..d25534940bde 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -58,8 +58,7 @@ #define ORC_TYPE_CALL 0 #define ORC_TYPE_REGS 1 #define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_SAVE 3 -#define UNWIND_HINT_TYPE_RESTORE 4 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 #ifndef __ASSEMBLY__ /* diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 3e0c019ef297..3abd4316cd4f 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -98,7 +98,8 @@ FEATURE_TESTS_EXTRA := \ llvm \ llvm-version \ clang \ - libbpf + libbpf \ + libpfm4 FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 92012381393a..84f845b9627d 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -69,7 +69,8 @@ FILES= \ test-libaio.bin \ test-libzstd.bin \ test-clang-bpf-global-var.bin \ - test-file-handle.bin + test-file-handle.bin \ + test-libpfm4.bin FILES := $(addprefix $(OUTPUT),$(FILES)) @@ -331,6 +332,9 @@ $(OUTPUT)test-clang-bpf-global-var.bin: $(OUTPUT)test-file-handle.bin: $(BUILD) +$(OUTPUT)test-libpfm4.bin: + $(BUILD) -lpfm + ############################### clean: diff --git a/tools/build/feature/test-libopencsd.c b/tools/build/feature/test-libopencsd.c index 2b0e02c38870..1547bc2c0950 100644 --- a/tools/build/feature/test-libopencsd.c +++ b/tools/build/feature/test-libopencsd.c @@ -4,9 +4,9 @@ /* * Check OpenCSD library version is sufficient to provide required features */ -#define OCSD_MIN_VER ((0 << 16) | (11 << 8) | (0)) +#define OCSD_MIN_VER ((0 << 16) | (14 << 8) | (0)) #if !defined(OCSD_VER_NUM) || (OCSD_VER_NUM < OCSD_MIN_VER) -#error "OpenCSD >= 0.11.0 is required" +#error "OpenCSD >= 0.14.0 is required" #endif int main(void) diff --git a/tools/build/feature/test-libpfm4.c b/tools/build/feature/test-libpfm4.c new file mode 100644 index 000000000000..af49b259459e --- /dev/null +++ b/tools/build/feature/test-libpfm4.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <sys/types.h> +#include <perfmon/pfmlib.h> + +int main(void) +{ + pfm_initialize(); + return 0; +} diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index e03b1ea23e0e..30dd21f976c3 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -11,7 +11,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - See Documentation/rbtree.txt for documentation and samples. + See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef __TOOLS_LINUX_PERF_RBTREE_H diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 381aa948610d..570bb9794421 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -23,7 +23,7 @@ * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * - * See Documentation/rbtree.txt for documentation and samples. + * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 428c7dde6b4b..fdd632c833b4 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virt/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.rst */ union { __u32 irq; @@ -1107,7 +1107,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virt/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.rst. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index ad80a5c885d5..d1192783139a 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -148,9 +148,18 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ -#define STATX_ALL 0x00000fffU /* All currently supported flags */ + #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + /* * Attributes to be found in stx_attributes and masked in stx_attributes_mask. * diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 027b18f7ed8c..82f53d81a7a7 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -90,6 +90,7 @@ struct fs { const char * const *mounts; char path[PATH_MAX]; bool found; + bool checked; long magic; }; @@ -111,31 +112,37 @@ static struct fs fs__entries[] = { .name = "sysfs", .mounts = sysfs__fs_known_mountpoints, .magic = SYSFS_MAGIC, + .checked = false, }, [FS__PROCFS] = { .name = "proc", .mounts = procfs__known_mountpoints, .magic = PROC_SUPER_MAGIC, + .checked = false, }, [FS__DEBUGFS] = { .name = "debugfs", .mounts = debugfs__known_mountpoints, .magic = DEBUGFS_MAGIC, + .checked = false, }, [FS__TRACEFS] = { .name = "tracefs", .mounts = tracefs__known_mountpoints, .magic = TRACEFS_MAGIC, + .checked = false, }, [FS__HUGETLBFS] = { .name = "hugetlbfs", .mounts = hugetlbfs__known_mountpoints, .magic = HUGETLBFS_MAGIC, + .checked = false, }, [FS__BPF_FS] = { .name = "bpf", .mounts = bpf_fs__known_mountpoints, .magic = BPF_FS_MAGIC, + .checked = false, }, }; @@ -158,6 +165,7 @@ static bool fs__read_mounts(struct fs *fs) } fclose(fp); + fs->checked = true; return fs->found = found; } @@ -220,6 +228,7 @@ static bool fs__env_override(struct fs *fs) return false; fs->found = true; + fs->checked = true; strncpy(fs->path, override_path, sizeof(fs->path) - 1); fs->path[sizeof(fs->path) - 1] = '\0'; return true; @@ -246,6 +255,14 @@ static const char *fs__mountpoint(int idx) if (fs->found) return (const char *)fs->path; + /* the mount point was already checked for the mount point + * but and did not exist, so return NULL to avoid scanning again. + * This makes the found and not found paths cost equivalent + * in case of multiple calls. + */ + if (fs->checked) + return NULL; + return fs__get_mountpoint(fs); } diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h index 936edb95e1f3..aa222ca30311 100644 --- a/tools/lib/api/fs/fs.h +++ b/tools/lib/api/fs/fs.h @@ -18,6 +18,18 @@ const char *name##__mount(void); \ bool name##__configured(void); \ +/* + * The xxxx__mountpoint() entry points find the first match mount point for each + * filesystems listed below, where xxxx is the filesystem type. + * + * The interface is as follows: + * + * - If a mount point is found on first call, it is cached and used for all + * subsequent calls. + * + * - If a mount point is not found, NULL is returned on first call and all + * subsequent calls. + */ FS(sysfs) FS(procfs) FS(debugfs) diff --git a/tools/lib/api/io.h b/tools/lib/api/io.h new file mode 100644 index 000000000000..777c20f6b604 --- /dev/null +++ b/tools/lib/api/io.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Lightweight buffered reading library. + * + * Copyright 2019 Google LLC. + */ +#ifndef __API_IO__ +#define __API_IO__ + +#include <stdlib.h> +#include <unistd.h> + +struct io { + /* File descriptor being read/ */ + int fd; + /* Size of the read buffer. */ + unsigned int buf_len; + /* Pointer to storage for buffering read. */ + char *buf; + /* End of the storage. */ + char *end; + /* Currently accessed data pointer. */ + char *data; + /* Set true on when the end of file on read error. */ + bool eof; +}; + +static inline void io__init(struct io *io, int fd, + char *buf, unsigned int buf_len) +{ + io->fd = fd; + io->buf_len = buf_len; + io->buf = buf; + io->end = buf; + io->data = buf; + io->eof = false; +} + +/* Reads one character from the "io" file with similar semantics to fgetc. */ +static inline int io__get_char(struct io *io) +{ + char *ptr = io->data; + + if (io->eof) + return -1; + + if (ptr == io->end) { + ssize_t n = read(io->fd, io->buf, io->buf_len); + + if (n <= 0) { + io->eof = true; + return -1; + } + ptr = &io->buf[0]; + io->end = &io->buf[n]; + } + io->data = ptr + 1; + return *ptr; +} + +/* Read a hexadecimal value with no 0x prefix into the out argument hex. If the + * first character isn't hexadecimal returns -2, io->eof returns -1, otherwise + * returns the character after the hexadecimal value which may be -1 for eof. + * If the read value is larger than a u64 the high-order bits will be dropped. + */ +static inline int io__get_hex(struct io *io, __u64 *hex) +{ + bool first_read = true; + + *hex = 0; + while (true) { + int ch = io__get_char(io); + + if (ch < 0) + return ch; + if (ch >= '0' && ch <= '9') + *hex = (*hex << 4) | (ch - '0'); + else if (ch >= 'a' && ch <= 'f') + *hex = (*hex << 4) | (ch - 'a' + 10); + else if (ch >= 'A' && ch <= 'F') + *hex = (*hex << 4) | (ch - 'A' + 10); + else if (first_read) + return -2; + else + return ch; + first_read = false; + } +} + +/* Read a positive decimal value with out argument dec. If the first character + * isn't a decimal returns -2, io->eof returns -1, otherwise returns the + * character after the decimal value which may be -1 for eof. If the read value + * is larger than a u64 the high-order bits will be dropped. + */ +static inline int io__get_dec(struct io *io, __u64 *dec) +{ + bool first_read = true; + + *dec = 0; + while (true) { + int ch = io__get_char(io); + + if (ch < 0) + return ch; + if (ch >= '0' && ch <= '9') + *dec = (*dec * 10) + ch - '0'; + else if (first_read) + return -2; + else + return ch; + first_read = false; + } +} + +#endif /* __API_IO__ */ diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index f93f4e703e4c..ca0215047c32 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -247,7 +247,7 @@ out: int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx) { - if (idx < cpus->nr) + if (cpus && idx < cpus->nr) return cpus->map[idx]; return -1; diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 5b9f2ca50591..6a875a0f01bb 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -11,10 +11,8 @@ #include <internal/mmap.h> #include <internal/cpumap.h> #include <internal/threadmap.h> -#include <internal/xyarray.h> #include <internal/lib.h> #include <linux/zalloc.h> -#include <sys/ioctl.h> #include <stdlib.h> #include <errno.h> #include <unistd.h> @@ -125,8 +123,10 @@ static void perf_evlist__purge(struct perf_evlist *evlist) void perf_evlist__exit(struct perf_evlist *evlist) { perf_cpu_map__put(evlist->cpus); + perf_cpu_map__put(evlist->all_cpus); perf_thread_map__put(evlist->threads); evlist->cpus = NULL; + evlist->all_cpus = NULL; evlist->threads = NULL; fdarray__exit(&evlist->pollfd); } diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h index af9def589863..d2414144eb8c 100644 --- a/tools/lib/subcmd/parse-options.h +++ b/tools/lib/subcmd/parse-options.h @@ -151,6 +151,8 @@ struct option { { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb } #define OPT_CALLBACK(s, l, v, a, h, f) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f) } +#define OPT_CALLBACK_SET(s, l, v, os, a, h, f) \ + { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f), .set = check_vtype(os, bool *)} #define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG } #define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \ diff --git a/tools/lib/symbol/kallsyms.c b/tools/lib/symbol/kallsyms.c index 1a7a9f877095..e335ac2b9e19 100644 --- a/tools/lib/symbol/kallsyms.c +++ b/tools/lib/symbol/kallsyms.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "symbol/kallsyms.h" +#include "api/io.h" #include <stdio.h> -#include <stdlib.h> +#include <sys/stat.h> +#include <fcntl.h> u8 kallsyms2elf_type(char type) { @@ -15,74 +17,62 @@ bool kallsyms__is_function(char symbol_type) return symbol_type == 'T' || symbol_type == 'W'; } -/* - * While we find nice hex chars, build a long_val. - * Return number of chars processed. - */ -int hex2u64(const char *ptr, u64 *long_val) +static void read_to_eol(struct io *io) { - char *p; + int ch; - *long_val = strtoull(ptr, &p, 16); - - return p - ptr; + for (;;) { + ch = io__get_char(io); + if (ch < 0 || ch == '\n') + return; + } } int kallsyms__parse(const char *filename, void *arg, int (*process_symbol)(void *arg, const char *name, char type, u64 start)) { - char *line = NULL; - size_t n; - int err = -1; - FILE *file = fopen(filename, "r"); - - if (file == NULL) - goto out_failure; - - err = 0; + struct io io; + char bf[BUFSIZ]; + int err; - while (!feof(file)) { - u64 start; - int line_len, len; - char symbol_type; - char *symbol_name; + io.fd = open(filename, O_RDONLY, 0); - line_len = getline(&line, &n, file); - if (line_len < 0 || !line) - break; + if (io.fd < 0) + return -1; - line[--line_len] = '\0'; /* \n */ + io__init(&io, io.fd, bf, sizeof(bf)); - len = hex2u64(line, &start); + err = 0; + while (!io.eof) { + __u64 start; + int ch; + size_t i; + char symbol_type; + char symbol_name[KSYM_NAME_LEN + 1]; - /* Skip the line if we failed to parse the address. */ - if (!len) + if (io__get_hex(&io, &start) != ' ') { + read_to_eol(&io); continue; - - len++; - if (len + 2 >= line_len) + } + symbol_type = io__get_char(&io); + if (io__get_char(&io) != ' ') { + read_to_eol(&io); continue; - - symbol_type = line[len]; - len += 2; - symbol_name = line + len; - len = line_len - len; - - if (len >= KSYM_NAME_LEN) { - err = -1; - break; } + for (i = 0; i < sizeof(symbol_name); i++) { + ch = io__get_char(&io); + if (ch < 0 || ch == '\n') + break; + symbol_name[i] = ch; + } + symbol_name[i] = '\0'; err = process_symbol(arg, symbol_name, symbol_type, start); if (err) break; } - free(line); - fclose(file); + close(io.fd); return err; - -out_failure: - return -1; } diff --git a/tools/lib/symbol/kallsyms.h b/tools/lib/symbol/kallsyms.h index bd988f7b18d4..72ab9870454b 100644 --- a/tools/lib/symbol/kallsyms.h +++ b/tools/lib/symbol/kallsyms.h @@ -18,8 +18,6 @@ static inline u8 kallsyms2elf_binding(char type) return isupper(type) ? STB_GLOBAL : STB_LOCAL; } -int hex2u64(const char *ptr, u64 *long_val); - u8 kallsyms2elf_type(char type); bool kallsyms__is_function(char symbol_type); diff --git a/tools/lib/traceevent/kbuffer-parse.c b/tools/lib/traceevent/kbuffer-parse.c index b887e7437d67..27f3b07fdae8 100644 --- a/tools/lib/traceevent/kbuffer-parse.c +++ b/tools/lib/traceevent/kbuffer-parse.c @@ -438,7 +438,7 @@ void *kbuffer_translate_data(int swap, void *data, unsigned int *size) case KBUFFER_TYPE_TIME_EXTEND: case KBUFFER_TYPE_TIME_STAMP: return NULL; - }; + } *size = length; diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 20eed719542e..c271aeeb227d 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1958,7 +1958,8 @@ static char *op_to_str(struct tep_event_filter *filter, struct tep_filter_arg *a default: break; } - asprintf(&str, val ? "TRUE" : "FALSE"); + if (asprintf(&str, val ? "TRUE" : "FALSE") < 0) + str = NULL; break; } } @@ -1976,7 +1977,8 @@ static char *op_to_str(struct tep_event_filter *filter, struct tep_filter_arg *a break; } - asprintf(&str, "(%s) %s (%s)", left, op, right); + if (asprintf(&str, "(%s) %s (%s)", left, op, right) < 0) + str = NULL; break; case TEP_FILTER_OP_NOT: @@ -1992,10 +1994,12 @@ static char *op_to_str(struct tep_event_filter *filter, struct tep_filter_arg *a right_val = 0; if (right_val >= 0) { /* just return the opposite */ - asprintf(&str, right_val ? "FALSE" : "TRUE"); + if (asprintf(&str, right_val ? "FALSE" : "TRUE") < 0) + str = NULL; break; } - asprintf(&str, "%s(%s)", op, right); + if (asprintf(&str, "%s(%s)", op, right) < 0) + str = NULL; break; default: @@ -2011,7 +2015,8 @@ static char *val_to_str(struct tep_event_filter *filter, struct tep_filter_arg * { char *str = NULL; - asprintf(&str, "%lld", arg->value.val); + if (asprintf(&str, "%lld", arg->value.val) < 0) + str = NULL; return str; } @@ -2069,7 +2074,8 @@ static char *exp_to_str(struct tep_event_filter *filter, struct tep_filter_arg * break; } - asprintf(&str, "%s %s %s", lstr, op, rstr); + if (asprintf(&str, "%s %s %s", lstr, op, rstr) < 0) + str = NULL; out: free(lstr); free(rstr); @@ -2113,7 +2119,8 @@ static char *num_to_str(struct tep_event_filter *filter, struct tep_filter_arg * if (!op) op = "<="; - asprintf(&str, "%s %s %s", lstr, op, rstr); + if (asprintf(&str, "%s %s %s", lstr, op, rstr) < 0) + str = NULL; break; default: @@ -2148,8 +2155,9 @@ static char *str_to_str(struct tep_event_filter *filter, struct tep_filter_arg * if (!op) op = "!~"; - asprintf(&str, "%s %s \"%s\"", - arg->str.field->name, op, arg->str.val); + if (asprintf(&str, "%s %s \"%s\"", + arg->str.field->name, op, arg->str.val) < 0) + str = NULL; break; default: @@ -2165,7 +2173,8 @@ static char *arg_to_str(struct tep_event_filter *filter, struct tep_filter_arg * switch (arg->type) { case TEP_FILTER_ARG_BOOLEAN: - asprintf(&str, arg->boolean.value ? "TRUE" : "FALSE"); + if (asprintf(&str, arg->boolean.value ? "TRUE" : "FALSE") < 0) + str = NULL; return str; case TEP_FILTER_ARG_OP: diff --git a/tools/objtool/Build b/tools/objtool/Build index 66f44f5cd2a6..b7222d5cc7bc 100644 --- a/tools/objtool/Build +++ b/tools/objtool/Build @@ -1,11 +1,16 @@ objtool-y += arch/$(SRCARCH)/ + +objtool-y += weak.o + +objtool-$(SUBCMD_CHECK) += check.o +objtool-$(SUBCMD_CHECK) += special.o +objtool-$(SUBCMD_ORC) += check.o +objtool-$(SUBCMD_ORC) += orc_gen.o +objtool-$(SUBCMD_ORC) += orc_dump.o + objtool-y += builtin-check.o objtool-y += builtin-orc.o -objtool-y += check.o -objtool-y += orc_gen.o -objtool-y += orc_dump.o objtool-y += elf.o -objtool-y += special.o objtool-y += objtool.o objtool-y += libstring.o diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt index de094670050b..0542e46c7552 100644 --- a/tools/objtool/Documentation/stack-validation.txt +++ b/tools/objtool/Documentation/stack-validation.txt @@ -289,6 +289,47 @@ they mean, and suggestions for how to fix them. might be corrupt due to a gcc bug. For more details, see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646 +9. file.o: warning: objtool: funcA() call to funcB() with UACCESS enabled + + This means that an unexpected call to a non-whitelisted function exists + outside of arch-specific guards. + X86: SMAP (stac/clac): __uaccess_begin()/__uaccess_end() + ARM: PAN: uaccess_enable()/uaccess_disable() + + These functions should be called to denote a minimal critical section around + access to __user variables. See also: https://lwn.net/Articles/517475/ + + The intention of the warning is to prevent calls to funcB() from eventually + calling schedule(), potentially leaking the AC flags state, and not + restoring them correctly. + + It also helps verify that there are no unexpected calls to funcB() which may + access user space pages with protections against doing so disabled. + + To fix, either: + 1) remove explicit calls to funcB() from funcA(). + 2) add the correct guards before and after calls to low level functions like + __get_user_size()/__put_user_size(). + 3) add funcB to uaccess_safe_builtin whitelist in tools/objtool/check.c, if + funcB obviously does not call schedule(), and is marked notrace (since + function tracing inserts additional calls, which is not obvious from the + sources). + +10. file.o: warning: func()+0x5c: alternative modifies stack + + This means that an alternative includes instructions that modify the + stack. The problem is that there is only one ORC unwind table, this means + that the ORC unwind entries must be valid for each of the alternatives. + The easiest way to enforce this is to ensure alternatives do not contain + any ORC entries, which in turn implies the above constraint. + +11. file.o: warning: unannotated intra-function call + + This warning means that a direct call is done to a destination which + is not at the beginning of a function. If this is a legit call, you + can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL + directive right before the call. + If the error doesn't seem to make sense, it could be a bug in objtool. Feel free to ask the objtool maintainer for help. diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index f591c4d1b6fe..7770edcda3a0 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -35,7 +35,8 @@ all: $(OBJTOOL) INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ - -I$(srctree)/tools/arch/$(SRCARCH)/include + -I$(srctree)/tools/arch/$(SRCARCH)/include \ + -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) LDFLAGS += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) @@ -45,14 +46,24 @@ elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) AWK = awk + +SUBCMD_CHECK := n +SUBCMD_ORC := n + +ifeq ($(SRCARCH),x86) + SUBCMD_CHECK := y + SUBCMD_ORC := y +endif + +export SUBCMD_CHECK SUBCMD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include $(OBJTOOL_IN): fixdep FORCE + @$(CONFIG_SHELL) ./sync-check.sh @$(MAKE) $(build)=objtool $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) - @$(CONFIG_SHELL) ./sync-check.sh $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index ced3765c4f44..eda15a5a285e 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -8,9 +8,11 @@ #include <stdbool.h> #include <linux/list.h> -#include "elf.h" +#include "objtool.h" #include "cfi.h" +#include <asm/orc_types.h> + enum insn_type { INSN_JUMP_CONDITIONAL, INSN_JUMP_UNCONDITIONAL, @@ -20,7 +22,6 @@ enum insn_type { INSN_CALL_DYNAMIC, INSN_RETURN, INSN_CONTEXT_SWITCH, - INSN_STACK, INSN_BUG, INSN_NOP, INSN_STAC, @@ -64,15 +65,23 @@ struct op_src { struct stack_op { struct op_dest dest; struct op_src src; + struct list_head list; }; -void arch_initial_func_cfi_state(struct cfi_state *state); +struct instruction; + +void arch_initial_func_cfi_state(struct cfi_init_state *state); -int arch_decode_instruction(struct elf *elf, struct section *sec, +int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, - unsigned long *immediate, struct stack_op *op); + unsigned long *immediate, + struct list_head *ops_list); bool arch_callee_saved_reg(unsigned char reg); +unsigned long arch_jump_destination(struct instruction *insn); + +unsigned long arch_dest_rela_offset(int addend); + #endif /* _ARCH_H */ diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index a62e032863a8..4b504fc90bbb 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -11,6 +11,7 @@ #include "../../../arch/x86/lib/inat.c" #include "../../../arch/x86/lib/insn.c" +#include "../../check.h" #include "../../elf.h" #include "../../arch.h" #include "../../warn.h" @@ -26,7 +27,7 @@ static unsigned char op_to_cfi_reg[][2] = { {CFI_DI, CFI_R15}, }; -static int is_x86_64(struct elf *elf) +static int is_x86_64(const struct elf *elf) { switch (elf->ehdr.e_machine) { case EM_X86_64: @@ -66,16 +67,34 @@ bool arch_callee_saved_reg(unsigned char reg) } } -int arch_decode_instruction(struct elf *elf, struct section *sec, +unsigned long arch_dest_rela_offset(int addend) +{ + return addend + 4; +} + +unsigned long arch_jump_destination(struct instruction *insn) +{ + return insn->offset + insn->len + insn->immediate; +} + +#define ADD_OP(op) \ + if (!(op = calloc(1, sizeof(*op)))) \ + return -1; \ + else for (list_add_tail(&op->list, ops_list); op; op = NULL) + +int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, - unsigned long *immediate, struct stack_op *op) + unsigned long *immediate, + struct list_head *ops_list) { struct insn insn; int x86_64, sign; unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, sib = 0; + struct stack_op *op = NULL; + struct symbol *sym; x86_64 = is_x86_64(elf); if (x86_64 == -1) @@ -85,7 +104,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, insn_get_length(&insn); if (!insn_complete(&insn)) { - WARN_FUNC("can't decode instruction", sec, offset); + WARN("can't decode instruction at %s:0x%lx", sec->name, offset); return -1; } @@ -123,40 +142,44 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { /* add/sub reg, %rsp */ - *type = INSN_STACK; - op->src.type = OP_SRC_ADD; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } } break; case 0x50 ... 0x57: /* push reg */ - *type = INSN_STACK; - op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; - op->dest.type = OP_DEST_PUSH; + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + op->dest.type = OP_DEST_PUSH; + } break; case 0x58 ... 0x5f: /* pop reg */ - *type = INSN_STACK; - op->src.type = OP_SRC_POP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + ADD_OP(op) { + op->src.type = OP_SRC_POP; + op->dest.type = OP_DEST_REG; + op->dest.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + } break; case 0x68: case 0x6a: /* push immediate */ - *type = INSN_STACK; - op->src.type = OP_SRC_CONST; - op->dest.type = OP_DEST_PUSH; + ADD_OP(op) { + op->src.type = OP_SRC_CONST; + op->dest.type = OP_DEST_PUSH; + } break; case 0x70 ... 0x7f: @@ -170,12 +193,13 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, if (modrm == 0xe4) { /* and imm, %rsp */ - *type = INSN_STACK; - op->src.type = OP_SRC_AND; - op->src.reg = CFI_SP; - op->src.offset = insn.immediate.value; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; + ADD_OP(op) { + op->src.type = OP_SRC_AND; + op->src.reg = CFI_SP; + op->src.offset = insn.immediate.value; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } break; } @@ -187,34 +211,37 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, break; /* add/sub imm, %rsp */ - *type = INSN_STACK; - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_SP; - op->src.offset = insn.immediate.value * sign; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_SP; + op->src.offset = insn.immediate.value * sign; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } break; case 0x89: if (rex_w && !rex_r && modrm_mod == 3 && modrm_reg == 4) { /* mov %rsp, reg */ - *type = INSN_STACK; - op->src.type = OP_SRC_REG; - op->src.reg = CFI_SP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = CFI_SP; + op->dest.type = OP_DEST_REG; + op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + } break; } if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { /* mov reg, %rsp */ - *type = INSN_STACK; - op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } break; } @@ -224,22 +251,24 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, (modrm_mod == 1 || modrm_mod == 2) && modrm_rm == 5) { /* mov reg, disp(%rbp) */ - *type = INSN_STACK; - op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; - op->dest.type = OP_DEST_REG_INDIRECT; - op->dest.reg = CFI_BP; - op->dest.offset = insn.displacement.value; + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = CFI_BP; + op->dest.offset = insn.displacement.value; + } } else if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) { /* mov reg, disp(%rsp) */ - *type = INSN_STACK; - op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; - op->dest.type = OP_DEST_REG_INDIRECT; - op->dest.reg = CFI_SP; - op->dest.offset = insn.displacement.value; + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = CFI_SP; + op->dest.offset = insn.displacement.value; + } } break; @@ -248,23 +277,25 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, if (rex_w && !rex_b && modrm_mod == 1 && modrm_rm == 5) { /* mov disp(%rbp), reg */ - *type = INSN_STACK; - op->src.type = OP_SRC_REG_INDIRECT; - op->src.reg = CFI_BP; - op->src.offset = insn.displacement.value; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + ADD_OP(op) { + op->src.type = OP_SRC_REG_INDIRECT; + op->src.reg = CFI_BP; + op->src.offset = insn.displacement.value; + op->dest.type = OP_DEST_REG; + op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + } } else if (rex_w && !rex_b && sib == 0x24 && modrm_mod != 3 && modrm_rm == 4) { /* mov disp(%rsp), reg */ - *type = INSN_STACK; - op->src.type = OP_SRC_REG_INDIRECT; - op->src.reg = CFI_SP; - op->src.offset = insn.displacement.value; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + ADD_OP(op) { + op->src.type = OP_SRC_REG_INDIRECT; + op->src.reg = CFI_SP; + op->src.offset = insn.displacement.value; + op->dest.type = OP_DEST_REG; + op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + } } break; @@ -272,28 +303,30 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, case 0x8d: if (sib == 0x24 && rex_w && !rex_b && !rex_x) { - *type = INSN_STACK; - if (!insn.displacement.value) { - /* lea (%rsp), reg */ - op->src.type = OP_SRC_REG; - } else { - /* lea disp(%rsp), reg */ - op->src.type = OP_SRC_ADD; - op->src.offset = insn.displacement.value; + ADD_OP(op) { + if (!insn.displacement.value) { + /* lea (%rsp), reg */ + op->src.type = OP_SRC_REG; + } else { + /* lea disp(%rsp), reg */ + op->src.type = OP_SRC_ADD; + op->src.offset = insn.displacement.value; + } + op->src.reg = CFI_SP; + op->dest.type = OP_DEST_REG; + op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; } - op->src.reg = CFI_SP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; } else if (rex == 0x48 && modrm == 0x65) { /* lea disp(%rbp), %rsp */ - *type = INSN_STACK; - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_BP; - op->src.offset = insn.displacement.value; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_BP; + op->src.offset = insn.displacement.value; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } } else if (rex == 0x49 && modrm == 0x62 && insn.displacement.value == -8) { @@ -304,12 +337,13 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, * Restoring rsp back to its original value after a * stack realignment. */ - *type = INSN_STACK; - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R10; - op->src.offset = -8; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_R10; + op->src.offset = -8; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } } else if (rex == 0x49 && modrm == 0x65 && insn.displacement.value == -16) { @@ -320,21 +354,23 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, * Restoring rsp back to its original value after a * stack realignment. */ - *type = INSN_STACK; - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R13; - op->src.offset = -16; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_R13; + op->src.offset = -16; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } } break; case 0x8f: /* pop to mem */ - *type = INSN_STACK; - op->src.type = OP_SRC_POP; - op->dest.type = OP_DEST_MEM; + ADD_OP(op) { + op->src.type = OP_SRC_POP; + op->dest.type = OP_DEST_MEM; + } break; case 0x90: @@ -343,16 +379,18 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, case 0x9c: /* pushf */ - *type = INSN_STACK; - op->src.type = OP_SRC_CONST; - op->dest.type = OP_DEST_PUSHF; + ADD_OP(op) { + op->src.type = OP_SRC_CONST; + op->dest.type = OP_DEST_PUSHF; + } break; case 0x9d: /* popf */ - *type = INSN_STACK; - op->src.type = OP_SRC_POPF; - op->dest.type = OP_DEST_MEM; + ADD_OP(op) { + op->src.type = OP_SRC_POPF; + op->dest.type = OP_DEST_MEM; + } break; case 0x0f: @@ -387,16 +425,18 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, } else if (op2 == 0xa0 || op2 == 0xa8) { /* push fs/gs */ - *type = INSN_STACK; - op->src.type = OP_SRC_CONST; - op->dest.type = OP_DEST_PUSH; + ADD_OP(op) { + op->src.type = OP_SRC_CONST; + op->dest.type = OP_DEST_PUSH; + } } else if (op2 == 0xa1 || op2 == 0xa9) { /* pop fs/gs */ - *type = INSN_STACK; - op->src.type = OP_SRC_POP; - op->dest.type = OP_DEST_MEM; + ADD_OP(op) { + op->src.type = OP_SRC_POP; + op->dest.type = OP_DEST_MEM; + } } break; @@ -409,8 +449,8 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, * mov bp, sp * pop bp */ - *type = INSN_STACK; - op->dest.type = OP_DEST_LEAVE; + ADD_OP(op) + op->dest.type = OP_DEST_LEAVE; break; @@ -429,14 +469,41 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, *type = INSN_RETURN; break; + case 0xcf: /* iret */ + /* + * Handle sync_core(), which has an IRET to self. + * All other IRET are in STT_NONE entry code. + */ + sym = find_symbol_containing(sec, offset); + if (sym && sym->type == STT_FUNC) { + ADD_OP(op) { + /* add $40, %rsp */ + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_SP; + op->src.offset = 5*8; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } + break; + } + + /* fallthrough */ + case 0xca: /* retf */ case 0xcb: /* retf */ - case 0xcf: /* iret */ *type = INSN_CONTEXT_SWITCH; break; case 0xe8: *type = INSN_CALL; + /* + * For the impact on the stack, a CALL behaves like + * a PUSH of an immediate value (the return address). + */ + ADD_OP(op) { + op->src.type = OP_SRC_CONST; + op->dest.type = OP_DEST_PUSH; + } break; case 0xfc: @@ -464,9 +531,10 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, else if (modrm_reg == 6) { /* push from mem */ - *type = INSN_STACK; - op->src.type = OP_SRC_CONST; - op->dest.type = OP_DEST_PUSH; + ADD_OP(op) { + op->src.type = OP_SRC_CONST; + op->dest.type = OP_DEST_PUSH; + } } break; @@ -480,7 +548,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, return 0; } -void arch_initial_func_cfi_state(struct cfi_state *state) +void arch_initial_func_cfi_state(struct cfi_init_state *state) { int i; diff --git a/tools/objtool/arch/x86/include/cfi_regs.h b/tools/objtool/arch/x86/include/cfi_regs.h new file mode 100644 index 000000000000..79bc517efba8 --- /dev/null +++ b/tools/objtool/arch/x86/include/cfi_regs.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _OBJTOOL_CFI_REGS_H +#define _OBJTOOL_CFI_REGS_H + +#define CFI_AX 0 +#define CFI_DX 1 +#define CFI_CX 2 +#define CFI_BX 3 +#define CFI_SI 4 +#define CFI_DI 5 +#define CFI_BP 6 +#define CFI_SP 7 +#define CFI_R8 8 +#define CFI_R9 9 +#define CFI_R10 10 +#define CFI_R11 11 +#define CFI_R12 12 +#define CFI_R13 13 +#define CFI_R14 14 +#define CFI_R15 15 +#define CFI_RA 16 +#define CFI_NUM_REGS 17 + +#endif /* _OBJTOOL_CFI_REGS_H */ diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 10fbe75ab43d..7a44174967b5 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -14,10 +14,11 @@ */ #include <subcmd/parse-options.h> +#include <string.h> #include "builtin.h" -#include "check.h" +#include "objtool.h" -bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats; +bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux; static const char * const check_usage[] = { "objtool check [<options>] file.o", @@ -32,12 +33,14 @@ const struct option check_options[] = { OPT_BOOLEAN('b', "backtrace", &backtrace, "unwind on error"), OPT_BOOLEAN('a', "uaccess", &uaccess, "enable uaccess checking"), OPT_BOOLEAN('s', "stats", &stats, "print statistics"), + OPT_BOOLEAN('d', "duplicate", &validate_dup, "duplicate validation for vmlinux.o"), + OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"), OPT_END(), }; int cmd_check(int argc, const char **argv) { - const char *objname; + const char *objname, *s; argc = parse_options(argc, argv, check_options, check_usage, 0); @@ -46,5 +49,9 @@ int cmd_check(int argc, const char **argv) objname = argv[0]; + s = strstr(objname, "vmlinux.o"); + if (s && !s[9]) + vmlinux = true; + return check(objname, false); } diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c index 5f7cc6157edd..b1dfe2007962 100644 --- a/tools/objtool/builtin-orc.c +++ b/tools/objtool/builtin-orc.c @@ -14,8 +14,7 @@ #include <string.h> #include "builtin.h" -#include "check.h" - +#include "objtool.h" static const char *orc_usage[] = { "objtool orc generate [<options>] file.o", diff --git a/tools/objtool/builtin.h b/tools/objtool/builtin.h index 0b907902ee79..85c979caa367 100644 --- a/tools/objtool/builtin.h +++ b/tools/objtool/builtin.h @@ -8,7 +8,7 @@ #include <subcmd/parse-options.h> extern const struct option check_options[]; -extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats; +extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux; extern int cmd_check(int argc, const char **argv); extern int cmd_orc(int argc, const char **argv); diff --git a/tools/objtool/cfi.h b/tools/objtool/cfi.h index 4427bf8ed686..c7c59c6a44ee 100644 --- a/tools/objtool/cfi.h +++ b/tools/objtool/cfi.h @@ -6,38 +6,33 @@ #ifndef _OBJTOOL_CFI_H #define _OBJTOOL_CFI_H +#include "cfi_regs.h" + #define CFI_UNDEFINED -1 #define CFI_CFA -2 #define CFI_SP_INDIRECT -3 #define CFI_BP_INDIRECT -4 -#define CFI_AX 0 -#define CFI_DX 1 -#define CFI_CX 2 -#define CFI_BX 3 -#define CFI_SI 4 -#define CFI_DI 5 -#define CFI_BP 6 -#define CFI_SP 7 -#define CFI_R8 8 -#define CFI_R9 9 -#define CFI_R10 10 -#define CFI_R11 11 -#define CFI_R12 12 -#define CFI_R13 13 -#define CFI_R14 14 -#define CFI_R15 15 -#define CFI_RA 16 -#define CFI_NUM_REGS 17 - struct cfi_reg { int base; int offset; }; -struct cfi_state { +struct cfi_init_state { + struct cfi_reg regs[CFI_NUM_REGS]; struct cfi_reg cfa; +}; + +struct cfi_state { struct cfi_reg regs[CFI_NUM_REGS]; + struct cfi_reg vals[CFI_NUM_REGS]; + struct cfi_reg cfa; + int stack_size; + int drap_reg, drap_offset; + unsigned char type; + bool bp_scratch; + bool drap; + bool end; }; #endif /* _OBJTOOL_CFI_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3c6da70e6084..63d65a702900 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -7,10 +7,10 @@ #include <stdlib.h> #include "builtin.h" +#include "cfi.h" +#include "arch.h" #include "check.h" -#include "elf.h" #include "special.h" -#include "arch.h" #include "warn.h" #include <linux/hashtable.h> @@ -27,16 +27,17 @@ struct alternative { }; const char *objname; -struct cfi_state initial_func_cfi; +struct cfi_init_state initial_func_cfi; struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset) { struct instruction *insn; - hash_for_each_possible(file->insn_hash, insn, hash, offset) + hash_for_each_possible(file->insn_hash, insn, hash, sec_offset_hash(sec, offset)) { if (insn->sec == sec && insn->offset == offset) return insn; + } return NULL; } @@ -226,18 +227,31 @@ static bool dead_end_function(struct objtool_file *file, struct symbol *func) return __dead_end_function(file, func, 0); } -static void clear_insn_state(struct insn_state *state) +static void init_cfi_state(struct cfi_state *cfi) { int i; - memset(state, 0, sizeof(*state)); - state->cfa.base = CFI_UNDEFINED; for (i = 0; i < CFI_NUM_REGS; i++) { - state->regs[i].base = CFI_UNDEFINED; - state->vals[i].base = CFI_UNDEFINED; + cfi->regs[i].base = CFI_UNDEFINED; + cfi->vals[i].base = CFI_UNDEFINED; } - state->drap_reg = CFI_UNDEFINED; - state->drap_offset = -1; + cfi->cfa.base = CFI_UNDEFINED; + cfi->drap_reg = CFI_UNDEFINED; + cfi->drap_offset = -1; +} + +static void init_insn_state(struct insn_state *state, struct section *sec) +{ + memset(state, 0, sizeof(*state)); + init_cfi_state(&state->cfi); + + /* + * We need the full vmlinux for noinstr validation, otherwise we can + * not correctly determine insn->call_dest->sec (external symbols do + * not have a section). + */ + if (vmlinux && sec) + state->noinstr = sec->noinstr; } /* @@ -263,6 +277,10 @@ static int decode_instructions(struct objtool_file *file) strncmp(sec->name, ".discard.", 9)) sec->text = true; + if (!strcmp(sec->name, ".noinstr.text") || + !strcmp(sec->name, ".entry.text")) + sec->noinstr = true; + for (offset = 0; offset < sec->len; offset += insn->len) { insn = malloc(sizeof(*insn)); if (!insn) { @@ -271,7 +289,8 @@ static int decode_instructions(struct objtool_file *file) } memset(insn, 0, sizeof(*insn)); INIT_LIST_HEAD(&insn->alts); - clear_insn_state(&insn->state); + INIT_LIST_HEAD(&insn->stack_ops); + init_cfi_state(&insn->cfi); insn->sec = sec; insn->offset = offset; @@ -280,11 +299,11 @@ static int decode_instructions(struct objtool_file *file) sec->len - offset, &insn->len, &insn->type, &insn->immediate, - &insn->stack_op); + &insn->stack_ops); if (ret) goto err; - hash_add(file->insn_hash, &insn->hash, insn->offset); + hash_add(file->insn_hash, &insn->hash, sec_offset_hash(sec, insn->offset)); list_add_tail(&insn->list, &file->insn_list); nr_insns++; } @@ -314,6 +333,19 @@ err: return ret; } +static struct instruction *find_last_insn(struct objtool_file *file, + struct section *sec) +{ + struct instruction *insn = NULL; + unsigned int offset; + unsigned int end = (sec->len > 10) ? sec->len - 10 : 0; + + for (offset = sec->len - 1; offset >= end && !insn; offset--) + insn = find_insn(file, sec, offset); + + return insn; +} + /* * Mark "ud2" instructions and manually annotated dead ends. */ @@ -322,7 +354,6 @@ static int add_dead_ends(struct objtool_file *file) struct section *sec; struct rela *rela; struct instruction *insn; - bool found; /* * By default, "ud2" is a dead end unless otherwise annotated, because @@ -348,15 +379,8 @@ static int add_dead_ends(struct objtool_file *file) if (insn) insn = list_prev_entry(insn, list); else if (rela->addend == rela->sym->sec->len) { - found = false; - list_for_each_entry_reverse(insn, &file->insn_list, list) { - if (insn->sec == rela->sym->sec) { - found = true; - break; - } - } - - if (!found) { + insn = find_last_insn(file, rela->sym->sec); + if (!insn) { WARN("can't find unreachable insn at %s+0x%x", rela->sym->sec->name, rela->addend); return -1; @@ -390,15 +414,8 @@ reachable: if (insn) insn = list_prev_entry(insn, list); else if (rela->addend == rela->sym->sec->len) { - found = false; - list_for_each_entry_reverse(insn, &file->insn_list, list) { - if (insn->sec == rela->sym->sec) { - found = true; - break; - } - } - - if (!found) { + insn = find_last_insn(file, rela->sym->sec); + if (!insn) { WARN("can't find reachable insn at %s+0x%x", rela->sym->sec->name, rela->addend); return -1; @@ -490,6 +507,7 @@ static const char *uaccess_safe_builtin[] = { "__asan_report_store16_noabort", /* KCOV */ "write_comp_data", + "check_kcov_mode", "__sanitizer_cov_trace_pc", "__sanitizer_cov_trace_const_cmp1", "__sanitizer_cov_trace_const_cmp2", @@ -585,13 +603,14 @@ static int add_jump_destinations(struct objtool_file *file) insn->offset, insn->len); if (!rela) { dest_sec = insn->sec; - dest_off = insn->offset + insn->len + insn->immediate; + dest_off = arch_jump_destination(insn); } else if (rela->sym->type == STT_SECTION) { dest_sec = rela->sym->sec; - dest_off = rela->addend + 4; + dest_off = arch_dest_rela_offset(rela->addend); } else if (rela->sym->sec->idx) { dest_sec = rela->sym->sec; - dest_off = rela->sym->sym.st_value + rela->addend + 4; + dest_off = rela->sym->sym.st_value + + arch_dest_rela_offset(rela->addend); } else if (strstr(rela->sym->name, "_indirect_thunk_")) { /* * Retpoline jumps are really dynamic jumps in @@ -665,6 +684,16 @@ static int add_jump_destinations(struct objtool_file *file) return 0; } +static void remove_insn_ops(struct instruction *insn) +{ + struct stack_op *op, *tmp; + + list_for_each_entry_safe(op, tmp, &insn->stack_ops, list) { + list_del(&op->list); + free(op); + } +} + /* * Find the destination instructions for all calls. */ @@ -681,7 +710,7 @@ static int add_call_destinations(struct objtool_file *file) rela = find_rela_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); if (!rela) { - dest_off = insn->offset + insn->len + insn->immediate; + dest_off = arch_jump_destination(insn); insn->call_dest = find_func_by_offset(insn->sec, dest_off); if (!insn->call_dest) insn->call_dest = find_symbol_by_offset(insn->sec, dest_off); @@ -690,10 +719,7 @@ static int add_call_destinations(struct objtool_file *file) continue; if (!insn->call_dest) { - WARN_FUNC("unsupported intra-function call", - insn->sec, insn->offset); - if (retpoline) - WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE."); + WARN_FUNC("unannotated intra-function call", insn->sec, insn->offset); return -1; } @@ -704,17 +730,27 @@ static int add_call_destinations(struct objtool_file *file) } } else if (rela->sym->type == STT_SECTION) { + dest_off = arch_dest_rela_offset(rela->addend); insn->call_dest = find_func_by_offset(rela->sym->sec, - rela->addend+4); + dest_off); if (!insn->call_dest) { - WARN_FUNC("can't find call dest symbol at %s+0x%x", + WARN_FUNC("can't find call dest symbol at %s+0x%lx", insn->sec, insn->offset, rela->sym->sec->name, - rela->addend + 4); + dest_off); return -1; } } else insn->call_dest = rela->sym; + + /* + * Whatever stack impact regular CALLs have, should be undone + * by the RETURN of the called function. + * + * Annotated intra-function calls retain the stack_ops but + * are converted to JUMP, see read_intra_function_calls(). + */ + remove_insn_ops(insn); } return 0; @@ -742,7 +778,9 @@ static int handle_group_alt(struct objtool_file *file, struct instruction *orig_insn, struct instruction **new_insn) { + static unsigned int alt_group_next_index = 1; struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL; + unsigned int alt_group = alt_group_next_index++; unsigned long dest_off; last_orig_insn = NULL; @@ -751,7 +789,7 @@ static int handle_group_alt(struct objtool_file *file, if (insn->offset >= special_alt->orig_off + special_alt->orig_len) break; - insn->alt_group = true; + insn->alt_group = alt_group; last_orig_insn = insn; } @@ -763,7 +801,8 @@ static int handle_group_alt(struct objtool_file *file, } memset(fake_jump, 0, sizeof(*fake_jump)); INIT_LIST_HEAD(&fake_jump->alts); - clear_insn_state(&fake_jump->state); + INIT_LIST_HEAD(&fake_jump->stack_ops); + init_cfi_state(&fake_jump->cfi); fake_jump->sec = special_alt->new_sec; fake_jump->offset = FAKE_JUMP_OFFSET; @@ -784,6 +823,7 @@ static int handle_group_alt(struct objtool_file *file, } last_new_insn = NULL; + alt_group = alt_group_next_index++; insn = *new_insn; sec_for_each_insn_from(file, insn) { if (insn->offset >= special_alt->new_off + special_alt->new_len) @@ -793,6 +833,7 @@ static int handle_group_alt(struct objtool_file *file, insn->ignore = orig_insn->ignore_alts; insn->func = orig_insn->func; + insn->alt_group = alt_group; /* * Since alternative replacement code is copy/pasted by the @@ -821,7 +862,7 @@ static int handle_group_alt(struct objtool_file *file, if (!insn->immediate) continue; - dest_off = insn->offset + insn->len + insn->immediate; + dest_off = arch_jump_destination(insn); if (dest_off == special_alt->new_off + special_alt->new_len) { if (!fake_jump) { WARN("%s: alternative jump to end of section", @@ -916,6 +957,12 @@ static int add_special_section_alts(struct objtool_file *file) } if (special_alt->group) { + if (!special_alt->orig_len) { + WARN_FUNC("empty alternative entry", + orig_insn->sec, orig_insn->offset); + continue; + } + ret = handle_group_alt(file, special_alt, orig_insn, &new_insn); if (ret) @@ -1253,15 +1300,10 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - cfa = &insn->state.cfa; - - if (hint->type == UNWIND_HINT_TYPE_SAVE) { - insn->save = true; - continue; + cfa = &insn->cfi.cfa; - } else if (hint->type == UNWIND_HINT_TYPE_RESTORE) { - insn->restore = true; - insn->hint = true; + if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) { + insn->ret_offset = hint->sp_offset; continue; } @@ -1299,8 +1341,8 @@ static int read_unwind_hints(struct objtool_file *file) } cfa->offset = hint->sp_offset; - insn->state.type = hint->type; - insn->state.end = hint->end; + insn->cfi.type = hint->type; + insn->cfi.end = hint->end; } return 0; @@ -1341,6 +1383,104 @@ static int read_retpoline_hints(struct objtool_file *file) return 0; } +static int read_instr_hints(struct objtool_file *file) +{ + struct section *sec; + struct instruction *insn; + struct rela *rela; + + sec = find_section_by_name(file->elf, ".rela.discard.instr_end"); + if (!sec) + return 0; + + list_for_each_entry(rela, &sec->rela_list, list) { + if (rela->sym->type != STT_SECTION) { + WARN("unexpected relocation symbol type in %s", sec->name); + return -1; + } + + insn = find_insn(file, rela->sym->sec, rela->addend); + if (!insn) { + WARN("bad .discard.instr_end entry"); + return -1; + } + + insn->instr--; + } + + sec = find_section_by_name(file->elf, ".rela.discard.instr_begin"); + if (!sec) + return 0; + + list_for_each_entry(rela, &sec->rela_list, list) { + if (rela->sym->type != STT_SECTION) { + WARN("unexpected relocation symbol type in %s", sec->name); + return -1; + } + + insn = find_insn(file, rela->sym->sec, rela->addend); + if (!insn) { + WARN("bad .discard.instr_begin entry"); + return -1; + } + + insn->instr++; + } + + return 0; +} + +static int read_intra_function_calls(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + struct rela *rela; + + sec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls"); + if (!sec) + return 0; + + list_for_each_entry(rela, &sec->rela_list, list) { + unsigned long dest_off; + + if (rela->sym->type != STT_SECTION) { + WARN("unexpected relocation symbol type in %s", + sec->name); + return -1; + } + + insn = find_insn(file, rela->sym->sec, rela->addend); + if (!insn) { + WARN("bad .discard.intra_function_call entry"); + return -1; + } + + if (insn->type != INSN_CALL) { + WARN_FUNC("intra_function_call not a direct call", + insn->sec, insn->offset); + return -1; + } + + /* + * Treat intra-function CALLs as JMPs, but with a stack_op. + * See add_call_destinations(), which strips stack_ops from + * normal CALLs. + */ + insn->type = INSN_JUMP_UNCONDITIONAL; + + dest_off = insn->offset + insn->len + insn->immediate; + insn->jump_dest = find_insn(file, insn->sec, dest_off); + if (!insn->jump_dest) { + WARN_FUNC("can't find call dest at %s+0x%lx", + insn->sec, insn->offset, + insn->sec->name, dest_off); + return -1; + } + } + + return 0; +} + static void mark_rodata(struct objtool_file *file) { struct section *sec; @@ -1357,8 +1497,8 @@ static void mark_rodata(struct objtool_file *file) * .rodata.str1.* sections are ignored; they don't contain jump tables. */ for_each_sec(file, sec) { - if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) || - !strcmp(sec->name, C_JUMP_TABLE_SECTION)) { + if (!strncmp(sec->name, ".rodata", 7) && + !strstr(sec->name, ".str1.")) { sec->rodata = true; found = true; } @@ -1396,6 +1536,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + ret = read_intra_function_calls(file); + if (ret) + return ret; + ret = add_call_destinations(file); if (ret) return ret; @@ -1412,12 +1556,16 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + ret = read_instr_hints(file); + if (ret) + return ret; + return 0; } static bool is_fentry_call(struct instruction *insn) { - if (insn->type == INSN_CALL && + if (insn->type == INSN_CALL && insn->call_dest && insn->call_dest->type == STT_NOTYPE && !strcmp(insn->call_dest->name, "__fentry__")) return true; @@ -1425,40 +1573,57 @@ static bool is_fentry_call(struct instruction *insn) return false; } -static bool has_modified_stack_frame(struct insn_state *state) +static bool has_modified_stack_frame(struct instruction *insn, struct insn_state *state) { + u8 ret_offset = insn->ret_offset; + struct cfi_state *cfi = &state->cfi; int i; - if (state->cfa.base != initial_func_cfi.cfa.base || - state->cfa.offset != initial_func_cfi.cfa.offset || - state->stack_size != initial_func_cfi.cfa.offset || - state->drap) + if (cfi->cfa.base != initial_func_cfi.cfa.base || cfi->drap) + return true; + + if (cfi->cfa.offset != initial_func_cfi.cfa.offset + ret_offset) + return true; + + if (cfi->stack_size != initial_func_cfi.cfa.offset + ret_offset) return true; - for (i = 0; i < CFI_NUM_REGS; i++) - if (state->regs[i].base != initial_func_cfi.regs[i].base || - state->regs[i].offset != initial_func_cfi.regs[i].offset) + /* + * If there is a ret offset hint then don't check registers + * because a callee-saved register might have been pushed on + * the stack. + */ + if (ret_offset) + return false; + + for (i = 0; i < CFI_NUM_REGS; i++) { + if (cfi->regs[i].base != initial_func_cfi.regs[i].base || + cfi->regs[i].offset != initial_func_cfi.regs[i].offset) return true; + } return false; } static bool has_valid_stack_frame(struct insn_state *state) { - if (state->cfa.base == CFI_BP && state->regs[CFI_BP].base == CFI_CFA && - state->regs[CFI_BP].offset == -16) + struct cfi_state *cfi = &state->cfi; + + if (cfi->cfa.base == CFI_BP && cfi->regs[CFI_BP].base == CFI_CFA && + cfi->regs[CFI_BP].offset == -16) return true; - if (state->drap && state->regs[CFI_BP].base == CFI_BP) + if (cfi->drap && cfi->regs[CFI_BP].base == CFI_BP) return true; return false; } -static int update_insn_state_regs(struct instruction *insn, struct insn_state *state) +static int update_cfi_state_regs(struct instruction *insn, + struct cfi_state *cfi, + struct stack_op *op) { - struct cfi_reg *cfa = &state->cfa; - struct stack_op *op = &insn->stack_op; + struct cfi_reg *cfa = &cfi->cfa; if (cfa->base != CFI_SP && cfa->base != CFI_SP_INDIRECT) return 0; @@ -1479,20 +1644,19 @@ static int update_insn_state_regs(struct instruction *insn, struct insn_state *s return 0; } -static void save_reg(struct insn_state *state, unsigned char reg, int base, - int offset) +static void save_reg(struct cfi_state *cfi, unsigned char reg, int base, int offset) { if (arch_callee_saved_reg(reg) && - state->regs[reg].base == CFI_UNDEFINED) { - state->regs[reg].base = base; - state->regs[reg].offset = offset; + cfi->regs[reg].base == CFI_UNDEFINED) { + cfi->regs[reg].base = base; + cfi->regs[reg].offset = offset; } } -static void restore_reg(struct insn_state *state, unsigned char reg) +static void restore_reg(struct cfi_state *cfi, unsigned char reg) { - state->regs[reg].base = CFI_UNDEFINED; - state->regs[reg].offset = 0; + cfi->regs[reg].base = initial_func_cfi.regs[reg].base; + cfi->regs[reg].offset = initial_func_cfi.regs[reg].offset; } /* @@ -1548,11 +1712,11 @@ static void restore_reg(struct insn_state *state, unsigned char reg) * 41 5d pop %r13 * c3 retq */ -static int update_insn_state(struct instruction *insn, struct insn_state *state) +static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, + struct stack_op *op) { - struct stack_op *op = &insn->stack_op; - struct cfi_reg *cfa = &state->cfa; - struct cfi_reg *regs = state->regs; + struct cfi_reg *cfa = &cfi->cfa; + struct cfi_reg *regs = cfi->regs; /* stack operations don't make sense with an undefined CFA */ if (cfa->base == CFI_UNDEFINED) { @@ -1563,8 +1727,8 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) return 0; } - if (state->type == ORC_TYPE_REGS || state->type == ORC_TYPE_REGS_IRET) - return update_insn_state_regs(insn, state); + if (cfi->type == ORC_TYPE_REGS || cfi->type == ORC_TYPE_REGS_IRET) + return update_cfi_state_regs(insn, cfi, op); switch (op->dest.type) { @@ -1579,16 +1743,16 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) /* mov %rsp, %rbp */ cfa->base = op->dest.reg; - state->bp_scratch = false; + cfi->bp_scratch = false; } else if (op->src.reg == CFI_SP && - op->dest.reg == CFI_BP && state->drap) { + op->dest.reg == CFI_BP && cfi->drap) { /* drap: mov %rsp, %rbp */ regs[CFI_BP].base = CFI_BP; - regs[CFI_BP].offset = -state->stack_size; - state->bp_scratch = false; + regs[CFI_BP].offset = -cfi->stack_size; + cfi->bp_scratch = false; } else if (op->src.reg == CFI_SP && cfa->base == CFI_SP) { @@ -1603,8 +1767,8 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) * ... * mov %rax, %rsp */ - state->vals[op->dest.reg].base = CFI_CFA; - state->vals[op->dest.reg].offset = -state->stack_size; + cfi->vals[op->dest.reg].base = CFI_CFA; + cfi->vals[op->dest.reg].offset = -cfi->stack_size; } else if (op->src.reg == CFI_BP && op->dest.reg == CFI_SP && @@ -1615,14 +1779,14 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) * * Restore the original stack pointer (Clang). */ - state->stack_size = -state->regs[CFI_BP].offset; + cfi->stack_size = -cfi->regs[CFI_BP].offset; } else if (op->dest.reg == cfa->base) { /* mov %reg, %rsp */ if (cfa->base == CFI_SP && - state->vals[op->src.reg].base == CFI_CFA) { + cfi->vals[op->src.reg].base == CFI_CFA) { /* * This is needed for the rare case @@ -1632,8 +1796,8 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) * ... * mov %rcx, %rsp */ - cfa->offset = -state->vals[op->src.reg].offset; - state->stack_size = cfa->offset; + cfa->offset = -cfi->vals[op->src.reg].offset; + cfi->stack_size = cfa->offset; } else { cfa->base = CFI_UNDEFINED; @@ -1647,7 +1811,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) if (op->dest.reg == CFI_SP && op->src.reg == CFI_SP) { /* add imm, %rsp */ - state->stack_size -= op->src.offset; + cfi->stack_size -= op->src.offset; if (cfa->base == CFI_SP) cfa->offset -= op->src.offset; break; @@ -1656,14 +1820,14 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) if (op->dest.reg == CFI_SP && op->src.reg == CFI_BP) { /* lea disp(%rbp), %rsp */ - state->stack_size = -(op->src.offset + regs[CFI_BP].offset); + cfi->stack_size = -(op->src.offset + regs[CFI_BP].offset); break; } if (op->src.reg == CFI_SP && cfa->base == CFI_SP) { /* drap: lea disp(%rsp), %drap */ - state->drap_reg = op->dest.reg; + cfi->drap_reg = op->dest.reg; /* * lea disp(%rsp), %reg @@ -1675,25 +1839,25 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) * ... * mov %rcx, %rsp */ - state->vals[op->dest.reg].base = CFI_CFA; - state->vals[op->dest.reg].offset = \ - -state->stack_size + op->src.offset; + cfi->vals[op->dest.reg].base = CFI_CFA; + cfi->vals[op->dest.reg].offset = \ + -cfi->stack_size + op->src.offset; break; } - if (state->drap && op->dest.reg == CFI_SP && - op->src.reg == state->drap_reg) { + if (cfi->drap && op->dest.reg == CFI_SP && + op->src.reg == cfi->drap_reg) { /* drap: lea disp(%drap), %rsp */ cfa->base = CFI_SP; - cfa->offset = state->stack_size = -op->src.offset; - state->drap_reg = CFI_UNDEFINED; - state->drap = false; + cfa->offset = cfi->stack_size = -op->src.offset; + cfi->drap_reg = CFI_UNDEFINED; + cfi->drap = false; break; } - if (op->dest.reg == state->cfa.base) { + if (op->dest.reg == cfi->cfa.base) { WARN_FUNC("unsupported stack register modification", insn->sec, insn->offset); return -1; @@ -1703,18 +1867,18 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) case OP_SRC_AND: if (op->dest.reg != CFI_SP || - (state->drap_reg != CFI_UNDEFINED && cfa->base != CFI_SP) || - (state->drap_reg == CFI_UNDEFINED && cfa->base != CFI_BP)) { + (cfi->drap_reg != CFI_UNDEFINED && cfa->base != CFI_SP) || + (cfi->drap_reg == CFI_UNDEFINED && cfa->base != CFI_BP)) { WARN_FUNC("unsupported stack pointer realignment", insn->sec, insn->offset); return -1; } - if (state->drap_reg != CFI_UNDEFINED) { + if (cfi->drap_reg != CFI_UNDEFINED) { /* drap: and imm, %rsp */ - cfa->base = state->drap_reg; - cfa->offset = state->stack_size = 0; - state->drap = true; + cfa->base = cfi->drap_reg; + cfa->offset = cfi->stack_size = 0; + cfi->drap = true; } /* @@ -1726,57 +1890,55 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) case OP_SRC_POP: case OP_SRC_POPF: - if (!state->drap && op->dest.type == OP_DEST_REG && - op->dest.reg == cfa->base) { + if (!cfi->drap && op->dest.reg == cfa->base) { /* pop %rbp */ cfa->base = CFI_SP; } - if (state->drap && cfa->base == CFI_BP_INDIRECT && - op->dest.type == OP_DEST_REG && - op->dest.reg == state->drap_reg && - state->drap_offset == -state->stack_size) { + if (cfi->drap && cfa->base == CFI_BP_INDIRECT && + op->dest.reg == cfi->drap_reg && + cfi->drap_offset == -cfi->stack_size) { /* drap: pop %drap */ - cfa->base = state->drap_reg; + cfa->base = cfi->drap_reg; cfa->offset = 0; - state->drap_offset = -1; + cfi->drap_offset = -1; - } else if (regs[op->dest.reg].offset == -state->stack_size) { + } else if (regs[op->dest.reg].offset == -cfi->stack_size) { /* pop %reg */ - restore_reg(state, op->dest.reg); + restore_reg(cfi, op->dest.reg); } - state->stack_size -= 8; + cfi->stack_size -= 8; if (cfa->base == CFI_SP) cfa->offset -= 8; break; case OP_SRC_REG_INDIRECT: - if (state->drap && op->src.reg == CFI_BP && - op->src.offset == state->drap_offset) { + if (cfi->drap && op->src.reg == CFI_BP && + op->src.offset == cfi->drap_offset) { /* drap: mov disp(%rbp), %drap */ - cfa->base = state->drap_reg; + cfa->base = cfi->drap_reg; cfa->offset = 0; - state->drap_offset = -1; + cfi->drap_offset = -1; } - if (state->drap && op->src.reg == CFI_BP && + if (cfi->drap && op->src.reg == CFI_BP && op->src.offset == regs[op->dest.reg].offset) { /* drap: mov disp(%rbp), %reg */ - restore_reg(state, op->dest.reg); + restore_reg(cfi, op->dest.reg); } else if (op->src.reg == cfa->base && op->src.offset == regs[op->dest.reg].offset + cfa->offset) { /* mov disp(%rbp), %reg */ /* mov disp(%rsp), %reg */ - restore_reg(state, op->dest.reg); + restore_reg(cfi, op->dest.reg); } break; @@ -1791,78 +1953,78 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) case OP_DEST_PUSH: case OP_DEST_PUSHF: - state->stack_size += 8; + cfi->stack_size += 8; if (cfa->base == CFI_SP) cfa->offset += 8; if (op->src.type != OP_SRC_REG) break; - if (state->drap) { - if (op->src.reg == cfa->base && op->src.reg == state->drap_reg) { + if (cfi->drap) { + if (op->src.reg == cfa->base && op->src.reg == cfi->drap_reg) { /* drap: push %drap */ cfa->base = CFI_BP_INDIRECT; - cfa->offset = -state->stack_size; + cfa->offset = -cfi->stack_size; /* save drap so we know when to restore it */ - state->drap_offset = -state->stack_size; + cfi->drap_offset = -cfi->stack_size; - } else if (op->src.reg == CFI_BP && cfa->base == state->drap_reg) { + } else if (op->src.reg == CFI_BP && cfa->base == cfi->drap_reg) { /* drap: push %rbp */ - state->stack_size = 0; + cfi->stack_size = 0; } else if (regs[op->src.reg].base == CFI_UNDEFINED) { /* drap: push %reg */ - save_reg(state, op->src.reg, CFI_BP, -state->stack_size); + save_reg(cfi, op->src.reg, CFI_BP, -cfi->stack_size); } } else { /* push %reg */ - save_reg(state, op->src.reg, CFI_CFA, -state->stack_size); + save_reg(cfi, op->src.reg, CFI_CFA, -cfi->stack_size); } /* detect when asm code uses rbp as a scratch register */ if (!no_fp && insn->func && op->src.reg == CFI_BP && cfa->base != CFI_BP) - state->bp_scratch = true; + cfi->bp_scratch = true; break; case OP_DEST_REG_INDIRECT: - if (state->drap) { - if (op->src.reg == cfa->base && op->src.reg == state->drap_reg) { + if (cfi->drap) { + if (op->src.reg == cfa->base && op->src.reg == cfi->drap_reg) { /* drap: mov %drap, disp(%rbp) */ cfa->base = CFI_BP_INDIRECT; cfa->offset = op->dest.offset; /* save drap offset so we know when to restore it */ - state->drap_offset = op->dest.offset; + cfi->drap_offset = op->dest.offset; } else if (regs[op->src.reg].base == CFI_UNDEFINED) { /* drap: mov reg, disp(%rbp) */ - save_reg(state, op->src.reg, CFI_BP, op->dest.offset); + save_reg(cfi, op->src.reg, CFI_BP, op->dest.offset); } } else if (op->dest.reg == cfa->base) { /* mov reg, disp(%rbp) */ /* mov reg, disp(%rsp) */ - save_reg(state, op->src.reg, CFI_CFA, - op->dest.offset - state->cfa.offset); + save_reg(cfi, op->src.reg, CFI_CFA, + op->dest.offset - cfi->cfa.offset); } break; case OP_DEST_LEAVE: - if ((!state->drap && cfa->base != CFI_BP) || - (state->drap && cfa->base != state->drap_reg)) { + if ((!cfi->drap && cfa->base != CFI_BP) || + (cfi->drap && cfa->base != cfi->drap_reg)) { WARN_FUNC("leave instruction with modified stack frame", insn->sec, insn->offset); return -1; @@ -1870,10 +2032,10 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) /* leave (mov %rbp, %rsp; pop %rbp) */ - state->stack_size = -state->regs[CFI_BP].offset - 8; - restore_reg(state, CFI_BP); + cfi->stack_size = -cfi->regs[CFI_BP].offset - 8; + restore_reg(cfi, CFI_BP); - if (!state->drap) { + if (!cfi->drap) { cfa->base = CFI_SP; cfa->offset -= 8; } @@ -1888,7 +2050,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) } /* pop mem */ - state->stack_size -= 8; + cfi->stack_size -= 8; if (cfa->base == CFI_SP) cfa->offset -= 8; @@ -1903,41 +2065,86 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) return 0; } -static bool insn_state_match(struct instruction *insn, struct insn_state *state) +static int handle_insn_ops(struct instruction *insn, struct insn_state *state) +{ + struct stack_op *op; + + list_for_each_entry(op, &insn->stack_ops, list) { + struct cfi_state old_cfi = state->cfi; + int res; + + res = update_cfi_state(insn, &state->cfi, op); + if (res) + return res; + + if (insn->alt_group && memcmp(&state->cfi, &old_cfi, sizeof(struct cfi_state))) { + WARN_FUNC("alternative modifies stack", insn->sec, insn->offset); + return -1; + } + + if (op->dest.type == OP_DEST_PUSHF) { + if (!state->uaccess_stack) { + state->uaccess_stack = 1; + } else if (state->uaccess_stack >> 31) { + WARN_FUNC("PUSHF stack exhausted", + insn->sec, insn->offset); + return 1; + } + state->uaccess_stack <<= 1; + state->uaccess_stack |= state->uaccess; + } + + if (op->src.type == OP_SRC_POPF) { + if (state->uaccess_stack) { + state->uaccess = state->uaccess_stack & 1; + state->uaccess_stack >>= 1; + if (state->uaccess_stack == 1) + state->uaccess_stack = 0; + } + } + } + + return 0; +} + +static bool insn_cfi_match(struct instruction *insn, struct cfi_state *cfi2) { - struct insn_state *state1 = &insn->state, *state2 = state; + struct cfi_state *cfi1 = &insn->cfi; int i; - if (memcmp(&state1->cfa, &state2->cfa, sizeof(state1->cfa))) { + if (memcmp(&cfi1->cfa, &cfi2->cfa, sizeof(cfi1->cfa))) { + WARN_FUNC("stack state mismatch: cfa1=%d%+d cfa2=%d%+d", insn->sec, insn->offset, - state1->cfa.base, state1->cfa.offset, - state2->cfa.base, state2->cfa.offset); + cfi1->cfa.base, cfi1->cfa.offset, + cfi2->cfa.base, cfi2->cfa.offset); - } else if (memcmp(&state1->regs, &state2->regs, sizeof(state1->regs))) { + } else if (memcmp(&cfi1->regs, &cfi2->regs, sizeof(cfi1->regs))) { for (i = 0; i < CFI_NUM_REGS; i++) { - if (!memcmp(&state1->regs[i], &state2->regs[i], + if (!memcmp(&cfi1->regs[i], &cfi2->regs[i], sizeof(struct cfi_reg))) continue; WARN_FUNC("stack state mismatch: reg1[%d]=%d%+d reg2[%d]=%d%+d", insn->sec, insn->offset, - i, state1->regs[i].base, state1->regs[i].offset, - i, state2->regs[i].base, state2->regs[i].offset); + i, cfi1->regs[i].base, cfi1->regs[i].offset, + i, cfi2->regs[i].base, cfi2->regs[i].offset); break; } - } else if (state1->type != state2->type) { + } else if (cfi1->type != cfi2->type) { + WARN_FUNC("stack state mismatch: type1=%d type2=%d", - insn->sec, insn->offset, state1->type, state2->type); + insn->sec, insn->offset, cfi1->type, cfi2->type); + + } else if (cfi1->drap != cfi2->drap || + (cfi1->drap && cfi1->drap_reg != cfi2->drap_reg) || + (cfi1->drap && cfi1->drap_offset != cfi2->drap_offset)) { - } else if (state1->drap != state2->drap || - (state1->drap && state1->drap_reg != state2->drap_reg) || - (state1->drap && state1->drap_offset != state2->drap_offset)) { WARN_FUNC("stack state mismatch: drap1=%d(%d,%d) drap2=%d(%d,%d)", insn->sec, insn->offset, - state1->drap, state1->drap_reg, state1->drap_offset, - state2->drap, state2->drap_reg, state2->drap_offset); + cfi1->drap, cfi1->drap_reg, cfi1->drap_offset, + cfi2->drap, cfi2->drap_reg, cfi2->drap_offset); } else return true; @@ -1963,6 +2170,13 @@ static inline const char *call_dest_name(struct instruction *insn) static int validate_call(struct instruction *insn, struct insn_state *state) { + if (state->noinstr && state->instr <= 0 && + (!insn->call_dest || !insn->call_dest->sec->noinstr)) { + WARN_FUNC("call to %s() leaves .noinstr.text section", + insn->sec, insn->offset, call_dest_name(insn)); + return 1; + } + if (state->uaccess && !func_uaccess_safe(insn->call_dest)) { WARN_FUNC("call to %s() with UACCESS enabled", insn->sec, insn->offset, call_dest_name(insn)); @@ -1980,7 +2194,7 @@ static int validate_call(struct instruction *insn, struct insn_state *state) static int validate_sibling_call(struct instruction *insn, struct insn_state *state) { - if (has_modified_stack_frame(state)) { + if (has_modified_stack_frame(insn, state)) { WARN_FUNC("sibling call from callable instruction with modified stack frame", insn->sec, insn->offset); return 1; @@ -1991,6 +2205,12 @@ static int validate_sibling_call(struct instruction *insn, struct insn_state *st static int validate_return(struct symbol *func, struct instruction *insn, struct insn_state *state) { + if (state->noinstr && state->instr > 0) { + WARN_FUNC("return with instrumentation enabled", + insn->sec, insn->offset); + return 1; + } + if (state->uaccess && !func_uaccess_safe(func)) { WARN_FUNC("return with UACCESS enabled", insn->sec, insn->offset); @@ -2009,13 +2229,13 @@ static int validate_return(struct symbol *func, struct instruction *insn, struct return 1; } - if (func && has_modified_stack_frame(state)) { + if (func && has_modified_stack_frame(insn, state)) { WARN_FUNC("return with modified stack frame", insn->sec, insn->offset); return 1; } - if (state->bp_scratch) { + if (state->cfi.bp_scratch) { WARN_FUNC("BP used as a scratch register", insn->sec, insn->offset); return 1; @@ -2025,29 +2245,46 @@ static int validate_return(struct symbol *func, struct instruction *insn, struct } /* + * Alternatives should not contain any ORC entries, this in turn means they + * should not contain any CFI ops, which implies all instructions should have + * the same same CFI state. + * + * It is possible to constuct alternatives that have unreachable holes that go + * unreported (because they're NOPs), such holes would result in CFI_UNDEFINED + * states which then results in ORC entries, which we just said we didn't want. + * + * Avoid them by copying the CFI entry of the first instruction into the whole + * alternative. + */ +static void fill_alternative_cfi(struct objtool_file *file, struct instruction *insn) +{ + struct instruction *first_insn = insn; + int alt_group = insn->alt_group; + + sec_for_each_insn_continue(file, insn) { + if (insn->alt_group != alt_group) + break; + insn->cfi = first_insn->cfi; + } +} + +/* * Follow the branch starting at the given instruction, and recursively follow * any other branches (jumps). Meanwhile, track the frame pointer state at * each instruction and validate all the rules described in * tools/objtool/Documentation/stack-validation.txt. */ static int validate_branch(struct objtool_file *file, struct symbol *func, - struct instruction *first, struct insn_state state) + struct instruction *insn, struct insn_state state) { struct alternative *alt; - struct instruction *insn, *next_insn; + struct instruction *next_insn; struct section *sec; u8 visited; int ret; - insn = first; sec = insn->sec; - if (insn->alt_group && list_empty(&insn->alts)) { - WARN_FUNC("don't know how to handle branch to middle of alternative instruction group", - sec, insn->offset); - return 1; - } - while (1) { next_insn = next_insn_same_sec(file, insn); @@ -2065,59 +2302,24 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, visited = 1 << state.uaccess; if (insn->visited) { - if (!insn->hint && !insn_state_match(insn, &state)) + if (!insn->hint && !insn_cfi_match(insn, &state.cfi)) return 1; if (insn->visited & visited) return 0; } - if (insn->hint) { - if (insn->restore) { - struct instruction *save_insn, *i; - - i = insn; - save_insn = NULL; - sym_for_each_insn_continue_reverse(file, func, i) { - if (i->save) { - save_insn = i; - break; - } - } - - if (!save_insn) { - WARN_FUNC("no corresponding CFI save for CFI restore", - sec, insn->offset); - return 1; - } - - if (!save_insn->visited) { - /* - * Oops, no state to copy yet. - * Hopefully we can reach this - * instruction from another branch - * after the save insn has been - * visited. - */ - if (insn == first) - return 0; - - WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", - sec, insn->offset); - return 1; - } - - insn->state = save_insn->state; - } - - state = insn->state; + if (state.noinstr) + state.instr += insn->instr; - } else - insn->state = state; + if (insn->hint) + state.cfi = insn->cfi; + else + insn->cfi = state.cfi; insn->visited |= visited; - if (!insn->ignore_alts) { + if (!insn->ignore_alts && !list_empty(&insn->alts)) { bool skip_orig = false; list_for_each_entry(alt, &insn->alts, list) { @@ -2132,10 +2334,16 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, } } + if (insn->alt_group) + fill_alternative_cfi(file, insn); + if (skip_orig) return 0; } + if (handle_insn_ops(insn, &state)) + return 1; + switch (insn->type) { case INSN_RETURN: @@ -2202,32 +2410,6 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, } return 0; - case INSN_STACK: - if (update_insn_state(insn, &state)) - return 1; - - if (insn->stack_op.dest.type == OP_DEST_PUSHF) { - if (!state.uaccess_stack) { - state.uaccess_stack = 1; - } else if (state.uaccess_stack >> 31) { - WARN_FUNC("PUSHF stack exhausted", sec, insn->offset); - return 1; - } - state.uaccess_stack <<= 1; - state.uaccess_stack |= state.uaccess; - } - - if (insn->stack_op.src.type == OP_SRC_POPF) { - if (state.uaccess_stack) { - state.uaccess = state.uaccess_stack & 1; - state.uaccess_stack >>= 1; - if (state.uaccess_stack == 1) - state.uaccess_stack = 0; - } - } - - break; - case INSN_STAC: if (state.uaccess) { WARN_FUNC("recursive UACCESS enable", sec, insn->offset); @@ -2273,7 +2455,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 0; if (!next_insn) { - if (state.cfa.base == CFI_UNDEFINED) + if (state.cfi.cfa.base == CFI_UNDEFINED) return 0; WARN("%s: unexpected end of section", sec->name); return 1; @@ -2285,24 +2467,34 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 0; } -static int validate_unwind_hints(struct objtool_file *file) +static int validate_unwind_hints(struct objtool_file *file, struct section *sec) { struct instruction *insn; - int ret, warnings = 0; struct insn_state state; + int ret, warnings = 0; if (!file->hints) return 0; - clear_insn_state(&state); + init_insn_state(&state, sec); - for_each_insn(file, insn) { + if (sec) { + insn = find_insn(file, sec, 0); + if (!insn) + return 0; + } else { + insn = list_first_entry(&file->insn_list, typeof(*insn), list); + } + + while (&insn->list != &file->insn_list && (!sec || insn->sec == sec)) { if (insn->hint && !insn->visited) { ret = validate_branch(file, insn->func, insn, state); if (ret && backtrace) BT_FUNC("<=== (hint)", insn); warnings += ret; } + + insn = list_next_entry(insn, list); } return warnings; @@ -2417,43 +2609,69 @@ static bool ignore_unreachable_insn(struct instruction *insn) return false; } -static int validate_section(struct objtool_file *file, struct section *sec) +static int validate_symbol(struct objtool_file *file, struct section *sec, + struct symbol *sym, struct insn_state *state) { - struct symbol *func; struct instruction *insn; - struct insn_state state; - int ret, warnings = 0; + int ret; + + if (!sym->len) { + WARN("%s() is missing an ELF size annotation", sym->name); + return 1; + } + + if (sym->pfunc != sym || sym->alias != sym) + return 0; - clear_insn_state(&state); + insn = find_insn(file, sec, sym->offset); + if (!insn || insn->ignore || insn->visited) + return 0; + + state->uaccess = sym->uaccess_safe; + + ret = validate_branch(file, insn->func, insn, *state); + if (ret && backtrace) + BT_FUNC("<=== (sym)", insn); + return ret; +} - state.cfa = initial_func_cfi.cfa; - memcpy(&state.regs, &initial_func_cfi.regs, - CFI_NUM_REGS * sizeof(struct cfi_reg)); - state.stack_size = initial_func_cfi.cfa.offset; +static int validate_section(struct objtool_file *file, struct section *sec) +{ + struct insn_state state; + struct symbol *func; + int warnings = 0; list_for_each_entry(func, &sec->symbol_list, list) { if (func->type != STT_FUNC) continue; - if (!func->len) { - WARN("%s() is missing an ELF size annotation", - func->name); - warnings++; - } + init_insn_state(&state, sec); + state.cfi.cfa = initial_func_cfi.cfa; + memcpy(&state.cfi.regs, &initial_func_cfi.regs, + CFI_NUM_REGS * sizeof(struct cfi_reg)); + state.cfi.stack_size = initial_func_cfi.cfa.offset; - if (func->pfunc != func || func->alias != func) - continue; + warnings += validate_symbol(file, sec, func, &state); + } - insn = find_insn(file, sec, func->offset); - if (!insn || insn->ignore || insn->visited) - continue; + return warnings; +} - state.uaccess = func->uaccess_safe; +static int validate_vmlinux_functions(struct objtool_file *file) +{ + struct section *sec; + int warnings = 0; - ret = validate_branch(file, func, insn, state); - if (ret && backtrace) - BT_FUNC("<=== (func)", insn); - warnings += ret; + sec = find_section_by_name(file->elf, ".noinstr.text"); + if (sec) { + warnings += validate_section(file, sec); + warnings += validate_unwind_hints(file, sec); + } + + sec = find_section_by_name(file->elf, ".entry.text"); + if (sec) { + warnings += validate_section(file, sec); + warnings += validate_unwind_hints(file, sec); } return warnings; @@ -2464,8 +2682,12 @@ static int validate_functions(struct objtool_file *file) struct section *sec; int warnings = 0; - for_each_sec(file, sec) + for_each_sec(file, sec) { + if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + continue; + warnings += validate_section(file, sec); + } return warnings; } @@ -2496,7 +2718,7 @@ int check(const char *_objname, bool orc) objname = _objname; - file.elf = elf_read(objname, orc ? O_RDWR : O_RDONLY); + file.elf = elf_open_read(objname, orc ? O_RDWR : O_RDONLY); if (!file.elf) return 1; @@ -2516,6 +2738,15 @@ int check(const char *_objname, bool orc) if (list_empty(&file.insn_list)) goto out; + if (vmlinux && !validate_dup) { + ret = validate_vmlinux_functions(&file); + if (ret < 0) + goto out; + + warnings += ret; + goto out; + } + if (retpoline) { ret = validate_retpoline(&file); if (ret < 0) @@ -2528,7 +2759,7 @@ int check(const char *_objname, bool orc) goto out; warnings += ret; - ret = validate_unwind_hints(&file); + ret = validate_unwind_hints(&file, NULL); if (ret < 0) goto out; warnings += ret; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index f0ce8ffe7135..906b5210f7ca 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -7,22 +7,16 @@ #define _CHECK_H #include <stdbool.h> -#include "elf.h" #include "cfi.h" #include "arch.h" -#include "orc.h" -#include <linux/hashtable.h> struct insn_state { - struct cfi_reg cfa; - struct cfi_reg regs[CFI_NUM_REGS]; - int stack_size; - unsigned char type; - bool bp_scratch; - bool drap, end, uaccess, df; + struct cfi_state cfi; unsigned int uaccess_stack; - int drap_reg, drap_offset; - struct cfi_reg vals[CFI_NUM_REGS]; + bool uaccess; + bool df; + bool noinstr; + s8 instr; }; struct instruction { @@ -33,29 +27,24 @@ struct instruction { unsigned int len; enum insn_type type; unsigned long immediate; - bool alt_group, dead_end, ignore, hint, save, restore, ignore_alts; + bool dead_end, ignore, ignore_alts; + bool hint; bool retpoline_safe; + s8 instr; u8 visited; + u8 ret_offset; + int alt_group; struct symbol *call_dest; struct instruction *jump_dest; struct instruction *first_jump_src; struct rela *jump_table; struct list_head alts; struct symbol *func; - struct stack_op stack_op; - struct insn_state state; + struct list_head stack_ops; + struct cfi_state cfi; struct orc_entry orc; }; -struct objtool_file { - struct elf *elf; - struct list_head insn_list; - DECLARE_HASHTABLE(insn_hash, 20); - bool ignore_unreachables, c_file, hints, rodata; -}; - -int check(const char *objname, bool orc); - struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset); diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index c4857fa3f1d1..84225679f96d 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -27,6 +27,22 @@ static inline u32 str_hash(const char *str) return jhash(str, strlen(str), 0); } +static inline int elf_hash_bits(void) +{ + return vmlinux ? ELF_HASH_BITS : 16; +} + +#define elf_hash_add(hashtable, node, key) \ + hlist_add_head(node, &hashtable[hash_min(key, elf_hash_bits())]) + +static void elf_hash_init(struct hlist_head *table) +{ + __hash_init(table, 1U << elf_hash_bits()); +} + +#define elf_hash_for_each_possible(name, obj, member, key) \ + hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member) + static void rb_add(struct rb_root *tree, struct rb_node *node, int (*cmp)(struct rb_node *, const struct rb_node *)) { @@ -45,7 +61,7 @@ static void rb_add(struct rb_root *tree, struct rb_node *node, rb_insert_color(node, tree); } -static struct rb_node *rb_find_first(struct rb_root *tree, const void *key, +static struct rb_node *rb_find_first(const struct rb_root *tree, const void *key, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; @@ -111,11 +127,11 @@ static int symbol_by_offset(const void *key, const struct rb_node *node) return 0; } -struct section *find_section_by_name(struct elf *elf, const char *name) +struct section *find_section_by_name(const struct elf *elf, const char *name) { struct section *sec; - hash_for_each_possible(elf->section_name_hash, sec, name_hash, str_hash(name)) + elf_hash_for_each_possible(elf->section_name_hash, sec, name_hash, str_hash(name)) if (!strcmp(sec->name, name)) return sec; @@ -127,7 +143,7 @@ static struct section *find_section_by_index(struct elf *elf, { struct section *sec; - hash_for_each_possible(elf->section_hash, sec, hash, idx) + elf_hash_for_each_possible(elf->section_hash, sec, hash, idx) if (sec->idx == idx) return sec; @@ -138,7 +154,7 @@ static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx) { struct symbol *sym; - hash_for_each_possible(elf->symbol_hash, sym, hash, idx) + elf_hash_for_each_possible(elf->symbol_hash, sym, hash, idx) if (sym->idx == idx) return sym; @@ -173,7 +189,7 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) return NULL; } -struct symbol *find_symbol_containing(struct section *sec, unsigned long offset) +struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset) { struct rb_node *node; @@ -201,18 +217,18 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset) return NULL; } -struct symbol *find_symbol_by_name(struct elf *elf, const char *name) +struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) { struct symbol *sym; - hash_for_each_possible(elf->symbol_name_hash, sym, name_hash, str_hash(name)) + elf_hash_for_each_possible(elf->symbol_name_hash, sym, name_hash, str_hash(name)) if (!strcmp(sym->name, name)) return sym; return NULL; } -struct rela *find_rela_by_dest_range(struct elf *elf, struct section *sec, +struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len) { struct rela *rela, *r = NULL; @@ -224,7 +240,7 @@ struct rela *find_rela_by_dest_range(struct elf *elf, struct section *sec, sec = sec->rela; for_offset_range(o, offset, offset + len) { - hash_for_each_possible(elf->rela_hash, rela, hash, + elf_hash_for_each_possible(elf->rela_hash, rela, hash, sec_offset_hash(sec, o)) { if (rela->sec != sec) continue; @@ -241,7 +257,7 @@ struct rela *find_rela_by_dest_range(struct elf *elf, struct section *sec, return NULL; } -struct rela *find_rela_by_dest(struct elf *elf, struct section *sec, unsigned long offset) +struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset) { return find_rela_by_dest_range(elf, sec, offset, 1); } @@ -309,8 +325,8 @@ static int read_sections(struct elf *elf) sec->len = sec->sh.sh_size; list_add_tail(&sec->list, &elf->sections); - hash_add(elf->section_hash, &sec->hash, sec->idx); - hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name)); + elf_hash_add(elf->section_hash, &sec->hash, sec->idx); + elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name)); } if (stats) @@ -327,12 +343,14 @@ static int read_sections(struct elf *elf) static int read_symbols(struct elf *elf) { - struct section *symtab, *sec; + struct section *symtab, *symtab_shndx, *sec; struct symbol *sym, *pfunc; struct list_head *entry; struct rb_node *pnode; int symbols_nr, i; char *coldstr; + Elf_Data *shndx_data = NULL; + Elf32_Word shndx; symtab = find_section_by_name(elf, ".symtab"); if (!symtab) { @@ -340,6 +358,10 @@ static int read_symbols(struct elf *elf) return -1; } + symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); + if (symtab_shndx) + shndx_data = symtab_shndx->data; + symbols_nr = symtab->sh.sh_size / symtab->sh.sh_entsize; for (i = 0; i < symbols_nr; i++) { @@ -353,8 +375,9 @@ static int read_symbols(struct elf *elf) sym->idx = i; - if (!gelf_getsym(symtab->data, i, &sym->sym)) { - WARN_ELF("gelf_getsym"); + if (!gelf_getsymshndx(symtab->data, shndx_data, i, &sym->sym, + &shndx)) { + WARN_ELF("gelf_getsymshndx"); goto err; } @@ -368,10 +391,13 @@ static int read_symbols(struct elf *elf) sym->type = GELF_ST_TYPE(sym->sym.st_info); sym->bind = GELF_ST_BIND(sym->sym.st_info); - if (sym->sym.st_shndx > SHN_UNDEF && - sym->sym.st_shndx < SHN_LORESERVE) { - sym->sec = find_section_by_index(elf, - sym->sym.st_shndx); + if ((sym->sym.st_shndx > SHN_UNDEF && + sym->sym.st_shndx < SHN_LORESERVE) || + (shndx_data && sym->sym.st_shndx == SHN_XINDEX)) { + if (sym->sym.st_shndx != SHN_XINDEX) + shndx = sym->sym.st_shndx; + + sym->sec = find_section_by_index(elf, shndx); if (!sym->sec) { WARN("couldn't find section for symbol %s", sym->name); @@ -394,8 +420,8 @@ static int read_symbols(struct elf *elf) else entry = &sym->sec->symbol_list; list_add(&sym->list, entry); - hash_add(elf->symbol_hash, &sym->hash, sym->idx); - hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); + elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx); + elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); } if (stats) @@ -456,6 +482,14 @@ err: return -1; } +void elf_add_rela(struct elf *elf, struct rela *rela) +{ + struct section *sec = rela->sec; + + list_add_tail(&rela->list, &sec->rela_list); + elf_hash_add(elf->rela_hash, &rela->hash, rela_hash(rela)); +} + static int read_relas(struct elf *elf) { struct section *sec; @@ -503,8 +537,7 @@ static int read_relas(struct elf *elf) return -1; } - list_add_tail(&rela->list, &sec->rela_list); - hash_add(elf->rela_hash, &rela->hash, rela_hash(rela)); + elf_add_rela(elf, rela); nr_rela++; } max_rela = max(max_rela, nr_rela); @@ -519,7 +552,7 @@ static int read_relas(struct elf *elf) return 0; } -struct elf *elf_read(const char *name, int flags) +struct elf *elf_open_read(const char *name, int flags) { struct elf *elf; Elf_Cmd cmd; @@ -531,15 +564,16 @@ struct elf *elf_read(const char *name, int flags) perror("malloc"); return NULL; } - memset(elf, 0, sizeof(*elf)); + memset(elf, 0, offsetof(struct elf, sections)); - hash_init(elf->symbol_hash); - hash_init(elf->symbol_name_hash); - hash_init(elf->section_hash); - hash_init(elf->section_name_hash); - hash_init(elf->rela_hash); INIT_LIST_HEAD(&elf->sections); + elf_hash_init(elf->symbol_hash); + elf_hash_init(elf->symbol_name_hash); + elf_hash_init(elf->section_hash); + elf_hash_init(elf->section_name_hash); + elf_hash_init(elf->rela_hash); + elf->fd = open(name, flags); if (elf->fd == -1) { fprintf(stderr, "objtool: Can't open '%s': %s\n", @@ -676,8 +710,8 @@ struct section *elf_create_section(struct elf *elf, const char *name, shstrtab->changed = true; list_add_tail(&sec->list, &elf->sections); - hash_add(elf->section_hash, &sec->hash, sec->idx); - hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name)); + elf_hash_add(elf->section_hash, &sec->hash, sec->idx); + elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name)); return sec; } @@ -745,7 +779,7 @@ int elf_rebuild_rela_section(struct section *sec) return 0; } -int elf_write(struct elf *elf) +int elf_write(const struct elf *elf) { struct section *sec; Elf_Scn *s; diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index 12e01ac190ec..f4fe1d6ea392 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -39,7 +39,7 @@ struct section { char *name; int idx; unsigned int len; - bool changed, text, rodata; + bool changed, text, rodata, noinstr; }; struct symbol { @@ -70,17 +70,19 @@ struct rela { bool jump_table_start; }; +#define ELF_HASH_BITS 20 + struct elf { Elf *elf; GElf_Ehdr ehdr; int fd; char *name; struct list_head sections; - DECLARE_HASHTABLE(symbol_hash, 20); - DECLARE_HASHTABLE(symbol_name_hash, 20); - DECLARE_HASHTABLE(section_hash, 16); - DECLARE_HASHTABLE(section_name_hash, 16); - DECLARE_HASHTABLE(rela_hash, 20); + DECLARE_HASHTABLE(symbol_hash, ELF_HASH_BITS); + DECLARE_HASHTABLE(symbol_name_hash, ELF_HASH_BITS); + DECLARE_HASHTABLE(section_hash, ELF_HASH_BITS); + DECLARE_HASHTABLE(section_name_hash, ELF_HASH_BITS); + DECLARE_HASHTABLE(rela_hash, ELF_HASH_BITS); }; #define OFFSET_STRIDE_BITS 4 @@ -112,22 +114,23 @@ static inline u32 rela_hash(struct rela *rela) return sec_offset_hash(rela->sec, rela->offset); } -struct elf *elf_read(const char *name, int flags); -struct section *find_section_by_name(struct elf *elf, const char *name); +struct elf *elf_open_read(const char *name, int flags); +struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr); +struct section *elf_create_rela_section(struct elf *elf, struct section *base); +void elf_add_rela(struct elf *elf, struct rela *rela); +int elf_write(const struct elf *elf); +void elf_close(struct elf *elf); + +struct section *find_section_by_name(const struct elf *elf, const char *name); struct symbol *find_func_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); -struct symbol *find_symbol_by_name(struct elf *elf, const char *name); -struct symbol *find_symbol_containing(struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest(struct elf *elf, struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest_range(struct elf *elf, struct section *sec, +struct symbol *find_symbol_by_name(const struct elf *elf, const char *name); +struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset); +struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); +struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len); struct symbol *find_func_containing(struct section *sec, unsigned long offset); -struct section *elf_create_section(struct elf *elf, const char *name, size_t - entsize, int nr); -struct section *elf_create_rela_section(struct elf *elf, struct section *base); int elf_rebuild_rela_section(struct section *sec); -int elf_write(struct elf *elf); -void elf_close(struct elf *elf); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 0b3528f05053..58fdda510653 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -58,7 +58,9 @@ static void cmd_usage(void) printf("\n"); - exit(129); + if (!help) + exit(129); + exit(0); } static void handle_options(int *argc, const char ***argv) diff --git a/tools/objtool/objtool.h b/tools/objtool/objtool.h new file mode 100644 index 000000000000..528028a66816 --- /dev/null +++ b/tools/objtool/objtool.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Matt Helsley <mhelsley@vmware.com> + */ + +#ifndef _OBJTOOL_H +#define _OBJTOOL_H + +#include <stdbool.h> +#include <linux/list.h> +#include <linux/hashtable.h> + +#include "elf.h" + +struct objtool_file { + struct elf *elf; + struct list_head insn_list; + DECLARE_HASHTABLE(insn_hash, 20); + bool ignore_unreachables, c_file, hints, rodata; +}; + +int check(const char *objname, bool orc); +int orc_dump(const char *objname); +int create_orc(struct objtool_file *file); +int create_orc_sections(struct objtool_file *file); + +#endif /* _OBJTOOL_H */ diff --git a/tools/objtool/orc.h b/tools/objtool/orc.h deleted file mode 100644 index ee2832221e62..000000000000 --- a/tools/objtool/orc.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> - */ - -#ifndef _ORC_H -#define _ORC_H - -#include <asm/orc_types.h> - -struct objtool_file; - -int create_orc(struct objtool_file *file); -int create_orc_sections(struct objtool_file *file); - -int orc_dump(const char *objname); - -#endif /* _ORC_H */ diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index ba4cbb1cdd63..fca46e006fc2 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -4,7 +4,8 @@ */ #include <unistd.h> -#include "orc.h" +#include <asm/orc_types.h> +#include "objtool.h" #include "warn.h" static const char *reg_name(unsigned int reg) diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 4c0dabd28000..c9549988121a 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -6,7 +6,6 @@ #include <stdlib.h> #include <string.h> -#include "orc.h" #include "check.h" #include "warn.h" @@ -16,10 +15,10 @@ int create_orc(struct objtool_file *file) for_each_insn(file, insn) { struct orc_entry *orc = &insn->orc; - struct cfi_reg *cfa = &insn->state.cfa; - struct cfi_reg *bp = &insn->state.regs[CFI_BP]; + struct cfi_reg *cfa = &insn->cfi.cfa; + struct cfi_reg *bp = &insn->cfi.regs[CFI_BP]; - orc->end = insn->state.end; + orc->end = insn->cfi.end; if (cfa->base == CFI_UNDEFINED) { orc->sp_reg = ORC_REG_UNDEFINED; @@ -75,7 +74,7 @@ int create_orc(struct objtool_file *file) orc->sp_offset = cfa->offset; orc->bp_offset = bp->offset; - orc->type = insn->state.type; + orc->type = insn->cfi.type; } return 0; @@ -130,8 +129,7 @@ static int create_orc_entry(struct elf *elf, struct section *u_sec, struct secti rela->offset = idx * sizeof(int); rela->sec = ip_relasec; - list_add_tail(&rela->list, &ip_relasec->rela_list); - hash_add(elf->rela_hash, &rela->hash, rela_hash(rela)); + elf_add_rela(elf, rela); return 0; } diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c new file mode 100644 index 000000000000..942ea5e8ac36 --- /dev/null +++ b/tools/objtool/weak.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Matt Helsley <mhelsley@vmware.com> + * Weak definitions necessary to compile objtool without + * some subcommands (e.g. check, orc). + */ + +#include <stdbool.h> +#include <errno.h> +#include "objtool.h" + +#define __weak __attribute__((weak)) + +#define UNSUPPORTED(name) \ +({ \ + fprintf(stderr, "error: objtool: " name " not implemented\n"); \ + return ENOSYS; \ +}) + +const char __weak *objname; + +int __weak check(const char *_objname, bool orc) +{ + UNSUPPORTED("check subcommand"); +} + +int __weak orc_dump(const char *_objname) +{ + UNSUPPORTED("orc"); +} + +int __weak create_orc(struct objtool_file *file) +{ + UNSUPPORTED("orc"); +} + +int __weak create_orc_sections(struct objtool_file *file) +{ + UNSUPPORTED("orc"); +} diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 31824d5269cc..6e54979c2124 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -48,7 +48,7 @@ man5dir=$(mandir)/man5 man7dir=$(mandir)/man7 ASCIIDOC=asciidoc -ASCIIDOC_EXTRA = --unsafe -f asciidoc.conf +ASCIIDOC_EXTRA += --unsafe -f asciidoc.conf ASCIIDOC_HTML = xhtml11 MANPAGE_XSL = manpage-normal.xsl XMLTO_EXTRA = @@ -59,7 +59,7 @@ HTML_REF = origin/html ifdef USE_ASCIIDOCTOR ASCIIDOC = asciidoctor -ASCIIDOC_EXTRA = -a compat-mode +ASCIIDOC_EXTRA += -a compat-mode ASCIIDOC_EXTRA += -I. -rasciidoctor-extensions ASCIIDOC_EXTRA += -a mansource="perf" -a manmanual="perf Manual" ASCIIDOC_HTML = xhtml5 diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index 82ff7dad40c2..271484754fee 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -10,7 +10,9 @@ e synthesize error events d create a debug log g synthesize a call chain (use with i or x) + G synthesize a call chain on existing event records l synthesize last branch entries (use with i or x) + L synthesize last branch entries on existing event records s skip initial number of events The default is all events i.e. the same as --itrace=ibxwpe, @@ -31,6 +33,10 @@ Also the number of last branch entries (default 64, max. 1024) for instructions or transactions events can be specified. + Similar to options g and l, size may also be specified for options G and L. + On x86, note that G and L work poorly when data has been recorded with + large PEBS. Refer linkperf:perf-intel-pt[1] man page for details. + It is also possible to skip events generated (instructions, branches, transactions, ptwrite, power) at the beginning. This is useful to ignore initialization code. diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 0921a3c67381..bad16512c48d 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -61,6 +61,9 @@ SUBSYSTEM 'epoll':: Eventpoll (epoll) stressing benchmarks. +'internals':: + Benchmark internal perf functionality. + 'all':: All benchmark subsystems. @@ -214,6 +217,11 @@ Suite for evaluating concurrent epoll_wait calls. *ctl*:: Suite for evaluating multiple epoll_ctl calls. +SUITES FOR 'internals' +~~~~~~~~~~~~~~~~~~~~~~ +*synthesize*:: +Suite for evaluating perf's event synthesis performance. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index e6150f21267d..2133eb320cb0 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -111,6 +111,17 @@ REPORT OPTIONS --display:: Switch to HITM type (rmt, lcl) to display and sort on. Total HITMs as default. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The perf.data file must have been obtained using + perf c2c record --call-graph lbr. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. + C2C RECORD ---------- The perf c2c record command setup options related to HITM cacheline analysis diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index 456fdcbf26ac..eb8b7d42591a 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -69,22 +69,22 @@ And profiled with 'perf report' e.g. To also trace kernel space presents a problem, namely kernel self-modifying code. A fairly good kernel image is available in /proc/kcore but to get an accurate image a copy of /proc/kcore needs to be made under the same conditions -as the data capture. A script perf-with-kcore can do that, but beware that the -script makes use of 'sudo' to copy /proc/kcore. If you have perf installed -locally from the source tree you can do: +as the data capture. 'perf record' can make a copy of /proc/kcore if the option +--kcore is used, but access to /proc/kcore is restricted e.g. - ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls + sudo perf record -o pt_ls --kcore -e intel_pt// -- ls -which will create a directory named 'pt_ls' and put the perf.data file and -copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use -'perf report' becomes: +which will create a directory named 'pt_ls' and put the perf.data file (named +simply 'data') and copies of /proc/kcore, /proc/kallsyms and /proc/modules into +it. The other tools understand the directory format, so to use 'perf report' +becomes: - ~/libexec/perf-core/perf-with-kcore report pt_ls + sudo perf report -i pt_ls Because samples are synthesized after-the-fact, the sampling period can be selected for reporting. e.g. sample every microsecond - ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge + sudo perf report pt_ls --itrace=i1usge See the sections below for more information about the --itrace option. @@ -821,7 +821,9 @@ The letters are: e synthesize tracing error events d create a debug log g synthesize a call chain (use with i or x) + G synthesize a call chain on existing event records l synthesize last branch entries (use with i or x) + L synthesize last branch entries on existing event records s skip initial number of events "Instructions" events look like they were recorded by "perf record -e @@ -912,6 +914,39 @@ transactions events can be specified. e.g. Note that last branch entries are cleared for each sample, so there is no overlap from one sample to the next. +The G and L options are designed in particular for sample mode, and work much +like g and l but add call chain and branch stack to the other selected events +instead of synthesized events. For example, to record branch-misses events for +'ls' and then add a call chain derived from the Intel PT trace: + + perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' -- ls + perf report --itrace=Ge + +Although in fact G is a default for perf report, so that is the same as just: + + perf report + +One caveat with the G and L options is that they work poorly with "Large PEBS". +Large PEBS means PEBS records will be accumulated by hardware and the written +into the event buffer in one go. That reduces interrupts, but can give very +late timestamps. Because the Intel PT trace is synchronized by timestamps, +the PEBS events do not match the trace. Currently, Large PEBS is used only in +certain circumstances: + - hardware supports it + - PEBS is used + - event period is specified, instead of frequency + - the sample type is limited to the following flags: + PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | + PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | + PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | + PERF_SAMPLE_PERIOD (and sometimes) | PERF_SAMPLE_TIME +Because Intel PT sample mode uses a different sample type to the list above, +Large PEBS is not used with Intel PT sample mode. To avoid Large PEBS in other +cases, avoid specifying the event period i.e. avoid the 'perf record' -c option, +--count option, or 'period' config term. + To disable trace decoding entirely, use the option --no-itrace. It is also possible to skip events generated (instructions, branches, transactions) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 6345db33c533..376a50b3452d 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -115,6 +115,11 @@ raw encoding of 0x1A8 can be used: perf stat -e r1a8 -a sleep 1 perf record -e r1a8 ... +It's also possible to use pmu syntax: + + perf record -e r1a8 -a sleep 1 + perf record -e cpu/r1a8/ ... + You should refer to the processor specific documentation for getting these details. Some of them are referenced in the SEE ALSO section below. @@ -258,6 +263,9 @@ Normally all events in an event group sample, but with :S only the first event (the leader) samples, and it only reads the values of the other events in the group. +However, in the case AUX area events (e.g. Intel PT or CoreSight), the AUX +area event must be the leader, so then the second event samples, not the first. + OPTIONS ------- diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index b3f3b3f1c161..561ef55743e2 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -556,6 +556,19 @@ overhead. You can still switch them on with: --switch-output --no-no-buildid --no-no-buildid-cache +--switch-output-event:: +Events that will cause the switch of the perf.data file, auto-selecting +--switch-output=signal, the results are similar as internally the side band +thread will also send a SIGUSR2 to the main one. + +Uses the same syntax as --event, it will just not be recorded, serving only to +switch the perf.data file as soon as the --switch-output event is processed by +a separate sideband thread. + +This sideband thread is also used to other purposes, like processing the +PERF_RECORD_BPF_EVENT records as they happen, asking the kernel for extra BPF +information, etc. + --switch-max-files=N:: When rotating perf.data with --switch-output, only keep N files. @@ -596,6 +609,10 @@ Make a copy of /proc/kcore and place it into a directory with the perf data file Limit the sample data max size, <size> is expected to be a number with appended unit character - B/K/M/G +--num-thread-synthesize:: + The number of threads to run when synthesizing events for existing processes. + By default, the number of threads equals 1. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index f569b9ea4002..d068103690cc 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -488,6 +488,17 @@ include::itrace.txt[] This option extends the perf report to show reference callgraphs, which collected by reference event, in no callgraph event. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The perf.data file must have been obtained using + perf record --call-graph lbr. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. + --socket-filter:: Only report the samples on the processor socket that match with this filter diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 963487e82edc..372dfd110e6d 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -440,6 +440,17 @@ include::itrace.txt[] --show-on-off-events:: Show the --switch-on/off events too. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The perf.data file must have been obtained using + perf record --call-graph lbr. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 4d56586b2fb9..3fb5028aef08 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -176,6 +176,8 @@ Print count deltas every N milliseconds (minimum: 1ms) The overhead percentage could be high in some cases, for instance with small, sub 100ms intervals. Use with caution. example: 'perf stat -I 1000 -e cycles -a sleep 5' +If the metric exists, it is calculated by the counts generated in this interval and the metric is printed after #. + --interval-count times:: Print count deltas for fixed number of times. This option should be used together with "-I" option. diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 487737a725e9..20227dabc208 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -319,6 +319,15 @@ Default is to monitor all CPUS. go straight to the histogram browser, just like 'perf top' with no events explicitely specified does. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The option must be used with --call-graph lbr recording. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. INTERACTIVE PROMPTING KEYS -------------------------- diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index b0152e1095c5..b6472e463284 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -373,6 +373,22 @@ struct { Indicates that trace contains records of PERF_RECORD_COMPRESSED type that have perf_events records in compressed form. + HEADER_CPU_PMU_CAPS = 28, + + A list of cpu PMU capabilities. The format of data is as below. + +struct { + u32 nr_cpu_pmu_caps; + { + char name[]; + char value[]; + } [nr_cpu_pmu_caps] +}; + + +Example: + cpu pmu capabilities: branches=32, max_precise=3, pmu_name=icelake + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index d15a311408f1..94a495594e99 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -188,7 +188,7 @@ AWK = awk # non-config cases config := 1 -NON_CONFIG_TARGETS := clean python-clean TAGS tags cscope help install-doc install-man install-html install-info install-pdf doc man html info pdf +NON_CONFIG_TARGETS := clean python-clean TAGS tags cscope help ifdef MAKECMDGOALS ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),) @@ -832,7 +832,7 @@ INSTALL_DOC_TARGETS += quick-install-doc quick-install-man quick-install-html # 'make doc' should call 'make -C Documentation all' $(DOC_TARGETS): - $(Q)$(MAKE) -C $(DOC_DIR) O=$(OUTPUT) $(@:doc=all) + $(Q)$(MAKE) -C $(DOC_DIR) O=$(OUTPUT) $(@:doc=all) ASCIIDOC_EXTRA=$(ASCIIDOC_EXTRA) TAG_FOLDERS= . ../lib ../include TAG_FILES= ../../include/uapi/linux/perf_event.h @@ -959,7 +959,7 @@ install-python_ext: # 'make install-doc' should call 'make -C Documentation install' $(INSTALL_DOC_TARGETS): - $(Q)$(MAKE) -C $(DOC_DIR) O=$(OUTPUT) $(@:-doc=) + $(Q)$(MAKE) -C $(DOC_DIR) O=$(OUTPUT) $(@:-doc=) ASCIIDOC_EXTRA=$(ASCIIDOC_EXTRA) ### Cleaning rules diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 941f814820b8..97aa02c4491d 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -23,6 +23,7 @@ #include "../../util/event.h" #include "../../util/evlist.h" #include "../../util/evsel.h" +#include "../../util/perf_api_probe.h" #include "../../util/evsel_config.h" #include "../../util/pmu.h" #include "../../util/cs-etm.h" @@ -232,7 +233,7 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu, ret = perf_pmu__scan_file(pmu, path, "%x", &hash); if (ret != 1) { pr_err("failed to set sink \"%s\" on event %s with %d (%s)\n", - sink, perf_evsel__name(evsel), errno, + sink, evsel__name(evsel), errno, str_error_r(errno, msg, sizeof(msg))); return ret; } @@ -401,7 +402,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, * when a context switch happened. */ if (!perf_cpu_map__empty(cpus)) { - perf_evsel__set_sample_bit(cs_etm_evsel, CPU); + evsel__set_sample_bit(cs_etm_evsel, CPU); err = cs_etm_set_option(itr, cs_etm_evsel, ETM_OPT_CTXTID | ETM_OPT_TS); @@ -425,7 +426,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, /* In per-cpu case, always need the time of mmap events etc */ if (!perf_cpu_map__empty(cpus)) - perf_evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, TIME); } out: diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 27653be24447..e3593063b3d1 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -120,9 +120,9 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, */ perf_evlist__to_front(evlist, arm_spe_evsel); - perf_evsel__set_sample_bit(arm_spe_evsel, CPU); - perf_evsel__set_sample_bit(arm_spe_evsel, TIME); - perf_evsel__set_sample_bit(arm_spe_evsel, TID); + evsel__set_sample_bit(arm_spe_evsel, CPU); + evsel__set_sample_bit(arm_spe_evsel, TIME); + evsel__set_sample_bit(arm_spe_evsel, TID); /* Add dummy event to keep tracking */ err = parse_events(evlist, "dummy:u", NULL); @@ -134,9 +134,9 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, tracking_evsel->core.attr.freq = 0; tracking_evsel->core.attr.sample_period = 1; - perf_evsel__set_sample_bit(tracking_evsel, TIME); - perf_evsel__set_sample_bit(tracking_evsel, CPU); - perf_evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); + evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, CPU); + evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); return 0; } diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c index 3b4cdfc5efd6..d4870074f14c 100644 --- a/tools/perf/arch/powerpc/util/header.c +++ b/tools/perf/arch/powerpc/util/header.c @@ -7,6 +7,8 @@ #include <string.h> #include <linux/stringify.h> #include "header.h" +#include "metricgroup.h" +#include <api/fs/fs.h> #define mfspr(rn) ({unsigned long rval; \ asm volatile("mfspr %0," __stringify(rn) \ @@ -44,3 +46,9 @@ get_cpuid_str(struct perf_pmu *pmu __maybe_unused) return bufp; } + +int arch_get_runtimeparam(void) +{ + int count; + return sysfs__read_int("/devices/hv_24x7/interface/sockets", &count) < 0 ? 1 : count; +} diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index 16807269317c..eed9e5a42935 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -39,7 +39,7 @@ static void hcall_event_get_key(struct evsel *evsel, struct event_key *key) { key->info = 0; - key->key = perf_evsel__intval(evsel, sample, "req"); + key->key = evsel__intval(evsel, sample, "req"); } static const char *get_hcall_exit_reason(u64 exit_code) diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/arch/s390/util/kvm-stat.c index 0fd4e9f49ed0..34da89ced29a 100644 --- a/tools/perf/arch/s390/util/kvm-stat.c +++ b/tools/perf/arch/s390/util/kvm-stat.c @@ -30,7 +30,7 @@ static void event_icpt_insn_get_key(struct evsel *evsel, { unsigned long insn; - insn = perf_evsel__intval(evsel, sample, "instruction"); + insn = evsel__intval(evsel, sample, "instruction"); key->key = icpt_insn_decoder(insn); key->exit_reasons = sie_icpt_insn_codes; } @@ -39,7 +39,7 @@ static void event_sigp_get_key(struct evsel *evsel, struct perf_sample *sample, struct event_key *key) { - key->key = perf_evsel__intval(evsel, sample, "order_code"); + key->key = evsel__intval(evsel, sample, "order_code"); key->exit_reasons = sie_sigp_order_codes; } @@ -47,7 +47,7 @@ static void event_diag_get_key(struct evsel *evsel, struct perf_sample *sample, struct event_key *key) { - key->key = perf_evsel__intval(evsel, sample, "code"); + key->key = evsel__intval(evsel, sample, "code"); key->exit_reasons = sie_diagnose_codes; } @@ -55,7 +55,7 @@ static void event_icpt_prog_get_key(struct evsel *evsel, struct perf_sample *sample, struct event_key *key) { - key->key = perf_evsel__intval(evsel, sample, "code"); + key->key = evsel__intval(evsel, sample, "code"); key->exit_reasons = sie_icpt_prog_codes; } diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 909ead08a6f6..026d32ed078e 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -130,13 +130,11 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe goto next_event; if (strcmp(event->comm.comm, comm1) == 0) { - CHECK__(perf_evsel__parse_sample(evsel, event, - &sample)); + CHECK__(evsel__parse_sample(evsel, event, &sample)); comm1_time = sample.time; } if (strcmp(event->comm.comm, comm2) == 0) { - CHECK__(perf_evsel__parse_sample(evsel, event, - &sample)); + CHECK__(evsel__parse_sample(evsel, event, &sample)); comm2_time = sample.time; } next_event: diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 09f93800bffd..0dc09b5809c1 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -224,7 +224,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, * AUX event. */ if (!perf_cpu_map__empty(cpus)) - perf_evsel__set_sample_bit(intel_bts_evsel, CPU); + evsel__set_sample_bit(intel_bts_evsel, CPU); } /* Add dummy event to keep tracking */ diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 1643aed8c4c8..3f7c20cc7b79 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -25,6 +25,7 @@ #include "../../../util/pmu.h" #include "../../../util/debug.h" #include "../../../util/auxtrace.h" +#include "../../../util/perf_api_probe.h" #include "../../../util/record.h" #include "../../../util/target.h" #include "../../../util/tsc.h" @@ -420,8 +421,8 @@ static int intel_pt_track_switches(struct evlist *evlist) evsel = evlist__last(evlist); - perf_evsel__set_sample_bit(evsel, CPU); - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, TIME); evsel->core.system_wide = true; evsel->no_aux_samples = true; @@ -801,10 +802,10 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, switch_evsel->no_aux_samples = true; switch_evsel->immediate = true; - perf_evsel__set_sample_bit(switch_evsel, TID); - perf_evsel__set_sample_bit(switch_evsel, TIME); - perf_evsel__set_sample_bit(switch_evsel, CPU); - perf_evsel__reset_sample_bit(switch_evsel, BRANCH_STACK); + evsel__set_sample_bit(switch_evsel, TID); + evsel__set_sample_bit(switch_evsel, TIME); + evsel__set_sample_bit(switch_evsel, CPU); + evsel__reset_sample_bit(switch_evsel, BRANCH_STACK); opts->record_switch_events = false; ptr->have_sched_switch = 3; @@ -838,7 +839,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * AUX event. */ if (!perf_cpu_map__empty(cpus)) - perf_evsel__set_sample_bit(intel_pt_evsel, CPU); + evsel__set_sample_bit(intel_pt_evsel, CPU); } /* Add dummy event to keep tracking */ @@ -862,11 +863,11 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, /* In per-cpu case, always need the time of mmap events etc */ if (!perf_cpu_map__empty(cpus)) { - perf_evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, TIME); /* And the CPU for switch events */ - perf_evsel__set_sample_bit(tracking_evsel, CPU); + evsel__set_sample_bit(tracking_evsel, CPU); } - perf_evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); + evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); } /* diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c index c0775c39227f..072920475b65 100644 --- a/tools/perf/arch/x86/util/kvm-stat.c +++ b/tools/perf/arch/x86/util/kvm-stat.c @@ -31,8 +31,8 @@ const char *kvm_exit_trace = "kvm:kvm_exit"; static void mmio_event_get_key(struct evsel *evsel, struct perf_sample *sample, struct event_key *key) { - key->key = perf_evsel__intval(evsel, sample, "gpa"); - key->info = perf_evsel__intval(evsel, sample, "type"); + key->key = evsel__intval(evsel, sample, "gpa"); + key->info = evsel__intval(evsel, sample, "type"); } #define KVM_TRACE_MMIO_READ_UNSATISFIED 0 @@ -48,7 +48,7 @@ static bool mmio_event_begin(struct evsel *evsel, /* MMIO write begin event in kernel. */ if (!strcmp(evsel->name, "kvm:kvm_mmio") && - perf_evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_WRITE) { + evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_WRITE) { mmio_event_get_key(evsel, sample, key); return true; } @@ -65,7 +65,7 @@ static bool mmio_event_end(struct evsel *evsel, struct perf_sample *sample, /* MMIO read end event in kernel.*/ if (!strcmp(evsel->name, "kvm:kvm_mmio") && - perf_evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_READ) { + evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_READ) { mmio_event_get_key(evsel, sample, key); return true; } @@ -94,8 +94,8 @@ static void ioport_event_get_key(struct evsel *evsel, struct perf_sample *sample, struct event_key *key) { - key->key = perf_evsel__intval(evsel, sample, "port"); - key->info = perf_evsel__intval(evsel, sample, "rw"); + key->key = evsel__intval(evsel, sample, "port"); + key->info = evsel__intval(evsel, sample, "rw"); } static bool ioport_event_begin(struct evsel *evsel, diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index e4e321b6f883..768e408757a0 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -6,9 +6,10 @@ perf-y += futex-wake.o perf-y += futex-wake-parallel.o perf-y += futex-requeue.o perf-y += futex-lock-pi.o - perf-y += epoll-wait.o perf-y += epoll-ctl.o +perf-y += synthesize.o +perf-y += kallsyms-parse.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 4aa6de1aa67d..61cae4966cae 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -41,9 +41,10 @@ int bench_futex_wake_parallel(int argc, const char **argv); int bench_futex_requeue(int argc, const char **argv); /* pi futexes */ int bench_futex_lock_pi(int argc, const char **argv); - int bench_epoll_wait(int argc, const char **argv); int bench_epoll_ctl(int argc, const char **argv); +int bench_synthesize(int argc, const char **argv); +int bench_kallsyms_parse(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index f938c585d512..cf797362675b 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -519,7 +519,8 @@ int bench_epoll_wait(int argc, const char **argv) qsort(worker, nthreads, sizeof(struct worker), cmpworker); for (i = 0; i < nthreads; i++) { - unsigned long t = worker[i].ops / bench__runtime.tv_sec; + unsigned long t = bench__runtime.tv_sec > 0 ? + worker[i].ops / bench__runtime.tv_sec : 0; update_stats(&throughput_stats, t); diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 65eebe06c04d..915bf3da7ce2 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -205,7 +205,8 @@ int bench_futex_hash(int argc, const char **argv) pthread_mutex_destroy(&thread_lock); for (i = 0; i < nthreads; i++) { - unsigned long t = worker[i].ops / bench__runtime.tv_sec; + unsigned long t = bench__runtime.tv_sec > 0 ? + worker[i].ops / bench__runtime.tv_sec : 0; update_stats(&throughput_stats, t); if (!silent) { if (nfutexes == 1) diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 89fd8f325f38..bb25d8beb3b8 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -211,7 +211,8 @@ int bench_futex_lock_pi(int argc, const char **argv) pthread_mutex_destroy(&thread_lock); for (i = 0; i < nthreads; i++) { - unsigned long t = worker[i].ops / bench__runtime.tv_sec; + unsigned long t = bench__runtime.tv_sec > 0 ? + worker[i].ops / bench__runtime.tv_sec : 0; update_stats(&throughput_stats, t); if (!silent) diff --git a/tools/perf/bench/kallsyms-parse.c b/tools/perf/bench/kallsyms-parse.c new file mode 100644 index 000000000000..2b0d0f980ae9 --- /dev/null +++ b/tools/perf/bench/kallsyms-parse.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark of /proc/kallsyms parsing. + * + * Copyright 2020 Google LLC. + */ +#include <stdlib.h> +#include "bench.h" +#include "../util/stat.h" +#include <linux/time64.h> +#include <subcmd/parse-options.h> +#include <symbol/kallsyms.h> + +static unsigned int iterations = 100; + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals kallsyms-parse <options>", + NULL +}; + +static int bench_process_symbol(void *arg __maybe_unused, + const char *name __maybe_unused, + char type __maybe_unused, + u64 start __maybe_unused) +{ + return 0; +} + +static int do_kallsyms_parse(void) +{ + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double time_average, time_stddev; + int err; + struct stats time_stats; + + init_stats(&time_stats); + + for (i = 0; i < iterations; i++) { + gettimeofday(&start, NULL); + err = kallsyms__parse("/proc/kallsyms", NULL, + bench_process_symbol); + if (err) + return err; + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + } + + time_average = avg_stats(&time_stats) / USEC_PER_MSEC; + time_stddev = stddev_stats(&time_stats) / USEC_PER_MSEC; + printf(" Average kallsyms__parse took: %.3f ms (+- %.3f ms)\n", + time_average, time_stddev); + return 0; +} + +int bench_kallsyms_parse(int argc, const char **argv) +{ + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + return do_kallsyms_parse(); +} diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c new file mode 100644 index 000000000000..8d624aea1c5e --- /dev/null +++ b/tools/perf/bench/synthesize.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark synthesis of perf events such as at the start of a 'perf + * record'. Synthesis is done on the current process and the 'dummy' event + * handlers are invoked that support dump_trace but otherwise do nothing. + * + * Copyright 2019 Google LLC. + */ +#include <stdio.h> +#include "bench.h" +#include "../util/debug.h" +#include "../util/session.h" +#include "../util/stat.h" +#include "../util/synthetic-events.h" +#include "../util/target.h" +#include "../util/thread_map.h" +#include "../util/tool.h" +#include "../util/util.h" +#include <linux/atomic.h> +#include <linux/err.h> +#include <linux/time64.h> +#include <subcmd/parse-options.h> + +static unsigned int min_threads = 1; +static unsigned int max_threads = UINT_MAX; +static unsigned int single_iterations = 10000; +static unsigned int multi_iterations = 10; +static bool run_st; +static bool run_mt; + +static const struct option options[] = { + OPT_BOOLEAN('s', "st", &run_st, "Run single threaded benchmark"), + OPT_BOOLEAN('t', "mt", &run_mt, "Run multi-threaded benchmark"), + OPT_UINTEGER('m', "min-threads", &min_threads, + "Minimum number of threads in multithreaded bench"), + OPT_UINTEGER('M', "max-threads", &max_threads, + "Maximum number of threads in multithreaded bench"), + OPT_UINTEGER('i', "single-iterations", &single_iterations, + "Number of iterations used to compute single-threaded average"), + OPT_UINTEGER('I', "multi-iterations", &multi_iterations, + "Number of iterations used to compute multi-threaded average"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals synthesize <options>", + NULL +}; + +static atomic_t event_count; + +static int process_synthesized_event(struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + atomic_inc(&event_count); + return 0; +} + +static int do_run_single_threaded(struct perf_session *session, + struct perf_thread_map *threads, + struct target *target, bool data_mmap) +{ + const unsigned int nr_threads_synthesize = 1; + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double time_average, time_stddev, event_average, event_stddev; + int err; + struct stats time_stats, event_stats; + + init_stats(&time_stats); + init_stats(&event_stats); + + for (i = 0; i < single_iterations; i++) { + atomic_set(&event_count, 0); + gettimeofday(&start, NULL); + err = __machine__synthesize_threads(&session->machines.host, + NULL, + target, threads, + process_synthesized_event, + data_mmap, + nr_threads_synthesize); + if (err) + return err; + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&event_stats, atomic_read(&event_count)); + } + + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average %ssynthesis took: %.3f usec (+- %.3f usec)\n", + data_mmap ? "data " : "", time_average, time_stddev); + + event_average = avg_stats(&event_stats); + event_stddev = stddev_stats(&event_stats); + printf(" Average num. events: %.3f (+- %.3f)\n", + event_average, event_stddev); + + printf(" Average time per event %.3f usec\n", + time_average / event_average); + return 0; +} + +static int run_single_threaded(void) +{ + struct perf_session *session; + struct target target = { + .pid = "self", + }; + struct perf_thread_map *threads; + int err; + + perf_set_singlethreaded(); + session = perf_session__new(NULL, false, NULL); + if (IS_ERR(session)) { + pr_err("Session creation failed.\n"); + return PTR_ERR(session); + } + threads = thread_map__new_by_pid(getpid()); + if (!threads) { + pr_err("Thread map creation failed.\n"); + err = -ENOMEM; + goto err_out; + } + + puts( +"Computing performance of single threaded perf event synthesis by\n" +"synthesizing events on the perf process itself:"); + + err = do_run_single_threaded(session, threads, &target, false); + if (err) + goto err_out; + + err = do_run_single_threaded(session, threads, &target, true); + +err_out: + if (threads) + perf_thread_map__put(threads); + + perf_session__delete(session); + return err; +} + +static int do_run_multi_threaded(struct target *target, + unsigned int nr_threads_synthesize) +{ + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double time_average, time_stddev, event_average, event_stddev; + int err; + struct stats time_stats, event_stats; + struct perf_session *session; + + init_stats(&time_stats); + init_stats(&event_stats); + for (i = 0; i < multi_iterations; i++) { + session = perf_session__new(NULL, false, NULL); + if (!session) + return -ENOMEM; + + atomic_set(&event_count, 0); + gettimeofday(&start, NULL); + err = __machine__synthesize_threads(&session->machines.host, + NULL, + target, NULL, + process_synthesized_event, + false, + nr_threads_synthesize); + if (err) { + perf_session__delete(session); + return err; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&event_stats, atomic_read(&event_count)); + perf_session__delete(session); + } + + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average synthesis took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + event_average = avg_stats(&event_stats); + event_stddev = stddev_stats(&event_stats); + printf(" Average num. events: %.3f (+- %.3f)\n", + event_average, event_stddev); + + printf(" Average time per event %.3f usec\n", + time_average / event_average); + return 0; +} + +static int run_multi_threaded(void) +{ + struct target target = { + .cpu_list = "0" + }; + unsigned int nr_threads_synthesize; + int err; + + if (max_threads == UINT_MAX) + max_threads = sysconf(_SC_NPROCESSORS_ONLN); + + puts( +"Computing performance of multi threaded perf event synthesis by\n" +"synthesizing events on CPU 0:"); + + for (nr_threads_synthesize = min_threads; + nr_threads_synthesize <= max_threads; + nr_threads_synthesize++) { + if (nr_threads_synthesize == 1) + perf_set_singlethreaded(); + else + perf_set_multithreaded(); + + printf(" Number of synthesis threads: %u\n", + nr_threads_synthesize); + + err = do_run_multi_threaded(&target, nr_threads_synthesize); + if (err) + return err; + } + perf_set_singlethreaded(); + return 0; +} + +int bench_synthesize(int argc, const char **argv) +{ + int err = 0; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + /* + * If neither single threaded or multi-threaded are specified, default + * to running just single threaded. + */ + if (!run_st && !run_mt) + run_st = true; + + if (run_st) + err = run_single_threaded(); + + if (!err && run_mt) + err = run_multi_threaded(); + + return err; +} diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 6c0a0412502e..d3e5a84f87a2 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -212,11 +212,9 @@ static bool has_annotation(struct perf_annotate *ann) return ui__has_annotation() || ann->use_stdio2; } -static int perf_evsel__add_sample(struct evsel *evsel, - struct perf_sample *sample, - struct addr_location *al, - struct perf_annotate *ann, - struct machine *machine) +static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample, + struct addr_location *al, struct perf_annotate *ann, + struct machine *machine) { struct hists *hists = evsel__hists(evsel); struct hist_entry *he; @@ -278,7 +276,7 @@ static int process_sample_event(struct perf_tool *tool, goto out_put; if (!al.filtered && - perf_evsel__add_sample(evsel, sample, &al, ann, machine)) { + evsel__add_sample(evsel, sample, &al, ann, machine)) { pr_warning("problem incrementing symbol count, " "skipping event\n"); ret = -1; @@ -433,11 +431,10 @@ static int __cmd_annotate(struct perf_annotate *ann) total_nr_samples += nr_samples; hists__collapse_resort(hists, NULL); /* Don't sort callchain */ - perf_evsel__reset_sample_bit(pos, CALLCHAIN); + evsel__reset_sample_bit(pos, CALLCHAIN); perf_evsel__output_resort(pos, NULL); - if (symbol_conf.event_group && - !perf_evsel__is_group_leader(pos)) + if (symbol_conf.event_group && !evsel__is_group_leader(pos)) continue; hists__find_annotations(hists, pos, ann); diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index c06fe21c8613..083273209c88 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -76,6 +76,12 @@ static struct bench epoll_benchmarks[] = { }; #endif // HAVE_EVENTFD +static struct bench internals_benchmarks[] = { + { "synthesize", "Benchmark perf event synthesis", bench_synthesize }, + { "kallsyms-parse", "Benchmark kallsyms parsing", bench_kallsyms_parse }, + { NULL, NULL, NULL } +}; + struct collection { const char *name; const char *summary; @@ -92,6 +98,7 @@ static struct collection collections[] = { #ifdef HAVE_EVENTFD {"epoll", "Epoll stressing benchmarks", epoll_benchmarks }, #endif + { "internals", "Perf-internals benchmarks", internals_benchmarks }, { "all", "All benchmarks", NULL }, { NULL, NULL, NULL } }; diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 246ac0b4d54f..1baf4cae086f 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -95,6 +95,7 @@ struct perf_c2c { bool use_stdio; bool stats_only; bool symbol_full; + bool stitch_lbr; /* HITM shared clines stats */ struct c2c_stats hitm_stats; @@ -273,6 +274,9 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, return -1; } + if (c2c.stitch_lbr) + al.thread->lbr_stitch_enable = true; + ret = sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, sysctl_perf_event_max_stack); if (ret) @@ -1705,7 +1709,7 @@ static struct c2c_dimension *get_dimension(const char *name) if (!strcmp(dim->name, name)) return dim; - }; + } return NULL; } @@ -1921,7 +1925,7 @@ static bool he__display(struct hist_entry *he, struct c2c_stats *stats) FILTER_HITM(tot_hitm); default: break; - }; + } #undef FILTER_HITM @@ -2255,8 +2259,7 @@ static void print_c2c_info(FILE *out, struct perf_session *session) fprintf(out, "=================================================\n"); evlist__for_each_entry(evlist, evsel) { - fprintf(out, "%-36s: %s\n", first ? " Events" : "", - perf_evsel__name(evsel)); + fprintf(out, "%-36s: %s\n", first ? " Events" : "", evsel__name(evsel)); first = false; } fprintf(out, " Cachelines sort on : %s HITMs\n", @@ -2601,6 +2604,12 @@ static int setup_callchain(struct evlist *evlist) } } + if (c2c.stitch_lbr && (mode != CALLCHAIN_LBR)) { + ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" + "Please apply --call-graph lbr when recording.\n"); + c2c.stitch_lbr = false; + } + callchain_param.record_mode = mode; callchain_param.min_percent = 0; return 0; @@ -2752,6 +2761,8 @@ static int perf_c2c__report(int argc, const char **argv) OPT_STRING('c', "coalesce", &coalesce, "coalesce fields", "coalesce fields: pid,tid,iaddr,dso"), OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"), + OPT_BOOLEAN(0, "stitch-lbr", &c2c.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPT_PARENT(c2c_options), OPT_END() }; @@ -2947,7 +2958,7 @@ static int perf_c2c__record(int argc, const char **argv) rec_argv[i++] = "-e"; rec_argv[i++] = perf_mem_events__name(j); - }; + } if (all_user) rec_argv[i++] = "--all-user"; diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index c94a002f295e..f8c9bdd8269a 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -467,7 +467,7 @@ static struct evsel *evsel_match(struct evsel *evsel, struct evsel *e; evlist__for_each_entry(evlist, e) { - if (perf_evsel__match2(evsel, e)) + if (evsel__match2(evsel, e)) return e; } @@ -981,7 +981,7 @@ static void data_process(void) if (!quiet) { fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n", - perf_evsel__name(evsel_base)); + evsel__name(evsel_base)); } first = false; @@ -990,7 +990,7 @@ static void data_process(void) data__fprintf(); /* Don't sort callchain for perf diff */ - perf_evsel__reset_sample_bit(evsel_base, CALLCHAIN); + evsel__reset_sample_bit(evsel_base, CALLCHAIN); hists__process(hists_base); } @@ -1562,7 +1562,7 @@ hpp__entry_pair(struct hist_entry *he, struct hist_entry *pair, default: BUG_ON(1); - }; + } } static void diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index d5adc417a4ca..55eda54240fb 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -284,10 +284,11 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) .events = POLLIN, }; - if (!perf_cap__capable(CAP_SYS_ADMIN)) { + if (!(perf_cap__capable(CAP_PERFMON) || + perf_cap__capable(CAP_SYS_ADMIN))) { pr_err("ftrace only works for %s!\n", #ifdef HAVE_LIBCAP_SUPPORT - "users with the SYS_ADMIN capability" + "users with the CAP_PERFMON or CAP_SYS_ADMIN capability" #else "root" #endif diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 7e124a7b8bfd..53932db97a79 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -536,7 +536,7 @@ static int perf_inject__sched_stat(struct perf_tool *tool, union perf_event *event_sw; struct perf_sample sample_sw; struct perf_inject *inject = container_of(tool, struct perf_inject, tool); - u32 pid = perf_evsel__intval(evsel, sample, "pid"); + u32 pid = evsel__intval(evsel, sample, "pid"); list_for_each_entry(ent, &inject->samples, node) { if (pid == ent->tid) @@ -546,7 +546,7 @@ static int perf_inject__sched_stat(struct perf_tool *tool, return 0; found: event_sw = &ent->event[0]; - perf_evsel__parse_sample(evsel, event_sw, &sample_sw); + evsel__parse_sample(evsel, event_sw, &sample_sw); sample_sw.period = sample->period; sample_sw.time = sample->time; @@ -561,11 +561,10 @@ static void sig_handler(int sig __maybe_unused) session_done = 1; } -static int perf_evsel__check_stype(struct evsel *evsel, - u64 sample_type, const char *sample_msg) +static int evsel__check_stype(struct evsel *evsel, u64 sample_type, const char *sample_msg) { struct perf_event_attr *attr = &evsel->core.attr; - const char *name = perf_evsel__name(evsel); + const char *name = evsel__name(evsel); if (!(attr->sample_type & sample_type)) { pr_err("Samples for %s event do not have %s attribute set.", @@ -622,10 +621,10 @@ static int __cmd_inject(struct perf_inject *inject) struct evsel *evsel; evlist__for_each_entry(session->evlist, evsel) { - const char *name = perf_evsel__name(evsel); + const char *name = evsel__name(evsel); if (!strcmp(name, "sched:sched_switch")) { - if (perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID")) + if (evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID")) return -EINVAL; evsel->handler = perf_inject__sched_switch; @@ -684,14 +683,14 @@ static int __cmd_inject(struct perf_inject *inject) perf_header__clear_feat(&session->header, HEADER_AUXTRACE); - if (inject->itrace_synth_opts.last_branch) + if (inject->itrace_synth_opts.last_branch || + inject->itrace_synth_opts.add_last_branch) perf_header__set_feat(&session->header, HEADER_BRANCH_STACK); evsel = perf_evlist__id2evsel_strict(session->evlist, inject->aux_id); if (evsel) { - pr_debug("Deleting %s\n", - perf_evsel__name(evsel)); + pr_debug("Deleting %s\n", evsel__name(evsel)); evlist__remove(session->evlist, evsel); evsel__delete(evsel); } diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 003c85f5f56c..38a5ab683ebc 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -169,13 +169,12 @@ static int insert_caller_stat(unsigned long call_site, return 0; } -static int perf_evsel__process_alloc_event(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_alloc_event(struct evsel *evsel, struct perf_sample *sample) { - unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr"), - call_site = perf_evsel__intval(evsel, sample, "call_site"); - int bytes_req = perf_evsel__intval(evsel, sample, "bytes_req"), - bytes_alloc = perf_evsel__intval(evsel, sample, "bytes_alloc"); + unsigned long ptr = evsel__intval(evsel, sample, "ptr"), + call_site = evsel__intval(evsel, sample, "call_site"); + int bytes_req = evsel__intval(evsel, sample, "bytes_req"), + bytes_alloc = evsel__intval(evsel, sample, "bytes_alloc"); if (insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, sample->cpu) || insert_caller_stat(call_site, bytes_req, bytes_alloc)) @@ -188,14 +187,13 @@ static int perf_evsel__process_alloc_event(struct evsel *evsel, return 0; } -static int perf_evsel__process_alloc_node_event(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_alloc_node_event(struct evsel *evsel, struct perf_sample *sample) { - int ret = perf_evsel__process_alloc_event(evsel, sample); + int ret = evsel__process_alloc_event(evsel, sample); if (!ret) { int node1 = cpu__get_node(sample->cpu), - node2 = perf_evsel__intval(evsel, sample, "node"); + node2 = evsel__intval(evsel, sample, "node"); if (node1 != node2) nr_cross_allocs++; @@ -232,10 +230,9 @@ static struct alloc_stat *search_alloc_stat(unsigned long ptr, return NULL; } -static int perf_evsel__process_free_event(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_free_event(struct evsel *evsel, struct perf_sample *sample) { - unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr"); + unsigned long ptr = evsel__intval(evsel, sample, "ptr"); struct alloc_stat *s_alloc, *s_caller; s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp); @@ -784,13 +781,12 @@ static int parse_gfp_flags(struct evsel *evsel, struct perf_sample *sample, return 0; } -static int perf_evsel__process_page_alloc_event(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_page_alloc_event(struct evsel *evsel, struct perf_sample *sample) { u64 page; - unsigned int order = perf_evsel__intval(evsel, sample, "order"); - unsigned int gfp_flags = perf_evsel__intval(evsel, sample, "gfp_flags"); - unsigned int migrate_type = perf_evsel__intval(evsel, sample, + unsigned int order = evsel__intval(evsel, sample, "order"); + unsigned int gfp_flags = evsel__intval(evsel, sample, "gfp_flags"); + unsigned int migrate_type = evsel__intval(evsel, sample, "migratetype"); u64 bytes = kmem_page_size << order; u64 callsite; @@ -802,9 +798,9 @@ static int perf_evsel__process_page_alloc_event(struct evsel *evsel, }; if (use_pfn) - page = perf_evsel__intval(evsel, sample, "pfn"); + page = evsel__intval(evsel, sample, "pfn"); else - page = perf_evsel__intval(evsel, sample, "page"); + page = evsel__intval(evsel, sample, "page"); nr_page_allocs++; total_page_alloc_bytes += bytes; @@ -857,11 +853,10 @@ static int perf_evsel__process_page_alloc_event(struct evsel *evsel, return 0; } -static int perf_evsel__process_page_free_event(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_page_free_event(struct evsel *evsel, struct perf_sample *sample) { u64 page; - unsigned int order = perf_evsel__intval(evsel, sample, "order"); + unsigned int order = evsel__intval(evsel, sample, "order"); u64 bytes = kmem_page_size << order; struct page_stat *pstat; struct page_stat this = { @@ -869,9 +864,9 @@ static int perf_evsel__process_page_free_event(struct evsel *evsel, }; if (use_pfn) - page = perf_evsel__intval(evsel, sample, "pfn"); + page = evsel__intval(evsel, sample, "pfn"); else - page = perf_evsel__intval(evsel, sample, "page"); + page = evsel__intval(evsel, sample, "page"); nr_page_frees++; total_page_free_bytes += bytes; @@ -1371,15 +1366,15 @@ static int __cmd_kmem(struct perf_session *session) struct evsel *evsel; const struct evsel_str_handler kmem_tracepoints[] = { /* slab allocator */ - { "kmem:kmalloc", perf_evsel__process_alloc_event, }, - { "kmem:kmem_cache_alloc", perf_evsel__process_alloc_event, }, - { "kmem:kmalloc_node", perf_evsel__process_alloc_node_event, }, - { "kmem:kmem_cache_alloc_node", perf_evsel__process_alloc_node_event, }, - { "kmem:kfree", perf_evsel__process_free_event, }, - { "kmem:kmem_cache_free", perf_evsel__process_free_event, }, + { "kmem:kmalloc", evsel__process_alloc_event, }, + { "kmem:kmem_cache_alloc", evsel__process_alloc_event, }, + { "kmem:kmalloc_node", evsel__process_alloc_node_event, }, + { "kmem:kmem_cache_alloc_node", evsel__process_alloc_node_event, }, + { "kmem:kfree", evsel__process_free_event, }, + { "kmem:kmem_cache_free", evsel__process_free_event, }, /* page allocator */ - { "kmem:mm_page_alloc", perf_evsel__process_page_alloc_event, }, - { "kmem:mm_page_free", perf_evsel__process_page_free_event, }, + { "kmem:mm_page_alloc", evsel__process_page_alloc_event, }, + { "kmem:mm_page_free", evsel__process_page_free_event, }, }; if (!perf_session__has_traces(session, "kmem record")) @@ -1391,8 +1386,8 @@ static int __cmd_kmem(struct perf_session *session) } evlist__for_each_entry(session->evlist, evsel) { - if (!strcmp(perf_evsel__name(evsel), "kmem:mm_page_alloc") && - perf_evsel__field(evsel, "pfn")) { + if (!strcmp(evsel__name(evsel), "kmem:mm_page_alloc") && + evsel__field(evsel, "pfn")) { use_pfn = true; break; } diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 577af4f3297a..95a77058023e 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -69,7 +69,7 @@ void exit_event_get_key(struct evsel *evsel, struct event_key *key) { key->info = 0; - key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason); + key->key = evsel__intval(evsel, sample, kvm_exit_reason); } bool kvm_exit_event(struct evsel *evsel) @@ -416,8 +416,7 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread, return NULL; } - vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample, - vcpu_id_str); + vcpu_record->vcpu_id = evsel__intval(evsel, sample, vcpu_id_str); thread__set_priv(thread, vcpu_record); } @@ -1033,16 +1032,16 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm) struct perf_event_attr *attr = &pos->core.attr; /* make sure these *are* set */ - perf_evsel__set_sample_bit(pos, TID); - perf_evsel__set_sample_bit(pos, TIME); - perf_evsel__set_sample_bit(pos, CPU); - perf_evsel__set_sample_bit(pos, RAW); + evsel__set_sample_bit(pos, TID); + evsel__set_sample_bit(pos, TIME); + evsel__set_sample_bit(pos, CPU); + evsel__set_sample_bit(pos, RAW); /* make sure these are *not*; want as small a sample as possible */ - perf_evsel__reset_sample_bit(pos, PERIOD); - perf_evsel__reset_sample_bit(pos, IP); - perf_evsel__reset_sample_bit(pos, CALLCHAIN); - perf_evsel__reset_sample_bit(pos, ADDR); - perf_evsel__reset_sample_bit(pos, READ); + evsel__reset_sample_bit(pos, PERIOD); + evsel__reset_sample_bit(pos, IP); + evsel__reset_sample_bit(pos, CALLCHAIN); + evsel__reset_sample_bit(pos, ADDR); + evsel__reset_sample_bit(pos, READ); attr->mmap = 0; attr->comm = 0; attr->task = 0; diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 474dfd59d7eb..f0a1dbacb46c 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -48,7 +48,7 @@ struct lock_stat { struct rb_node rb; /* used for sorting */ /* - * FIXME: perf_evsel__intval() returns u64, + * FIXME: evsel__intval() returns u64, * so address of lockdep_map should be dealed as 64bit. * Is there more better solution? */ @@ -404,9 +404,9 @@ static int report_lock_acquire_event(struct evsel *evsel, struct lock_stat *ls; struct thread_stat *ts; struct lock_seq_stat *seq; - const char *name = perf_evsel__strval(evsel, sample, "name"); - u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr"); - int flag = perf_evsel__intval(evsel, sample, "flag"); + const char *name = evsel__strval(evsel, sample, "name"); + u64 tmp = evsel__intval(evsel, sample, "lockdep_addr"); + int flag = evsel__intval(evsel, sample, "flag"); memcpy(&addr, &tmp, sizeof(void *)); @@ -477,8 +477,8 @@ static int report_lock_acquired_event(struct evsel *evsel, struct thread_stat *ts; struct lock_seq_stat *seq; u64 contended_term; - const char *name = perf_evsel__strval(evsel, sample, "name"); - u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr"); + const char *name = evsel__strval(evsel, sample, "name"); + u64 tmp = evsel__intval(evsel, sample, "lockdep_addr"); memcpy(&addr, &tmp, sizeof(void *)); @@ -539,8 +539,8 @@ static int report_lock_contended_event(struct evsel *evsel, struct lock_stat *ls; struct thread_stat *ts; struct lock_seq_stat *seq; - const char *name = perf_evsel__strval(evsel, sample, "name"); - u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr"); + const char *name = evsel__strval(evsel, sample, "name"); + u64 tmp = evsel__intval(evsel, sample, "lockdep_addr"); memcpy(&addr, &tmp, sizeof(void *)); @@ -594,8 +594,8 @@ static int report_lock_release_event(struct evsel *evsel, struct lock_stat *ls; struct thread_stat *ts; struct lock_seq_stat *seq; - const char *name = perf_evsel__strval(evsel, sample, "name"); - u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr"); + const char *name = evsel__strval(evsel, sample, "name"); + u64 tmp = evsel__intval(evsel, sample, "lockdep_addr"); memcpy(&addr, &tmp, sizeof(void *)); @@ -657,32 +657,28 @@ static struct trace_lock_handler report_lock_ops = { static struct trace_lock_handler *trace_handler; -static int perf_evsel__process_lock_acquire(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_lock_acquire(struct evsel *evsel, struct perf_sample *sample) { if (trace_handler->acquire_event) return trace_handler->acquire_event(evsel, sample); return 0; } -static int perf_evsel__process_lock_acquired(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_lock_acquired(struct evsel *evsel, struct perf_sample *sample) { if (trace_handler->acquired_event) return trace_handler->acquired_event(evsel, sample); return 0; } -static int perf_evsel__process_lock_contended(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_lock_contended(struct evsel *evsel, struct perf_sample *sample) { if (trace_handler->contended_event) return trace_handler->contended_event(evsel, sample); return 0; } -static int perf_evsel__process_lock_release(struct evsel *evsel, - struct perf_sample *sample) +static int evsel__process_lock_release(struct evsel *evsel, struct perf_sample *sample) { if (trace_handler->release_event) return trace_handler->release_event(evsel, sample); @@ -775,7 +771,7 @@ static void dump_threads(void) pr_info("%10d: %s\n", st->tid, thread__comm_str(t)); node = rb_next(node); thread__put(t); - }; + } } static void dump_map(void) @@ -849,10 +845,10 @@ static void sort_result(void) } static const struct evsel_str_handler lock_tracepoints[] = { - { "lock:lock_acquire", perf_evsel__process_lock_acquire, }, /* CONFIG_LOCKDEP */ - { "lock:lock_acquired", perf_evsel__process_lock_acquired, }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */ - { "lock:lock_contended", perf_evsel__process_lock_contended, }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */ - { "lock:lock_release", perf_evsel__process_lock_release, }, /* CONFIG_LOCKDEP */ + { "lock:lock_acquire", evsel__process_lock_acquire, }, /* CONFIG_LOCKDEP */ + { "lock:lock_acquired", evsel__process_lock_acquired, }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */ + { "lock:lock_contended", evsel__process_lock_contended, }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */ + { "lock:lock_release", evsel__process_lock_release, }, /* CONFIG_LOCKDEP */ }; static bool force; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index a13f5817d6fc..68a7eb84561a 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -123,7 +123,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) rec_argv[i++] = "-e"; rec_argv[i++] = perf_mem_events__name(j); - }; + } if (all_user) rec_argv[i++] = "--all-user"; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1ab349abe904..e4efdbf1a81e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -34,6 +34,7 @@ #include "util/tsc.h" #include "util/parse-branch-options.h" #include "util/parse-regs-options.h" +#include "util/perf_api_probe.h" #include "util/llvm-utils.h" #include "util/bpf-loader.h" #include "util/trigger.h" @@ -43,6 +44,7 @@ #include "util/time-utils.h" #include "util/units.h" #include "util/bpf-event.h" +#include "util/util.h" #include "asm/bug.h" #include "perf.h" @@ -50,6 +52,7 @@ #include <inttypes.h> #include <locale.h> #include <poll.h> +#include <pthread.h> #include <unistd.h> #include <sched.h> #include <signal.h> @@ -84,7 +87,10 @@ struct record { struct auxtrace_record *itr; struct evlist *evlist; struct perf_session *session; + struct evlist *sb_evlist; + pthread_t thread_id; int realtime_prio; + bool switch_output_event_set; bool no_buildid; bool no_buildid_set; bool no_buildid_cache; @@ -503,6 +509,20 @@ static int process_synthesized_event(struct perf_tool *tool, return record__write(rec, NULL, event, event->header.size); } +static int process_locked_synthesized_event(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER; + int ret; + + pthread_mutex_lock(&synth_lock); + ret = process_synthesized_event(tool, event, sample, machine); + pthread_mutex_unlock(&synth_lock); + return ret; +} + static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) { struct record *rec = to; @@ -825,7 +845,7 @@ static int record__open(struct record *rec) evlist__for_each_entry(evlist, pos) { try_again: if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { - if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { + if (evsel__fallback(pos, errno, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); goto try_again; @@ -837,8 +857,7 @@ try_again: goto try_again; } rc = -errno; - perf_evsel__open_strerror(pos, &opts->target, - errno, msg, sizeof(msg)); + evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg)); ui__error("%s\n", msg); goto out; } @@ -859,7 +878,7 @@ try_again: if (perf_evlist__apply_filters(evlist, &pos)) { pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", - pos->filter, perf_evsel__name(pos), errno, + pos->filter, evsel__name(pos), errno, str_error_r(errno, msg, sizeof(msg))); rc = -1; goto out; @@ -1288,6 +1307,7 @@ static int record__synthesize(struct record *rec, bool tail) struct perf_tool *tool = &rec->tool; int fd = perf_data__fd(data); int err = 0; + event_op f = process_synthesized_event; if (rec->opts.tail_synthesize != tail) return 0; @@ -1402,13 +1422,67 @@ static int record__synthesize(struct record *rec, bool tail) if (err < 0) pr_warning("Couldn't synthesize cgroup events.\n"); + if (rec->opts.nr_threads_synthesize > 1) { + perf_set_multithreaded(); + f = process_locked_synthesized_event; + } + err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, - process_synthesized_event, opts->sample_address, - 1); + f, opts->sample_address, + rec->opts.nr_threads_synthesize); + + if (rec->opts.nr_threads_synthesize > 1) + perf_set_singlethreaded(); + out: return err; } +static int record__process_signal_event(union perf_event *event __maybe_unused, void *data) +{ + struct record *rec = data; + pthread_kill(rec->thread_id, SIGUSR2); + return 0; +} + +static int record__setup_sb_evlist(struct record *rec) +{ + struct record_opts *opts = &rec->opts; + + if (rec->sb_evlist != NULL) { + /* + * We get here if --switch-output-event populated the + * sb_evlist, so associate a callback that will send a SIGUSR2 + * to the main thread. + */ + evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec); + rec->thread_id = pthread_self(); + } + + if (!opts->no_bpf_event) { + if (rec->sb_evlist == NULL) { + rec->sb_evlist = evlist__new(); + + if (rec->sb_evlist == NULL) { + pr_err("Couldn't create side band evlist.\n."); + return -1; + } + } + + if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) { + pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); + return -1; + } + } + + if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) { + pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); + opts->no_bpf_event = true; + } + + return 0; +} + static int __cmd_record(struct record *rec, int argc, const char **argv) { int err; @@ -1420,7 +1494,6 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) struct perf_data *data = &rec->data; struct perf_session *session; bool disabled = false, draining = false; - struct evlist *sb_evlist = NULL; int fd; float ratio = 0; @@ -1546,21 +1619,17 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) goto out_child; } + err = -1; if (!rec->no_buildid && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { pr_err("Couldn't generate buildids. " "Use --no-buildid to profile anyway.\n"); - err = -1; goto out_child; } - if (!opts->no_bpf_event) - bpf_event__add_sb_event(&sb_evlist, &session->header.env); - - if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { - pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); - opts->no_bpf_event = true; - } + err = record__setup_sb_evlist(rec); + if (err) + goto out_child; err = record__synthesize(rec, false); if (err < 0) @@ -1831,7 +1900,7 @@ out_delete_session: perf_session__delete(session); if (!opts->no_bpf_event) - perf_evlist__stop_sb_thread(sb_evlist); + perf_evlist__stop_sb_thread(rec->sb_evlist); return status; } @@ -2142,10 +2211,19 @@ static int switch_output_setup(struct record *rec) }; unsigned long val; + /* + * If we're using --switch-output-events, then we imply its + * --switch-output=signal, as we'll send a SIGUSR2 from the side band + * thread to its parent. + */ + if (rec->switch_output_event_set) + goto do_signal; + if (!s->set) return 0; if (!strcmp(s->str, "signal")) { +do_signal: s->signal = true; pr_debug("switch-output with SIGUSR2 signal\n"); goto enabled; @@ -2232,6 +2310,7 @@ static struct record record = { .default_per_cpu = true, }, .mmap_flush = MMAP_FLUSH_DEFAULT, + .nr_threads_synthesize = 1, }, .tool = { .sample = process_sample_event, @@ -2402,6 +2481,9 @@ static struct option __record_options[] = { &record.switch_output.set, "signal or size[BKMG] or time[smhd]", "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", "signal"), + OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event", + "switch output event selector. use 'perf list' to list available events", + parse_events_option_new_evlist), OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, "Limit number of switch output generated files"), OPT_BOOLEAN(0, "dry-run", &dry_run, @@ -2421,6 +2503,9 @@ static struct option __record_options[] = { #endif OPT_CALLBACK(0, "max-size", &record.output_max_size, "size", "Limit the maximum size of the output file", parse_output_max_size), + OPT_UINTEGER(0, "num-thread-synthesize", + &record.opts.nr_threads_synthesize, + "number of threads to run for event synthesis"), OPT_END() }; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 26d8fc27e427..ba63390246c2 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -84,6 +84,7 @@ struct report { bool header_only; bool nonany_branch_mode; bool group_set; + bool stitch_lbr; int max_stack; struct perf_read_values show_threads_values; struct annotation_options annotation_opts; @@ -267,6 +268,9 @@ static int process_sample_event(struct perf_tool *tool, return -1; } + if (rep->stitch_lbr) + al.thread->lbr_stitch_enable = true; + if (symbol_conf.hide_unresolved && al.sym == NULL) goto out_put; @@ -317,7 +321,7 @@ static int process_read_event(struct perf_tool *tool, struct report *rep = container_of(tool, struct report, tool); if (rep->show_threads) { - const char *name = perf_evsel__name(evsel); + const char *name = evsel__name(evsel); int err = perf_read_values_add_value(&rep->show_threads_values, event->read.pid, event->read.tid, evsel->idx, @@ -339,12 +343,14 @@ static int report__setup_sample_type(struct report *rep) bool is_pipe = perf_data__is_pipe(session->data); if (session->itrace_synth_opts->callchain || + session->itrace_synth_opts->add_callchain || (!is_pipe && perf_header__has_feat(&session->header, HEADER_AUXTRACE) && !session->itrace_synth_opts->set)) sample_type |= PERF_SAMPLE_CALLCHAIN; - if (session->itrace_synth_opts->last_branch) + if (session->itrace_synth_opts->last_branch || + session->itrace_synth_opts->add_last_branch) sample_type |= PERF_SAMPLE_BRANCH_STACK; if (!is_pipe && !(sample_type & PERF_SAMPLE_CALLCHAIN)) { @@ -407,6 +413,12 @@ static int report__setup_sample_type(struct report *rep) callchain_param.record_mode = CALLCHAIN_FP; } + if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) { + ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" + "Please apply --call-graph lbr when recording.\n"); + rep->stitch_lbr = false; + } + /* ??? handle more cases than just ANY? */ if (!(perf_evlist__combined_branch_type(session->evlist) & PERF_SAMPLE_BRANCH_ANY)) @@ -447,10 +459,10 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report nr_events = hists->stats.total_non_filtered_period; } - if (perf_evsel__is_group_event(evsel)) { + if (evsel__is_group_event(evsel)) { struct evsel *pos; - perf_evsel__group_desc(evsel, buf, size); + evsel__group_desc(evsel, buf, size); evname = buf; for_each_group_member(pos, evsel) { @@ -525,10 +537,9 @@ static int perf_evlist__tty_browse_hists(struct evlist *evlist, evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); - const char *evname = perf_evsel__name(pos); + const char *evname = evsel__name(pos); - if (symbol_conf.event_group && - !perf_evsel__is_group_leader(pos)) + if (symbol_conf.event_group && !evsel__is_group_leader(pos)) continue; hists__fprintf_nr_sample_events(hists, rep, evname, stdout); @@ -670,8 +681,7 @@ static int report__collapse_hists(struct report *rep) break; /* Non-group events are considered as leader */ - if (symbol_conf.event_group && - !perf_evsel__is_group_leader(pos)) { + if (symbol_conf.event_group && !evsel__is_group_leader(pos)) { struct hists *leader_hists = evsel__hists(pos->leader); hists__match(leader_hists, hists); @@ -1257,6 +1267,8 @@ int cmd_report(int argc, const char **argv) "Show full source file name path for source lines"), OPT_BOOLEAN(0, "show-ref-call-graph", &symbol_conf.show_ref_callgraph, "Show callgraph from reference event"), + OPT_BOOLEAN(0, "stitch-lbr", &report.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPT_INTEGER(0, "socket-filter", &report.socket_filter, "only show processor socket that match with this filter"), OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace, @@ -1332,7 +1344,7 @@ int cmd_report(int argc, const char **argv) if (symbol_conf.cumulate_callchain && !callchain_param.order_set) callchain_param.order = ORDER_CALLER; - if (itrace_synth_opts.callchain && + if ((itrace_synth_opts.callchain || itrace_synth_opts.add_callchain) && (int)itrace_synth_opts.callchain_sz > report.max_stack) report.max_stack = itrace_synth_opts.callchain_sz; @@ -1380,7 +1392,7 @@ repeat: goto error; } - if (itrace_synth_opts.last_branch) + if (itrace_synth_opts.last_branch || itrace_synth_opts.add_last_branch) has_br_stack = true; if (has_br_stack && branch_call_mode) @@ -1400,7 +1412,7 @@ repeat: } if (branch_call_mode) { callchain_param.key = CCKEY_ADDRESS; - callchain_param.branch_callstack = 1; + callchain_param.branch_callstack = true; symbol_conf.use_callchain = true; callchain_register_param(&callchain_param); if (sort_order == NULL) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 82fcc2c15fe4..459e4229945e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -811,8 +811,8 @@ replay_wakeup_event(struct perf_sched *sched, struct evsel *evsel, struct perf_sample *sample, struct machine *machine __maybe_unused) { - const char *comm = perf_evsel__strval(evsel, sample, "comm"); - const u32 pid = perf_evsel__intval(evsel, sample, "pid"); + const char *comm = evsel__strval(evsel, sample, "comm"); + const u32 pid = evsel__intval(evsel, sample, "pid"); struct task_desc *waker, *wakee; if (verbose > 0) { @@ -833,11 +833,11 @@ static int replay_switch_event(struct perf_sched *sched, struct perf_sample *sample, struct machine *machine __maybe_unused) { - const char *prev_comm = perf_evsel__strval(evsel, sample, "prev_comm"), - *next_comm = perf_evsel__strval(evsel, sample, "next_comm"); - const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), - next_pid = perf_evsel__intval(evsel, sample, "next_pid"); - const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state"); + const char *prev_comm = evsel__strval(evsel, sample, "prev_comm"), + *next_comm = evsel__strval(evsel, sample, "next_comm"); + const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"), + next_pid = evsel__intval(evsel, sample, "next_pid"); + const u64 prev_state = evsel__intval(evsel, sample, "prev_state"); struct task_desc *prev, __maybe_unused *next; u64 timestamp0, timestamp = sample->time; int cpu = sample->cpu; @@ -1106,9 +1106,9 @@ static int latency_switch_event(struct perf_sched *sched, struct perf_sample *sample, struct machine *machine) { - const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), - next_pid = perf_evsel__intval(evsel, sample, "next_pid"); - const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state"); + const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"), + next_pid = evsel__intval(evsel, sample, "next_pid"); + const u64 prev_state = evsel__intval(evsel, sample, "prev_state"); struct work_atoms *out_events, *in_events; struct thread *sched_out, *sched_in; u64 timestamp0, timestamp = sample->time; @@ -1176,8 +1176,8 @@ static int latency_runtime_event(struct perf_sched *sched, struct perf_sample *sample, struct machine *machine) { - const u32 pid = perf_evsel__intval(evsel, sample, "pid"); - const u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); + const u32 pid = evsel__intval(evsel, sample, "pid"); + const u64 runtime = evsel__intval(evsel, sample, "runtime"); struct thread *thread = machine__findnew_thread(machine, -1, pid); struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); u64 timestamp = sample->time; @@ -1211,7 +1211,7 @@ static int latency_wakeup_event(struct perf_sched *sched, struct perf_sample *sample, struct machine *machine) { - const u32 pid = perf_evsel__intval(evsel, sample, "pid"); + const u32 pid = evsel__intval(evsel, sample, "pid"); struct work_atoms *atoms; struct work_atom *atom; struct thread *wakee; @@ -1272,7 +1272,7 @@ static int latency_migrate_task_event(struct perf_sched *sched, struct perf_sample *sample, struct machine *machine) { - const u32 pid = perf_evsel__intval(evsel, sample, "pid"); + const u32 pid = evsel__intval(evsel, sample, "pid"); u64 timestamp = sample->time; struct work_atoms *atoms; struct work_atom *atom; @@ -1526,7 +1526,7 @@ map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid static int map_switch_event(struct perf_sched *sched, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) { - const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); + const u32 next_pid = evsel__intval(evsel, sample, "next_pid"); struct thread *sched_in; struct thread_runtime *tr; int new_shortname; @@ -1670,8 +1670,8 @@ static int process_sched_switch_event(struct perf_tool *tool, { struct perf_sched *sched = container_of(tool, struct perf_sched, tool); int this_cpu = sample->cpu, err = 0; - u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), - next_pid = perf_evsel__intval(evsel, sample, "next_pid"); + u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"), + next_pid = evsel__intval(evsel, sample, "next_pid"); if (sched->curr_pid[this_cpu] != (u32)-1) { /* @@ -1848,7 +1848,7 @@ static inline void print_sched_time(unsigned long long nsecs, int width) * returns runtime data for event, allocating memory for it the * first time it is used. */ -static struct evsel_runtime *perf_evsel__get_runtime(struct evsel *evsel) +static struct evsel_runtime *evsel__get_runtime(struct evsel *evsel) { struct evsel_runtime *r = evsel->priv; @@ -1863,10 +1863,9 @@ static struct evsel_runtime *perf_evsel__get_runtime(struct evsel *evsel) /* * save last time event was seen per cpu */ -static void perf_evsel__save_time(struct evsel *evsel, - u64 timestamp, u32 cpu) +static void evsel__save_time(struct evsel *evsel, u64 timestamp, u32 cpu) { - struct evsel_runtime *r = perf_evsel__get_runtime(evsel); + struct evsel_runtime *r = evsel__get_runtime(evsel); if (r == NULL) return; @@ -1890,9 +1889,9 @@ static void perf_evsel__save_time(struct evsel *evsel, } /* returns last time this event was seen on the given cpu */ -static u64 perf_evsel__get_time(struct evsel *evsel, u32 cpu) +static u64 evsel__get_time(struct evsel *evsel, u32 cpu) { - struct evsel_runtime *r = perf_evsel__get_runtime(evsel); + struct evsel_runtime *r = evsel__get_runtime(evsel); if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu)) return 0; @@ -2004,8 +2003,8 @@ static void timehist_print_sample(struct perf_sched *sched, u64 t, int state) { struct thread_runtime *tr = thread__priv(thread); - const char *next_comm = perf_evsel__strval(evsel, sample, "next_comm"); - const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); + const char *next_comm = evsel__strval(evsel, sample, "next_comm"); + const u32 next_pid = evsel__intval(evsel, sample, "next_pid"); u32 max_cpus = sched->max_cpu + 1; char tstr[64]; char nstr[30]; @@ -2136,8 +2135,8 @@ static bool is_idle_sample(struct perf_sample *sample, struct evsel *evsel) { /* pid 0 == swapper == idle task */ - if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0) - return perf_evsel__intval(evsel, sample, "prev_pid") == 0; + if (strcmp(evsel__name(evsel), "sched:sched_switch") == 0) + return evsel__intval(evsel, sample, "prev_pid") == 0; return sample->pid == 0; } @@ -2334,7 +2333,7 @@ static struct thread *timehist_get_thread(struct perf_sched *sched, itr->last_thread = thread; /* copy task callchain when entering to idle */ - if (perf_evsel__intval(evsel, sample, "next_pid") == 0) + if (evsel__intval(evsel, sample, "next_pid") == 0) save_idle_callchain(sched, itr, sample); } } @@ -2355,10 +2354,10 @@ static bool timehist_skip_sample(struct perf_sched *sched, } if (sched->idle_hist) { - if (strcmp(perf_evsel__name(evsel), "sched:sched_switch")) + if (strcmp(evsel__name(evsel), "sched:sched_switch")) rc = true; - else if (perf_evsel__intval(evsel, sample, "prev_pid") != 0 && - perf_evsel__intval(evsel, sample, "next_pid") != 0) + else if (evsel__intval(evsel, sample, "prev_pid") != 0 && + evsel__intval(evsel, sample, "next_pid") != 0) rc = true; } @@ -2409,7 +2408,7 @@ static int timehist_sched_wakeup_event(struct perf_tool *tool, struct thread *thread; struct thread_runtime *tr = NULL; /* want pid of awakened task not pid in sample */ - const u32 pid = perf_evsel__intval(evsel, sample, "pid"); + const u32 pid = evsel__intval(evsel, sample, "pid"); thread = machine__findnew_thread(machine, 0, pid); if (thread == NULL) @@ -2445,8 +2444,8 @@ static void timehist_print_migration_event(struct perf_sched *sched, return; max_cpus = sched->max_cpu + 1; - ocpu = perf_evsel__intval(evsel, sample, "orig_cpu"); - dcpu = perf_evsel__intval(evsel, sample, "dest_cpu"); + ocpu = evsel__intval(evsel, sample, "orig_cpu"); + dcpu = evsel__intval(evsel, sample, "dest_cpu"); thread = machine__findnew_thread(machine, sample->pid, sample->tid); if (thread == NULL) @@ -2493,7 +2492,7 @@ static int timehist_migrate_task_event(struct perf_tool *tool, struct thread *thread; struct thread_runtime *tr = NULL; /* want pid of migrated task not pid in sample */ - const u32 pid = perf_evsel__intval(evsel, sample, "pid"); + const u32 pid = evsel__intval(evsel, sample, "pid"); thread = machine__findnew_thread(machine, 0, pid); if (thread == NULL) @@ -2524,8 +2523,7 @@ static int timehist_sched_change_event(struct perf_tool *tool, struct thread_runtime *tr = NULL; u64 tprev, t = sample->time; int rc = 0; - int state = perf_evsel__intval(evsel, sample, "prev_state"); - + int state = evsel__intval(evsel, sample, "prev_state"); if (machine__resolve(machine, &al, sample) < 0) { pr_err("problem processing %d event. skipping it\n", @@ -2549,7 +2547,7 @@ static int timehist_sched_change_event(struct perf_tool *tool, goto out; } - tprev = perf_evsel__get_time(evsel, sample->cpu); + tprev = evsel__get_time(evsel, sample->cpu); /* * If start time given: @@ -2632,7 +2630,7 @@ out: tr->ready_to_run = 0; } - perf_evsel__save_time(evsel, sample->time, sample->cpu); + evsel__save_time(evsel, sample->time, sample->cpu); return rc; } @@ -2942,7 +2940,7 @@ static int timehist_check_attr(struct perf_sched *sched, struct evsel_runtime *er; list_for_each_entry(evsel, &evlist->core.entries, core.node) { - er = perf_evsel__get_runtime(evsel); + er = evsel__get_runtime(evsel); if (er == NULL) { pr_err("Failed to allocate memory for evsel runtime data\n"); return -1; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 1f57a7ecdf3d..56d7bcd12671 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -273,7 +273,7 @@ static struct evsel_script *perf_evsel_script__new(struct evsel *evsel, struct evsel_script *es = zalloc(sizeof(*es)); if (es != NULL) { - if (asprintf(&es->filename, "%s.%s.dump", data->file.path, perf_evsel__name(evsel)) < 0) + if (asprintf(&es->filename, "%s.%s.dump", data->file.path, evsel__name(evsel)) < 0) goto out_free; es->fp = fopen(es->filename, "w"); if (es->fp == NULL) @@ -351,10 +351,8 @@ static const char *output_field2str(enum perf_output_field field) #define PRINT_FIELD(x) (output[output_type(attr->type)].fields & PERF_OUTPUT_##x) -static int perf_evsel__do_check_stype(struct evsel *evsel, - u64 sample_type, const char *sample_msg, - enum perf_output_field field, - bool allow_user_set) +static int evsel__do_check_stype(struct evsel *evsel, u64 sample_type, const char *sample_msg, + enum perf_output_field field, bool allow_user_set) { struct perf_event_attr *attr = &evsel->core.attr; int type = output_type(attr->type); @@ -366,7 +364,7 @@ static int perf_evsel__do_check_stype(struct evsel *evsel, if (output[type].user_set_fields & field) { if (allow_user_set) return 0; - evname = perf_evsel__name(evsel); + evname = evsel__name(evsel); pr_err("Samples for '%s' event do not have %s attribute set. " "Cannot print '%s' field.\n", evname, sample_msg, output_field2str(field)); @@ -375,7 +373,7 @@ static int perf_evsel__do_check_stype(struct evsel *evsel, /* user did not ask for it explicitly so remove from the default list */ output[type].fields &= ~field; - evname = perf_evsel__name(evsel); + evname = evsel__name(evsel); pr_debug("Samples for '%s' event do not have %s attribute set. " "Skipping '%s' field.\n", evname, sample_msg, output_field2str(field)); @@ -383,16 +381,13 @@ static int perf_evsel__do_check_stype(struct evsel *evsel, return 0; } -static int perf_evsel__check_stype(struct evsel *evsel, - u64 sample_type, const char *sample_msg, - enum perf_output_field field) +static int evsel__check_stype(struct evsel *evsel, u64 sample_type, const char *sample_msg, + enum perf_output_field field) { - return perf_evsel__do_check_stype(evsel, sample_type, sample_msg, field, - false); + return evsel__do_check_stype(evsel, sample_type, sample_msg, field, false); } -static int perf_evsel__check_attr(struct evsel *evsel, - struct perf_session *session) +static int perf_evsel__check_attr(struct evsel *evsel, struct perf_session *session) { struct perf_event_attr *attr = &evsel->core.attr; bool allow_user_set; @@ -404,32 +399,28 @@ static int perf_evsel__check_attr(struct evsel *evsel, HEADER_AUXTRACE); if (PRINT_FIELD(TRACE) && - !perf_session__has_traces(session, "record -R")) + !perf_session__has_traces(session, "record -R")) return -EINVAL; if (PRINT_FIELD(IP)) { - if (perf_evsel__check_stype(evsel, PERF_SAMPLE_IP, "IP", - PERF_OUTPUT_IP)) + if (evsel__check_stype(evsel, PERF_SAMPLE_IP, "IP", PERF_OUTPUT_IP)) return -EINVAL; } if (PRINT_FIELD(ADDR) && - perf_evsel__do_check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR", - PERF_OUTPUT_ADDR, allow_user_set)) + evsel__do_check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR", PERF_OUTPUT_ADDR, allow_user_set)) return -EINVAL; if (PRINT_FIELD(DATA_SRC) && - perf_evsel__check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC", - PERF_OUTPUT_DATA_SRC)) + evsel__check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC", PERF_OUTPUT_DATA_SRC)) return -EINVAL; if (PRINT_FIELD(WEIGHT) && - perf_evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT", - PERF_OUTPUT_WEIGHT)) + evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT", PERF_OUTPUT_WEIGHT)) return -EINVAL; if (PRINT_FIELD(SYM) && - !(evsel->core.attr.sample_type & (PERF_SAMPLE_IP|PERF_SAMPLE_ADDR))) { + !(evsel->core.attr.sample_type & (PERF_SAMPLE_IP|PERF_SAMPLE_ADDR))) { pr_err("Display of symbols requested but neither sample IP nor " "sample address\navailable. Hence, no addresses to convert " "to symbols.\n"); @@ -441,7 +432,7 @@ static int perf_evsel__check_attr(struct evsel *evsel, return -EINVAL; } if (PRINT_FIELD(DSO) && - !(evsel->core.attr.sample_type & (PERF_SAMPLE_IP|PERF_SAMPLE_ADDR))) { + !(evsel->core.attr.sample_type & (PERF_SAMPLE_IP|PERF_SAMPLE_ADDR))) { pr_err("Display of DSO requested but no address to convert.\n"); return -EINVAL; } @@ -458,33 +449,27 @@ static int perf_evsel__check_attr(struct evsel *evsel, return -EINVAL; } if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) && - perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID", - PERF_OUTPUT_TID|PERF_OUTPUT_PID)) + evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID", PERF_OUTPUT_TID|PERF_OUTPUT_PID)) return -EINVAL; if (PRINT_FIELD(TIME) && - perf_evsel__check_stype(evsel, PERF_SAMPLE_TIME, "TIME", - PERF_OUTPUT_TIME)) + evsel__check_stype(evsel, PERF_SAMPLE_TIME, "TIME", PERF_OUTPUT_TIME)) return -EINVAL; if (PRINT_FIELD(CPU) && - perf_evsel__do_check_stype(evsel, PERF_SAMPLE_CPU, "CPU", - PERF_OUTPUT_CPU, allow_user_set)) + evsel__do_check_stype(evsel, PERF_SAMPLE_CPU, "CPU", PERF_OUTPUT_CPU, allow_user_set)) return -EINVAL; if (PRINT_FIELD(IREGS) && - perf_evsel__check_stype(evsel, PERF_SAMPLE_REGS_INTR, "IREGS", - PERF_OUTPUT_IREGS)) + evsel__check_stype(evsel, PERF_SAMPLE_REGS_INTR, "IREGS", PERF_OUTPUT_IREGS)) return -EINVAL; if (PRINT_FIELD(UREGS) && - perf_evsel__check_stype(evsel, PERF_SAMPLE_REGS_USER, "UREGS", - PERF_OUTPUT_UREGS)) + evsel__check_stype(evsel, PERF_SAMPLE_REGS_USER, "UREGS", PERF_OUTPUT_UREGS)) return -EINVAL; if (PRINT_FIELD(PHYS_ADDR) && - perf_evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", - PERF_OUTPUT_PHYS_ADDR)) + evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR)) return -EINVAL; return 0; @@ -604,8 +589,6 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r), val); } - fprintf(fp, "\n"); - return printed; } @@ -1697,6 +1680,7 @@ struct perf_script { bool show_cgroup_events; bool allocated; bool per_event_dump; + bool stitch_lbr; struct evswitch evswitch; struct perf_cpu_map *cpus; struct perf_thread_map *threads; @@ -1713,7 +1697,7 @@ static int perf_evlist__max_name_len(struct evlist *evlist) int max = 0; evlist__for_each_entry(evlist, evsel) { - int len = strlen(perf_evsel__name(evsel)); + int len = strlen(evsel__name(evsel)); max = MAX(len, max); } @@ -1887,7 +1871,7 @@ static void process_event(struct perf_script *script, fprintf(fp, "%10" PRIu64 " ", sample->period); if (PRINT_FIELD(EVNAME)) { - const char *evname = perf_evsel__name(evsel); + const char *evname = evsel__name(evsel); if (!script->name_width) script->name_width = perf_evlist__max_name_len(script->session->evlist); @@ -1923,6 +1907,9 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(IP)) { struct callchain_cursor *cursor = NULL; + if (script->stitch_lbr) + al->thread->lbr_stitch_enable = true; + if (symbol_conf.use_callchain && sample->callchain && thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, scripting_max_stack) == 0) @@ -1946,7 +1933,7 @@ static void process_event(struct perf_script *script, else if (PRINT_FIELD(BRSTACKOFF)) perf_sample__fprintf_brstackoff(sample, thread, attr, fp); - if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) + if (evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) perf_sample__fprintf_bpf_output(sample, fp); perf_sample__fprintf_insn(sample, attr, thread, machine, fp); @@ -1975,7 +1962,7 @@ static struct scripting_ops *scripting_ops; static void __process_stat(struct evsel *counter, u64 tstamp) { int nthreads = perf_thread_map__nr(counter->core.threads); - int ncpus = perf_evsel__nr_cpus(counter); + int ncpus = evsel__nr_cpus(counter); int cpu, thread; static int header_printed; @@ -2001,7 +1988,7 @@ static void __process_stat(struct evsel *counter, u64 tstamp) counts->ena, counts->run, tstamp, - perf_evsel__name(counter)); + evsel__name(counter)); } } } @@ -2040,7 +2027,7 @@ static int cleanup_scripting(void) static bool filter_cpu(struct perf_sample *sample) { - if (cpu_list) + if (cpu_list && sample->cpu != (u32)-1) return !test_bit(sample->cpu, cpu_bitmap); return false; } @@ -2138,41 +2125,59 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, return err; } -static int process_comm_event(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) +static int print_event_with_time(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine, + pid_t pid, pid_t tid, u64 timestamp) { - struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); struct perf_session *session = script->session; struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - int ret = -1; + struct thread *thread = NULL; - thread = machine__findnew_thread(machine, event->comm.pid, event->comm.tid); - if (thread == NULL) { - pr_debug("problem processing COMM event, skipping it.\n"); - return -1; + if (evsel && !evsel->core.attr.sample_id_all) { + sample->cpu = 0; + sample->time = timestamp; + sample->pid = pid; + sample->tid = tid; } - if (perf_event__process_comm(tool, event, sample, machine) < 0) - goto out; + if (filter_cpu(sample)) + return 0; - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->comm.tid; - sample->pid = event->comm.pid; - } - if (!filter_cpu(sample)) { + if (tid != -1) + thread = machine__findnew_thread(machine, pid, tid); + + if (thread && evsel) { perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_COMM, stdout); - perf_event__fprintf(event, stdout); + event->header.type, stdout); } - ret = 0; -out: + + perf_event__fprintf(event, stdout); + thread__put(thread); - return ret; + + return 0; +} + +static int print_event(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct machine *machine, + pid_t pid, pid_t tid) +{ + return print_event_with_time(tool, event, sample, machine, pid, tid, 0); +} + +static int process_comm_event(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + if (perf_event__process_comm(tool, event, sample, machine) < 0) + return -1; + + return print_event(tool, event, sample, machine, event->comm.pid, + event->comm.tid); } static int process_namespaces_event(struct perf_tool *tool, @@ -2180,37 +2185,11 @@ static int process_namespaces_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - int ret = -1; - - thread = machine__findnew_thread(machine, event->namespaces.pid, - event->namespaces.tid); - if (thread == NULL) { - pr_debug("problem processing NAMESPACES event, skipping it.\n"); - return -1; - } - if (perf_event__process_namespaces(tool, event, sample, machine) < 0) - goto out; + return -1; - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->namespaces.tid; - sample->pid = event->namespaces.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_NAMESPACES, stdout); - perf_event__fprintf(event, stdout); - } - ret = 0; -out: - thread__put(thread); - return ret; + return print_event(tool, event, sample, machine, event->namespaces.pid, + event->namespaces.tid); } static int process_cgroup_event(struct perf_tool *tool, @@ -2218,34 +2197,11 @@ static int process_cgroup_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - int ret = -1; - - thread = machine__findnew_thread(machine, sample->pid, sample->tid); - if (thread == NULL) { - pr_debug("problem processing CGROUP event, skipping it.\n"); - return -1; - } - if (perf_event__process_cgroup(tool, event, sample, machine) < 0) - goto out; + return -1; - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_CGROUP, stdout); - perf_event__fprintf(event, stdout); - } - ret = 0; -out: - thread__put(thread); - return ret; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static int process_fork_event(struct perf_tool *tool, @@ -2253,69 +2209,24 @@ static int process_fork_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (perf_event__process_fork(tool, event, sample, machine) < 0) return -1; - thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); - if (thread == NULL) { - pr_debug("problem processing FORK event, skipping it.\n"); - return -1; - } - - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = event->fork.time; - sample->tid = event->fork.tid; - sample->pid = event->fork.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_FORK, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - - return 0; + return print_event_with_time(tool, event, sample, machine, + event->fork.pid, event->fork.tid, + event->fork.time); } static int process_exit_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) { - int err = 0; - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - - thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); - if (thread == NULL) { - pr_debug("problem processing EXIT event, skipping it.\n"); + /* Print before 'exit' deletes anything */ + if (print_event_with_time(tool, event, sample, machine, event->fork.pid, + event->fork.tid, event->fork.time)) return -1; - } - - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->fork.tid; - sample->pid = event->fork.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_EXIT, stdout); - perf_event__fprintf(event, stdout); - } - - if (perf_event__process_exit(tool, event, sample, machine) < 0) - err = -1; - thread__put(thread); - return err; + return perf_event__process_exit(tool, event, sample, machine); } static int process_mmap_event(struct perf_tool *tool, @@ -2323,33 +2234,11 @@ static int process_mmap_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (perf_event__process_mmap(tool, event, sample, machine) < 0) return -1; - thread = machine__findnew_thread(machine, event->mmap.pid, event->mmap.tid); - if (thread == NULL) { - pr_debug("problem processing MMAP event, skipping it.\n"); - return -1; - } - - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->mmap.tid; - sample->pid = event->mmap.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_MMAP, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, event->mmap.pid, + event->mmap.tid); } static int process_mmap2_event(struct perf_tool *tool, @@ -2357,33 +2246,11 @@ static int process_mmap2_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (perf_event__process_mmap2(tool, event, sample, machine) < 0) return -1; - thread = machine__findnew_thread(machine, event->mmap2.pid, event->mmap2.tid); - if (thread == NULL) { - pr_debug("problem processing MMAP2 event, skipping it.\n"); - return -1; - } - - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->mmap2.tid; - sample->pid = event->mmap2.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_MMAP2, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, event->mmap2.pid, + event->mmap2.tid); } static int process_switch_event(struct perf_tool *tool, @@ -2391,10 +2258,7 @@ static int process_switch_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); if (perf_event__process_switch(tool, event, sample, machine) < 0) return -1; @@ -2405,20 +2269,8 @@ static int process_switch_event(struct perf_tool *tool, if (!script->show_switch_events) return 0; - thread = machine__findnew_thread(machine, sample->pid, - sample->tid); - if (thread == NULL) { - pr_debug("problem processing SWITCH event, skipping it.\n"); - return -1; - } - - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_SWITCH, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static int @@ -2427,23 +2279,8 @@ process_lost_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - struct thread *thread; - - thread = machine__findnew_thread(machine, sample->pid, - sample->tid); - if (thread == NULL) - return -1; - - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_LOST, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static int @@ -2462,33 +2299,11 @@ process_bpf_events(struct perf_tool *tool __maybe_unused, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (machine__process_ksymbol(machine, event, sample) < 0) return -1; - if (!evsel->core.attr.sample_id_all) { - perf_event__fprintf(event, stdout); - return 0; - } - - thread = machine__findnew_thread(machine, sample->pid, sample->tid); - if (thread == NULL) { - pr_debug("problem processing MMAP event, skipping it.\n"); - return -1; - } - - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - event->header.type, stdout); - perf_event__fprintf(event, stdout); - } - - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static void sig_handler(int sig __maybe_unused) @@ -3145,7 +2960,7 @@ static int check_ev_match(char *dir_name, char *scriptname, match = 0; evlist__for_each_entry(session->evlist, pos) { - if (!strcmp(perf_evsel__name(pos), evname)) { + if (!strcmp(evsel__name(pos), evname)) { match = 1; break; } @@ -3342,6 +3157,12 @@ static void script__setup_sample_type(struct perf_script *script) else callchain_param.record_mode = CALLCHAIN_FP; } + + if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) { + pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" + "Please apply --call-graph lbr when recording.\n"); + script->stitch_lbr = false; + } } static int process_stat_round_event(struct perf_session *session, @@ -3653,6 +3474,8 @@ int cmd_script(int argc, const char **argv) "file", "file saving guest os /proc/kallsyms"), OPT_STRING(0, "guestmodules", &symbol_conf.default_guest_modules, "file", "file saving guest os /proc/modules"), + OPT_BOOLEAN('\0', "stitch-lbr", &script.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPTS_EVSWITCH(&script.evswitch), OPT_END() }; @@ -3709,7 +3532,7 @@ int cmd_script(int argc, const char **argv) return -1; } - if (itrace_synth_opts.callchain && + if ((itrace_synth_opts.callchain || itrace_synth_opts.add_callchain) && itrace_synth_opts.callchain_sz > scripting_max_stack) scripting_max_stack = itrace_synth_opts.callchain_sz; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index ec053dc1e35c..e0c1ad23c768 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -238,9 +238,8 @@ static int write_stat_round_event(u64 tm, u64 type) #define SID(e, x, y) xyarray__entry(e->core.sample_id, x, y) -static int -perf_evsel__write_stat_event(struct evsel *counter, u32 cpu, u32 thread, - struct perf_counts_values *count) +static int evsel__write_stat_event(struct evsel *counter, u32 cpu, u32 thread, + struct perf_counts_values *count) { struct perf_sample_id *sid = SID(counter, cpu, thread); @@ -259,7 +258,7 @@ static int read_single_counter(struct evsel *counter, int cpu, count->val = val; return 0; } - return perf_evsel__read_counter(counter, cpu, thread); + return evsel__read_counter(counter, cpu, thread); } /* @@ -284,7 +283,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu) /* * The leader's group read loads data into its group members - * (via perf_evsel__read_counter()) and sets their count->loaded. + * (via evsel__read_counter()) and sets their count->loaded. */ if (!perf_counts__is_loaded(counter->counts, cpu, thread) && read_single_counter(counter, cpu, thread, rs)) { @@ -297,7 +296,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu) perf_counts__set_loaded(counter->counts, cpu, thread, false); if (STAT_RECORD) { - if (perf_evsel__write_stat_event(counter, cpu, thread, count)) { + if (evsel__write_stat_event(counter, cpu, thread, count)) { pr_err("failed to write stat event\n"); return -1; } @@ -306,7 +305,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu) if (verbose > 1) { fprintf(stat_config.output, "%s: %d: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - perf_evsel__name(counter), + evsel__name(counter), cpu, count->val, count->ena, count->run); } @@ -359,6 +358,7 @@ static void process_interval(void) clock_gettime(CLOCK_MONOTONIC, &ts); diff_timespec(&rs, &ts, &ref_time); + perf_stat__reset_shadow_per_stat(&rt_stat); read_counters(&rs); if (STAT_RECORD) { @@ -409,7 +409,7 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf workload_exec_errno = info->si_value.sival_int; } -static bool perf_evsel__should_store_id(struct evsel *counter) +static bool evsel__should_store_id(struct evsel *counter) { return STAT_RECORD || counter->core.attr.read_format & PERF_FORMAT_ID; } @@ -454,7 +454,7 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) errno == ENXIO) { if (verbose > 0) ui__warning("%s event is not supported by the kernel.\n", - perf_evsel__name(counter)); + evsel__name(counter)); counter->supported = false; /* * errored is a sticky flag that means one of the counter's @@ -465,7 +465,7 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) if ((counter->leader != counter) || !(counter->leader->core.nr_members > 1)) return COUNTER_SKIP; - } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { + } else if (evsel__fallback(counter, errno, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); return COUNTER_RETRY; @@ -483,8 +483,7 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) } } - perf_evsel__open_strerror(counter, &target, - errno, msg, sizeof(msg)); + evsel__open_strerror(counter, &target, errno, msg, sizeof(msg)); ui__error("%s\n", msg); if (child_pid != -1) @@ -604,7 +603,7 @@ try_again: if (!counter->reset_group) continue; try_again_reset: - pr_debug2("reopening weak %s\n", perf_evsel__name(counter)); + pr_debug2("reopening weak %s\n", evsel__name(counter)); if (create_perf_stat_counter(counter, &stat_config, &target, counter->cpu_iter - 1) < 0) { @@ -635,14 +634,14 @@ try_again_reset: if (l > stat_config.unit_width) stat_config.unit_width = l; - if (perf_evsel__should_store_id(counter) && - perf_evsel__store_ids(counter, evsel_list)) + if (evsel__should_store_id(counter) && + evsel__store_ids(counter, evsel_list)) return -1; } if (perf_evlist__apply_filters(evsel_list, &counter)) { pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", - counter->filter, perf_evsel__name(counter), errno, + counter->filter, evsel__name(counter), errno, str_error_r(errno, msg, sizeof(msg))); return -1; } @@ -686,8 +685,11 @@ try_again_reset: break; } } - if (child_pid != -1) + if (child_pid != -1) { + if (timeout) + kill(child_pid, SIGTERM); wait4(child_pid, &status, 0, &stat_config.ru_data); + } if (workload_exec_errno) { const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 9e84fae9b096..c76f84b174c4 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -579,8 +579,8 @@ process_sample_cpu_idle(struct timechart *tchart __maybe_unused, struct perf_sample *sample, const char *backtrace __maybe_unused) { - u32 state = perf_evsel__intval(evsel, sample, "state"); - u32 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id"); + u32 state = evsel__intval(evsel, sample, "state"); + u32 cpu_id = evsel__intval(evsel, sample, "cpu_id"); if (state == (u32)PWR_EVENT_EXIT) c_state_end(tchart, cpu_id, sample->time); @@ -595,8 +595,8 @@ process_sample_cpu_frequency(struct timechart *tchart, struct perf_sample *sample, const char *backtrace __maybe_unused) { - u32 state = perf_evsel__intval(evsel, sample, "state"); - u32 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id"); + u32 state = evsel__intval(evsel, sample, "state"); + u32 cpu_id = evsel__intval(evsel, sample, "cpu_id"); p_state_change(tchart, cpu_id, sample->time, state); return 0; @@ -608,9 +608,9 @@ process_sample_sched_wakeup(struct timechart *tchart, struct perf_sample *sample, const char *backtrace) { - u8 flags = perf_evsel__intval(evsel, sample, "common_flags"); - int waker = perf_evsel__intval(evsel, sample, "common_pid"); - int wakee = perf_evsel__intval(evsel, sample, "pid"); + u8 flags = evsel__intval(evsel, sample, "common_flags"); + int waker = evsel__intval(evsel, sample, "common_pid"); + int wakee = evsel__intval(evsel, sample, "pid"); sched_wakeup(tchart, sample->cpu, sample->time, waker, wakee, flags, backtrace); return 0; @@ -622,9 +622,9 @@ process_sample_sched_switch(struct timechart *tchart, struct perf_sample *sample, const char *backtrace) { - int prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"); - int next_pid = perf_evsel__intval(evsel, sample, "next_pid"); - u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state"); + int prev_pid = evsel__intval(evsel, sample, "prev_pid"); + int next_pid = evsel__intval(evsel, sample, "next_pid"); + u64 prev_state = evsel__intval(evsel, sample, "prev_state"); sched_switch(tchart, sample->cpu, sample->time, prev_pid, next_pid, prev_state, backtrace); @@ -638,8 +638,8 @@ process_sample_power_start(struct timechart *tchart __maybe_unused, struct perf_sample *sample, const char *backtrace __maybe_unused) { - u64 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id"); - u64 value = perf_evsel__intval(evsel, sample, "value"); + u64 cpu_id = evsel__intval(evsel, sample, "cpu_id"); + u64 value = evsel__intval(evsel, sample, "value"); c_state_start(cpu_id, sample->time, value); return 0; @@ -661,8 +661,8 @@ process_sample_power_frequency(struct timechart *tchart, struct perf_sample *sample, const char *backtrace __maybe_unused) { - u64 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id"); - u64 value = perf_evsel__intval(evsel, sample, "value"); + u64 cpu_id = evsel__intval(evsel, sample, "cpu_id"); + u64 value = evsel__intval(evsel, sample, "value"); p_state_change(tchart, cpu_id, sample->time, value); return 0; @@ -843,7 +843,7 @@ process_enter_read(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long fd = perf_evsel__intval(evsel, sample, "fd"); + long fd = evsel__intval(evsel, sample, "fd"); return pid_begin_io_sample(tchart, sample->tid, IOTYPE_READ, sample->time, fd); } @@ -853,7 +853,7 @@ process_exit_read(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long ret = perf_evsel__intval(evsel, sample, "ret"); + long ret = evsel__intval(evsel, sample, "ret"); return pid_end_io_sample(tchart, sample->tid, IOTYPE_READ, sample->time, ret); } @@ -863,7 +863,7 @@ process_enter_write(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long fd = perf_evsel__intval(evsel, sample, "fd"); + long fd = evsel__intval(evsel, sample, "fd"); return pid_begin_io_sample(tchart, sample->tid, IOTYPE_WRITE, sample->time, fd); } @@ -873,7 +873,7 @@ process_exit_write(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long ret = perf_evsel__intval(evsel, sample, "ret"); + long ret = evsel__intval(evsel, sample, "ret"); return pid_end_io_sample(tchart, sample->tid, IOTYPE_WRITE, sample->time, ret); } @@ -883,7 +883,7 @@ process_enter_sync(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long fd = perf_evsel__intval(evsel, sample, "fd"); + long fd = evsel__intval(evsel, sample, "fd"); return pid_begin_io_sample(tchart, sample->tid, IOTYPE_SYNC, sample->time, fd); } @@ -893,7 +893,7 @@ process_exit_sync(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long ret = perf_evsel__intval(evsel, sample, "ret"); + long ret = evsel__intval(evsel, sample, "ret"); return pid_end_io_sample(tchart, sample->tid, IOTYPE_SYNC, sample->time, ret); } @@ -903,7 +903,7 @@ process_enter_tx(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long fd = perf_evsel__intval(evsel, sample, "fd"); + long fd = evsel__intval(evsel, sample, "fd"); return pid_begin_io_sample(tchart, sample->tid, IOTYPE_TX, sample->time, fd); } @@ -913,7 +913,7 @@ process_exit_tx(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long ret = perf_evsel__intval(evsel, sample, "ret"); + long ret = evsel__intval(evsel, sample, "ret"); return pid_end_io_sample(tchart, sample->tid, IOTYPE_TX, sample->time, ret); } @@ -923,7 +923,7 @@ process_enter_rx(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long fd = perf_evsel__intval(evsel, sample, "fd"); + long fd = evsel__intval(evsel, sample, "fd"); return pid_begin_io_sample(tchart, sample->tid, IOTYPE_RX, sample->time, fd); } @@ -933,7 +933,7 @@ process_exit_rx(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long ret = perf_evsel__intval(evsel, sample, "ret"); + long ret = evsel__intval(evsel, sample, "ret"); return pid_end_io_sample(tchart, sample->tid, IOTYPE_RX, sample->time, ret); } @@ -943,7 +943,7 @@ process_enter_poll(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long fd = perf_evsel__intval(evsel, sample, "fd"); + long fd = evsel__intval(evsel, sample, "fd"); return pid_begin_io_sample(tchart, sample->tid, IOTYPE_POLL, sample->time, fd); } @@ -953,7 +953,7 @@ process_exit_poll(struct timechart *tchart, struct evsel *evsel, struct perf_sample *sample) { - long ret = perf_evsel__intval(evsel, sample, "ret"); + long ret = evsel__intval(evsel, sample, "ret"); return pid_end_io_sample(tchart, sample->tid, IOTYPE_POLL, sample->time, ret); } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 289cf83e658a..372c38254654 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -33,6 +33,7 @@ #include "util/map.h" #include "util/mmap.h" #include "util/session.h" +#include "util/thread.h" #include "util/symbol.h" #include "util/synthetic-events.h" #include "util/top.h" @@ -254,7 +255,7 @@ static void perf_top__show_details(struct perf_top *top) if (notes->src == NULL) goto out_unlock; - printf("Showing %s for %s\n", perf_evsel__name(top->sym_evsel), symbol->name); + printf("Showing %s for %s\n", evsel__name(top->sym_evsel), symbol->name); printf(" Events Pcnt (>=%d%%)\n", top->annotation_opts.min_pcnt); more = symbol__annotate_printf(&he->ms, top->sym_evsel, &top->annotation_opts); @@ -297,8 +298,7 @@ static void perf_top__resort_hists(struct perf_top *t) hists__collapse_resort(hists, NULL); /* Non-group events are considered as leader */ - if (symbol_conf.event_group && - !perf_evsel__is_group_leader(pos)) { + if (symbol_conf.event_group && !evsel__is_group_leader(pos)) { struct hists *leader_hists = evsel__hists(pos->leader); hists__match(leader_hists, hists); @@ -441,7 +441,7 @@ static void perf_top__print_mapped_keys(struct perf_top *top) fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top->print_entries); if (top->evlist->core.nr_entries > 1) - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", perf_evsel__name(top->sym_evsel)); + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", evsel__name(top->sym_evsel)); fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top->count_filter); @@ -528,13 +528,13 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c) fprintf(stderr, "\nAvailable events:"); evlist__for_each_entry(top->evlist, top->sym_evsel) - fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, perf_evsel__name(top->sym_evsel)); + fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, evsel__name(top->sym_evsel)); prompt_integer(&counter, "Enter details event counter"); if (counter >= top->evlist->core.nr_entries) { top->sym_evsel = evlist__first(top->evlist); - fprintf(stderr, "Sorry, no such event, using %s.\n", perf_evsel__name(top->sym_evsel)); + fprintf(stderr, "Sorry, no such event, using %s.\n", evsel__name(top->sym_evsel)); sleep(1); break; } @@ -775,6 +775,9 @@ static void perf_event__process_sample(struct perf_tool *tool, if (machine__resolve(machine, &al, sample) < 0) return; + if (top->stitch_lbr) + al.thread->lbr_stitch_enable = true; + if (!machine->kptr_restrict_warned && symbol_conf.kptr_restrict && al.cpumode == PERF_RECORD_MISC_KERNEL) { @@ -1042,14 +1045,13 @@ try_again: perf_top_overwrite_fallback(top, counter)) goto try_again; - if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { + if (evsel__fallback(counter, errno, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); goto try_again; } - perf_evsel__open_strerror(counter, &opts->target, - errno, msg, sizeof(msg)); + evsel__open_strerror(counter, &opts->target, errno, msg, sizeof(msg)); ui__error("%s\n", msg); goto out_err; } @@ -1571,10 +1573,11 @@ int cmd_top(int argc, const char **argv) "Sort the output by the event at the index n in group. " "If n is invalid, sort by the first event. " "WARNING: should be used on grouped events."), + OPT_BOOLEAN(0, "stitch-lbr", &top.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPTS_EVSWITCH(&top.evswitch), OPT_END() }; - struct evlist *sb_evlist = NULL; const char * const top_usage[] = { "perf top [<options>]", NULL @@ -1640,6 +1643,11 @@ int cmd_top(int argc, const char **argv) } } + if (top.stitch_lbr && !(callchain_param.record_mode == CALLCHAIN_LBR)) { + pr_err("Error: --stitch-lbr must be used with --call-graph lbr\n"); + goto out_delete_evlist; + } + if (opts->branch_stack && callchain_param.enabled) symbol_conf.show_branchflag_count = true; @@ -1732,10 +1740,21 @@ int cmd_top(int argc, const char **argv) goto out_delete_evlist; } - if (!top.record_opts.no_bpf_event) - bpf_event__add_sb_event(&sb_evlist, &perf_env); + if (!top.record_opts.no_bpf_event) { + top.sb_evlist = evlist__new(); + + if (top.sb_evlist == NULL) { + pr_err("Couldn't create side band evlist.\n."); + goto out_delete_evlist; + } + + if (evlist__add_bpf_sb_event(top.sb_evlist, &perf_env)) { + pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); + goto out_delete_evlist; + } + } - if (perf_evlist__start_sb_thread(sb_evlist, target)) { + if (perf_evlist__start_sb_thread(top.sb_evlist, target)) { pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); opts->no_bpf_event = true; } @@ -1743,7 +1762,7 @@ int cmd_top(int argc, const char **argv) status = __cmd_top(&top); if (!opts->no_bpf_event) - perf_evlist__stop_sb_thread(sb_evlist); + perf_evlist__stop_sb_thread(top.sb_evlist); out_delete_evlist: evlist__delete(top.evlist); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 01d542007c8b..a46efb907bd4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -366,11 +366,9 @@ out_delete: return NULL; } -static int perf_evsel__init_tp_uint_field(struct evsel *evsel, - struct tp_field *field, - const char *name) +static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name) { - struct tep_format_field *format_field = perf_evsel__field(evsel, name); + struct tep_format_field *format_field = evsel__field(evsel, name); if (format_field == NULL) return -1; @@ -380,13 +378,11 @@ static int perf_evsel__init_tp_uint_field(struct evsel *evsel, #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\ - perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) + evsel__init_tp_uint_field(evsel, &sc->name, #name); }) -static int perf_evsel__init_tp_ptr_field(struct evsel *evsel, - struct tp_field *field, - const char *name) +static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name) { - struct tep_format_field *format_field = perf_evsel__field(evsel, name); + struct tep_format_field *format_field = evsel__field(evsel, name); if (format_field == NULL) return -1; @@ -396,7 +392,7 @@ static int perf_evsel__init_tp_ptr_field(struct evsel *evsel, #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\ - perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) + evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) static void evsel__delete_priv(struct evsel *evsel) { @@ -404,13 +400,13 @@ static void evsel__delete_priv(struct evsel *evsel) evsel__delete(evsel); } -static int perf_evsel__init_syscall_tp(struct evsel *evsel) +static int evsel__init_syscall_tp(struct evsel *evsel) { struct syscall_tp *sc = evsel__syscall_tp(evsel); if (sc != NULL) { - if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") && - perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr")) + if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") && + evsel__init_tp_uint_field(evsel, &sc->id, "nr")) return -ENOENT; return 0; } @@ -418,14 +414,14 @@ static int perf_evsel__init_syscall_tp(struct evsel *evsel) return -ENOMEM; } -static int perf_evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp) +static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp) { struct syscall_tp *sc = evsel__syscall_tp(evsel); if (sc != NULL) { - struct tep_format_field *syscall_id = perf_evsel__field(tp, "id"); + struct tep_format_field *syscall_id = evsel__field(tp, "id"); if (syscall_id == NULL) - syscall_id = perf_evsel__field(tp, "__syscall_nr"); + syscall_id = evsel__field(tp, "__syscall_nr"); if (syscall_id == NULL || __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap)) return -EINVAL; @@ -436,21 +432,21 @@ static int perf_evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evs return -ENOMEM; } -static int perf_evsel__init_augmented_syscall_tp_args(struct evsel *evsel) +static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel) { struct syscall_tp *sc = __evsel__syscall_tp(evsel); return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)); } -static int perf_evsel__init_augmented_syscall_tp_ret(struct evsel *evsel) +static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel) { struct syscall_tp *sc = __evsel__syscall_tp(evsel); return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap); } -static int perf_evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler) +static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler) { if (evsel__syscall_tp(evsel) != NULL) { if (perf_evsel__init_sc_tp_uint_field(evsel, id)) @@ -474,7 +470,7 @@ static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void * if (IS_ERR(evsel)) return NULL; - if (perf_evsel__init_raw_syscall_tp(evsel, handler)) + if (evsel__init_raw_syscall_tp(evsel, handler)) goto out_delete; return evsel; @@ -1801,7 +1797,7 @@ static int trace__read_syscall_info(struct trace *trace, int id) return syscall__set_arg_fmts(sc); } -static int perf_evsel__init_tp_arg_scnprintf(struct evsel *evsel) +static int evsel__init_tp_arg_scnprintf(struct evsel *evsel) { struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); @@ -2074,7 +2070,7 @@ static struct syscall *trace__syscall_info(struct trace *trace, if (verbose > 1) { static u64 n; fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", - id, perf_evsel__name(evsel), ++n); + id, evsel__name(evsel), ++n); } return NULL; } @@ -2206,7 +2202,7 @@ static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel, double ts = (double)sample->time / NSEC_PER_MSEC; printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n", - perf_evsel__name(evsel), ts, + evsel__name(evsel), ts, thread__comm_str(thread), sample->pid, sample->tid, sample->cpu); } @@ -2382,7 +2378,7 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sam static const char *errno_to_name(struct evsel *evsel, int err) { - struct perf_env *env = perf_evsel__env(evsel); + struct perf_env *env = evsel__env(evsel); const char *arch_name = perf_env__arch(env); return arch_syscalls__strerrno(arch_name, err); @@ -2513,7 +2509,7 @@ errno_print: { if (callchain_ret > 0) trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) - pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); + pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel)); out: ttrace->entry_pending = false; err = 0; @@ -2531,7 +2527,7 @@ static int trace__vfs_getname(struct trace *trace, struct evsel *evsel, size_t filename_len, entry_str_len, to_move; ssize_t remaining_space; char *pos; - const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); + const char *filename = evsel__rawptr(evsel, sample, "pathname"); if (!thread) goto out; @@ -2587,7 +2583,7 @@ static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) { - u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); + u64 runtime = evsel__intval(evsel, sample, "runtime"); double runtime_ms = (double)runtime / NSEC_PER_MSEC; struct thread *thread = machine__findnew_thread(trace->host, sample->pid, @@ -2606,10 +2602,10 @@ out_put: out_dump: fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", evsel->name, - perf_evsel__strval(evsel, sample, "comm"), - (pid_t)perf_evsel__intval(evsel, sample, "pid"), + evsel__strval(evsel, sample, "comm"), + (pid_t)evsel__intval(evsel, sample, "pid"), runtime, - perf_evsel__intval(evsel, sample, "vruntime")); + evsel__intval(evsel, sample, "vruntime")); goto out_put; } @@ -2774,7 +2770,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, fprintf(trace->output, "%s(", evsel->name); - if (perf_evsel__is_bpf_output(evsel)) { + if (evsel__is_bpf_output(evsel)) { bpf_output__fprintf(trace, sample); } else if (evsel->tp_format) { if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || @@ -2795,7 +2791,7 @@ newline: if (callchain_ret > 0) trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) - pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); + pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel)); ++trace->nr_events_printed; @@ -2890,7 +2886,7 @@ static int trace__pgfault(struct trace *trace, if (callchain_ret > 0) trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) - pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); + pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel)); ++trace->nr_events_printed; out: @@ -3032,10 +3028,10 @@ static bool evlist__add_vfs_getname(struct evlist *evlist) } evlist__for_each_entry_safe(evlist, evsel, tmp) { - if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname")) + if (!strstarts(evsel__name(evsel), "probe:vfs_getname")) continue; - if (perf_evsel__field(evsel, "pathname")) { + if (evsel__field(evsel, "pathname")) { evsel->handler = trace__vfs_getname; found = true; continue; @@ -3093,7 +3089,7 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT && sample->raw_data == NULL) { fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", - perf_evsel__name(evsel), sample->tid, + evsel__name(evsel), sample->tid, sample->cpu, sample->raw_size); } else { tracepoint_handler handler = evsel->handler; @@ -3124,8 +3120,8 @@ static int trace__add_syscall_newtp(struct trace *trace) if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) goto out_delete_sys_exit; - perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param); - perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param); + evsel__config_callchain(sys_enter, &trace->opts, &callchain_param); + evsel__config_callchain(sys_exit, &trace->opts, &callchain_param); evlist__add(evlist, sys_enter); evlist__add(evlist, sys_exit); @@ -3164,10 +3160,9 @@ static int trace__set_ev_qualifier_tp_filter(struct trace *trace) if (filter == NULL) goto out_enomem; - if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter, - filter)) { + if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) { sys_exit = trace->syscalls.events.sys_exit; - err = perf_evsel__append_tp_filter(sys_exit, filter); + err = evsel__append_tp_filter(sys_exit, filter); } free(filter); @@ -3695,7 +3690,7 @@ static int ordered_events__deliver_event(struct ordered_events *oe, return __trace__deliver_event(trace, event->event); } -static struct syscall_arg_fmt *perf_evsel__syscall_arg_fmt(struct evsel *evsel, char *arg) +static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg) { struct tep_format_field *field; struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel); @@ -3750,7 +3745,7 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel scnprintf(arg, sizeof(arg), "%.*s", left_size, left); - fmt = perf_evsel__syscall_arg_fmt(evsel, arg); + fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg); if (fmt == NULL) { pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n", arg, evsel->name, evsel->filter); @@ -3801,7 +3796,7 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel if (new_filter != evsel->filter) { pr_debug("New filter for %s: %s\n", evsel->name, new_filter); - perf_evsel__set_filter(evsel, new_filter); + evsel__set_filter(evsel, new_filter); free(new_filter); } @@ -3849,7 +3844,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); if (pgfault_maj == NULL) goto out_error_mem; - perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); + evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); evlist__add(evlist, pgfault_maj); } @@ -3857,7 +3852,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); if (pgfault_min == NULL) goto out_error_mem; - perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); + evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); evlist__add(evlist, pgfault_min); } @@ -4108,7 +4103,7 @@ out_error: out_error_apply_filters: fprintf(trace->output, "Failed to set filter \"%s\" on event %s with %d (%s)\n", - evsel->filter, perf_evsel__name(evsel), errno, + evsel->filter, evsel__name(evsel), errno, str_error_r(errno, errbuf, sizeof(errbuf))); goto out_delete_evlist; } @@ -4179,7 +4174,7 @@ static int trace__replay(struct trace *trace) "syscalls:sys_enter"); if (evsel && - (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || + (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || perf_evsel__init_sc_tp_ptr_field(evsel, args))) { pr_err("Error during initialize raw_syscalls:sys_enter event\n"); goto out; @@ -4191,7 +4186,7 @@ static int trace__replay(struct trace *trace) evsel = perf_evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit"); if (evsel && - (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || + (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || perf_evsel__init_sc_tp_uint_field(evsel, ret))) { pr_err("Error during initialize raw_syscalls:sys_exit event\n"); goto out; @@ -4471,11 +4466,11 @@ static int evlist__set_syscall_tp_fields(struct evlist *evlist) continue; if (strcmp(evsel->tp_format->system, "syscalls")) { - perf_evsel__init_tp_arg_scnprintf(evsel); + evsel__init_tp_arg_scnprintf(evsel); continue; } - if (perf_evsel__init_syscall_tp(evsel)) + if (evsel__init_syscall_tp(evsel)) return -1; if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) { @@ -4989,7 +4984,7 @@ int cmd_trace(int argc, const char **argv) */ if (trace.syscalls.events.augmented) { evlist__for_each_entry(trace.evlist, evsel) { - bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0; + bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0; if (raw_syscalls_sys_exit) { trace.raw_augmented_syscalls = true; @@ -4997,10 +4992,10 @@ int cmd_trace(int argc, const char **argv) } if (trace.syscalls.events.augmented->priv == NULL && - strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) { + strstr(evsel__name(evsel), "syscalls:sys_enter")) { struct evsel *augmented = trace.syscalls.events.augmented; - if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) || - perf_evsel__init_augmented_syscall_tp_args(augmented)) + if (evsel__init_augmented_syscall_tp(augmented, evsel) || + evsel__init_augmented_syscall_tp_args(augmented)) goto out; /* * Augmented is __augmented_syscalls__ BPF_OUTPUT event @@ -5014,16 +5009,16 @@ int cmd_trace(int argc, const char **argv) * as not to filter it, then we'll handle it just like we would * for the BPF_OUTPUT one: */ - if (perf_evsel__init_augmented_syscall_tp(evsel, evsel) || - perf_evsel__init_augmented_syscall_tp_args(evsel)) + if (evsel__init_augmented_syscall_tp(evsel, evsel) || + evsel__init_augmented_syscall_tp_args(evsel)) goto out; evsel->handler = trace__sys_enter; } - if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) { + if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) { struct syscall_tp *sc; init_augmented_syscall_tp: - if (perf_evsel__init_augmented_syscall_tp(evsel, evsel)) + if (evsel__init_augmented_syscall_tp(evsel, evsel)) goto out; sc = __evsel__syscall_tp(evsel); /* @@ -5047,7 +5042,7 @@ init_augmented_syscall_tp: */ if (trace.raw_augmented_syscalls) trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset; - perf_evsel__init_augmented_syscall_tp_ret(evsel); + evsel__init_augmented_syscall_tp_ret(evsel); evsel->handler = trace__sys_exit; } } diff --git a/tools/perf/design.txt b/tools/perf/design.txt index 0453ba26cdbd..a42fab308ff6 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -258,7 +258,8 @@ gets schedule to. Per task counters can be created by any user, for their own tasks. A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts -all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. +all events on CPU-x. Per CPU counters need CAP_PERFMON or CAP_SYS_ADMIN +privilege. The 'flags' parameter is currently unused and must be zero. diff --git a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json new file mode 100644 index 000000000000..c121e526442a --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json @@ -0,0 +1,19 @@ +[ + { + "MetricExpr": "(hv_24x7@PM_MCS01_128B_RD_DISP_PORT01\\,chip\\=?@ + hv_24x7@PM_MCS01_128B_RD_DISP_PORT23\\,chip\\=?@ + hv_24x7@PM_MCS23_128B_RD_DISP_PORT01\\,chip\\=?@ + hv_24x7@PM_MCS23_128B_RD_DISP_PORT23\\,chip\\=?@)", + "MetricName": "Memory_RD_BW_Chip", + "MetricGroup": "Memory_BW", + "ScaleUnit": "1.6e-2MB" + }, + { + "MetricExpr": "(hv_24x7@PM_MCS01_128B_WR_DISP_PORT01\\,chip\\=?@ + hv_24x7@PM_MCS01_128B_WR_DISP_PORT23\\,chip\\=?@ + hv_24x7@PM_MCS23_128B_WR_DISP_PORT01\\,chip\\=?@ + hv_24x7@PM_MCS23_128B_WR_DISP_PORT23\\,chip\\=?@ )", + "MetricName": "Memory_WR_BW_Chip", + "MetricGroup": "Memory_BW", + "ScaleUnit": "1.6e-2MB" + }, + { + "MetricExpr": "(hv_24x7@PM_PB_CYC\\,chip\\=?@ )", + "MetricName": "PowerBUS_Frequency", + "ScaleUnit": "2.5e-7GHz" + } +] diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index 53e76d5d5b37..c8f306b572f4 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -26,7 +26,7 @@ struct pmu_event { * Map a CPU to its table of PMU events. The CPU is identified by the * cpuid field, which is an arch-specific identifier for the CPU. * The identifier specified in tools/perf/pmu-events/arch/xxx/mapfile - * must match the get_cpustr() in tools/perf/arch/xxx/util/header.c) + * must match the get_cpuid_str() in tools/perf/arch/xxx/util/header.c) * * The cpuid can contain any character other than the comma. */ diff --git a/tools/perf/scripts/python/bin/flamegraph-record b/tools/perf/scripts/python/bin/flamegraph-record new file mode 100755 index 000000000000..7df5a19c0163 --- /dev/null +++ b/tools/perf/scripts/python/bin/flamegraph-record @@ -0,0 +1,2 @@ +#!/bin/bash +perf record -g "$@" diff --git a/tools/perf/scripts/python/bin/flamegraph-report b/tools/perf/scripts/python/bin/flamegraph-report new file mode 100755 index 000000000000..53c5dc90c87e --- /dev/null +++ b/tools/perf/scripts/python/bin/flamegraph-report @@ -0,0 +1,3 @@ +#!/bin/bash +# description: create flame graphs +perf script -s "$PERF_EXEC_PATH"/scripts/python/flamegraph.py -- "$@" diff --git a/tools/perf/scripts/python/flamegraph.py b/tools/perf/scripts/python/flamegraph.py new file mode 100755 index 000000000000..61f3be9add6b --- /dev/null +++ b/tools/perf/scripts/python/flamegraph.py @@ -0,0 +1,124 @@ +# flamegraph.py - create flame graphs from perf samples +# SPDX-License-Identifier: GPL-2.0 +# +# Usage: +# +# perf record -a -g -F 99 sleep 60 +# perf script report flamegraph +# +# Combined: +# +# perf script flamegraph -a -F 99 sleep 60 +# +# Written by Andreas Gerstmayr <agerstmayr@redhat.com> +# Flame Graphs invented by Brendan Gregg <bgregg@netflix.com> +# Works in tandem with d3-flame-graph by Martin Spier <mspier@netflix.com> + +from __future__ import print_function +import sys +import os +import argparse +import json + + +class Node: + def __init__(self, name, libtype=""): + self.name = name + self.libtype = libtype + self.value = 0 + self.children = [] + + def toJSON(self): + return { + "n": self.name, + "l": self.libtype, + "v": self.value, + "c": self.children + } + + +class FlameGraphCLI: + def __init__(self, args): + self.args = args + self.stack = Node("root") + + if self.args.format == "html" and \ + not os.path.isfile(self.args.template): + print("Flame Graph template {} does not exist. Please install " + "the js-d3-flame-graph (RPM) or libjs-d3-flame-graph (deb) " + "package, specify an existing flame graph template " + "(--template PATH) or another output format " + "(--format FORMAT).".format(self.args.template), + file=sys.stderr) + sys.exit(1) + + def find_or_create_node(self, node, name, dso): + libtype = "kernel" if dso == "[kernel.kallsyms]" else "" + if name is None: + name = "[unknown]" + + for child in node.children: + if child.name == name and child.libtype == libtype: + return child + + child = Node(name, libtype) + node.children.append(child) + return child + + def process_event(self, event): + node = self.find_or_create_node(self.stack, event["comm"], None) + if "callchain" in event: + for entry in reversed(event['callchain']): + node = self.find_or_create_node( + node, entry.get("sym", {}).get("name"), event.get("dso")) + else: + node = self.find_or_create_node( + node, entry.get("symbol"), event.get("dso")) + node.value += 1 + + def trace_end(self): + json_str = json.dumps(self.stack, default=lambda x: x.toJSON()) + + if self.args.format == "html": + try: + with open(self.args.template) as f: + output_str = f.read().replace("/** @flamegraph_json **/", + json_str) + except IOError as e: + print("Error reading template file: {}".format(e), file=sys.stderr) + sys.exit(1) + output_fn = self.args.output or "flamegraph.html" + else: + output_str = json_str + output_fn = self.args.output or "stacks.json" + + if output_fn == "-": + sys.stdout.write(output_str) + else: + print("dumping data to {}".format(output_fn)) + try: + with open(output_fn, "w") as out: + out.write(output_str) + except IOError as e: + print("Error writing output file: {}".format(e), file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Create flame graphs.") + parser.add_argument("-f", "--format", + default="html", choices=["json", "html"], + help="output file format") + parser.add_argument("-o", "--output", + help="output file name") + parser.add_argument("--template", + default="/usr/share/d3-flame-graph/d3-flamegraph-base.html", + help="path to flamegraph HTML template") + parser.add_argument("-i", "--input", + help=argparse.SUPPRESS) + + args = parser.parse_args() + cli = FlameGraphCLI(args) + + process_event = cli.process_event + trace_end = cli.trace_end diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index b3d1bf13ca07..c75557aeef0e 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -56,6 +56,7 @@ perf-y += mem2node.o perf-y += maps.o perf-y += time-utils-test.o perf-y += genelf.o +perf-y += api-io.o $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build $(call rule_mkdir) diff --git a/tools/perf/tests/api-io.c b/tools/perf/tests/api-io.c new file mode 100644 index 000000000000..2ada86ad6084 --- /dev/null +++ b/tools/perf/tests/api-io.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "debug.h" +#include "tests.h" +#include <api/io.h> +#include <linux/kernel.h> + +#define TEMPL "/tmp/perf-test-XXXXXX" + +#define EXPECT_EQUAL(val, expected) \ +do { \ + if (val != expected) { \ + pr_debug("%s:%d: %d != %d\n", \ + __FILE__, __LINE__, val, expected); \ + ret = -1; \ + } \ +} while (0) + +#define EXPECT_EQUAL64(val, expected) \ +do { \ + if (val != expected) { \ + pr_debug("%s:%d: %lld != %lld\n", \ + __FILE__, __LINE__, val, expected); \ + ret = -1; \ + } \ +} while (0) + +static int make_test_file(char path[PATH_MAX], const char *contents) +{ + ssize_t contents_len = strlen(contents); + int fd; + + strcpy(path, TEMPL); + fd = mkstemp(path); + if (fd < 0) { + pr_debug("mkstemp failed"); + return -1; + } + if (write(fd, contents, contents_len) < contents_len) { + pr_debug("short write"); + close(fd); + unlink(path); + return -1; + } + close(fd); + return 0; +} + +static int setup_test(char path[PATH_MAX], const char *contents, + size_t buf_size, struct io *io) +{ + if (make_test_file(path, contents)) + return -1; + + io->fd = open(path, O_RDONLY); + if (io->fd < 0) { + pr_debug("Failed to open '%s'\n", path); + unlink(path); + return -1; + } + io->buf = malloc(buf_size); + if (io->buf == NULL) { + pr_debug("Failed to allocate memory"); + close(io->fd); + unlink(path); + return -1; + } + io__init(io, io->fd, io->buf, buf_size); + return 0; +} + +static void cleanup_test(char path[PATH_MAX], struct io *io) +{ + free(io->buf); + close(io->fd); + unlink(path); +} + +static int do_test_get_char(const char *test_string, size_t buf_size) +{ + char path[PATH_MAX]; + struct io io; + int ch, ret = 0; + size_t i; + + if (setup_test(path, test_string, buf_size, &io)) + return -1; + + for (i = 0; i < strlen(test_string); i++) { + ch = io__get_char(&io); + + EXPECT_EQUAL(ch, test_string[i]); + EXPECT_EQUAL(io.eof, false); + } + ch = io__get_char(&io); + EXPECT_EQUAL(ch, -1); + EXPECT_EQUAL(io.eof, true); + + cleanup_test(path, &io); + return ret; +} + +static int test_get_char(void) +{ + int i, ret = 0; + size_t j; + + static const char *const test_strings[] = { + "12345678abcdef90", + "a\nb\nc\nd\n", + "\a\b\t\v\f\r", + }; + for (i = 0; i <= 10; i++) { + for (j = 0; j < ARRAY_SIZE(test_strings); j++) { + if (do_test_get_char(test_strings[j], 1 << i)) + ret = -1; + } + } + return ret; +} + +static int do_test_get_hex(const char *test_string, + __u64 val1, int ch1, + __u64 val2, int ch2, + __u64 val3, int ch3, + bool end_eof) +{ + char path[PATH_MAX]; + struct io io; + int ch, ret = 0; + __u64 hex; + + if (setup_test(path, test_string, 4, &io)) + return -1; + + ch = io__get_hex(&io, &hex); + EXPECT_EQUAL64(hex, val1); + EXPECT_EQUAL(ch, ch1); + + ch = io__get_hex(&io, &hex); + EXPECT_EQUAL64(hex, val2); + EXPECT_EQUAL(ch, ch2); + + ch = io__get_hex(&io, &hex); + EXPECT_EQUAL64(hex, val3); + EXPECT_EQUAL(ch, ch3); + + EXPECT_EQUAL(io.eof, end_eof); + + cleanup_test(path, &io); + return ret; +} + +static int test_get_hex(void) +{ + int ret = 0; + + if (do_test_get_hex("12345678abcdef90", + 0x12345678abcdef90, -1, + 0, -1, + 0, -1, + true)) + ret = -1; + + if (do_test_get_hex("1\n2\n3\n", + 1, '\n', + 2, '\n', + 3, '\n', + false)) + ret = -1; + + if (do_test_get_hex("12345678ABCDEF90;a;b", + 0x12345678abcdef90, ';', + 0xa, ';', + 0xb, -1, + true)) + ret = -1; + + if (do_test_get_hex("0x1x2x", + 0, 'x', + 1, 'x', + 2, 'x', + false)) + ret = -1; + + if (do_test_get_hex("x1x", + 0, -2, + 1, 'x', + 0, -1, + true)) + ret = -1; + + if (do_test_get_hex("10000000000000000000000000000abcdefgh99i", + 0xabcdef, 'g', + 0, -2, + 0x99, 'i', + false)) + ret = -1; + + return ret; +} + +static int do_test_get_dec(const char *test_string, + __u64 val1, int ch1, + __u64 val2, int ch2, + __u64 val3, int ch3, + bool end_eof) +{ + char path[PATH_MAX]; + struct io io; + int ch, ret = 0; + __u64 dec; + + if (setup_test(path, test_string, 4, &io)) + return -1; + + ch = io__get_dec(&io, &dec); + EXPECT_EQUAL64(dec, val1); + EXPECT_EQUAL(ch, ch1); + + ch = io__get_dec(&io, &dec); + EXPECT_EQUAL64(dec, val2); + EXPECT_EQUAL(ch, ch2); + + ch = io__get_dec(&io, &dec); + EXPECT_EQUAL64(dec, val3); + EXPECT_EQUAL(ch, ch3); + + EXPECT_EQUAL(io.eof, end_eof); + + cleanup_test(path, &io); + return ret; +} + +static int test_get_dec(void) +{ + int ret = 0; + + if (do_test_get_dec("12345678abcdef90", + 12345678, 'a', + 0, -2, + 0, -2, + false)) + ret = -1; + + if (do_test_get_dec("1\n2\n3\n", + 1, '\n', + 2, '\n', + 3, '\n', + false)) + ret = -1; + + if (do_test_get_dec("12345678;1;2", + 12345678, ';', + 1, ';', + 2, -1, + true)) + ret = -1; + + if (do_test_get_dec("0x1x2x", + 0, 'x', + 1, 'x', + 2, 'x', + false)) + ret = -1; + + if (do_test_get_dec("x1x", + 0, -2, + 1, 'x', + 0, -1, + true)) + ret = -1; + + if (do_test_get_dec("10000000000000000000000000000000000000000000000000000000000123456789ab99c", + 123456789, 'a', + 0, -2, + 99, 'c', + false)) + ret = -1; + + return ret; +} + +int test__api_io(struct test *test __maybe_unused, + int subtest __maybe_unused) +{ + int ret = 0; + + if (test_get_char()) + ret = TEST_FAIL; + if (test_get_hex()) + ret = TEST_FAIL; + if (test_get_dec()) + ret = TEST_FAIL; + return ret; +} diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index b6322eb0f423..3471ec52ea11 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -310,6 +310,10 @@ static struct test generic_tests[] = { .func = test__jit_write_elf, }, { + .desc = "Test api io", + .func = test__api_io, + }, + { .desc = "maps__merge_in", .func = test__maps__merge_in, }, diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c index 1e8a9f5c356d..db68894a6f40 100644 --- a/tools/perf/tests/event-times.c +++ b/tools/perf/tests/event-times.c @@ -72,7 +72,7 @@ static int attach__current_disabled(struct evlist *evlist) evsel->core.attr.disabled = 1; - err = perf_evsel__open_per_thread(evsel, threads); + err = evsel__open_per_thread(evsel, threads); if (err) { pr_debug("Failed to open event cpu-clock:u\n"); return err; @@ -96,7 +96,7 @@ static int attach__current_enabled(struct evlist *evlist) return -1; } - err = perf_evsel__open_per_thread(evsel, threads); + err = evsel__open_per_thread(evsel, threads); perf_thread_map__put(threads); return err == 0 ? TEST_OK : TEST_FAIL; @@ -125,7 +125,7 @@ static int attach__cpu_disabled(struct evlist *evlist) evsel->core.attr.disabled = 1; - err = perf_evsel__open_per_cpu(evsel, cpus, -1); + err = evsel__open_per_cpu(evsel, cpus, -1); if (err) { if (err == -EACCES) return TEST_SKIP; @@ -152,7 +152,7 @@ static int attach__cpu_enabled(struct evlist *evlist) return -1; } - err = perf_evsel__open_per_cpu(evsel, cpus, -1); + err = evsel__open_per_cpu(evsel, cpus, -1); if (err == -EACCES) return TEST_SKIP; diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index c727379cf20e..bdcf032f8516 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -109,7 +109,7 @@ int test__event_update(struct test *test __maybe_unused, int subtest __maybe_unu TEST_ASSERT_VAL("failed to synthesize attr update scale", !perf_event__synthesize_event_update_scale(NULL, evsel, process_event_scale)); - tmp.name = perf_evsel__name(evsel); + tmp.name = evsel__name(evsel); TEST_ASSERT_VAL("failed to synthesize attr update name", !perf_event__synthesize_event_update_name(&tmp.tool, evsel, process_event_name)); diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c index 956205bf9326..61ecd8e33a01 100644 --- a/tools/perf/tests/evsel-roundtrip-name.c +++ b/tools/perf/tests/evsel-roundtrip-name.c @@ -20,12 +20,11 @@ static int perf_evsel__roundtrip_cache_name_test(void) for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { /* skip invalid cache type */ - if (!perf_evsel__is_cache_op_valid(type, op)) + if (!evsel__is_cache_op_valid(type, op)) continue; for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - __perf_evsel__hw_cache_type_op_res_name(type, op, i, - name, sizeof(name)); + __evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name)); err = parse_events(evlist, name, NULL); if (err) ret = err; @@ -39,23 +38,22 @@ static int perf_evsel__roundtrip_cache_name_test(void) for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { /* skip invalid cache type */ - if (!perf_evsel__is_cache_op_valid(type, op)) + if (!evsel__is_cache_op_valid(type, op)) continue; for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - __perf_evsel__hw_cache_type_op_res_name(type, op, i, - name, sizeof(name)); + __evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name)); if (evsel->idx != idx) continue; ++idx; - if (strcmp(perf_evsel__name(evsel), name)) { - pr_debug("%s != %s\n", perf_evsel__name(evsel), name); + if (strcmp(evsel__name(evsel), name)) { + pr_debug("%s != %s\n", evsel__name(evsel), name); ret = -1; } - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); } } } @@ -84,9 +82,9 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names) err = 0; evlist__for_each_entry(evlist, evsel) { - if (strcmp(perf_evsel__name(evsel), names[evsel->idx])) { + if (strcmp(evsel__name(evsel), names[evsel->idx])) { --err; - pr_debug("%s != %s\n", perf_evsel__name(evsel), names[evsel->idx]); + pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->idx]); } } diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c index 261e6eaaee99..ce8aa32bc3ee 100644 --- a/tools/perf/tests/evsel-tp-sched.c +++ b/tools/perf/tests/evsel-tp-sched.c @@ -8,7 +8,7 @@ static int perf_evsel__test_field(struct evsel *evsel, const char *name, int size, bool should_be_signed) { - struct tep_format_field *field = perf_evsel__field(evsel, name); + struct tep_format_field *field = evsel__field(evsel, name); int is_signed; int ret = 0; diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 28313e59d6f6..f9e8e5628836 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -6,11 +6,11 @@ #include <string.h> #include <linux/zalloc.h> -static int test(struct parse_ctx *ctx, const char *e, double val2) +static int test(struct expr_parse_ctx *ctx, const char *e, double val2) { double val; - if (expr__parse(&val, ctx, e)) + if (expr__parse(&val, ctx, e, 1)) TEST_ASSERT_VAL("parse test failed", 0); TEST_ASSERT_VAL("unexpected value", val == val2); return 0; @@ -22,7 +22,7 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) const char **other; double val; int i, ret; - struct parse_ctx ctx; + struct expr_parse_ctx ctx; int num_other; expr__ctx_init(&ctx); @@ -44,21 +44,29 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) return ret; p = "FOO/0"; - ret = expr__parse(&val, &ctx, p); + ret = expr__parse(&val, &ctx, p, 1); TEST_ASSERT_VAL("division by zero", ret == -1); p = "BAR/"; - ret = expr__parse(&val, &ctx, p); + ret = expr__parse(&val, &ctx, p, 1); TEST_ASSERT_VAL("missing operand", ret == -1); TEST_ASSERT_VAL("find other", - expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", &other, &num_other) == 0); + expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", &other, &num_other, 1) == 0); TEST_ASSERT_VAL("find other", num_other == 3); TEST_ASSERT_VAL("find other", !strcmp(other[0], "BAR")); TEST_ASSERT_VAL("find other", !strcmp(other[1], "BAZ")); TEST_ASSERT_VAL("find other", !strcmp(other[2], "BOZO")); TEST_ASSERT_VAL("find other", other[3] == NULL); + TEST_ASSERT_VAL("find other", + expr__find_other("EVENT1\\,param\\=?@ + EVENT2\\,param\\=?@", NULL, + &other, &num_other, 3) == 0); + TEST_ASSERT_VAL("find other", num_other == 2); + TEST_ASSERT_VAL("find other", !strcmp(other[0], "EVENT1,param=3/")); + TEST_ASSERT_VAL("find other", !strcmp(other[1], "EVENT2,param=3/")); + TEST_ASSERT_VAL("find other", other[2] == NULL); + for (i = 0; i < num_other; i++) zfree(&other[i]); free((void *)other); diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 6367c8f6ca22..7a542f1c1c78 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -280,7 +280,7 @@ static int test1(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = false; symbol_conf.cumulate_callchain = false; - perf_evsel__reset_sample_bit(evsel, CALLCHAIN); + evsel__reset_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); callchain_register_param(&callchain_param); @@ -427,7 +427,7 @@ static int test2(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = true; symbol_conf.cumulate_callchain = false; - perf_evsel__set_sample_bit(evsel, CALLCHAIN); + evsel__set_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); callchain_register_param(&callchain_param); @@ -485,7 +485,7 @@ static int test3(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = false; symbol_conf.cumulate_callchain = true; - perf_evsel__reset_sample_bit(evsel, CALLCHAIN); + evsel__reset_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); callchain_register_param(&callchain_param); @@ -669,7 +669,7 @@ static int test4(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = true; symbol_conf.cumulate_callchain = true; - perf_evsel__set_sample_bit(evsel, CALLCHAIN); + evsel__set_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 5f4c0dbb4715..d4b8eb6e337a 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -86,7 +86,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse } evsels[i]->core.attr.wakeup_events = 1; - perf_evsel__set_sample_id(evsels[i], false); + evsel__set_sample_id(evsels[i], false); evlist__add(evlist, evsels[i]); @@ -150,7 +150,7 @@ out_init: if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) { pr_debug("expected %d %s events, got %d\n", expected_nr_events[evsel->idx], - perf_evsel__name(evsel), nr_events[evsel->idx]); + evsel__name(evsel), nr_events[evsel->idx]); err = -1; goto out_delete_evlist; } diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c index 93c176523e38..900934be22d2 100644 --- a/tools/perf/tests/openat-syscall-all-cpus.c +++ b/tools/perf/tests/openat-syscall-all-cpus.c @@ -103,15 +103,15 @@ int test__openat_syscall_event_on_all_cpus(struct test *test __maybe_unused, int if (cpus->map[cpu] >= CPU_SETSIZE) continue; - if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) { - pr_debug("perf_evsel__read_on_cpu\n"); + if (evsel__read_on_cpu(evsel, cpu, 0) < 0) { + pr_debug("evsel__read_on_cpu\n"); err = -1; break; } expected = nr_openat_calls + cpu; if (perf_counts(evsel->counts, cpu, 0)->val != expected) { - pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n", + pr_debug("evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n", expected, cpus->map[cpu], perf_counts(evsel->counts, cpu, 0)->val); err = -1; } diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index c6b2d7aab608..1dc2897d2df9 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -60,7 +60,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest goto out_delete_evlist; } - perf_evsel__config(evsel, &opts, NULL); + evsel__config(evsel, &opts, NULL); perf_thread_map__set_pid(evlist->core.threads, 0, getpid()); @@ -108,13 +108,13 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest continue; } - err = perf_evsel__parse_sample(evsel, event, &sample); + err = evsel__parse_sample(evsel, event, &sample); if (err) { pr_debug("Can't parse sample, err = %d\n", err); goto out_delete_evlist; } - tp_flags = perf_evsel__intval(evsel, &sample, "flags"); + tp_flags = evsel__intval(evsel, &sample, "flags"); if (flags != tp_flags) { pr_debug("%s: Expected flags=%#x, got %#x\n", diff --git a/tools/perf/tests/openat-syscall.c b/tools/perf/tests/openat-syscall.c index 5ebffae18605..db5d8bb8cd06 100644 --- a/tools/perf/tests/openat-syscall.c +++ b/tools/perf/tests/openat-syscall.c @@ -34,7 +34,7 @@ int test__openat_syscall_event(struct test *test __maybe_unused, int subtest __m goto out_thread_map_delete; } - if (perf_evsel__open_per_thread(evsel, threads) < 0) { + if (evsel__open_per_thread(evsel, threads) < 0) { pr_debug("failed to open counter: %s, " "tweak /proc/sys/kernel/perf_event_paranoid?\n", str_error_r(errno, sbuf, sizeof(sbuf))); @@ -46,13 +46,13 @@ int test__openat_syscall_event(struct test *test __maybe_unused, int subtest __m close(fd); } - if (perf_evsel__read_on_cpu(evsel, 0, 0) < 0) { - pr_debug("perf_evsel__read_on_cpu\n"); + if (evsel__read_on_cpu(evsel, 0, 0) < 0) { + pr_debug("evsel__read_on_cpu\n"); goto out_close_fd; } if (perf_counts(evsel->counts, 0, 0)->val != nr_openat_calls) { - pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %" PRIu64 "\n", + pr_debug("evsel__read_on_cpu: expected to intercept %d calls, got %" PRIu64 "\n", nr_openat_calls, perf_counts(evsel->counts, 0, 0)->val); goto out_close_fd; } diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 091c3aeccc27..895188b63f96 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -371,7 +371,7 @@ static int test__checkevent_breakpoint_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong name", - !strcmp(perf_evsel__name(evsel), "mem:0:u")); + !strcmp(evsel__name(evsel), "mem:0:u")); return test__checkevent_breakpoint(evlist); } @@ -385,7 +385,7 @@ static int test__checkevent_breakpoint_x_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong name", - !strcmp(perf_evsel__name(evsel), "mem:0:x:k")); + !strcmp(evsel__name(evsel), "mem:0:x:k")); return test__checkevent_breakpoint_x(evlist); } @@ -399,7 +399,7 @@ static int test__checkevent_breakpoint_r_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong name", - !strcmp(perf_evsel__name(evsel), "mem:0:r:hp")); + !strcmp(evsel__name(evsel), "mem:0:r:hp")); return test__checkevent_breakpoint_r(evlist); } @@ -413,7 +413,7 @@ static int test__checkevent_breakpoint_w_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong name", - !strcmp(perf_evsel__name(evsel), "mem:0:w:up")); + !strcmp(evsel__name(evsel), "mem:0:w:up")); return test__checkevent_breakpoint_w(evlist); } @@ -427,7 +427,7 @@ static int test__checkevent_breakpoint_rw_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong name", - !strcmp(perf_evsel__name(evsel), "mem:0:rw:kp")); + !strcmp(evsel__name(evsel), "mem:0:rw:kp")); return test__checkevent_breakpoint_rw(evlist); } @@ -468,7 +468,7 @@ static int test__checkevent_list(struct evlist *evlist) TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); /* syscalls:sys_enter_openat:k */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->core.attr.type); TEST_ASSERT_VAL("wrong sample_type", PERF_TP_SAMPLE_TYPE == evsel->core.attr.sample_type); @@ -479,7 +479,7 @@ static int test__checkevent_list(struct evlist *evlist) TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); /* 1:1:hp */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", 1 == evsel->core.attr.config); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); @@ -498,15 +498,15 @@ static int test__checkevent_pmu_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", 1 == evsel->core.attr.config); - TEST_ASSERT_VAL("wrong name", !strcmp(perf_evsel__name(evsel), "krava")); + TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "krava")); /* cpu/config=2/u" */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", 2 == evsel->core.attr.config); TEST_ASSERT_VAL("wrong name", - !strcmp(perf_evsel__name(evsel), "cpu/config=2/u")); + !strcmp(evsel__name(evsel), "cpu/config=2/u")); return 0; } @@ -529,7 +529,7 @@ static int test__checkevent_pmu_partial_time_callgraph(struct evlist *evlist) TEST_ASSERT_VAL("wrong time", !(PERF_SAMPLE_TIME & evsel->core.attr.sample_type)); /* cpu/config=2,call-graph=no,time=0,period=2000/ */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", 2 == evsel->core.attr.config); /* @@ -577,7 +577,7 @@ static int test__checkevent_pmu_events_mix(struct evlist *evlist) TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); /* cpu/pmu-event/u*/ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong exclude_user", @@ -652,13 +652,13 @@ static int test__group1(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* cycles:upp */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -670,7 +670,7 @@ static int test__group1(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 2); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); return 0; @@ -694,13 +694,13 @@ static int test__group2(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* cache-references + :u modifier */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CACHE_REFERENCES == evsel->core.attr.config); @@ -711,11 +711,11 @@ static int test__group2(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* cycles:k */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -725,7 +725,7 @@ static int test__group2(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); return 0; @@ -750,15 +750,15 @@ static int test__group3(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong group name", !strcmp(leader->group_name, "group1")); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* group1 cycles:kppp */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -771,11 +771,11 @@ static int test__group3(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 3); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* group2 cycles + G modifier */ - evsel = leader = perf_evsel__next(evsel); + evsel = leader = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -785,15 +785,15 @@ static int test__group3(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong group name", !strcmp(leader->group_name, "group2")); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* group2 1:3 + G modifier */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", 3 == evsel->core.attr.config); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); @@ -803,11 +803,11 @@ static int test__group3(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* instructions:u */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); @@ -817,7 +817,7 @@ static int test__group3(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); return 0; @@ -843,13 +843,13 @@ static int test__group4(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 1); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* instructions:kp + p */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); @@ -861,7 +861,7 @@ static int test__group4(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 2); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); return 0; @@ -886,13 +886,13 @@ static int test__group5(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* instructions + G */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); @@ -903,11 +903,11 @@ static int test__group5(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* cycles:G */ - evsel = leader = perf_evsel__next(evsel); + evsel = leader = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -918,13 +918,13 @@ static int test__group5(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); /* instructions:G */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); @@ -935,10 +935,10 @@ static int test__group5(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); /* cycles */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -948,7 +948,7 @@ static int test__group5(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); return 0; } @@ -972,12 +972,12 @@ static int test__group_gh1(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); /* cache-misses:G + :H group modifier */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); @@ -988,7 +988,7 @@ static int test__group_gh1(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); return 0; } @@ -1012,12 +1012,12 @@ static int test__group_gh2(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); /* cache-misses:H + :G group modifier */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); @@ -1028,7 +1028,7 @@ static int test__group_gh2(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); return 0; } @@ -1052,12 +1052,12 @@ static int test__group_gh3(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); /* cache-misses:H + :u group modifier */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); @@ -1068,7 +1068,7 @@ static int test__group_gh3(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); return 0; } @@ -1092,12 +1092,12 @@ static int test__group_gh4(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); /* cache-misses:H + :uG group modifier */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); @@ -1108,7 +1108,7 @@ static int test__group_gh4(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); - TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); return 0; } @@ -1135,7 +1135,7 @@ static int test__leader_sample1(struct evlist *evlist) TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); /* cache-misses - not sampling */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); @@ -1149,7 +1149,7 @@ static int test__leader_sample1(struct evlist *evlist) TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); /* branch-misses - not sampling */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_BRANCH_MISSES == evsel->core.attr.config); @@ -1188,7 +1188,7 @@ static int test__leader_sample2(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); /* branch-misses - not sampling */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_BRANCH_MISSES == evsel->core.attr.config); @@ -1234,14 +1234,14 @@ static int test__pinned_group(struct evlist *evlist) TEST_ASSERT_VAL("wrong pinned", evsel->core.attr.pinned); /* cache-misses - can not be pinned, but will go on with the leader */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); /* branch-misses - ditto */ - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_BRANCH_MISSES == evsel->core.attr.config); TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); @@ -1356,6 +1356,16 @@ static int test__checkevent_complex_name(struct evlist *evlist) return 0; } +static int test__checkevent_raw_pmu(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + return 0; +} + static int test__sym_event_slash(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); @@ -1750,7 +1760,12 @@ static struct evlist_test test__events_pmu[] = { .name = "cpu/name='COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks',period=0x1,event=0x2/ukp", .check = test__checkevent_complex_name, .id = 3, - } + }, + { + .name = "software/r1a/", + .check = test__checkevent_raw_pmu, + .id = 4, + }, }; struct terms_test { diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 2195fc205e72..83adfd846ccd 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -106,9 +106,9 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus * Config the evsels, setting attr->comm on the first one, etc. */ evsel = evlist__first(evlist); - perf_evsel__set_sample_bit(evsel, CPU); - perf_evsel__set_sample_bit(evsel, TID); - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, TID); + evsel__set_sample_bit(evsel, TIME); perf_evlist__config(evlist, &opts, NULL); err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask); diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 61865699c3f4..a0bdaf390ac8 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -296,12 +296,12 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) goto out_free; } - evsel.sample_size = __perf_evsel__sample_size(sample_type); + evsel.sample_size = __evsel__sample_size(sample_type); - err = perf_evsel__parse_sample(&evsel, event, &sample_out); + err = evsel__parse_sample(&evsel, event, &sample_out); if (err) { pr_debug("%s failed for sample_type %#"PRIx64", error %d\n", - "perf_evsel__parse_sample", sample_type, err); + "evsel__parse_sample", sample_type, err); goto out_free; } diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index fcb0d03dba4e..db5e1f70053a 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -135,8 +135,8 @@ static int process_sample_event(struct evlist *evlist, evsel = perf_evlist__id2evsel(evlist, sample.id); if (evsel == switch_tracking->switch_evsel) { - next_tid = perf_evsel__intval(evsel, &sample, "next_pid"); - prev_tid = perf_evsel__intval(evsel, &sample, "prev_pid"); + next_tid = evsel__intval(evsel, &sample, "next_pid"); + prev_tid = evsel__intval(evsel, &sample, "prev_pid"); cpu = sample.cpu; pr_debug3("sched_switch: cpu: %d prev_tid %d next_tid %d\n", cpu, prev_tid, next_tid); @@ -394,8 +394,8 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ switch_evsel = evlist__last(evlist); - perf_evsel__set_sample_bit(switch_evsel, CPU); - perf_evsel__set_sample_bit(switch_evsel, TIME); + evsel__set_sample_bit(switch_evsel, CPU); + evsel__set_sample_bit(switch_evsel, TIME); switch_evsel->core.system_wide = true; switch_evsel->no_aux_samples = true; @@ -412,8 +412,8 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ goto out_err; } - perf_evsel__set_sample_bit(cycles_evsel, CPU); - perf_evsel__set_sample_bit(cycles_evsel, TIME); + evsel__set_sample_bit(cycles_evsel, CPU); + evsel__set_sample_bit(cycles_evsel, TIME); /* Fourth event */ err = parse_events(evlist, "dummy:u", NULL); @@ -429,7 +429,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ tracking_evsel->core.attr.freq = 0; tracking_evsel->core.attr.sample_period = 1; - perf_evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, TIME); /* Config events */ perf_evlist__config(evlist, &opts, NULL); diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index 61a1ab032080..d6d4ac34eeb7 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -112,6 +112,7 @@ int test__mem2node(struct test *t, int subtest); int test__maps__merge_in(struct test *t, int subtest); int test__time_utils(struct test *t, int subtest); int test__jit_write_elf(struct test *test, int subtest); +int test__api_io(struct test *test, int subtest); bool test__bp_signal_is_supported(void); bool test__bp_account_is_supported(void); diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 4a800499d7c3..22daf2bdf5fa 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -33,10 +33,8 @@ static int session_write_header(char *path) { struct perf_session *session; struct perf_data data = { - .file = { - .path = path, - }, - .mode = PERF_DATA_MODE_WRITE, + .path = path, + .mode = PERF_DATA_MODE_WRITE, }; session = perf_session__new(&data, false, NULL); @@ -63,10 +61,8 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) { struct perf_session *session; struct perf_data data = { - .file = { - .path = path, - }, - .mode = PERF_DATA_MODE_READ, + .path = path, + .mode = PERF_DATA_MODE_READ, }; int i; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 487e54ef56a9..f98a118dfc49 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -3416,7 +3416,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser, struct hists *hists = evsel__hists(evsel); bool current_entry = ui_browser__is_current_entry(browser, row); unsigned long nr_events = hists->stats.nr_events[PERF_RECORD_SAMPLE]; - const char *ev_name = perf_evsel__name(evsel); + const char *ev_name = evsel__name(evsel); char bf[256], unit; const char *warn = " "; size_t printed; @@ -3424,10 +3424,10 @@ static void perf_evsel_menu__write(struct ui_browser *browser, ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : HE_COLORSET_NORMAL); - if (perf_evsel__is_group_event(evsel)) { + if (evsel__is_group_event(evsel)) { struct evsel *pos; - ev_name = perf_evsel__group_name(evsel); + ev_name = evsel__group_name(evsel); for_each_group_member(pos, evsel) { struct hists *pos_hists = evsel__hists(pos); @@ -3512,13 +3512,13 @@ browse_hists: if (pos->core.node.next == &evlist->core.entries) pos = evlist__first(evlist); else - pos = perf_evsel__next(pos); + pos = evsel__next(pos); goto browse_hists; case K_UNTAB: if (pos->core.node.prev == &evlist->core.entries) pos = evlist__last(evlist); else - pos = perf_evsel__prev(pos); + pos = evsel__prev(pos); goto browse_hists; case K_SWITCH_INPUT_DATA: case K_RELOAD: @@ -3554,7 +3554,7 @@ static bool filter_group_entries(struct ui_browser *browser __maybe_unused, { struct evsel *evsel = list_entry(entry, struct evsel, core.node); - if (symbol_conf.event_group && !perf_evsel__is_group_leader(evsel)) + if (symbol_conf.event_group && !evsel__is_group_leader(evsel)) return true; return false; @@ -3587,7 +3587,7 @@ static int __perf_evlist__tui_browse_hists(struct evlist *evlist, ui_helpline__push("Press ESC to exit"); evlist__for_each_entry(evlist, pos) { - const char *ev_name = perf_evsel__name(pos); + const char *ev_name = evsel__name(pos); size_t line_len = strlen(ev_name) + 7; if (menu.b.width < line_len) @@ -3622,7 +3622,7 @@ single_entry: nr_entries = 0; evlist__for_each_entry(evlist, pos) { - if (perf_evsel__is_group_leader(pos)) + if (evsel__is_group_leader(pos)) nr_entries++; } @@ -3640,7 +3640,7 @@ static int block_hists_browser__title(struct hist_browser *browser, char *bf, size_t size) { struct hists *hists = evsel__hists(browser->block_evsel); - const char *evname = perf_evsel__name(browser->block_evsel); + const char *evname = evsel__name(browser->block_evsel); unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE]; int ret; diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c index 35f9641bf670..a7dff77f2018 100644 --- a/tools/perf/ui/gtk/annotate.c +++ b/tools/perf/ui/gtk/annotate.c @@ -130,7 +130,7 @@ static int perf_gtk__annotate_symbol(GtkWidget *window, struct map_symbol *ms, gtk_list_store_append(store, &iter); - if (perf_evsel__is_group_event(evsel)) { + if (evsel__is_group_event(evsel)) { for (i = 0; i < evsel->core.nr_members; i++) { ret += perf_gtk__get_percent(s + ret, sizeof(s) - ret, diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c index ed1a97b2c4b0..53ef71a1b15d 100644 --- a/tools/perf/ui/gtk/hists.c +++ b/tools/perf/ui/gtk/hists.c @@ -635,18 +635,18 @@ int perf_evlist__gtk_browse_hists(struct evlist *evlist, evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); - const char *evname = perf_evsel__name(pos); + const char *evname = evsel__name(pos); GtkWidget *scrolled_window; GtkWidget *tab_label; char buf[512]; size_t size = sizeof(buf); if (symbol_conf.event_group) { - if (!perf_evsel__is_group_leader(pos)) + if (!evsel__is_group_leader(pos)) continue; if (pos->core.nr_members > 1) { - perf_evsel__group_desc(pos, buf, size); + evsel__group_desc(pos, buf, size); evname = buf; } } diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 025f4c7f96bf..c1f24d004852 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -43,12 +43,12 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, } else ret = hpp__call_print_fn(hpp, print_fn, fmt, len, get_field(he)); - if (perf_evsel__is_group_event(evsel)) { + if (evsel__is_group_event(evsel)) { int prev_idx, idx_delta; struct hist_entry *pair; int nr_members = evsel->core.nr_members; - prev_idx = perf_evsel__group_idx(evsel); + prev_idx = evsel__group_idx(evsel); list_for_each_entry(pair, &he->pairs.head, pairs.node) { u64 period = get_field(pair); @@ -58,7 +58,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, continue; evsel = hists_to_evsel(pair->hists); - idx_delta = perf_evsel__group_idx(evsel) - prev_idx - 1; + idx_delta = evsel__group_idx(evsel) - prev_idx - 1; while (idx_delta--) { /* @@ -82,7 +82,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, len, period); } - prev_idx = perf_evsel__group_idx(evsel); + prev_idx = evsel__group_idx(evsel); } idx_delta = nr_members - prev_idx - 1; @@ -164,12 +164,12 @@ static int hist_entry__new_pair(struct hist_entry *a, struct hist_entry *b, list_for_each_entry(pair, &a->pairs.head, pairs.node) { struct evsel *evsel = hists_to_evsel(pair->hists); - fa[perf_evsel__group_idx(evsel)] = get_field(pair); + fa[evsel__group_idx(evsel)] = get_field(pair); } list_for_each_entry(pair, &b->pairs.head, pairs.node) { struct evsel *evsel = hists_to_evsel(pair->hists); - fb[perf_evsel__group_idx(evsel)] = get_field(pair); + fb[evsel__group_idx(evsel)] = get_field(pair); } *fields_a = fa; @@ -190,7 +190,7 @@ static int __hpp__group_sort_idx(struct hist_entry *a, struct hist_entry *b, int cmp, nr_members, ret, i; cmp = field_cmp(get_field(a), get_field(b)); - if (!perf_evsel__is_group_event(evsel)) + if (!evsel__is_group_event(evsel)) return cmp; nr_members = evsel->core.nr_members; @@ -240,7 +240,7 @@ static int __hpp__sort(struct hist_entry *a, struct hist_entry *b, return ret; evsel = hists_to_evsel(a->hists); - if (!perf_evsel__is_group_event(evsel)) + if (!evsel__is_group_event(evsel)) return ret; nr_members = evsel->core.nr_members; diff --git a/tools/perf/util/Build b/tools/perf/util/Build index c0cf8dff694e..ca07a162d602 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -10,6 +10,7 @@ perf-y += db-export.o perf-y += env.o perf-y += event.o perf-y += evlist.o +perf-y += sideband_evlist.o perf-y += evsel.o perf-y += evsel_fprintf.o perf-y += perf_event_attr_fprintf.o @@ -88,6 +89,7 @@ perf-y += counts.o perf-y += stat.o perf-y += stat-shadow.o perf-y += stat-display.o +perf-y += perf_api_probe.o perf-y += record.o perf-y += srcline.o perf-y += srccode.o diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index f1ea0d61eb5b..d828c2d2edee 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1191,7 +1191,7 @@ static struct disasm_line *disasm_line__new(struct annotate_args *args) struct disasm_line *dl = NULL; int nr = 1; - if (perf_evsel__is_group_event(args->evsel)) + if (evsel__is_group_event(args->evsel)) nr = args->evsel->core.nr_members; dl = zalloc(disasm_line_size(nr)); @@ -1437,7 +1437,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start if (queue) return -1; - if (perf_evsel__is_group_event(evsel)) + if (evsel__is_group_event(evsel)) width *= evsel->core.nr_members; if (!*al->line) @@ -1821,6 +1821,24 @@ static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, } #endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +static int +symbol__disassemble_bpf_image(struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + + args->offset = -1; + args->line = strdup("to be implemented"); + args->line_nr = 0; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + free(args->line); + return 0; +} + /* * Possibly create a new version of line with tabs expanded. Returns the * existing or new line, storage is updated if a new line is allocated. If @@ -1920,6 +1938,8 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { return symbol__disassemble_bpf(sym, args); + } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { + return symbol__disassemble_bpf_image(sym, args); } else if (dso__is_kcore(dso)) { kce.kcore_filename = symfs_filename; kce.addr = map__rip_2objdump(map, sym->start); @@ -2136,7 +2156,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, .evsel = evsel, .options = options, }; - struct perf_env *env = perf_evsel__env(evsel); + struct perf_env *env = evsel__env(evsel); const char *arch_name = perf_env__arch(env); struct arch *arch; int err; @@ -2324,7 +2344,7 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel, struct dso *dso = map->dso; char *filename; const char *d_filename; - const char *evsel_name = perf_evsel__name(evsel); + const char *evsel_name = evsel__name(evsel); struct annotation *notes = symbol__annotation(sym); struct sym_hist *h = annotation__histogram(notes, evsel->idx); struct annotation_line *pos, *queue = NULL; @@ -2348,9 +2368,9 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel, len = symbol__size(sym); - if (perf_evsel__is_group_event(evsel)) { + if (evsel__is_group_event(evsel)) { width *= evsel->core.nr_members; - perf_evsel__group_desc(evsel, buf, sizeof(buf)); + evsel__group_desc(evsel, buf, sizeof(buf)); evsel_name = buf; } @@ -2485,7 +2505,7 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp, int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel, struct annotation_options *opts) { - const char *ev_name = perf_evsel__name(evsel); + const char *ev_name = evsel__name(evsel); char buf[1024]; char *filename; int err = -1; @@ -2498,8 +2518,8 @@ int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel, if (fp == NULL) goto out_free_filename; - if (perf_evsel__is_group_event(evsel)) { - perf_evsel__group_desc(evsel, buf, sizeof(buf)); + if (evsel__is_group_event(evsel)) { + evsel__group_desc(evsel, buf, sizeof(buf)); ev_name = buf; } @@ -3044,7 +3064,7 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, if (notes->offsets == NULL) return ENOMEM; - if (perf_evsel__is_group_event(evsel)) + if (evsel__is_group_event(evsel)) nr_pcnt = evsel->core.nr_members; err = symbol__annotate(ms, evsel, options, parch); diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 53be12b23ff4..875a0dd540e5 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -176,6 +176,14 @@ static void arm_spe_free(struct perf_session *session) free(spe); } +static bool arm_spe_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); + + return evsel->core.attr.type == spe->pmu_type; +} + static const char * const arm_spe_info_fmts[] = { [ARM_SPE_PMU_TYPE] = " PMU Type %"PRId64"\n", }; @@ -218,6 +226,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event, spe->auxtrace.flush_events = arm_spe_flush; spe->auxtrace.free_events = arm_spe_free_events; spe->auxtrace.free = arm_spe_free; + spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace; session->auxtrace = &spe->auxtrace; arm_spe_print_info(&auxtrace_info->priv[0]); diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 3571ce72ca28..749487a41cc7 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -33,6 +33,7 @@ #include "evsel.h" #include "evsel_config.h" #include "symbol.h" +#include "util/perf_api_probe.h" #include "util/synthetic-events.h" #include "thread_map.h" #include "asm/bug.h" @@ -58,25 +59,6 @@ #include "symbol/kallsyms.h" #include <internal/lib.h> -static struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel) -{ - struct perf_pmu *pmu = NULL; - - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - if (pmu->type == evsel->core.attr.type) - break; - } - - return pmu; -} - -static bool perf_evsel__is_aux_event(struct evsel *evsel) -{ - struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); - - return pmu && pmu->auxtrace; -} - /* * Make a group from 'leader' to 'last', requiring that the events were not * already grouped to a different leader. @@ -88,7 +70,7 @@ static int perf_evlist__regroup(struct evlist *evlist, struct evsel *evsel; bool grp; - if (!perf_evsel__is_group_leader(leader)) + if (!evsel__is_group_leader(leader)) return -EINVAL; grp = false; @@ -703,8 +685,8 @@ static int auxtrace_validate_aux_sample_size(struct evlist *evlist, evlist__for_each_entry(evlist, evsel) { sz = evsel->core.attr.aux_sample_size; - if (perf_evsel__is_group_leader(evsel)) { - has_aux_leader = perf_evsel__is_aux_event(evsel); + if (evsel__is_group_leader(evsel)) { + has_aux_leader = evsel__is_aux_event(evsel); if (sz) { if (has_aux_leader) pr_err("Cannot add AUX area sampling to an AUX area event\n"); @@ -723,10 +705,10 @@ static int auxtrace_validate_aux_sample_size(struct evlist *evlist, pr_err("Cannot add AUX area sampling because group leader is not an AUX area event\n"); return -EINVAL; } - perf_evsel__set_sample_bit(evsel, AUX); + evsel__set_sample_bit(evsel, AUX); opts->auxtrace_sample_mode = true; } else { - perf_evsel__reset_sample_bit(evsel, AUX); + evsel__reset_sample_bit(evsel, AUX); } } @@ -777,8 +759,8 @@ int auxtrace_parse_sample_options(struct auxtrace_record *itr, /* Set aux_sample_size based on --aux-sample option */ evlist__for_each_entry(evlist, evsel) { - if (perf_evsel__is_group_leader(evsel)) { - has_aux_leader = perf_evsel__is_aux_event(evsel); + if (evsel__is_group_leader(evsel)) { + has_aux_leader = evsel__is_aux_event(evsel); } else if (has_aux_leader) { evsel->core.attr.aux_sample_size = sz; } @@ -787,7 +769,7 @@ no_opt: aux_evsel = NULL; /* Override with aux_sample_size from config term */ evlist__for_each_entry(evlist, evsel) { - if (perf_evsel__is_aux_event(evsel)) + if (evsel__is_aux_event(evsel)) aux_evsel = evsel; term = perf_evsel__get_config_term(evsel, AUX_SAMPLE_SIZE); if (term) { @@ -1234,29 +1216,79 @@ out_free: return err; } +static void unleader_evsel(struct evlist *evlist, struct evsel *leader) +{ + struct evsel *new_leader = NULL; + struct evsel *evsel; + + /* Find new leader for the group */ + evlist__for_each_entry(evlist, evsel) { + if (evsel->leader != leader || evsel == leader) + continue; + if (!new_leader) + new_leader = evsel; + evsel->leader = new_leader; + } + + /* Update group information */ + if (new_leader) { + zfree(&new_leader->group_name); + new_leader->group_name = leader->group_name; + leader->group_name = NULL; + + new_leader->core.nr_members = leader->core.nr_members - 1; + leader->core.nr_members = 1; + } +} + +static void unleader_auxtrace(struct perf_session *session) +{ + struct evsel *evsel; + + evlist__for_each_entry(session->evlist, evsel) { + if (auxtrace__evsel_is_auxtrace(session, evsel) && + evsel__is_group_leader(evsel)) { + unleader_evsel(session->evlist, evsel); + } + } +} + int perf_event__process_auxtrace_info(struct perf_session *session, union perf_event *event) { enum auxtrace_type type = event->auxtrace_info.type; + int err; if (dump_trace) fprintf(stdout, " type: %u\n", type); switch (type) { case PERF_AUXTRACE_INTEL_PT: - return intel_pt_process_auxtrace_info(event, session); + err = intel_pt_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_INTEL_BTS: - return intel_bts_process_auxtrace_info(event, session); + err = intel_bts_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_ARM_SPE: - return arm_spe_process_auxtrace_info(event, session); + err = arm_spe_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_CS_ETM: - return cs_etm__process_auxtrace_info(event, session); + err = cs_etm__process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_S390_CPUMSF: - return s390_cpumsf_process_auxtrace_info(event, session); + err = s390_cpumsf_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; } + + if (err) + return err; + + unleader_auxtrace(session); + + return 0; } s64 perf_event__process_auxtrace(struct perf_session *session, @@ -1412,8 +1444,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, synth_opts->branches = true; synth_opts->returns = true; break; + case 'G': case 'g': - synth_opts->callchain = true; + if (p[-1] == 'G') + synth_opts->add_callchain = true; + else + synth_opts->callchain = true; synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ; while (*p == ' ' || *p == ',') @@ -1428,8 +1464,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, synth_opts->callchain_sz = val; } break; + case 'L': case 'l': - synth_opts->last_branch = true; + if (p[-1] == 'L') + synth_opts->add_last_branch = true; + else + synth_opts->last_branch = true; synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ; while (*p == ' ' || *p == ',') @@ -2482,7 +2522,7 @@ static int parse_addr_filter(struct evsel *evsel, const char *filter, goto out_exit; } - if (perf_evsel__append_addr_filter(evsel, new_filter)) { + if (evsel__append_addr_filter(evsel, new_filter)) { err = -ENOMEM; goto out_exit; } @@ -2500,9 +2540,9 @@ out_exit: return err; } -static int perf_evsel__nr_addr_filter(struct evsel *evsel) +static int evsel__nr_addr_filter(struct evsel *evsel) { - struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); + struct perf_pmu *pmu = evsel__find_pmu(evsel); int nr_addr_filters = 0; if (!pmu) @@ -2521,7 +2561,7 @@ int auxtrace_parse_filters(struct evlist *evlist) evlist__for_each_entry(evlist, evsel) { filter = evsel->filter; - max_nr = perf_evsel__nr_addr_filter(evsel); + max_nr = evsel__nr_addr_filter(evsel); if (!filter || !max_nr) continue; evsel->filter = NULL; @@ -2577,3 +2617,12 @@ void auxtrace__free(struct perf_session *session) return session->auxtrace->free(session); } + +bool auxtrace__evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + if (!session->auxtrace || !session->auxtrace->evsel_is_auxtrace) + return false; + + return session->auxtrace->evsel_is_auxtrace(session, evsel); +} diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index e58ef160b599..0220a2e86c16 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -21,6 +21,7 @@ union perf_event; struct perf_session; struct evlist; +struct evsel; struct perf_tool; struct mmap; struct perf_sample; @@ -73,8 +74,10 @@ enum itrace_period_type { * @calls: limit branch samples to calls (can be combined with @returns) * @returns: limit branch samples to returns (can be combined with @calls) * @callchain: add callchain to 'instructions' events + * @add_callchain: add callchain to existing event records * @thread_stack: feed branches to the thread_stack * @last_branch: add branch context to 'instruction' events + * @add_last_branch: add branch context to existing event records * @callchain_sz: maximum callchain size * @last_branch_sz: branch context size * @period: 'instructions' events period @@ -100,8 +103,10 @@ struct itrace_synth_opts { bool calls; bool returns; bool callchain; + bool add_callchain; bool thread_stack; bool last_branch; + bool add_last_branch; unsigned int callchain_sz; unsigned int last_branch_sz; unsigned long long period; @@ -166,6 +171,8 @@ struct auxtrace { struct perf_tool *tool); void (*free_events)(struct perf_session *session); void (*free)(struct perf_session *session); + bool (*evsel_is_auxtrace)(struct perf_session *session, + struct evsel *evsel); }; /** @@ -584,6 +591,8 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session, int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool); void auxtrace__free_events(struct perf_session *session); void auxtrace__free(struct perf_session *session); +bool auxtrace__evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel); #define ITRACE_HELP \ " i: synthesize instructions events\n" \ @@ -750,6 +759,13 @@ void auxtrace_index__free(struct list_head *head __maybe_unused) } static inline +bool auxtrace__evsel_is_auxtrace(struct perf_session *session __maybe_unused, + struct evsel *evsel __maybe_unused) +{ + return false; +} + +static inline int auxtrace_parse_filters(struct evlist *evlist __maybe_unused) { return 0; diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index a3207d900339..3742511a08d1 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -6,6 +6,9 @@ #include <bpf/libbpf.h> #include <linux/btf.h> #include <linux/err.h> +#include <linux/string.h> +#include <internal/lib.h> +#include <symbol/kallsyms.h> #include "bpf-event.h" #include "debug.h" #include "dso.h" @@ -290,11 +293,82 @@ out: return err ? -1 : 0; } +struct kallsyms_parse { + union perf_event *event; + perf_event__handler_t process; + struct machine *machine; + struct perf_tool *tool; +}; + +static int +process_bpf_image(char *name, u64 addr, struct kallsyms_parse *data) +{ + struct machine *machine = data->machine; + union perf_event *event = data->event; + struct perf_record_ksymbol *ksymbol; + int len; + + ksymbol = &event->ksymbol; + + *ksymbol = (struct perf_record_ksymbol) { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = offsetof(struct perf_record_ksymbol, name), + }, + .addr = addr, + .len = page_size, + .ksym_type = PERF_RECORD_KSYMBOL_TYPE_BPF, + .flags = 0, + }; + + len = scnprintf(ksymbol->name, KSYM_NAME_LEN, "%s", name); + ksymbol->header.size += PERF_ALIGN(len + 1, sizeof(u64)); + memset((void *) event + event->header.size, 0, machine->id_hdr_size); + event->header.size += machine->id_hdr_size; + + return perf_tool__process_synth_event(data->tool, event, machine, + data->process); +} + +static int +kallsyms_process_symbol(void *data, const char *_name, + char type __maybe_unused, u64 start) +{ + char disp[KSYM_NAME_LEN]; + char *module, *name; + unsigned long id; + int err = 0; + + module = strchr(_name, '\t'); + if (!module) + return 0; + + /* We are going after [bpf] module ... */ + if (strcmp(module + 1, "[bpf]")) + return 0; + + name = memdup(_name, (module - _name) + 1); + if (!name) + return -ENOMEM; + + name[module - _name] = 0; + + /* .. and only for trampolines and dispatchers */ + if ((sscanf(name, "bpf_trampoline_%lu", &id) == 1) || + (sscanf(name, "bpf_dispatcher_%s", disp) == 1)) + err = process_bpf_image(name, start, data); + + free(name); + return err; +} + int perf_event__synthesize_bpf_events(struct perf_session *session, perf_event__handler_t process, struct machine *machine, struct record_opts *opts) { + const char *kallsyms_filename = "/proc/kallsyms"; + struct kallsyms_parse arg; union perf_event *event; __u32 id = 0; int err; @@ -303,6 +377,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, event = malloc(sizeof(event->bpf) + KSYM_NAME_LEN + machine->id_hdr_size); if (!event) return -1; + + /* Synthesize all the bpf programs in system. */ while (true) { err = bpf_prog_get_next_id(id, &id); if (err) { @@ -335,6 +411,23 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, break; } } + + /* Synthesize all the bpf images - trampolines/dispatchers. */ + if (symbol_conf.kallsyms_name != NULL) + kallsyms_filename = symbol_conf.kallsyms_name; + + arg = (struct kallsyms_parse) { + .event = event, + .process = process, + .machine = machine, + .tool = session->tool, + }; + + if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) { + pr_err("%s: failed to synthesize bpf images: %s\n", + __func__, strerror(errno)); + } + free(event); return err; } @@ -416,8 +509,7 @@ static int bpf_event__sb_cb(union perf_event *event, void *data) return 0; } -int bpf_event__add_sb_event(struct evlist **evlist, - struct perf_env *env) +int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env) { struct perf_event_attr attr = { .type = PERF_TYPE_SOFTWARE, diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index 81fdc88e6c1a..68f315c3df5b 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -33,8 +33,7 @@ struct btf_node { #ifdef HAVE_LIBBPF_SUPPORT int machine__process_bpf(struct machine *machine, union perf_event *event, struct perf_sample *sample); -int bpf_event__add_sb_event(struct evlist **evlist, - struct perf_env *env); +int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env); void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, struct perf_env *env, FILE *fp); @@ -46,8 +45,8 @@ static inline int machine__process_bpf(struct machine *machine __maybe_unused, return 0; } -static inline int bpf_event__add_sb_event(struct evlist **evlist __maybe_unused, - struct perf_env *env __maybe_unused) +static inline int evlist__add_bpf_sb_event(struct evlist *evlist __maybe_unused, + struct perf_env *env __maybe_unused) { return 0; } diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 10c187b8b8ea..83bfb8768235 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -1430,7 +1430,7 @@ apply_config_evsel_for_key(const char *name, int map_fd, void *pkey, return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH; } - if (perf_evsel__is_bpf_output(evsel)) + if (evsel__is_bpf_output(evsel)) check_pass = true; if (attr->type == PERF_TYPE_RAW) check_pass = true; diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 154a05cd03af..4d3f02fa223d 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -15,13 +15,18 @@ #include "event.h" struct branch_flags { - u64 mispred:1; - u64 predicted:1; - u64 in_tx:1; - u64 abort:1; - u64 cycles:16; - u64 type:4; - u64 reserved:40; + union { + u64 value; + struct { + u64 mispred:1; + u64 predicted:1; + u64 in_tx:1; + u64 abort:1; + u64 cycles:16; + u64 type:4; + u64 reserved:40; + }; + }; }; struct branch_info { diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 706bb7bbe1e1..8f668ee29f25 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -143,6 +143,9 @@ struct callchain_cursor_node { u64 ip; struct map_symbol ms; const char *srcline; + /* Indicate valid cursor node for LBR stitch */ + bool valid; + bool branch; struct branch_flags branch_flags; u64 branch_from; @@ -151,6 +154,11 @@ struct callchain_cursor_node { struct callchain_cursor_node *next; }; +struct stitch_list { + struct list_head node; + struct callchain_cursor_node cursor; +}; + struct callchain_cursor { u64 nr; struct callchain_cursor_node *first; diff --git a/tools/perf/util/cap.h b/tools/perf/util/cap.h index 051dc590ceee..ae52878c0b2e 100644 --- a/tools/perf/util/cap.h +++ b/tools/perf/util/cap.h @@ -29,4 +29,8 @@ static inline bool perf_cap__capable(int cap __maybe_unused) #define CAP_SYSLOG 34 #endif +#ifndef CAP_PERFMON +#define CAP_PERFMON 38 +#endif + #endif /* __PERF_CAP_H */ diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index b73fb7823048..050dea9f1e88 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -107,7 +107,8 @@ found: static void cgroup__delete(struct cgroup *cgroup) { - close(cgroup->fd); + if (cgroup->fd >= 0) + close(cgroup->fd); zfree(&cgroup->name); free(cgroup); } diff --git a/tools/perf/util/cloexec.c b/tools/perf/util/cloexec.c index a12872f2856a..6b3988a7aba8 100644 --- a/tools/perf/util/cloexec.c +++ b/tools/perf/util/cloexec.c @@ -28,7 +28,7 @@ int __weak sched_getcpu(void) static int perf_flag_probe(void) { - /* use 'safest' configuration as used in perf_evsel__fallback() */ + /* use 'safest' configuration as used in evsel__fallback() */ struct perf_event_attr attr = { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index cd92a99eb89d..cd007cc9c283 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -564,6 +564,8 @@ static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer( resp = cs_etm_decoder__set_tid(etmq, packet_queue, elem, trace_chan_id); break; + /* Unused packet types */ + case OCSD_GEN_TRC_ELEM_I_RANGE_NOPATH: case OCSD_GEN_TRC_ELEM_ADDR_NACC: case OCSD_GEN_TRC_ELEM_CYCLE_COUNT: case OCSD_GEN_TRC_ELEM_ADDR_UNKNOWN: diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 62d2f9b9ce1b..c283223fb31f 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -94,6 +94,9 @@ struct cs_etm_queue { struct cs_etm_traceid_queue **traceid_queues; }; +/* RB tree for quick conversion between traceID and metadata pointers */ +static struct intlist *traceid_list; + static int cs_etm__update_queues(struct cs_etm_auxtrace *etm); static int cs_etm__process_queues(struct cs_etm_auxtrace *etm); static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm, @@ -631,6 +634,16 @@ static void cs_etm__free(struct perf_session *session) zfree(&aux); } +static bool cs_etm__evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct cs_etm_auxtrace *aux = container_of(session->auxtrace, + struct cs_etm_auxtrace, + auxtrace); + + return evsel->core.attr.type == aux->pmu_type; +} + static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address) { struct machine *machine; @@ -2618,6 +2631,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event, etm->auxtrace.flush_events = cs_etm__flush_events; etm->auxtrace.free_events = cs_etm__free_events; etm->auxtrace.free = cs_etm__free; + etm->auxtrace.evsel_is_auxtrace = cs_etm__evsel_is_auxtrace; session->auxtrace = &etm->auxtrace; etm->unknown_thread = thread__new(999999999, 999999999); diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index 650ecc2a6349..4ad925d6d799 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -114,9 +114,6 @@ enum cs_etm_isa { CS_ETM_ISA_T32, }; -/* RB tree for quick conversion between traceID and metadata pointers */ -struct intlist *traceid_list; - struct cs_etm_queue; struct cs_etm_packet { diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index dbc772bfb04e..5f36fc6a5578 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -835,7 +835,7 @@ static int process_sample_event(struct perf_tool *tool, return -1; } - if (perf_evsel__is_bpf_output(evsel)) { + if (evsel__is_bpf_output(evsel)) { ret = add_bpf_output_values(event_class, event, sample); if (ret) return -1; @@ -1155,7 +1155,7 @@ static int add_event(struct ctf_writer *cw, struct evsel *evsel) { struct bt_ctf_event_class *event_class; struct evsel_priv *priv; - const char *name = perf_evsel__name(evsel); + const char *name = evsel__name(evsel); int ret; pr("Adding event '%s' (type %d)\n", name, evsel->core.attr.type); @@ -1174,7 +1174,7 @@ static int add_event(struct ctf_writer *cw, struct evsel *evsel) goto err; } - if (perf_evsel__is_bpf_output(evsel)) { + if (evsel__is_bpf_output(evsel)) { ret = add_bpf_output_types(cw, event_class); if (ret) goto err; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 91f21239608b..f338990e0fe6 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -191,6 +191,7 @@ int dso__read_binary_type_filename(const struct dso *dso, case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__JAVA_JIT: case DSO_BINARY_TYPE__BPF_PROG_INFO: + case DSO_BINARY_TYPE__BPF_IMAGE: case DSO_BINARY_TYPE__NOT_FOUND: ret = -1; break; diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 2db64b79617a..9553a1fd9e8a 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -40,6 +40,7 @@ enum dso_binary_type { DSO_BINARY_TYPE__GUEST_KCORE, DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO, DSO_BINARY_TYPE__BPF_PROG_INFO, + DSO_BINARY_TYPE__BPF_IMAGE, DSO_BINARY_TYPE__NOT_FOUND, }; diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 7632075a8792..1ab2682d5d2b 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -48,6 +48,7 @@ struct perf_env { char *cpuid; unsigned long long total_mem; unsigned int msr_pmu_type; + unsigned int max_branches; int nr_cmdline; int nr_sibling_cores; @@ -57,12 +58,14 @@ struct perf_env { int nr_memory_nodes; int nr_pmu_mappings; int nr_groups; + int nr_cpu_pmu_caps; char *cmdline; const char **cmdline_argv; char *sibling_cores; char *sibling_dies; char *sibling_threads; char *pmu_mappings; + char *cpu_pmu_caps; struct cpu_topology_map *cpu; struct cpu_cache_level *caches; int caches_cnt; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index dc0e11214ae1..f581550a3015 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -626,7 +626,7 @@ int machine__resolve(struct machine *machine, struct addr_location *al, ret = strlist__has_entry(symbol_conf.sym_list, al->sym->name); } - if (!(ret && al->sym)) { + if (!ret && al->sym) { snprintf(al_addr_str, sz, "0x%"PRIx64, al->map->unmap_ip(al->map, al->sym->start)); ret = strlist__has_entry(symbol_conf.sym_list, diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 1548237b6558..0a0b760d6948 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -23,6 +23,7 @@ #include "asm/bug.h" #include "bpf-event.h" #include "util/string2.h" +#include "util/perf_api_probe.h" #include <signal.h> #include <unistd.h> #include <sched.h> @@ -118,7 +119,7 @@ static void perf_evlist__update_id_pos(struct evlist *evlist) struct evsel *evsel; evlist__for_each_entry(evlist, evsel) - perf_evsel__calc_id_pos(evsel); + evsel__calc_id_pos(evsel); perf_evlist__set_id_pos(evlist); } @@ -390,14 +391,14 @@ void evlist__disable(struct evlist *evlist) evlist__for_each_entry(evlist, pos) { if (evsel__cpu_iter_skip(pos, cpu)) continue; - if (pos->disabled || !perf_evsel__is_group_leader(pos) || !pos->core.fd) + if (pos->disabled || !evsel__is_group_leader(pos) || !pos->core.fd) continue; evsel__disable_cpu(pos, pos->cpu_iter - 1); } } affinity__cleanup(&affinity); evlist__for_each_entry(evlist, pos) { - if (!perf_evsel__is_group_leader(pos) || !pos->core.fd) + if (!evsel__is_group_leader(pos) || !pos->core.fd) continue; pos->disabled = true; } @@ -420,14 +421,14 @@ void evlist__enable(struct evlist *evlist) evlist__for_each_entry(evlist, pos) { if (evsel__cpu_iter_skip(pos, cpu)) continue; - if (!perf_evsel__is_group_leader(pos) || !pos->core.fd) + if (!evsel__is_group_leader(pos) || !pos->core.fd) continue; evsel__enable_cpu(pos, pos->cpu_iter - 1); } } affinity__cleanup(&affinity); evlist__for_each_entry(evlist, pos) { - if (!perf_evsel__is_group_leader(pos) || !pos->core.fd) + if (!evsel__is_group_leader(pos) || !pos->core.fd) continue; pos->disabled = false; } @@ -947,7 +948,7 @@ void __perf_evlist__set_sample_bit(struct evlist *evlist, struct evsel *evsel; evlist__for_each_entry(evlist, evsel) - __perf_evsel__set_sample_bit(evsel, bit); + __evsel__set_sample_bit(evsel, bit); } void __perf_evlist__reset_sample_bit(struct evlist *evlist, @@ -956,7 +957,7 @@ void __perf_evlist__reset_sample_bit(struct evlist *evlist, struct evsel *evsel; evlist__for_each_entry(evlist, evsel) - __perf_evsel__reset_sample_bit(evsel, bit); + __evsel__reset_sample_bit(evsel, bit); } int perf_evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) @@ -994,7 +995,7 @@ int perf_evlist__set_tp_filter(struct evlist *evlist, const char *filter) if (evsel->core.attr.type != PERF_TYPE_TRACEPOINT) continue; - err = perf_evsel__set_filter(evsel, filter); + err = evsel__set_filter(evsel, filter); if (err) break; } @@ -1014,7 +1015,7 @@ int perf_evlist__append_tp_filter(struct evlist *evlist, const char *filter) if (evsel->core.attr.type != PERF_TYPE_TRACEPOINT) continue; - err = perf_evsel__append_tp_filter(evsel, filter); + err = evsel__append_tp_filter(evsel, filter); if (err) break; } @@ -1131,8 +1132,10 @@ bool perf_evlist__valid_read_format(struct evlist *evlist) u64 sample_type = first->core.attr.sample_type; evlist__for_each_entry(evlist, pos) { - if (read_format != pos->core.attr.read_format) - return false; + if (read_format != pos->core.attr.read_format) { + pr_debug("Read format differs %#" PRIx64 " vs %#" PRIx64 "\n", + read_format, (u64)pos->core.attr.read_format); + } } /* PERF_SAMPLE_READ imples PERF_FORMAT_ID. */ @@ -1436,7 +1439,7 @@ int perf_evlist__parse_sample(struct evlist *evlist, union perf_event *event, if (!evsel) return -EFAULT; - return perf_evsel__parse_sample(evsel, event, sample); + return evsel__parse_sample(evsel, event, sample); } int perf_evlist__parse_sample_timestamp(struct evlist *evlist, @@ -1447,7 +1450,7 @@ int perf_evlist__parse_sample_timestamp(struct evlist *evlist, if (!evsel) return -EFAULT; - return perf_evsel__parse_sample_timestamp(evsel, event, timestamp); + return evsel__parse_sample_timestamp(evsel, event, timestamp); } int perf_evlist__strerror_open(struct evlist *evlist, @@ -1701,133 +1704,3 @@ struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list, } return leader; } - -int perf_evlist__add_sb_event(struct evlist **evlist, - struct perf_event_attr *attr, - perf_evsel__sb_cb_t cb, - void *data) -{ - struct evsel *evsel; - bool new_evlist = (*evlist) == NULL; - - if (*evlist == NULL) - *evlist = evlist__new(); - if (*evlist == NULL) - return -1; - - if (!attr->sample_id_all) { - pr_warning("enabling sample_id_all for all side band events\n"); - attr->sample_id_all = 1; - } - - evsel = perf_evsel__new_idx(attr, (*evlist)->core.nr_entries); - if (!evsel) - goto out_err; - - evsel->side_band.cb = cb; - evsel->side_band.data = data; - evlist__add(*evlist, evsel); - return 0; - -out_err: - if (new_evlist) { - evlist__delete(*evlist); - *evlist = NULL; - } - return -1; -} - -static void *perf_evlist__poll_thread(void *arg) -{ - struct evlist *evlist = arg; - bool draining = false; - int i, done = 0; - /* - * In order to read symbols from other namespaces perf to needs to call - * setns(2). This isn't permitted if the struct_fs has multiple users. - * unshare(2) the fs so that we may continue to setns into namespaces - * that we're observing when, for instance, reading the build-ids at - * the end of a 'perf record' session. - */ - unshare(CLONE_FS); - - while (!done) { - bool got_data = false; - - if (evlist->thread.done) - draining = true; - - if (!draining) - evlist__poll(evlist, 1000); - - for (i = 0; i < evlist->core.nr_mmaps; i++) { - struct mmap *map = &evlist->mmap[i]; - union perf_event *event; - - if (perf_mmap__read_init(&map->core)) - continue; - while ((event = perf_mmap__read_event(&map->core)) != NULL) { - struct evsel *evsel = perf_evlist__event2evsel(evlist, event); - - if (evsel && evsel->side_band.cb) - evsel->side_band.cb(event, evsel->side_band.data); - else - pr_warning("cannot locate proper evsel for the side band event\n"); - - perf_mmap__consume(&map->core); - got_data = true; - } - perf_mmap__read_done(&map->core); - } - - if (draining && !got_data) - break; - } - return NULL; -} - -int perf_evlist__start_sb_thread(struct evlist *evlist, - struct target *target) -{ - struct evsel *counter; - - if (!evlist) - return 0; - - if (perf_evlist__create_maps(evlist, target)) - goto out_delete_evlist; - - evlist__for_each_entry(evlist, counter) { - if (evsel__open(counter, evlist->core.cpus, - evlist->core.threads) < 0) - goto out_delete_evlist; - } - - if (evlist__mmap(evlist, UINT_MAX)) - goto out_delete_evlist; - - evlist__for_each_entry(evlist, counter) { - if (evsel__enable(counter)) - goto out_delete_evlist; - } - - evlist->thread.done = 0; - if (pthread_create(&evlist->thread.th, NULL, perf_evlist__poll_thread, evlist)) - goto out_delete_evlist; - - return 0; - -out_delete_evlist: - evlist__delete(evlist); - evlist = NULL; - return -1; -} - -void perf_evlist__stop_sb_thread(struct evlist *evlist) -{ - if (!evlist) - return; - evlist->thread.done = 1; - pthread_join(evlist->thread.th, NULL); - evlist__delete(evlist); -} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index f5bd5c386df1..b6f325dfb4d2 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -107,10 +107,11 @@ int __perf_evlist__add_default_attrs(struct evlist *evlist, int perf_evlist__add_dummy(struct evlist *evlist); -int perf_evlist__add_sb_event(struct evlist **evlist, +int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, - perf_evsel__sb_cb_t cb, + evsel__sb_cb_t cb, void *data); +void evlist__set_cb(struct evlist *evlist, evsel__sb_cb_t cb, void *data); int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target); void perf_evlist__stop_sb_thread(struct evlist *evlist); @@ -173,10 +174,6 @@ void evlist__close(struct evlist *evlist); struct callchain_param; void perf_evlist__set_id_pos(struct evlist *evlist); -bool perf_can_sample_identifier(void); -bool perf_can_record_switch_events(void); -bool perf_can_record_cpu_wide(void); -bool perf_can_aux_sample(void); void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, struct callchain_param *callchain); int record_opts__config(struct record_opts *opts); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index eb880efbce16..f3e60c45d59a 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -102,7 +102,7 @@ set_methods: #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y)) -int __perf_evsel__sample_size(u64 sample_type) +int __evsel__sample_size(u64 sample_type) { u64 mask = sample_type & PERF_SAMPLE_MASK; int size = 0; @@ -178,53 +178,53 @@ static int __perf_evsel__calc_is_pos(u64 sample_type) return idx; } -void perf_evsel__calc_id_pos(struct evsel *evsel) +void evsel__calc_id_pos(struct evsel *evsel) { evsel->id_pos = __perf_evsel__calc_id_pos(evsel->core.attr.sample_type); evsel->is_pos = __perf_evsel__calc_is_pos(evsel->core.attr.sample_type); } -void __perf_evsel__set_sample_bit(struct evsel *evsel, +void __evsel__set_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit) { if (!(evsel->core.attr.sample_type & bit)) { evsel->core.attr.sample_type |= bit; evsel->sample_size += sizeof(u64); - perf_evsel__calc_id_pos(evsel); + evsel__calc_id_pos(evsel); } } -void __perf_evsel__reset_sample_bit(struct evsel *evsel, +void __evsel__reset_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit) { if (evsel->core.attr.sample_type & bit) { evsel->core.attr.sample_type &= ~bit; evsel->sample_size -= sizeof(u64); - perf_evsel__calc_id_pos(evsel); + evsel__calc_id_pos(evsel); } } -void perf_evsel__set_sample_id(struct evsel *evsel, +void evsel__set_sample_id(struct evsel *evsel, bool can_sample_identifier) { if (can_sample_identifier) { - perf_evsel__reset_sample_bit(evsel, ID); - perf_evsel__set_sample_bit(evsel, IDENTIFIER); + evsel__reset_sample_bit(evsel, ID); + evsel__set_sample_bit(evsel, IDENTIFIER); } else { - perf_evsel__set_sample_bit(evsel, ID); + evsel__set_sample_bit(evsel, ID); } evsel->core.attr.read_format |= PERF_FORMAT_ID; } /** - * perf_evsel__is_function_event - Return whether given evsel is a function + * evsel__is_function_event - Return whether given evsel is a function * trace event * * @evsel - evsel selector to be tested * * Return %true if event is function trace event */ -bool perf_evsel__is_function_event(struct evsel *evsel) +bool evsel__is_function_event(struct evsel *evsel) { #define FUNCTION_EVENT "ftrace:function" @@ -249,8 +249,8 @@ void evsel__init(struct evsel *evsel, evsel->bpf_fd = -1; INIT_LIST_HEAD(&evsel->config_terms); perf_evsel__object.init(evsel); - evsel->sample_size = __perf_evsel__sample_size(attr->sample_type); - perf_evsel__calc_id_pos(evsel); + evsel->sample_size = __evsel__sample_size(attr->sample_type); + evsel__calc_id_pos(evsel); evsel->cmdline_group_boundary = false; evsel->metric_expr = NULL; evsel->metric_name = NULL; @@ -267,13 +267,13 @@ struct evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) return NULL; evsel__init(evsel, attr, idx); - if (perf_evsel__is_bpf_output(evsel)) { + if (evsel__is_bpf_output(evsel)) { evsel->core.attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD), evsel->core.attr.sample_period = 1; } - if (perf_evsel__is_clock(evsel)) { + if (evsel__is_clock(evsel)) { /* * The evsel->unit points to static alias->unit * so it's ok to use static string in here. @@ -385,7 +385,7 @@ const char *perf_evsel__hw_names[PERF_COUNT_HW_MAX] = { "ref-cycles", }; -static const char *__perf_evsel__hw_name(u64 config) +static const char *__evsel__hw_name(u64 config) { if (config < PERF_COUNT_HW_MAX && perf_evsel__hw_names[config]) return perf_evsel__hw_names[config]; @@ -429,9 +429,9 @@ static int perf_evsel__add_modifiers(struct evsel *evsel, char *bf, size_t size) return r; } -static int perf_evsel__hw_name(struct evsel *evsel, char *bf, size_t size) +static int evsel__hw_name(struct evsel *evsel, char *bf, size_t size) { - int r = scnprintf(bf, size, "%s", __perf_evsel__hw_name(evsel->core.attr.config)); + int r = scnprintf(bf, size, "%s", __evsel__hw_name(evsel->core.attr.config)); return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); } @@ -448,20 +448,20 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = { "dummy", }; -static const char *__perf_evsel__sw_name(u64 config) +static const char *__evsel__sw_name(u64 config) { if (config < PERF_COUNT_SW_MAX && perf_evsel__sw_names[config]) return perf_evsel__sw_names[config]; return "unknown-software"; } -static int perf_evsel__sw_name(struct evsel *evsel, char *bf, size_t size) +static int evsel__sw_name(struct evsel *evsel, char *bf, size_t size) { - int r = scnprintf(bf, size, "%s", __perf_evsel__sw_name(evsel->core.attr.config)); + int r = scnprintf(bf, size, "%s", __evsel__sw_name(evsel->core.attr.config)); return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); } -static int __perf_evsel__bp_name(char *bf, size_t size, u64 addr, u64 type) +static int __evsel__bp_name(char *bf, size_t size, u64 addr, u64 type) { int r; @@ -479,10 +479,10 @@ static int __perf_evsel__bp_name(char *bf, size_t size, u64 addr, u64 type) return r; } -static int perf_evsel__bp_name(struct evsel *evsel, char *bf, size_t size) +static int evsel__bp_name(struct evsel *evsel, char *bf, size_t size) { struct perf_event_attr *attr = &evsel->core.attr; - int r = __perf_evsel__bp_name(bf, size, attr->bp_addr, attr->bp_type); + int r = __evsel__bp_name(bf, size, attr->bp_addr, attr->bp_type); return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); } @@ -531,7 +531,7 @@ static unsigned long perf_evsel__hw_cache_stat[C(MAX)] = { [C(NODE)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), }; -bool perf_evsel__is_cache_op_valid(u8 type, u8 op) +bool evsel__is_cache_op_valid(u8 type, u8 op) { if (perf_evsel__hw_cache_stat[type] & COP(op)) return true; /* valid */ @@ -539,8 +539,7 @@ bool perf_evsel__is_cache_op_valid(u8 type, u8 op) return false; /* invalid */ } -int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, - char *bf, size_t size) +int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size) { if (result) { return scnprintf(bf, size, "%s-%s-%s", perf_evsel__hw_cache[type][0], @@ -552,7 +551,7 @@ int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, perf_evsel__hw_cache_op[op][1]); } -static int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size) +static int __evsel__hw_cache_name(u64 config, char *bf, size_t size) { u8 op, result, type = (config >> 0) & 0xff; const char *err = "unknown-ext-hardware-cache-type"; @@ -571,33 +570,33 @@ static int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size) goto out_err; err = "invalid-cache"; - if (!perf_evsel__is_cache_op_valid(type, op)) + if (!evsel__is_cache_op_valid(type, op)) goto out_err; - return __perf_evsel__hw_cache_type_op_res_name(type, op, result, bf, size); + return __evsel__hw_cache_type_op_res_name(type, op, result, bf, size); out_err: return scnprintf(bf, size, "%s", err); } -static int perf_evsel__hw_cache_name(struct evsel *evsel, char *bf, size_t size) +static int evsel__hw_cache_name(struct evsel *evsel, char *bf, size_t size) { - int ret = __perf_evsel__hw_cache_name(evsel->core.attr.config, bf, size); + int ret = __evsel__hw_cache_name(evsel->core.attr.config, bf, size); return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret); } -static int perf_evsel__raw_name(struct evsel *evsel, char *bf, size_t size) +static int evsel__raw_name(struct evsel *evsel, char *bf, size_t size) { int ret = scnprintf(bf, size, "raw 0x%" PRIx64, evsel->core.attr.config); return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret); } -static int perf_evsel__tool_name(char *bf, size_t size) +static int evsel__tool_name(char *bf, size_t size) { int ret = scnprintf(bf, size, "duration_time"); return ret; } -const char *perf_evsel__name(struct evsel *evsel) +const char *evsel__name(struct evsel *evsel) { char bf[128]; @@ -609,22 +608,22 @@ const char *perf_evsel__name(struct evsel *evsel) switch (evsel->core.attr.type) { case PERF_TYPE_RAW: - perf_evsel__raw_name(evsel, bf, sizeof(bf)); + evsel__raw_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_HARDWARE: - perf_evsel__hw_name(evsel, bf, sizeof(bf)); + evsel__hw_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_HW_CACHE: - perf_evsel__hw_cache_name(evsel, bf, sizeof(bf)); + evsel__hw_cache_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_SOFTWARE: if (evsel->tool_event) - perf_evsel__tool_name(bf, sizeof(bf)); + evsel__tool_name(bf, sizeof(bf)); else - perf_evsel__sw_name(evsel, bf, sizeof(bf)); + evsel__sw_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_TRACEPOINT: @@ -632,7 +631,7 @@ const char *perf_evsel__name(struct evsel *evsel) break; case PERF_TYPE_BREAKPOINT: - perf_evsel__bp_name(evsel, bf, sizeof(bf)); + evsel__bp_name(evsel, bf, sizeof(bf)); break; default: @@ -649,7 +648,7 @@ out_unknown: return "unknown"; } -const char *perf_evsel__group_name(struct evsel *evsel) +const char *evsel__group_name(struct evsel *evsel) { return evsel->group_name ?: "anon group"; } @@ -664,21 +663,19 @@ const char *perf_evsel__group_name(struct evsel *evsel) * For record -e 'cycles,instructions' and report --group * 'cycles:u, instructions:u' */ -int perf_evsel__group_desc(struct evsel *evsel, char *buf, size_t size) +int evsel__group_desc(struct evsel *evsel, char *buf, size_t size) { int ret = 0; struct evsel *pos; - const char *group_name = perf_evsel__group_name(evsel); + const char *group_name = evsel__group_name(evsel); if (!evsel->forced_leader) ret = scnprintf(buf, size, "%s { ", group_name); - ret += scnprintf(buf + ret, size - ret, "%s", - perf_evsel__name(evsel)); + ret += scnprintf(buf + ret, size - ret, "%s", evsel__name(evsel)); for_each_group_member(pos, evsel) - ret += scnprintf(buf + ret, size - ret, ", %s", - perf_evsel__name(pos)); + ret += scnprintf(buf + ret, size - ret, ", %s", evsel__name(pos)); if (!evsel->forced_leader) ret += scnprintf(buf + ret, size - ret, " }"); @@ -686,14 +683,13 @@ int perf_evsel__group_desc(struct evsel *evsel, char *buf, size_t size) return ret; } -static void __perf_evsel__config_callchain(struct evsel *evsel, - struct record_opts *opts, - struct callchain_param *param) +static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts, + struct callchain_param *param) { - bool function = perf_evsel__is_function_event(evsel); + bool function = evsel__is_function_event(evsel); struct perf_event_attr *attr = &evsel->core.attr; - perf_evsel__set_sample_bit(evsel, CALLCHAIN); + evsel__set_sample_bit(evsel, CALLCHAIN); attr->sample_max_stack = param->max_stack; @@ -708,7 +704,7 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, "to get user callchain information. " "Falling back to framepointers.\n"); } else { - perf_evsel__set_sample_bit(evsel, BRANCH_STACK); + evsel__set_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_CALL_STACK | PERF_SAMPLE_BRANCH_NO_CYCLES | @@ -722,8 +718,8 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, if (param->record_mode == CALLCHAIN_DWARF) { if (!function) { - perf_evsel__set_sample_bit(evsel, REGS_USER); - perf_evsel__set_sample_bit(evsel, STACK_USER); + evsel__set_sample_bit(evsel, REGS_USER); + evsel__set_sample_bit(evsel, STACK_USER); if (opts->sample_user_regs && DWARF_MINIMAL_REGS != PERF_REGS_MASK) { attr->sample_regs_user |= DWARF_MINIMAL_REGS; pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, " @@ -746,12 +742,11 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, } } -void perf_evsel__config_callchain(struct evsel *evsel, - struct record_opts *opts, - struct callchain_param *param) +void evsel__config_callchain(struct evsel *evsel, struct record_opts *opts, + struct callchain_param *param) { if (param->enabled) - return __perf_evsel__config_callchain(evsel, opts, param); + return __evsel__config_callchain(evsel, opts, param); } static void @@ -760,16 +755,16 @@ perf_evsel__reset_callgraph(struct evsel *evsel, { struct perf_event_attr *attr = &evsel->core.attr; - perf_evsel__reset_sample_bit(evsel, CALLCHAIN); + evsel__reset_sample_bit(evsel, CALLCHAIN); if (param->record_mode == CALLCHAIN_LBR) { - perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); + evsel__reset_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_CALL_STACK | PERF_SAMPLE_BRANCH_HW_INDEX); } if (param->record_mode == CALLCHAIN_DWARF) { - perf_evsel__reset_sample_bit(evsel, REGS_USER); - perf_evsel__reset_sample_bit(evsel, STACK_USER); + evsel__reset_sample_bit(evsel, REGS_USER); + evsel__reset_sample_bit(evsel, STACK_USER); } } @@ -793,32 +788,32 @@ static void apply_config_terms(struct evsel *evsel, if (!(term->weak && opts->user_interval != ULLONG_MAX)) { attr->sample_period = term->val.period; attr->freq = 0; - perf_evsel__reset_sample_bit(evsel, PERIOD); + evsel__reset_sample_bit(evsel, PERIOD); } break; case PERF_EVSEL__CONFIG_TERM_FREQ: if (!(term->weak && opts->user_freq != UINT_MAX)) { attr->sample_freq = term->val.freq; attr->freq = 1; - perf_evsel__set_sample_bit(evsel, PERIOD); + evsel__set_sample_bit(evsel, PERIOD); } break; case PERF_EVSEL__CONFIG_TERM_TIME: if (term->val.time) - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, TIME); else - perf_evsel__reset_sample_bit(evsel, TIME); + evsel__reset_sample_bit(evsel, TIME); break; case PERF_EVSEL__CONFIG_TERM_CALLGRAPH: callgraph_buf = term->val.str; break; case PERF_EVSEL__CONFIG_TERM_BRANCH: if (term->val.str && strcmp(term->val.str, "no")) { - perf_evsel__set_sample_bit(evsel, BRANCH_STACK); + evsel__set_sample_bit(evsel, BRANCH_STACK); parse_branch_str(term->val.str, &attr->branch_sample_type); } else - perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); + evsel__reset_sample_bit(evsel, BRANCH_STACK); break; case PERF_EVSEL__CONFIG_TERM_STACK_USER: dump_size = term->val.stack_user; @@ -832,7 +827,7 @@ static void apply_config_terms(struct evsel *evsel, case PERF_EVSEL__CONFIG_TERM_INHERIT: /* * attr->inherit should has already been set by - * perf_evsel__config. If user explicitly set + * evsel__config. If user explicitly set * inherit using config terms, override global * opt->no_inherit setting. */ @@ -897,11 +892,11 @@ static void apply_config_terms(struct evsel *evsel, /* set perf-event callgraph */ if (param.enabled) { if (sample_address) { - perf_evsel__set_sample_bit(evsel, ADDR); - perf_evsel__set_sample_bit(evsel, DATA_SRC); + evsel__set_sample_bit(evsel, ADDR); + evsel__set_sample_bit(evsel, DATA_SRC); evsel->core.attr.mmap_data = track; } - perf_evsel__config_callchain(evsel, opts, ¶m); + evsel__config_callchain(evsel, opts, ¶m); } } } @@ -953,8 +948,8 @@ struct perf_evsel_config_term *__perf_evsel__get_config_term(struct evsel *evsel * enable/disable events specifically, as there's no * initial traced exec call. */ -void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, - struct callchain_param *callchain) +void evsel__config(struct evsel *evsel, struct record_opts *opts, + struct callchain_param *callchain) { struct evsel *leader = evsel->leader; struct perf_event_attr *attr = &evsel->core.attr; @@ -965,17 +960,17 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, attr->inherit = !opts->no_inherit; attr->write_backward = opts->overwrite ? 1 : 0; - perf_evsel__set_sample_bit(evsel, IP); - perf_evsel__set_sample_bit(evsel, TID); + evsel__set_sample_bit(evsel, IP); + evsel__set_sample_bit(evsel, TID); if (evsel->sample_read) { - perf_evsel__set_sample_bit(evsel, READ); + evsel__set_sample_bit(evsel, READ); /* * We need ID even in case of single event, because * PERF_SAMPLE_READ process ID specific data. */ - perf_evsel__set_sample_id(evsel, false); + evsel__set_sample_id(evsel, false); /* * Apply group format only if we belong to group @@ -994,7 +989,7 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, if (!attr->sample_period || (opts->user_freq != UINT_MAX || opts->user_interval != ULLONG_MAX)) { if (opts->freq) { - perf_evsel__set_sample_bit(evsel, PERIOD); + evsel__set_sample_bit(evsel, PERIOD); attr->freq = 1; attr->sample_freq = opts->freq; } else { @@ -1002,25 +997,6 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, } } - /* - * Disable sampling for all group members other - * than leader in case leader 'leads' the sampling. - */ - if ((leader != evsel) && leader->sample_read) { - attr->freq = 0; - attr->sample_freq = 0; - attr->sample_period = 0; - attr->write_backward = 0; - - /* - * We don't get sample for slave events, we make them - * when delivering group leader sample. Set the slave - * event to follow the master sample_type to ease up - * report. - */ - attr->sample_type = leader->core.attr.sample_type; - } - if (opts->no_samples) attr->sample_freq = 0; @@ -1033,7 +1009,7 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, } if (opts->sample_address) { - perf_evsel__set_sample_bit(evsel, ADDR); + evsel__set_sample_bit(evsel, ADDR); attr->mmap_data = track; } @@ -1042,24 +1018,24 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, * event, due to issues with page faults while tracing page * fault handler and its overall trickiness nature. */ - if (perf_evsel__is_function_event(evsel)) + if (evsel__is_function_event(evsel)) evsel->core.attr.exclude_callchain_user = 1; if (callchain && callchain->enabled && !evsel->no_aux_samples) - perf_evsel__config_callchain(evsel, opts, callchain); + evsel__config_callchain(evsel, opts, callchain); if (opts->sample_intr_regs) { attr->sample_regs_intr = opts->sample_intr_regs; - perf_evsel__set_sample_bit(evsel, REGS_INTR); + evsel__set_sample_bit(evsel, REGS_INTR); } if (opts->sample_user_regs) { attr->sample_regs_user |= opts->sample_user_regs; - perf_evsel__set_sample_bit(evsel, REGS_USER); + evsel__set_sample_bit(evsel, REGS_USER); } if (target__has_cpu(&opts->target) || opts->sample_cpu) - perf_evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, CPU); /* * When the user explicitly disabled time don't force it here. @@ -1068,31 +1044,31 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, (!perf_missing_features.sample_id_all && (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu || opts->sample_time_set))) - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, TIME); if (opts->raw_samples && !evsel->no_aux_samples) { - perf_evsel__set_sample_bit(evsel, TIME); - perf_evsel__set_sample_bit(evsel, RAW); - perf_evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, RAW); + evsel__set_sample_bit(evsel, CPU); } if (opts->sample_address) - perf_evsel__set_sample_bit(evsel, DATA_SRC); + evsel__set_sample_bit(evsel, DATA_SRC); if (opts->sample_phys_addr) - perf_evsel__set_sample_bit(evsel, PHYS_ADDR); + evsel__set_sample_bit(evsel, PHYS_ADDR); if (opts->no_buffering) { attr->watermark = 0; attr->wakeup_events = 1; } if (opts->branch_stack && !evsel->no_aux_samples) { - perf_evsel__set_sample_bit(evsel, BRANCH_STACK); + evsel__set_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type = opts->branch_stack; } if (opts->sample_weight) - perf_evsel__set_sample_bit(evsel, WEIGHT); + evsel__set_sample_bit(evsel, WEIGHT); attr->task = track; attr->mmap = track; @@ -1106,14 +1082,14 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, if (opts->record_cgroup) { attr->cgroup = track && !perf_missing_features.cgroup; - perf_evsel__set_sample_bit(evsel, CGROUP); + evsel__set_sample_bit(evsel, CGROUP); } if (opts->record_switch_events) attr->context_switch = track; if (opts->sample_transaction) - perf_evsel__set_sample_bit(evsel, TRANSACTION); + evsel__set_sample_bit(evsel, TRANSACTION); if (opts->running_time) { evsel->core.attr.read_format |= @@ -1127,15 +1103,15 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, * Disabling only independent events or group leaders, * keeping group members enabled. */ - if (perf_evsel__is_group_leader(evsel)) + if (evsel__is_group_leader(evsel)) attr->disabled = 1; /* * Setting enable_on_exec for independent events and * group leaders for traced executed by perf. */ - if (target__none(&opts->target) && perf_evsel__is_group_leader(evsel) && - !opts->initial_delay) + if (target__none(&opts->target) && evsel__is_group_leader(evsel) && + !opts->initial_delay) attr->enable_on_exec = 1; if (evsel->immediate) { @@ -1176,9 +1152,9 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, /* The --period option takes the precedence. */ if (opts->period_set) { if (opts->period) - perf_evsel__set_sample_bit(evsel, PERIOD); + evsel__set_sample_bit(evsel, PERIOD); else - perf_evsel__reset_sample_bit(evsel, PERIOD); + evsel__reset_sample_bit(evsel, PERIOD); } /* @@ -1187,10 +1163,10 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, * if BRANCH_STACK bit is set. */ if (opts->initial_delay && is_dummy_event(evsel)) - perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); + evsel__reset_sample_bit(evsel, BRANCH_STACK); } -int perf_evsel__set_filter(struct evsel *evsel, const char *filter) +int evsel__set_filter(struct evsel *evsel, const char *filter) { char *new_filter = strdup(filter); @@ -1203,13 +1179,12 @@ int perf_evsel__set_filter(struct evsel *evsel, const char *filter) return -1; } -static int perf_evsel__append_filter(struct evsel *evsel, - const char *fmt, const char *filter) +static int evsel__append_filter(struct evsel *evsel, const char *fmt, const char *filter) { char *new_filter; if (evsel->filter == NULL) - return perf_evsel__set_filter(evsel, filter); + return evsel__set_filter(evsel, filter); if (asprintf(&new_filter, fmt, evsel->filter, filter) > 0) { free(evsel->filter); @@ -1220,14 +1195,14 @@ static int perf_evsel__append_filter(struct evsel *evsel, return -1; } -int perf_evsel__append_tp_filter(struct evsel *evsel, const char *filter) +int evsel__append_tp_filter(struct evsel *evsel, const char *filter) { - return perf_evsel__append_filter(evsel, "(%s) && (%s)", filter); + return evsel__append_filter(evsel, "(%s) && (%s)", filter); } -int perf_evsel__append_addr_filter(struct evsel *evsel, const char *filter) +int evsel__append_addr_filter(struct evsel *evsel, const char *filter) { - return perf_evsel__append_filter(evsel, "%s,%s", filter); + return evsel__append_filter(evsel, "%s,%s", filter); } /* Caller has to clear disabled after going through all CPUs. */ @@ -1278,7 +1253,7 @@ static void perf_evsel__free_config_terms(struct evsel *evsel) } } -void perf_evsel__exit(struct evsel *evsel) +void evsel__exit(struct evsel *evsel) { assert(list_empty(&evsel->core.node)); assert(evsel->evlist == NULL); @@ -1298,12 +1273,12 @@ void perf_evsel__exit(struct evsel *evsel) void evsel__delete(struct evsel *evsel) { - perf_evsel__exit(evsel); + evsel__exit(evsel); free(evsel); } -void perf_evsel__compute_deltas(struct evsel *evsel, int cpu, int thread, - struct perf_counts_values *count) +void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread, + struct perf_counts_values *count) { struct perf_counts_values tmp; @@ -1342,8 +1317,7 @@ void perf_counts_values__scale(struct perf_counts_values *count, *pscaled = scaled; } -static int -perf_evsel__read_one(struct evsel *evsel, int cpu, int thread) +static int evsel__read_one(struct evsel *evsel, int cpu, int thread) { struct perf_counts_values *count = perf_counts(evsel->counts, cpu, thread); @@ -1403,8 +1377,7 @@ perf_evsel__process_group_data(struct evsel *leader, return 0; } -static int -perf_evsel__read_group(struct evsel *leader, int cpu, int thread) +static int evsel__read_group(struct evsel *leader, int cpu, int thread) { struct perf_stat_evsel *ps = leader->stats; u64 read_format = leader->core.attr.read_format; @@ -1414,7 +1387,7 @@ perf_evsel__read_group(struct evsel *leader, int cpu, int thread) if (!(read_format & PERF_FORMAT_ID)) return -EINVAL; - if (!perf_evsel__is_group_leader(leader)) + if (!evsel__is_group_leader(leader)) return -EINVAL; if (!data) { @@ -1434,18 +1407,17 @@ perf_evsel__read_group(struct evsel *leader, int cpu, int thread) return perf_evsel__process_group_data(leader, cpu, thread, data); } -int perf_evsel__read_counter(struct evsel *evsel, int cpu, int thread) +int evsel__read_counter(struct evsel *evsel, int cpu, int thread) { u64 read_format = evsel->core.attr.read_format; if (read_format & PERF_FORMAT_GROUP) - return perf_evsel__read_group(evsel, cpu, thread); - else - return perf_evsel__read_one(evsel, cpu, thread); + return evsel__read_group(evsel, cpu, thread); + + return evsel__read_one(evsel, cpu, thread); } -int __perf_evsel__read_on_cpu(struct evsel *evsel, - int cpu, int thread, bool scale) +int __evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread, bool scale) { struct perf_counts_values count; size_t nv = scale ? 3 : 1; @@ -1459,7 +1431,7 @@ int __perf_evsel__read_on_cpu(struct evsel *evsel, if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) <= 0) return -errno; - perf_evsel__compute_deltas(evsel, cpu, thread, &count); + evsel__compute_deltas(evsel, cpu, thread, &count); perf_counts_values__scale(&count, scale, NULL); *perf_counts(evsel->counts, cpu, thread) = count; return 0; @@ -1470,7 +1442,7 @@ static int get_group_fd(struct evsel *evsel, int cpu, int thread) struct evsel *leader = evsel->leader; int fd; - if (perf_evsel__is_group_leader(evsel)) + if (evsel__is_group_leader(evsel)) return -1; /* @@ -1749,8 +1721,7 @@ retry_open: /* * If we succeeded but had to kill clockid, fail and - * have perf_evsel__open_strerror() print us a nice - * error. + * have evsel__open_strerror() print us a nice error. */ if (perf_missing_features.clockid || perf_missing_features.clockid_wrong) { @@ -1854,7 +1825,7 @@ try_fallback: } else if (!perf_missing_features.group_read && evsel->core.attr.inherit && (evsel->core.attr.read_format & PERF_FORMAT_GROUP) && - perf_evsel__is_group_leader(evsel)) { + evsel__is_group_leader(evsel)) { perf_missing_features.group_read = true; pr_debug2_peo("switching off group read\n"); goto fallback_missing_features; @@ -1888,9 +1859,7 @@ void evsel__close(struct evsel *evsel) perf_evsel__free_id(&evsel->core); } -int perf_evsel__open_per_cpu(struct evsel *evsel, - struct perf_cpu_map *cpus, - int cpu) +int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu) { if (cpu == -1) return evsel__open_cpu(evsel, cpus, NULL, 0, @@ -1899,8 +1868,7 @@ int perf_evsel__open_per_cpu(struct evsel *evsel, return evsel__open_cpu(evsel, cpus, NULL, cpu, cpu + 1); } -int perf_evsel__open_per_thread(struct evsel *evsel, - struct perf_thread_map *threads) +int evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads) { return evsel__open(evsel, NULL, threads); } @@ -1995,8 +1963,8 @@ perf_event__check_size(union perf_event *event, unsigned int sample_size) return 0; } -int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, - struct perf_sample *data) +int evsel__parse_sample(struct evsel *evsel, union perf_event *event, + struct perf_sample *data) { u64 type = evsel->core.attr.sample_type; bool swapped = evsel->needs_swap; @@ -2136,7 +2104,7 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, } } - if (evsel__has_callchain(evsel)) { + if (type & PERF_SAMPLE_CALLCHAIN) { const u64 max_callchain_nr = UINT64_MAX / sizeof(u64); OVERFLOW_CHECK_u64(array); @@ -2190,7 +2158,7 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, return -EFAULT; sz = data->branch_stack->nr * sizeof(struct branch_entry); - if (perf_evsel__has_branch_hw_idx(evsel)) + if (evsel__has_branch_hw_idx(evsel)) sz += sizeof(u64); else data->no_hw_idx = true; @@ -2298,9 +2266,8 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, return 0; } -int perf_evsel__parse_sample_timestamp(struct evsel *evsel, - union perf_event *event, - u64 *timestamp) +int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event, + u64 *timestamp) { u64 type = evsel->core.attr.sample_type; const __u64 *array; @@ -2342,15 +2309,14 @@ int perf_evsel__parse_sample_timestamp(struct evsel *evsel, return 0; } -struct tep_format_field *perf_evsel__field(struct evsel *evsel, const char *name) +struct tep_format_field *evsel__field(struct evsel *evsel, const char *name) { return tep_find_field(evsel->tp_format, name); } -void *perf_evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, - const char *name) +void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char *name) { - struct tep_format_field *field = perf_evsel__field(evsel, name); + struct tep_format_field *field = evsel__field(evsel, name); int offset; if (!field) @@ -2405,10 +2371,9 @@ u64 format_field__intval(struct tep_format_field *field, struct perf_sample *sam return 0; } -u64 perf_evsel__intval(struct evsel *evsel, struct perf_sample *sample, - const char *name) +u64 evsel__intval(struct evsel *evsel, struct perf_sample *sample, const char *name) { - struct tep_format_field *field = perf_evsel__field(evsel, name); + struct tep_format_field *field = evsel__field(evsel, name); if (!field) return 0; @@ -2416,8 +2381,7 @@ u64 perf_evsel__intval(struct evsel *evsel, struct perf_sample *sample, return field ? format_field__intval(field, sample, evsel->needs_swap) : 0; } -bool perf_evsel__fallback(struct evsel *evsel, int err, - char *msg, size_t msgsize) +bool evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize) { int paranoid; @@ -2442,10 +2406,14 @@ bool perf_evsel__fallback(struct evsel *evsel, int err, return true; } else if (err == EACCES && !evsel->core.attr.exclude_kernel && (paranoid = perf_event_paranoid()) > 1) { - const char *name = perf_evsel__name(evsel); + const char *name = evsel__name(evsel); char *new_name; const char *sep = ":"; + /* If event has exclude user then don't exclude kernel. */ + if (evsel->core.attr.exclude_user) + return false; + /* Is there already the separator in the name. */ if (strchr(name, '/') || strchr(name, ':')) @@ -2505,8 +2473,8 @@ static bool find_process(const char *name) return ret ? false : true; } -int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, - int err, char *msg, size_t size) +int evsel__open_strerror(struct evsel *evsel, struct target *target, + int err, char *msg, size_t size) { char sbuf[STRERR_BUFSIZE]; int printed = 0; @@ -2516,28 +2484,26 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, case EACCES: if (err == EPERM) printed = scnprintf(msg, size, - "No permission to enable %s event.\n\n", - perf_evsel__name(evsel)); + "No permission to enable %s event.\n\n", evsel__name(evsel)); return scnprintf(msg + printed, size - printed, "You may not have permission to collect %sstats.\n\n" "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" "which controls use of the performance events system by\n" - "unprivileged users (without CAP_SYS_ADMIN).\n\n" + "unprivileged users (without CAP_PERFMON or CAP_SYS_ADMIN).\n\n" "The current value is %d:\n\n" " -1: Allow use of (almost) all events by all users\n" " Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK\n" - ">= 0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN\n" - " Disallow raw tracepoint access by users without CAP_SYS_ADMIN\n" - ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" - ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN\n\n" + ">= 0: Disallow ftrace function tracepoint by users without CAP_PERFMON or CAP_SYS_ADMIN\n" + " Disallow raw tracepoint access by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN\n" + ">= 1: Disallow CPU event access by users without CAP_PERFMON or CAP_SYS_ADMIN\n" + ">= 2: Disallow kernel profiling by users without CAP_PERFMON or CAP_SYS_ADMIN\n\n" "To make this setting permanent, edit /etc/sysctl.conf too, e.g.:\n\n" " kernel.perf_event_paranoid = -1\n" , target->system_wide ? "system-wide " : "", perf_event_paranoid()); case ENOENT: - return scnprintf(msg, size, "The %s event is not supported.", - perf_evsel__name(evsel)); + return scnprintf(msg, size, "The %s event is not supported.", evsel__name(evsel)); case EMFILE: return scnprintf(msg, size, "%s", "Too many events are opened.\n" @@ -2561,7 +2527,7 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, if (evsel->core.attr.sample_period != 0) return scnprintf(msg, size, "%s: PMU Hardware doesn't support sampling/overflow-interrupts. Try 'perf stat'", - perf_evsel__name(evsel)); + evsel__name(evsel)); if (evsel->core.attr.precise_ip) return scnprintf(msg, size, "%s", "\'precise\' request may not be supported. Try removing 'p' modifier."); @@ -2594,11 +2560,10 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, return scnprintf(msg, size, "The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n" "/bin/dmesg | grep -i perf may provide additional information.\n", - err, str_error_r(err, sbuf, sizeof(sbuf)), - perf_evsel__name(evsel)); + err, str_error_r(err, sbuf, sizeof(sbuf)), evsel__name(evsel)); } -struct perf_env *perf_evsel__env(struct evsel *evsel) +struct perf_env *evsel__env(struct evsel *evsel) { if (evsel && evsel->evlist) return evsel->evlist->env; @@ -2623,7 +2588,7 @@ static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist) return 0; } -int perf_evsel__store_ids(struct evsel *evsel, struct evlist *evlist) +int evsel__store_ids(struct evsel *evsel, struct evlist *evlist) { struct perf_cpu_map *cpus = evsel->core.cpus; struct perf_thread_map *threads = evsel->core.threads; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 53187c501ee8..351c0aaf2a11 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -18,7 +18,7 @@ struct perf_counts; struct perf_stat_evsel; union perf_event; -typedef int (perf_evsel__sb_cb_t)(union perf_event *event, void *data); +typedef int (evsel__sb_cb_t)(union perf_event *event, void *data); enum perf_tool_event { PERF_TOOL_NONE = 0, @@ -101,9 +101,17 @@ struct evsel { int cpu_iter; const char *pmu_name; struct { - perf_evsel__sb_cb_t *cb; - void *data; + evsel__sb_cb_t *cb; + void *data; } side_band; + /* + * For reporting purposes, an evsel sample can have a callchain + * synthesized from AUX area data. Keep track of synthesized sample + * types here. Note, the recorded sample_type cannot be changed because + * it is needed to continue to parse events. + * See also evsel__has_callchain(). + */ + __u64 synth_sample_type; }; struct perf_missing_features { @@ -135,7 +143,7 @@ static inline struct perf_cpu_map *evsel__cpus(struct evsel *evsel) return perf_evsel__cpus(&evsel->core); } -static inline int perf_evsel__nr_cpus(struct evsel *evsel) +static inline int evsel__nr_cpus(struct evsel *evsel) { return evsel__cpus(evsel)->nr; } @@ -143,13 +151,16 @@ static inline int perf_evsel__nr_cpus(struct evsel *evsel) void perf_counts_values__scale(struct perf_counts_values *count, bool scale, s8 *pscaled); -void perf_evsel__compute_deltas(struct evsel *evsel, int cpu, int thread, - struct perf_counts_values *count); +void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread, + struct perf_counts_values *count); int perf_evsel__object_config(size_t object_size, int (*init)(struct evsel *evsel), void (*fini)(struct evsel *evsel)); +struct perf_pmu *evsel__find_pmu(struct evsel *evsel); +bool evsel__is_aux_event(struct evsel *evsel); + struct evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx); static inline struct evsel *evsel__new(struct perf_event_attr *attr) @@ -172,22 +183,20 @@ struct evsel *perf_evsel__new_cycles(bool precise); struct tep_event *event_format__new(const char *sys, const char *name); void evsel__init(struct evsel *evsel, struct perf_event_attr *attr, int idx); -void perf_evsel__exit(struct evsel *evsel); +void evsel__exit(struct evsel *evsel); void evsel__delete(struct evsel *evsel); struct callchain_param; -void perf_evsel__config(struct evsel *evsel, - struct record_opts *opts, - struct callchain_param *callchain); -void perf_evsel__config_callchain(struct evsel *evsel, - struct record_opts *opts, - struct callchain_param *callchain); +void evsel__config(struct evsel *evsel, struct record_opts *opts, + struct callchain_param *callchain); +void evsel__config_callchain(struct evsel *evsel, struct record_opts *opts, + struct callchain_param *callchain); -int __perf_evsel__sample_size(u64 sample_type); -void perf_evsel__calc_id_pos(struct evsel *evsel); +int __evsel__sample_size(u64 sample_type); +void evsel__calc_id_pos(struct evsel *evsel); -bool perf_evsel__is_cache_op_valid(u8 type, u8 op); +bool evsel__is_cache_op_valid(u8 type, u8 op); #define PERF_EVSEL__MAX_ALIASES 8 @@ -199,177 +208,153 @@ extern const char *perf_evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX] [PERF_EVSEL__MAX_ALIASES]; extern const char *perf_evsel__hw_names[PERF_COUNT_HW_MAX]; extern const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX]; -int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, - char *bf, size_t size); -const char *perf_evsel__name(struct evsel *evsel); +int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size); +const char *evsel__name(struct evsel *evsel); -const char *perf_evsel__group_name(struct evsel *evsel); -int perf_evsel__group_desc(struct evsel *evsel, char *buf, size_t size); +const char *evsel__group_name(struct evsel *evsel); +int evsel__group_desc(struct evsel *evsel, char *buf, size_t size); -void __perf_evsel__set_sample_bit(struct evsel *evsel, - enum perf_event_sample_format bit); -void __perf_evsel__reset_sample_bit(struct evsel *evsel, - enum perf_event_sample_format bit); +void __evsel__set_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit); +void __evsel__reset_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit); -#define perf_evsel__set_sample_bit(evsel, bit) \ - __perf_evsel__set_sample_bit(evsel, PERF_SAMPLE_##bit) +#define evsel__set_sample_bit(evsel, bit) \ + __evsel__set_sample_bit(evsel, PERF_SAMPLE_##bit) -#define perf_evsel__reset_sample_bit(evsel, bit) \ - __perf_evsel__reset_sample_bit(evsel, PERF_SAMPLE_##bit) +#define evsel__reset_sample_bit(evsel, bit) \ + __evsel__reset_sample_bit(evsel, PERF_SAMPLE_##bit) -void perf_evsel__set_sample_id(struct evsel *evsel, - bool use_sample_identifier); +void evsel__set_sample_id(struct evsel *evsel, bool use_sample_identifier); -int perf_evsel__set_filter(struct evsel *evsel, const char *filter); -int perf_evsel__append_tp_filter(struct evsel *evsel, const char *filter); -int perf_evsel__append_addr_filter(struct evsel *evsel, - const char *filter); +int evsel__set_filter(struct evsel *evsel, const char *filter); +int evsel__append_tp_filter(struct evsel *evsel, const char *filter); +int evsel__append_addr_filter(struct evsel *evsel, const char *filter); int evsel__enable_cpu(struct evsel *evsel, int cpu); int evsel__enable(struct evsel *evsel); int evsel__disable(struct evsel *evsel); int evsel__disable_cpu(struct evsel *evsel, int cpu); -int perf_evsel__open_per_cpu(struct evsel *evsel, - struct perf_cpu_map *cpus, - int cpu); -int perf_evsel__open_per_thread(struct evsel *evsel, - struct perf_thread_map *threads); +int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu); +int evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads); int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, struct perf_thread_map *threads); void evsel__close(struct evsel *evsel); struct perf_sample; -void *perf_evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, - const char *name); -u64 perf_evsel__intval(struct evsel *evsel, struct perf_sample *sample, - const char *name); +void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char *name); +u64 evsel__intval(struct evsel *evsel, struct perf_sample *sample, const char *name); -static inline char *perf_evsel__strval(struct evsel *evsel, - struct perf_sample *sample, - const char *name) +static inline char *evsel__strval(struct evsel *evsel, struct perf_sample *sample, const char *name) { - return perf_evsel__rawptr(evsel, sample, name); + return evsel__rawptr(evsel, sample, name); } struct tep_format_field; u64 format_field__intval(struct tep_format_field *field, struct perf_sample *sample, bool needs_swap); -struct tep_format_field *perf_evsel__field(struct evsel *evsel, const char *name); +struct tep_format_field *evsel__field(struct evsel *evsel, const char *name); -#define perf_evsel__match(evsel, t, c) \ +#define evsel__match(evsel, t, c) \ (evsel->core.attr.type == PERF_TYPE_##t && \ evsel->core.attr.config == PERF_COUNT_##c) -static inline bool perf_evsel__match2(struct evsel *e1, - struct evsel *e2) +static inline bool evsel__match2(struct evsel *e1, struct evsel *e2) { return (e1->core.attr.type == e2->core.attr.type) && (e1->core.attr.config == e2->core.attr.config); } -#define perf_evsel__cmp(a, b) \ - ((a) && \ - (b) && \ - (a)->core.attr.type == (b)->core.attr.type && \ - (a)->core.attr.config == (b)->core.attr.config) - -int perf_evsel__read_counter(struct evsel *evsel, int cpu, int thread); +int evsel__read_counter(struct evsel *evsel, int cpu, int thread); -int __perf_evsel__read_on_cpu(struct evsel *evsel, - int cpu, int thread, bool scale); +int __evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread, bool scale); /** - * perf_evsel__read_on_cpu - Read out the results on a CPU and thread + * evsel__read_on_cpu - Read out the results on a CPU and thread * * @evsel - event selector to read value * @cpu - CPU of interest * @thread - thread of interest */ -static inline int perf_evsel__read_on_cpu(struct evsel *evsel, - int cpu, int thread) +static inline int evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread) { - return __perf_evsel__read_on_cpu(evsel, cpu, thread, false); + return __evsel__read_on_cpu(evsel, cpu, thread, false); } /** - * perf_evsel__read_on_cpu_scaled - Read out the results on a CPU and thread, scaled + * evsel__read_on_cpu_scaled - Read out the results on a CPU and thread, scaled * * @evsel - event selector to read value * @cpu - CPU of interest * @thread - thread of interest */ -static inline int perf_evsel__read_on_cpu_scaled(struct evsel *evsel, - int cpu, int thread) +static inline int evsel__read_on_cpu_scaled(struct evsel *evsel, int cpu, int thread) { - return __perf_evsel__read_on_cpu(evsel, cpu, thread, true); + return __evsel__read_on_cpu(evsel, cpu, thread, true); } -int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, - struct perf_sample *sample); +int evsel__parse_sample(struct evsel *evsel, union perf_event *event, + struct perf_sample *sample); -int perf_evsel__parse_sample_timestamp(struct evsel *evsel, - union perf_event *event, - u64 *timestamp); +int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event, + u64 *timestamp); -static inline struct evsel *perf_evsel__next(struct evsel *evsel) +static inline struct evsel *evsel__next(struct evsel *evsel) { return list_entry(evsel->core.node.next, struct evsel, core.node); } -static inline struct evsel *perf_evsel__prev(struct evsel *evsel) +static inline struct evsel *evsel__prev(struct evsel *evsel) { return list_entry(evsel->core.node.prev, struct evsel, core.node); } /** - * perf_evsel__is_group_leader - Return whether given evsel is a leader event + * evsel__is_group_leader - Return whether given evsel is a leader event * * @evsel - evsel selector to be tested * * Return %true if @evsel is a group leader or a stand-alone event */ -static inline bool perf_evsel__is_group_leader(const struct evsel *evsel) +static inline bool evsel__is_group_leader(const struct evsel *evsel) { return evsel->leader == evsel; } /** - * perf_evsel__is_group_event - Return whether given evsel is a group event + * evsel__is_group_event - Return whether given evsel is a group event * * @evsel - evsel selector to be tested * * Return %true iff event group view is enabled and @evsel is a actual group * leader which has other members in the group */ -static inline bool perf_evsel__is_group_event(struct evsel *evsel) +static inline bool evsel__is_group_event(struct evsel *evsel) { if (!symbol_conf.event_group) return false; - return perf_evsel__is_group_leader(evsel) && evsel->core.nr_members > 1; + return evsel__is_group_leader(evsel) && evsel->core.nr_members > 1; } -bool perf_evsel__is_function_event(struct evsel *evsel); +bool evsel__is_function_event(struct evsel *evsel); -static inline bool perf_evsel__is_bpf_output(struct evsel *evsel) +static inline bool evsel__is_bpf_output(struct evsel *evsel) { - return perf_evsel__match(evsel, SOFTWARE, SW_BPF_OUTPUT); + return evsel__match(evsel, SOFTWARE, SW_BPF_OUTPUT); } -static inline bool perf_evsel__is_clock(struct evsel *evsel) +static inline bool evsel__is_clock(struct evsel *evsel) { - return perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || - perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK); + return evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || + evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK); } -bool perf_evsel__fallback(struct evsel *evsel, int err, - char *msg, size_t msgsize); -int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, - int err, char *msg, size_t size); +bool evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize); +int evsel__open_strerror(struct evsel *evsel, struct target *target, + int err, char *msg, size_t size); -static inline int perf_evsel__group_idx(struct evsel *evsel) +static inline int evsel__group_idx(struct evsel *evsel) { return evsel->idx - evsel->leader->idx; } @@ -386,22 +371,37 @@ for ((_evsel) = _leader; \ (_evsel) && (_evsel)->leader == (_leader); \ (_evsel) = list_entry((_evsel)->core.node.next, struct evsel, core.node)) -static inline bool perf_evsel__has_branch_callstack(const struct evsel *evsel) +static inline bool evsel__has_branch_callstack(const struct evsel *evsel) { return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } -static inline bool perf_evsel__has_branch_hw_idx(const struct evsel *evsel) +static inline bool evsel__has_branch_hw_idx(const struct evsel *evsel) { return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool evsel__has_callchain(const struct evsel *evsel) { - return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0; + /* + * For reporting purposes, an evsel sample can have a recorded callchain + * or a callchain synthesized from AUX area data. + */ + return evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN || + evsel->synth_sample_type & PERF_SAMPLE_CALLCHAIN; +} + +static inline bool evsel__has_br_stack(const struct evsel *evsel) +{ + /* + * For reporting purposes, an evsel sample can have a recorded branch + * stack or a branch stack synthesized from AUX area data. + */ + return evsel->core.attr.sample_type & PERF_SAMPLE_BRANCH_STACK || + evsel->synth_sample_type & PERF_SAMPLE_BRANCH_STACK; } -struct perf_env *perf_evsel__env(struct evsel *evsel); +struct perf_env *evsel__env(struct evsel *evsel); -int perf_evsel__store_ids(struct evsel *evsel, struct evlist *evlist); +int evsel__store_ids(struct evsel *evsel, struct evlist *evlist); #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index e026ab67b008..f8938916577c 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -7,7 +7,7 @@ /* * The 'struct perf_evsel_config_term' is used to pass event - * specific configuration data to perf_evsel__config routine. + * specific configuration data to evsel__config routine. * It is allocated within event parsing and attached to * perf_evsel::config_terms list head. */ diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 3b4842840db0..99aed708bd5a 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -44,22 +44,22 @@ int perf_evsel__fprintf(struct evsel *evsel, if (details->event_group) { struct evsel *pos; - if (!perf_evsel__is_group_leader(evsel)) + if (!evsel__is_group_leader(evsel)) return 0; if (evsel->core.nr_members > 1) printed += fprintf(fp, "%s{", evsel->group_name ?: ""); - printed += fprintf(fp, "%s", perf_evsel__name(evsel)); + printed += fprintf(fp, "%s", evsel__name(evsel)); for_each_group_member(pos, evsel) - printed += fprintf(fp, ",%s", perf_evsel__name(pos)); + printed += fprintf(fp, ",%s", evsel__name(pos)); if (evsel->core.nr_members > 1) printed += fprintf(fp, "}"); goto out; } - printed += fprintf(fp, "%s", perf_evsel__name(evsel)); + printed += fprintf(fp, "%s", evsel__name(evsel)); if (details->verbose) { printed += perf_event_attr__fprintf(fp, &evsel->core.attr, diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index fd192ddf93c1..aa631e37ad1e 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -3,7 +3,6 @@ #include <assert.h> #include "expr.h" #include "expr-bison.h" -#define YY_EXTRA_TYPE int #include "expr-flex.h" #ifdef PARSER_DEBUG @@ -11,7 +10,7 @@ extern int expr_debug; #endif /* Caller must make sure id is allocated */ -void expr__add_id(struct parse_ctx *ctx, const char *name, double val) +void expr__add_id(struct expr_parse_ctx *ctx, const char *name, double val) { int idx; @@ -21,20 +20,24 @@ void expr__add_id(struct parse_ctx *ctx, const char *name, double val) ctx->ids[idx].val = val; } -void expr__ctx_init(struct parse_ctx *ctx) +void expr__ctx_init(struct expr_parse_ctx *ctx) { ctx->num_ids = 0; } static int -__expr__parse(double *val, struct parse_ctx *ctx, const char *expr, - int start) +__expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr, + int start, int runtime) { + struct expr_scanner_ctx scanner_ctx = { + .start_token = start, + .runtime = runtime, + }; YY_BUFFER_STATE buffer; void *scanner; int ret; - ret = expr_lex_init_extra(start, &scanner); + ret = expr_lex_init_extra(&scanner_ctx, &scanner); if (ret) return ret; @@ -52,9 +55,9 @@ __expr__parse(double *val, struct parse_ctx *ctx, const char *expr, return ret; } -int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr) +int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char *expr, int runtime) { - return __expr__parse(final_val, ctx, expr, EXPR_PARSE) ? -1 : 0; + return __expr__parse(final_val, ctx, expr, EXPR_PARSE, runtime) ? -1 : 0; } static bool @@ -72,13 +75,13 @@ already_seen(const char *val, const char *one, const char **other, } int expr__find_other(const char *expr, const char *one, const char ***other, - int *num_other) + int *num_other, int runtime) { int err, i = 0, j = 0; - struct parse_ctx ctx; + struct expr_parse_ctx ctx; expr__ctx_init(&ctx); - err = __expr__parse(NULL, &ctx, expr, EXPR_OTHER); + err = __expr__parse(NULL, &ctx, expr, EXPR_OTHER, runtime); if (err) return -1; diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index 9377538f4097..87d627bb699b 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -5,20 +5,25 @@ #define EXPR_MAX_OTHER 20 #define MAX_PARSE_ID EXPR_MAX_OTHER -struct parse_id { +struct expr_parse_id { const char *name; double val; }; -struct parse_ctx { +struct expr_parse_ctx { int num_ids; - struct parse_id ids[MAX_PARSE_ID]; + struct expr_parse_id ids[MAX_PARSE_ID]; }; -void expr__ctx_init(struct parse_ctx *ctx); -void expr__add_id(struct parse_ctx *ctx, const char *id, double val); -int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr); +struct expr_scanner_ctx { + int start_token; + int runtime; +}; + +void expr__ctx_init(struct expr_parse_ctx *ctx); +void expr__add_id(struct expr_parse_ctx *ctx, const char *id, double val); +int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char *expr, int runtime); int expr__find_other(const char *expr, const char *one, const char ***other, - int *num_other); + int *num_other, int runtime); #endif diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l index eaad29243c23..74b9b59b1aa5 100644 --- a/tools/perf/util/expr.l +++ b/tools/perf/util/expr.l @@ -35,7 +35,7 @@ static int value(yyscan_t scanner, int base) * Allow @ instead of / to be able to specify pmu/event/ without * conflicts with normal division. */ -static char *normalize(char *str) +static char *normalize(char *str, int runtime) { char *ret = str; char *dst = str; @@ -45,6 +45,19 @@ static char *normalize(char *str) *dst++ = '/'; else if (*str == '\\') *dst++ = *++str; + else if (*str == '?') { + char *paramval; + int i = 0; + int size = asprintf(¶mval, "%d", runtime); + + if (size < 0) + *dst++ = '0'; + else { + while (i < size) + *dst++ = paramval[i++]; + free(paramval); + } + } else *dst++ = *str; str++; @@ -54,16 +67,16 @@ static char *normalize(char *str) return ret; } -static int str(yyscan_t scanner, int token) +static int str(yyscan_t scanner, int token, int runtime) { YYSTYPE *yylval = expr_get_lval(scanner); char *text = expr_get_text(scanner); - yylval->str = normalize(strdup(text)); + yylval->str = normalize(strdup(text), runtime); if (!yylval->str) return EXPR_ERROR; - yylval->str = normalize(yylval->str); + yylval->str = normalize(yylval->str, runtime); return token; } %} @@ -72,17 +85,17 @@ number [0-9]+ sch [-,=] spec \\{sch} -sym [0-9a-zA-Z_\.:@]+ -symbol {spec}*{sym}*{spec}*{sym}* +sym [0-9a-zA-Z_\.:@?]+ +symbol {spec}*{sym}*{spec}*{sym}*{spec}*{sym} %% - { - int start_token; + struct expr_scanner_ctx *sctx = expr_get_extra(yyscanner); - start_token = expr_get_extra(yyscanner); + { + int start_token = sctx->start_token; - if (start_token) { - expr_set_extra(NULL, yyscanner); + if (sctx->start_token) { + sctx->start_token = 0; return start_token; } } @@ -93,7 +106,7 @@ if { return IF; } else { return ELSE; } #smt_on { return SMT_ON; } {number} { return value(yyscanner, 10); } -{symbol} { return str(yyscanner, ID); } +{symbol} { return str(yyscanner, ID, sctx->runtime); } "|" { return '|'; } "^" { return '^'; } "&" { return '&'; } diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y index 4720cbe79357..cd17486c1c5d 100644 --- a/tools/perf/util/expr.y +++ b/tools/perf/util/expr.y @@ -15,7 +15,7 @@ %define api.pure full %parse-param { double *final_val } -%parse-param { struct parse_ctx *ctx } +%parse-param { struct expr_parse_ctx *ctx } %parse-param {void *scanner} %lex-param {void* scanner} @@ -39,14 +39,14 @@ %{ static void expr_error(double *final_val __maybe_unused, - struct parse_ctx *ctx __maybe_unused, + struct expr_parse_ctx *ctx __maybe_unused, void *scanner, const char *s) { pr_debug("%s\n", s); } -static int lookup_id(struct parse_ctx *ctx, char *id, double *val) +static int lookup_id(struct expr_parse_ctx *ctx, char *id, double *val) { int i; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index acbd046bf95c..0ce47283a8a1 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -525,7 +525,7 @@ static int write_event_desc(struct feat_fd *ff, /* * write event string as passed on cmdline */ - ret = do_write_string(ff, perf_evsel__name(evsel)); + ret = do_write_string(ff, evsel__name(evsel)); if (ret < 0) return ret; /* @@ -783,8 +783,7 @@ static int write_group_desc(struct feat_fd *ff, return ret; evlist__for_each_entry(evlist, evsel) { - if (perf_evsel__is_group_leader(evsel) && - evsel->core.nr_members > 1) { + if (evsel__is_group_leader(evsel) && evsel->core.nr_members > 1) { const char *name = evsel->group_name ?: "{anon_group}"; u32 leader_idx = evsel->idx; u32 nr_members = evsel->core.nr_members; @@ -1395,6 +1394,38 @@ static int write_compressed(struct feat_fd *ff __maybe_unused, return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len)); } +static int write_cpu_pmu_caps(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); + struct perf_pmu_caps *caps = NULL; + int nr_caps; + int ret; + + if (!cpu_pmu) + return -ENOENT; + + nr_caps = perf_pmu__caps_parse(cpu_pmu); + if (nr_caps < 0) + return nr_caps; + + ret = do_write(ff, &nr_caps, sizeof(nr_caps)); + if (ret < 0) + return ret; + + list_for_each_entry(caps, &cpu_pmu->caps, list) { + ret = do_write_string(ff, caps->name); + if (ret < 0) + return ret; + + ret = do_write_string(ff, caps->value); + if (ret < 0) + return ret; + } + + return ret; +} + static void print_hostname(struct feat_fd *ff, FILE *fp) { fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname); @@ -1809,6 +1840,27 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } +static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +{ + const char *delimiter = "# cpu pmu capabilities: "; + u32 nr_caps = ff->ph->env.nr_cpu_pmu_caps; + char *str; + + if (!nr_caps) { + fprintf(fp, "# cpu pmu capabilities: not available\n"); + return; + } + + str = ff->ph->env.cpu_pmu_caps; + while (nr_caps--) { + fprintf(fp, "%s%s", delimiter, str); + delimiter = ", "; + str += strlen(str) + 1; + } + + fprintf(fp, "\n"); +} + static void print_pmu_mappings(struct feat_fd *ff, FILE *fp) { const char *delimiter = "# pmu mappings: "; @@ -1854,14 +1906,12 @@ static void print_group_desc(struct feat_fd *ff, FILE *fp) session = container_of(ff->ph, struct perf_session, header); evlist__for_each_entry(session->evlist, evsel) { - if (perf_evsel__is_group_leader(evsel) && - evsel->core.nr_members > 1) { - fprintf(fp, "# group: %s{%s", evsel->group_name ?: "", - perf_evsel__name(evsel)); + if (evsel__is_group_leader(evsel) && evsel->core.nr_members > 1) { + fprintf(fp, "# group: %s{%s", evsel->group_name ?: "", evsel__name(evsel)); nr = evsel->core.nr_members - 1; } else if (nr) { - fprintf(fp, ",%s", perf_evsel__name(evsel)); + fprintf(fp, ",%s", evsel__name(evsel)); if (--nr == 0) fprintf(fp, "}\n"); @@ -2846,6 +2896,60 @@ static int process_compressed(struct feat_fd *ff, return 0; } +static int process_cpu_pmu_caps(struct feat_fd *ff, + void *data __maybe_unused) +{ + char *name, *value; + struct strbuf sb; + u32 nr_caps; + + if (do_read_u32(ff, &nr_caps)) + return -1; + + if (!nr_caps) { + pr_debug("cpu pmu capabilities not available\n"); + return 0; + } + + ff->ph->env.nr_cpu_pmu_caps = nr_caps; + + if (strbuf_init(&sb, 128) < 0) + return -1; + + while (nr_caps--) { + name = do_read_string(ff); + if (!name) + goto error; + + value = do_read_string(ff); + if (!value) + goto free_name; + + if (strbuf_addf(&sb, "%s=%s", name, value) < 0) + goto free_value; + + /* include a NULL character at the end */ + if (strbuf_add(&sb, "", 1) < 0) + goto free_value; + + if (!strcmp(name, "branches")) + ff->ph->env.max_branches = atoi(value); + + free(value); + free(name); + } + ff->ph->env.cpu_pmu_caps = strbuf_detach(&sb, NULL); + return 0; + +free_value: + free(value); +free_name: + free(name); +error: + strbuf_release(&sb); + return -1; +} + #define FEAT_OPR(n, func, __full_only) \ [HEADER_##n] = { \ .name = __stringify(n), \ @@ -2903,6 +3007,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(BPF_PROG_INFO, bpf_prog_info, false), FEAT_OPR(BPF_BTF, bpf_btf, false), FEAT_OPR(COMPRESSED, compressed, false), + FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 840f95cee349..650bd1c7a99b 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -43,6 +43,7 @@ enum { HEADER_BPF_PROG_INFO, HEADER_BPF_BTF, HEADER_COMPRESSED, + HEADER_CPU_PMU_CAPS, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 283a69ff6a3d..12b65d00cf65 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1070,6 +1070,20 @@ iter_next_cumulative_entry(struct hist_entry_iter *iter, return fill_callchain_info(al, node, iter->hide_unresolved); } +static bool +hist_entry__fast__sym_diff(struct hist_entry *left, + struct hist_entry *right) +{ + struct symbol *sym_l = left->ms.sym; + struct symbol *sym_r = right->ms.sym; + + if (!sym_l && !sym_r) + return left->ip != right->ip; + + return !!_sort__sym_cmp(sym_l, sym_r); +} + + static int iter_add_next_cumulative_entry(struct hist_entry_iter *iter, struct addr_location *al) @@ -1096,6 +1110,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, }; int i; struct callchain_cursor cursor; + bool fast = hists__has(he_tmp.hists, sym); callchain_cursor_snapshot(&cursor, &callchain_cursor); @@ -1106,6 +1121,14 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, * It's possible that it has cycles or recursive calls. */ for (i = 0; i < iter->curr; i++) { + /* + * For most cases, there are no duplicate entries in callchain. + * The symbols are usually different. Do a quick check for + * symbols first. + */ + if (fast && hist_entry__fast__sym_diff(he_cache[i], &he_tmp)) + continue; + if (hist_entry__cmp(he_cache[i], &he_tmp) == 0) { /* to avoid calling callback function */ iter->he = NULL; @@ -2637,7 +2660,7 @@ size_t perf_evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp) size_t ret = 0; evlist__for_each_entry(evlist, pos) { - ret += fprintf(fp, "%s stats:\n", perf_evsel__name(pos)); + ret += fprintf(fp, "%s stats:\n", evsel__name(pos)); ret += events_stats__fprintf(&evsel__hists(pos)->stats, fp); } @@ -2661,7 +2684,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE]; u64 nr_events = hists->stats.total_period; struct evsel *evsel = hists_to_evsel(hists); - const char *ev_name = perf_evsel__name(evsel); + const char *ev_name = evsel__name(evsel); char buf[512], sample_freq_str[64] = ""; size_t buflen = sizeof(buf); char ref[30] = " show reference callgraph, "; @@ -2672,10 +2695,10 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh nr_events = hists->stats.total_non_filtered_period; } - if (perf_evsel__is_group_event(evsel)) { + if (evsel__is_group_event(evsel)) { struct evsel *pos; - perf_evsel__group_desc(evsel, buf, buflen); + evsel__group_desc(evsel, buf, buflen); ev_name = buf; for_each_group_member(pos, evsel) { diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 34cb380d19a3..af1e78d76228 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -432,7 +432,7 @@ static int intel_bts_process_buffer(struct intel_bts_queue *btsq, le64_to_cpu(branch->from), le64_to_cpu(branch->to), btsq->intel_pt_insn.length, - buffer->buffer_nr + 1); + buffer->buffer_nr + 1, true, 0, 0); if (filter && !(filter & btsq->sample_flags)) continue; err = intel_bts_synth_branch_sample(btsq, branch); @@ -728,6 +728,15 @@ static void intel_bts_free(struct perf_session *session) free(bts); } +static bool intel_bts_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, + auxtrace); + + return evsel->core.attr.type == bts->pmu_type; +} + struct intel_bts_synth { struct perf_tool dummy_tool; struct perf_session *session; @@ -816,10 +825,10 @@ static int intel_bts_synth_events(struct intel_bts *bts, bts->branches_id = id; /* * We only use sample types from PERF_SAMPLE_MASK so we can use - * __perf_evsel__sample_size() here. + * __evsel__sample_size() here. */ bts->branches_event_size = sizeof(struct perf_record_sample) + - __perf_evsel__sample_size(attr.sample_type); + __evsel__sample_size(attr.sample_type); } return 0; @@ -883,6 +892,7 @@ int intel_bts_process_auxtrace_info(union perf_event *event, bts->auxtrace.flush_events = intel_bts_flush; bts->auxtrace.free_events = intel_bts_free_events; bts->auxtrace.free = intel_bts_free; + bts->auxtrace.evsel_is_auxtrace = intel_bts_evsel_is_auxtrace; session->auxtrace = &bts->auxtrace; intel_bts_print_info(&auxtrace_info->priv[0], INTEL_BTS_PMU_TYPE, diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c index 0ccf10a0bf44..4ce109993e74 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c @@ -552,7 +552,7 @@ static int intel_pt_do_get_packet(const unsigned char *buf, size_t len, break; default: break; - }; + } if (!(byte & BIT(0))) { if (byte == 0) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 23c8289c2472..f17b1e769ae4 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -33,6 +33,7 @@ #include "tsc.h" #include "intel-pt.h" #include "config.h" +#include "util/perf_api_probe.h" #include "util/synthetic-events.h" #include "time-utils.h" @@ -68,6 +69,10 @@ struct intel_pt { bool est_tsc; bool sync_switch; bool mispred_all; + bool use_thread_stack; + bool callstack; + unsigned int br_stack_sz; + unsigned int br_stack_sz_plus; int have_sched_switch; u32 pmu_type; u64 kernel_start; @@ -124,6 +129,9 @@ struct intel_pt { struct range *time_ranges; unsigned int range_cnt; + + struct ip_callchain *chain; + struct branch_stack *br_stack; }; enum switch_state { @@ -143,8 +151,6 @@ struct intel_pt_queue { const struct intel_pt_state *state; struct ip_callchain *chain; struct branch_stack *last_branch; - struct branch_stack *last_branch_rb; - size_t last_branch_pos; union perf_event *event_buf; bool on_heap; bool stop; @@ -868,6 +874,83 @@ static u64 intel_pt_ns_to_ticks(const struct intel_pt *pt, u64 ns) pt->tc.time_mult; } +static struct ip_callchain *intel_pt_alloc_chain(struct intel_pt *pt) +{ + size_t sz = sizeof(struct ip_callchain); + + /* Add 1 to callchain_sz for callchain context */ + sz += (pt->synth_opts.callchain_sz + 1) * sizeof(u64); + return zalloc(sz); +} + +static int intel_pt_callchain_init(struct intel_pt *pt) +{ + struct evsel *evsel; + + evlist__for_each_entry(pt->session->evlist, evsel) { + if (!(evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + evsel->synth_sample_type |= PERF_SAMPLE_CALLCHAIN; + } + + pt->chain = intel_pt_alloc_chain(pt); + if (!pt->chain) + return -ENOMEM; + + return 0; +} + +static void intel_pt_add_callchain(struct intel_pt *pt, + struct perf_sample *sample) +{ + struct thread *thread = machine__findnew_thread(pt->machine, + sample->pid, + sample->tid); + + thread_stack__sample_late(thread, sample->cpu, pt->chain, + pt->synth_opts.callchain_sz + 1, sample->ip, + pt->kernel_start); + + sample->callchain = pt->chain; +} + +static struct branch_stack *intel_pt_alloc_br_stack(struct intel_pt *pt) +{ + size_t sz = sizeof(struct branch_stack); + + sz += pt->br_stack_sz * sizeof(struct branch_entry); + return zalloc(sz); +} + +static int intel_pt_br_stack_init(struct intel_pt *pt) +{ + struct evsel *evsel; + + evlist__for_each_entry(pt->session->evlist, evsel) { + if (!(evsel->core.attr.sample_type & PERF_SAMPLE_BRANCH_STACK)) + evsel->synth_sample_type |= PERF_SAMPLE_BRANCH_STACK; + } + + pt->br_stack = intel_pt_alloc_br_stack(pt); + if (!pt->br_stack) + return -ENOMEM; + + return 0; +} + +static void intel_pt_add_br_stack(struct intel_pt *pt, + struct perf_sample *sample) +{ + struct thread *thread = machine__findnew_thread(pt->machine, + sample->pid, + sample->tid); + + thread_stack__br_sample_late(thread, sample->cpu, pt->br_stack, + pt->br_stack_sz, sample->ip, + pt->kernel_start); + + sample->branch_stack = pt->br_stack; +} + static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, unsigned int queue_nr) { @@ -880,26 +963,15 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, return NULL; if (pt->synth_opts.callchain) { - size_t sz = sizeof(struct ip_callchain); - - /* Add 1 to callchain_sz for callchain context */ - sz += (pt->synth_opts.callchain_sz + 1) * sizeof(u64); - ptq->chain = zalloc(sz); + ptq->chain = intel_pt_alloc_chain(pt); if (!ptq->chain) goto out_free; } if (pt->synth_opts.last_branch) { - size_t sz = sizeof(struct branch_stack); - - sz += pt->synth_opts.last_branch_sz * - sizeof(struct branch_entry); - ptq->last_branch = zalloc(sz); + ptq->last_branch = intel_pt_alloc_br_stack(pt); if (!ptq->last_branch) goto out_free; - ptq->last_branch_rb = zalloc(sz); - if (!ptq->last_branch_rb) - goto out_free; } ptq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); @@ -968,7 +1040,6 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, out_free: zfree(&ptq->event_buf); zfree(&ptq->last_branch); - zfree(&ptq->last_branch_rb); zfree(&ptq->chain); free(ptq); return NULL; @@ -984,7 +1055,6 @@ static void intel_pt_free_queue(void *priv) intel_pt_decoder_free(ptq->decoder); zfree(&ptq->event_buf); zfree(&ptq->last_branch); - zfree(&ptq->last_branch_rb); zfree(&ptq->chain); free(ptq); } @@ -1152,58 +1222,6 @@ static int intel_pt_setup_queues(struct intel_pt *pt) return 0; } -static inline void intel_pt_copy_last_branch_rb(struct intel_pt_queue *ptq) -{ - struct branch_stack *bs_src = ptq->last_branch_rb; - struct branch_stack *bs_dst = ptq->last_branch; - size_t nr = 0; - - bs_dst->nr = bs_src->nr; - - if (!bs_src->nr) - return; - - nr = ptq->pt->synth_opts.last_branch_sz - ptq->last_branch_pos; - memcpy(&bs_dst->entries[0], - &bs_src->entries[ptq->last_branch_pos], - sizeof(struct branch_entry) * nr); - - if (bs_src->nr >= ptq->pt->synth_opts.last_branch_sz) { - memcpy(&bs_dst->entries[nr], - &bs_src->entries[0], - sizeof(struct branch_entry) * ptq->last_branch_pos); - } -} - -static inline void intel_pt_reset_last_branch_rb(struct intel_pt_queue *ptq) -{ - ptq->last_branch_pos = 0; - ptq->last_branch_rb->nr = 0; -} - -static void intel_pt_update_last_branch_rb(struct intel_pt_queue *ptq) -{ - const struct intel_pt_state *state = ptq->state; - struct branch_stack *bs = ptq->last_branch_rb; - struct branch_entry *be; - - if (!ptq->last_branch_pos) - ptq->last_branch_pos = ptq->pt->synth_opts.last_branch_sz; - - ptq->last_branch_pos -= 1; - - be = &bs->entries[ptq->last_branch_pos]; - be->from = state->from_ip; - be->to = state->to_ip; - be->flags.abort = !!(state->flags & INTEL_PT_ABORT_TX); - be->flags.in_tx = !!(state->flags & INTEL_PT_IN_TX); - /* No support for mispredict */ - be->flags.mispred = ptq->pt->mispred_all; - - if (bs->nr < ptq->pt->synth_opts.last_branch_sz) - bs->nr += 1; -} - static inline bool intel_pt_skip_event(struct intel_pt *pt) { return pt->synth_opts.initial_skip && @@ -1271,9 +1289,9 @@ static inline int intel_pt_opt_inject(struct intel_pt *pt, return intel_pt_inject_event(event, sample, type); } -static int intel_pt_deliver_synth_b_event(struct intel_pt *pt, - union perf_event *event, - struct perf_sample *sample, u64 type) +static int intel_pt_deliver_synth_event(struct intel_pt *pt, + union perf_event *event, + struct perf_sample *sample, u64 type) { int ret; @@ -1333,8 +1351,8 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) ptq->last_br_cyc_cnt = ptq->ipc_cyc_cnt; } - return intel_pt_deliver_synth_b_event(pt, event, &sample, - pt->branches_sample_type); + return intel_pt_deliver_synth_event(pt, event, &sample, + pt->branches_sample_type); } static void intel_pt_prep_sample(struct intel_pt *pt, @@ -1352,27 +1370,12 @@ static void intel_pt_prep_sample(struct intel_pt *pt, } if (pt->synth_opts.last_branch) { - intel_pt_copy_last_branch_rb(ptq); + thread_stack__br_sample(ptq->thread, ptq->cpu, ptq->last_branch, + pt->br_stack_sz); sample->branch_stack = ptq->last_branch; } } -static inline int intel_pt_deliver_synth_event(struct intel_pt *pt, - struct intel_pt_queue *ptq, - union perf_event *event, - struct perf_sample *sample, - u64 type) -{ - int ret; - - ret = intel_pt_deliver_synth_b_event(pt, event, sample, type); - - if (pt->synth_opts.last_branch) - intel_pt_reset_last_branch_rb(ptq); - - return ret; -} - static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq) { struct intel_pt *pt = ptq->pt; @@ -1397,7 +1400,7 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq) ptq->last_insn_cnt = ptq->state->tot_insn_cnt; - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->instructions_sample_type); } @@ -1415,7 +1418,7 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq) sample.id = ptq->pt->transactions_id; sample.stream_id = ptq->pt->transactions_id; - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->transactions_sample_type); } @@ -1456,7 +1459,7 @@ static int intel_pt_synth_ptwrite_sample(struct intel_pt_queue *ptq) sample.raw_size = perf_synth__raw_size(raw); sample.raw_data = perf_synth__raw_data(&raw); - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->ptwrites_sample_type); } @@ -1486,7 +1489,7 @@ static int intel_pt_synth_cbr_sample(struct intel_pt_queue *ptq) sample.raw_size = perf_synth__raw_size(raw); sample.raw_data = perf_synth__raw_data(&raw); - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->pwr_events_sample_type); } @@ -1511,7 +1514,7 @@ static int intel_pt_synth_mwait_sample(struct intel_pt_queue *ptq) sample.raw_size = perf_synth__raw_size(raw); sample.raw_data = perf_synth__raw_data(&raw); - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->pwr_events_sample_type); } @@ -1536,7 +1539,7 @@ static int intel_pt_synth_pwre_sample(struct intel_pt_queue *ptq) sample.raw_size = perf_synth__raw_size(raw); sample.raw_data = perf_synth__raw_data(&raw); - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->pwr_events_sample_type); } @@ -1561,7 +1564,7 @@ static int intel_pt_synth_exstop_sample(struct intel_pt_queue *ptq) sample.raw_size = perf_synth__raw_size(raw); sample.raw_data = perf_synth__raw_data(&raw); - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->pwr_events_sample_type); } @@ -1586,7 +1589,7 @@ static int intel_pt_synth_pwrx_sample(struct intel_pt_queue *ptq) sample.raw_size = perf_synth__raw_size(raw); sample.raw_data = perf_synth__raw_data(&raw); - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, + return intel_pt_deliver_synth_event(pt, event, &sample, pt->pwr_events_sample_type); } @@ -1680,15 +1683,14 @@ static u64 intel_pt_lbr_flags(u64 info) union { struct branch_flags flags; u64 result; - } u = { - .flags = { - .mispred = !!(info & LBR_INFO_MISPRED), - .predicted = !(info & LBR_INFO_MISPRED), - .in_tx = !!(info & LBR_INFO_IN_TX), - .abort = !!(info & LBR_INFO_ABORT), - .cycles = info & LBR_INFO_CYCLES, - } - }; + } u; + + u.result = 0; + u.flags.mispred = !!(info & LBR_INFO_MISPRED); + u.flags.predicted = !(info & LBR_INFO_MISPRED); + u.flags.in_tx = !!(info & LBR_INFO_IN_TX); + u.flags.abort = !!(info & LBR_INFO_ABORT); + u.flags.cycles = info & LBR_INFO_CYCLES; return u.result; } @@ -1807,7 +1809,9 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq) intel_pt_add_lbrs(&br.br_stack, items); sample.branch_stack = &br.br_stack; } else if (pt->synth_opts.last_branch) { - intel_pt_copy_last_branch_rb(ptq); + thread_stack__br_sample(ptq->thread, ptq->cpu, + ptq->last_branch, + pt->br_stack_sz); sample.branch_stack = ptq->last_branch; } else { br.br_stack.nr = 0; @@ -1842,7 +1846,7 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq) sample.transaction = txn; } - return intel_pt_deliver_synth_event(pt, ptq, event, &sample, sample_type); + return intel_pt_deliver_synth_event(pt, event, &sample, sample_type); } static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu, @@ -1992,12 +1996,15 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) if (!(state->type & INTEL_PT_BRANCH)) return 0; - if (pt->synth_opts.callchain || pt->synth_opts.thread_stack) - thread_stack__event(ptq->thread, ptq->cpu, ptq->flags, state->from_ip, - state->to_ip, ptq->insn_len, - state->trace_nr); - else + if (pt->use_thread_stack) { + thread_stack__event(ptq->thread, ptq->cpu, ptq->flags, + state->from_ip, state->to_ip, ptq->insn_len, + state->trace_nr, pt->callstack, + pt->br_stack_sz_plus, + pt->mispred_all); + } else { thread_stack__set_trace_nr(ptq->thread, ptq->cpu, state->trace_nr); + } if (pt->sample_branches) { err = intel_pt_synth_branch_sample(ptq); @@ -2005,9 +2012,6 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) return err; } - if (pt->synth_opts.last_branch) - intel_pt_update_last_branch_rb(ptq); - if (!ptq->sync_switch) return 0; @@ -2484,7 +2488,7 @@ static int intel_pt_process_switch(struct intel_pt *pt, if (evsel != pt->switch_evsel) return 0; - tid = perf_evsel__intval(evsel, sample, "next_pid"); + tid = evsel__intval(evsel, sample, "next_pid"); cpu = sample->cpu; intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n", @@ -2639,6 +2643,13 @@ static int intel_pt_process_event(struct perf_session *session, if (err) return err; + if (event->header.type == PERF_RECORD_SAMPLE) { + if (pt->synth_opts.add_callchain && !sample->callchain) + intel_pt_add_callchain(pt, sample); + if (pt->synth_opts.add_last_branch && !sample->branch_stack) + intel_pt_add_br_stack(pt, sample); + } + if (event->header.type == PERF_RECORD_AUX && (event->aux.flags & PERF_AUX_FLAG_TRUNCATED) && pt->synth_opts.errors) { @@ -2710,11 +2721,21 @@ static void intel_pt_free(struct perf_session *session) session->auxtrace = NULL; thread__put(pt->unknown_thread); addr_filters__exit(&pt->filts); + zfree(&pt->chain); zfree(&pt->filter); zfree(&pt->time_ranges); free(pt); } +static bool intel_pt_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, + auxtrace); + + return evsel->core.attr.type == pt->pmu_type; +} + static int intel_pt_process_auxtrace_event(struct perf_session *session, union perf_event *event, struct perf_tool *tool __maybe_unused) @@ -3016,7 +3037,7 @@ static struct evsel *intel_pt_find_sched_switch(struct evlist *evlist) struct evsel *evsel; evlist__for_each_entry_reverse(evlist, evsel) { - const char *name = perf_evsel__name(evsel); + const char *name = evsel__name(evsel); if (!strcmp(name, "sched:sched_switch")) return evsel; @@ -3310,6 +3331,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, pt->auxtrace.flush_events = intel_pt_flush; pt->auxtrace.free_events = intel_pt_free_events; pt->auxtrace.free = intel_pt_free; + pt->auxtrace.evsel_is_auxtrace = intel_pt_evsel_is_auxtrace; session->auxtrace = &pt->auxtrace; if (dump_trace) @@ -3338,6 +3360,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, !session->itrace_synth_opts->inject) { pt->synth_opts.branches = false; pt->synth_opts.callchain = true; + pt->synth_opts.add_callchain = true; } pt->synth_opts.thread_stack = session->itrace_synth_opts->thread_stack; @@ -3370,14 +3393,54 @@ int intel_pt_process_auxtrace_info(union perf_event *event, pt->branches_filter |= PERF_IP_FLAG_RETURN | PERF_IP_FLAG_TRACE_BEGIN; - if (pt->synth_opts.callchain && !symbol_conf.use_callchain) { + if ((pt->synth_opts.callchain || pt->synth_opts.add_callchain) && + !symbol_conf.use_callchain) { symbol_conf.use_callchain = true; if (callchain_register_param(&callchain_param) < 0) { symbol_conf.use_callchain = false; pt->synth_opts.callchain = false; + pt->synth_opts.add_callchain = false; } } + if (pt->synth_opts.add_callchain) { + err = intel_pt_callchain_init(pt); + if (err) + goto err_delete_thread; + } + + if (pt->synth_opts.last_branch || pt->synth_opts.add_last_branch) { + pt->br_stack_sz = pt->synth_opts.last_branch_sz; + pt->br_stack_sz_plus = pt->br_stack_sz; + } + + if (pt->synth_opts.add_last_branch) { + err = intel_pt_br_stack_init(pt); + if (err) + goto err_delete_thread; + /* + * Additional branch stack size to cater for tracing from the + * actual sample ip to where the sample time is recorded. + * Measured at about 200 branches, but generously set to 1024. + * If kernel space is not being traced, then add just 1 for the + * branch to kernel space. + */ + if (intel_pt_tracing_kernel(pt)) + pt->br_stack_sz_plus += 1024; + else + pt->br_stack_sz_plus += 1; + } + + pt->use_thread_stack = pt->synth_opts.callchain || + pt->synth_opts.add_callchain || + pt->synth_opts.thread_stack || + pt->synth_opts.last_branch || + pt->synth_opts.add_last_branch; + + pt->callstack = pt->synth_opts.callchain || + pt->synth_opts.add_callchain || + pt->synth_opts.thread_stack; + err = intel_pt_synth_events(pt, session); if (err) goto err_delete_thread; @@ -3400,6 +3463,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, return 0; err_delete_thread: + zfree(&pt->chain); thread__zput(pt->unknown_thread); err_free_queues: intel_pt_log_disable(); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 97142e9671be..8ed2135893bb 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -736,6 +736,12 @@ int machine__process_switch_event(struct machine *machine __maybe_unused, return 0; } +static int is_bpf_image(const char *name) +{ + return strncmp(name, "bpf_trampoline_", sizeof("bpf_trampoline_") - 1) || + strncmp(name, "bpf_dispatcher_", sizeof("bpf_dispatcher_") - 1); +} + static int machine__process_ksymbol_register(struct machine *machine, union perf_event *event, struct perf_sample *sample __maybe_unused) @@ -759,6 +765,12 @@ static int machine__process_ksymbol_register(struct machine *machine, map->start = event->ksymbol.addr; map->end = map->start + event->ksymbol.len; maps__insert(&machine->kmaps, map); + dso__set_loaded(dso); + + if (is_bpf_image(event->ksymbol.name)) { + dso->binary_type = DSO_BINARY_TYPE__BPF_IMAGE; + dso__set_long_name(dso, "", false); + } } sym = symbol__new(map->map_ip(map, map->start), @@ -2178,6 +2190,303 @@ static int remove_loops(struct branch_entry *l, int nr, return nr; } +static int lbr_callchain_add_kernel_ip(struct thread *thread, + struct callchain_cursor *cursor, + struct perf_sample *sample, + struct symbol **parent, + struct addr_location *root_al, + u64 branch_from, + bool callee, int end) +{ + struct ip_callchain *chain = sample->callchain; + u8 cpumode = PERF_RECORD_MISC_USER; + int err, i; + + if (callee) { + for (i = 0; i < end + 1; i++) { + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, chain->ips[i], + false, NULL, NULL, branch_from); + if (err) + return err; + } + return 0; + } + + for (i = end; i >= 0; i--) { + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, chain->ips[i], + false, NULL, NULL, branch_from); + if (err) + return err; + } + + return 0; +} + +static void save_lbr_cursor_node(struct thread *thread, + struct callchain_cursor *cursor, + int idx) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + + if (!lbr_stitch) + return; + + if (cursor->pos == cursor->nr) { + lbr_stitch->prev_lbr_cursor[idx].valid = false; + return; + } + + if (!cursor->curr) + cursor->curr = cursor->first; + else + cursor->curr = cursor->curr->next; + memcpy(&lbr_stitch->prev_lbr_cursor[idx], cursor->curr, + sizeof(struct callchain_cursor_node)); + + lbr_stitch->prev_lbr_cursor[idx].valid = true; + cursor->pos++; +} + +static int lbr_callchain_add_lbr_ip(struct thread *thread, + struct callchain_cursor *cursor, + struct perf_sample *sample, + struct symbol **parent, + struct addr_location *root_al, + u64 *branch_from, + bool callee) +{ + struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); + u8 cpumode = PERF_RECORD_MISC_USER; + int lbr_nr = lbr_stack->nr; + struct branch_flags *flags; + int err, i; + u64 ip; + + /* + * The curr and pos are not used in writing session. They are cleared + * in callchain_cursor_commit() when the writing session is closed. + * Using curr and pos to track the current cursor node. + */ + if (thread->lbr_stitch) { + cursor->curr = NULL; + cursor->pos = cursor->nr; + if (cursor->nr) { + cursor->curr = cursor->first; + for (i = 0; i < (int)(cursor->nr - 1); i++) + cursor->curr = cursor->curr->next; + } + } + + if (callee) { + /* Add LBR ip from first entries.to */ + ip = entries[0].to; + flags = &entries[0].flags; + *branch_from = entries[0].from; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + + /* + * The number of cursor node increases. + * Move the current cursor node. + * But does not need to save current cursor node for entry 0. + * It's impossible to stitch the whole LBRs of previous sample. + */ + if (thread->lbr_stitch && (cursor->pos != cursor->nr)) { + if (!cursor->curr) + cursor->curr = cursor->first; + else + cursor->curr = cursor->curr->next; + cursor->pos++; + } + + /* Add LBR ip from entries.from one by one. */ + for (i = 0; i < lbr_nr; i++) { + ip = entries[i].from; + flags = &entries[i].flags; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + save_lbr_cursor_node(thread, cursor, i); + } + return 0; + } + + /* Add LBR ip from entries.from one by one. */ + for (i = lbr_nr - 1; i >= 0; i--) { + ip = entries[i].from; + flags = &entries[i].flags; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + save_lbr_cursor_node(thread, cursor, i); + } + + /* Add LBR ip from first entries.to */ + ip = entries[0].to; + flags = &entries[0].flags; + *branch_from = entries[0].from; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + + return 0; +} + +static int lbr_callchain_add_stitched_lbr_ip(struct thread *thread, + struct callchain_cursor *cursor) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct callchain_cursor_node *cnode; + struct stitch_list *stitch_node; + int err; + + list_for_each_entry(stitch_node, &lbr_stitch->lists, node) { + cnode = &stitch_node->cursor; + + err = callchain_cursor_append(cursor, cnode->ip, + &cnode->ms, + cnode->branch, + &cnode->branch_flags, + cnode->nr_loop_iter, + cnode->iter_cycles, + cnode->branch_from, + cnode->srcline); + if (err) + return err; + } + return 0; +} + +static struct stitch_list *get_stitch_node(struct thread *thread) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct stitch_list *stitch_node; + + if (!list_empty(&lbr_stitch->free_lists)) { + stitch_node = list_first_entry(&lbr_stitch->free_lists, + struct stitch_list, node); + list_del(&stitch_node->node); + + return stitch_node; + } + + return malloc(sizeof(struct stitch_list)); +} + +static bool has_stitched_lbr(struct thread *thread, + struct perf_sample *cur, + struct perf_sample *prev, + unsigned int max_lbr, + bool callee) +{ + struct branch_stack *cur_stack = cur->branch_stack; + struct branch_entry *cur_entries = perf_sample__branch_entries(cur); + struct branch_stack *prev_stack = prev->branch_stack; + struct branch_entry *prev_entries = perf_sample__branch_entries(prev); + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + int i, j, nr_identical_branches = 0; + struct stitch_list *stitch_node; + u64 cur_base, distance; + + if (!cur_stack || !prev_stack) + return false; + + /* Find the physical index of the base-of-stack for current sample. */ + cur_base = max_lbr - cur_stack->nr + cur_stack->hw_idx + 1; + + distance = (prev_stack->hw_idx > cur_base) ? (prev_stack->hw_idx - cur_base) : + (max_lbr + prev_stack->hw_idx - cur_base); + /* Previous sample has shorter stack. Nothing can be stitched. */ + if (distance + 1 > prev_stack->nr) + return false; + + /* + * Check if there are identical LBRs between two samples. + * Identicall LBRs must have same from, to and flags values. Also, + * they have to be saved in the same LBR registers (same physical + * index). + * + * Starts from the base-of-stack of current sample. + */ + for (i = distance, j = cur_stack->nr - 1; (i >= 0) && (j >= 0); i--, j--) { + if ((prev_entries[i].from != cur_entries[j].from) || + (prev_entries[i].to != cur_entries[j].to) || + (prev_entries[i].flags.value != cur_entries[j].flags.value)) + break; + nr_identical_branches++; + } + + if (!nr_identical_branches) + return false; + + /* + * Save the LBRs between the base-of-stack of previous sample + * and the base-of-stack of current sample into lbr_stitch->lists. + * These LBRs will be stitched later. + */ + for (i = prev_stack->nr - 1; i > (int)distance; i--) { + + if (!lbr_stitch->prev_lbr_cursor[i].valid) + continue; + + stitch_node = get_stitch_node(thread); + if (!stitch_node) + return false; + + memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i], + sizeof(struct callchain_cursor_node)); + + if (callee) + list_add(&stitch_node->node, &lbr_stitch->lists); + else + list_add_tail(&stitch_node->node, &lbr_stitch->lists); + } + + return true; +} + +static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr) +{ + if (thread->lbr_stitch) + return true; + + thread->lbr_stitch = zalloc(sizeof(*thread->lbr_stitch)); + if (!thread->lbr_stitch) + goto err; + + thread->lbr_stitch->prev_lbr_cursor = calloc(max_lbr + 1, sizeof(struct callchain_cursor_node)); + if (!thread->lbr_stitch->prev_lbr_cursor) + goto free_lbr_stitch; + + INIT_LIST_HEAD(&thread->lbr_stitch->lists); + INIT_LIST_HEAD(&thread->lbr_stitch->free_lists); + + return true; + +free_lbr_stitch: + zfree(&thread->lbr_stitch); +err: + pr_warning("Failed to allocate space for stitched LBRs. Disable LBR stitch\n"); + thread->lbr_stitch_enable = false; + return false; +} + /* * Recolve LBR callstack chain sample * Return: @@ -2190,12 +2499,16 @@ static int resolve_lbr_callchain_sample(struct thread *thread, struct perf_sample *sample, struct symbol **parent, struct addr_location *root_al, - int max_stack) + int max_stack, + unsigned int max_lbr) { + bool callee = (callchain_param.order == ORDER_CALLEE); struct ip_callchain *chain = sample->callchain; int chain_nr = min(max_stack, (int)chain->nr), i; - u8 cpumode = PERF_RECORD_MISC_USER; - u64 ip, branch_from = 0; + struct lbr_stitch *lbr_stitch; + bool stitched_lbr = false; + u64 branch_from = 0; + int err; for (i = 0; i < chain_nr; i++) { if (chain->ips[i] == PERF_CONTEXT_USER) @@ -2203,71 +2516,65 @@ static int resolve_lbr_callchain_sample(struct thread *thread, } /* LBR only affects the user callchain */ - if (i != chain_nr) { - struct branch_stack *lbr_stack = sample->branch_stack; - struct branch_entry *entries = perf_sample__branch_entries(sample); - int lbr_nr = lbr_stack->nr, j, k; - bool branch; - struct branch_flags *flags; - /* - * LBR callstack can only get user call chain. - * The mix_chain_nr is kernel call chain - * number plus LBR user call chain number. - * i is kernel call chain number, - * 1 is PERF_CONTEXT_USER, - * lbr_nr + 1 is the user call chain number. - * For details, please refer to the comments - * in callchain__printf - */ - int mix_chain_nr = i + 1 + lbr_nr + 1; + if (i == chain_nr) + return 0; - for (j = 0; j < mix_chain_nr; j++) { - int err; - branch = false; - flags = NULL; + if (thread->lbr_stitch_enable && !sample->no_hw_idx && + (max_lbr > 0) && alloc_lbr_stitch(thread, max_lbr)) { + lbr_stitch = thread->lbr_stitch; - if (callchain_param.order == ORDER_CALLEE) { - if (j < i + 1) - ip = chain->ips[j]; - else if (j > i + 1) { - k = j - i - 2; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - } else { - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - } - } else { - if (j < lbr_nr) { - k = lbr_nr - j - 1; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - } - else if (j > lbr_nr) - ip = chain->ips[i + 1 - (j - lbr_nr)]; - else { - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - } - } + stitched_lbr = has_stitched_lbr(thread, sample, + &lbr_stitch->prev_sample, + max_lbr, callee); - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); + if (!stitched_lbr && !list_empty(&lbr_stitch->lists)) { + list_replace_init(&lbr_stitch->lists, + &lbr_stitch->free_lists); + } + memcpy(&lbr_stitch->prev_sample, sample, sizeof(*sample)); + } + + if (callee) { + /* Add kernel ip */ + err = lbr_callchain_add_kernel_ip(thread, cursor, sample, + parent, root_al, branch_from, + true, i); + if (err) + goto error; + + err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent, + root_al, &branch_from, true); + if (err) + goto error; + + if (stitched_lbr) { + err = lbr_callchain_add_stitched_lbr_ip(thread, cursor); if (err) - return (err < 0) ? err : 0; + goto error; } - return 1; + + } else { + if (stitched_lbr) { + err = lbr_callchain_add_stitched_lbr_ip(thread, cursor); + if (err) + goto error; + } + err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent, + root_al, &branch_from, false); + if (err) + goto error; + + /* Add kernel ip */ + err = lbr_callchain_add_kernel_ip(thread, cursor, sample, + parent, root_al, branch_from, + false, i); + if (err) + goto error; } + return 1; - return 0; +error: + return (err < 0) ? err : 0; } static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread, @@ -2311,9 +2618,12 @@ static int thread__resolve_callchain_sample(struct thread *thread, if (chain) chain_nr = chain->nr; - if (perf_evsel__has_branch_callstack(evsel)) { + if (evsel__has_branch_callstack(evsel)) { + struct perf_env *env = evsel__env(evsel); + err = resolve_lbr_callchain_sample(thread, cursor, sample, parent, - root_al, max_stack); + root_al, max_stack, + !env ? 0 : env->max_branches); if (err) return (err < 0) ? err : 0; } diff --git a/tools/perf/util/mem2node.c b/tools/perf/util/mem2node.c index 797d86a1ab09..c84f5841c7ab 100644 --- a/tools/perf/util/mem2node.c +++ b/tools/perf/util/mem2node.c @@ -1,5 +1,6 @@ #include <errno.h> #include <inttypes.h> +#include <asm/bug.h> #include <linux/bitmap.h> #include <linux/kernel.h> #include <linux/zalloc.h> @@ -95,7 +96,7 @@ int mem2node__init(struct mem2node *map, struct perf_env *env) /* Cut unused entries, due to merging. */ tmp_entries = realloc(entries, sizeof(*entries) * j); - if (tmp_entries) + if (tmp_entries || WARN_ON_ONCE(j == 0)) entries = tmp_entries; for (i = 0; i < j; i++) { diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 926449a7cdbf..b071df373f8b 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -90,6 +90,7 @@ struct egroup { const char *metric_name; const char *metric_expr; const char *metric_unit; + int runtime; }; static struct evsel *find_evsel_group(struct evlist *perf_evlist, @@ -202,6 +203,7 @@ static int metricgroup__setup_events(struct list_head *groups, expr->metric_name = eg->metric_name; expr->metric_unit = eg->metric_unit; expr->metric_events = metric_events; + expr->runtime = eg->runtime; list_add(&expr->nd, &me->head); } @@ -485,6 +487,45 @@ static bool metricgroup__has_constraint(struct pmu_event *pe) return false; } +int __weak arch_get_runtimeparam(void) +{ + return 1; +} + +static int __metricgroup__add_metric(struct strbuf *events, + struct list_head *group_list, struct pmu_event *pe, int runtime) +{ + + const char **ids; + int idnum; + struct egroup *eg; + + if (expr__find_other(pe->metric_expr, NULL, &ids, &idnum, runtime) < 0) + return -EINVAL; + + if (events->len > 0) + strbuf_addf(events, ","); + + if (metricgroup__has_constraint(pe)) + metricgroup__add_metric_non_group(events, ids, idnum); + else + metricgroup__add_metric_weak_group(events, ids, idnum); + + eg = malloc(sizeof(*eg)); + if (!eg) + return -ENOMEM; + + eg->ids = ids; + eg->idnum = idnum; + eg->metric_name = pe->metric_name; + eg->metric_expr = pe->metric_expr; + eg->metric_unit = pe->unit; + eg->runtime = runtime; + list_add_tail(&eg->nd, group_list); + + return 0; +} + static int metricgroup__add_metric(const char *metric, struct strbuf *events, struct list_head *group_list) { @@ -504,35 +545,26 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, continue; if (match_metric(pe->metric_group, metric) || match_metric(pe->metric_name, metric)) { - const char **ids; - int idnum; - struct egroup *eg; pr_debug("metric expr %s for %s\n", pe->metric_expr, pe->metric_name); - if (expr__find_other(pe->metric_expr, - NULL, &ids, &idnum) < 0) - continue; - if (events->len > 0) - strbuf_addf(events, ","); + if (!strstr(pe->metric_expr, "?")) { + ret = __metricgroup__add_metric(events, group_list, pe, 1); + } else { + int j, count; - if (metricgroup__has_constraint(pe)) - metricgroup__add_metric_non_group(events, ids, idnum); - else - metricgroup__add_metric_weak_group(events, ids, idnum); + count = arch_get_runtimeparam(); - eg = malloc(sizeof(struct egroup)); - if (!eg) { - ret = -ENOMEM; - break; + /* This loop is added to create multiple + * events depend on count value and add + * those events to group_list. + */ + + for (j = 0; j < count; j++) + ret = __metricgroup__add_metric(events, group_list, pe, j); } - eg->ids = ids; - eg->idnum = idnum; - eg->metric_name = pe->metric_name; - eg->metric_expr = pe->metric_expr; - eg->metric_unit = pe->unit; - list_add_tail(&eg->nd, group_list); - ret = 0; + if (ret == -ENOMEM) + break; } } return ret; diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index 475c7f912864..6b09eb30b4ec 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -22,6 +22,7 @@ struct metric_expr { const char *metric_name; const char *metric_unit; struct evsel **metric_events; + int runtime; }; struct metric_event *metricgroup__lookup(struct rblist *metric_events, @@ -34,4 +35,5 @@ int metricgroup__parse_groups(const struct option *opt, void metricgroup__print(bool metrics, bool groups, char *filter, bool raw, bool details); bool metricgroup__has_metric(const char *metric); +int arch_get_runtimeparam(void); #endif diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c index 359db2b1fcef..48c8f609441b 100644 --- a/tools/perf/util/ordered-events.c +++ b/tools/perf/util/ordered-events.c @@ -314,7 +314,7 @@ static int __ordered_events__flush(struct ordered_events *oe, enum oe_flush how, case OE_FLUSH__NONE: default: break; - }; + } pr_oe_time(oe->next_flush, "next_flush - ordered_events__flush PRE %s, nr_events %u\n", str[how], oe->nr_events); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 10107747b361..b7a0518d607d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -461,7 +461,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, cache_op = parse_aliases(str, perf_evsel__hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX); if (cache_op >= 0) { - if (!perf_evsel__is_cache_op_valid(cache_type, cache_op)) + if (!evsel__is_cache_op_valid(cache_type, cache_op)) return -EINVAL; continue; } @@ -1482,6 +1482,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, list_for_each_entry_safe(pos, tmp, &config_terms, list) { list_del_init(&pos->list); + zfree(&pos->val.str); free(pos); } return -EINVAL; @@ -1870,7 +1871,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add) evsel->precise_max = mod.precise_max; evsel->weak_group = mod.weak; - if (perf_evsel__is_group_leader(evsel)) + if (evsel__is_group_leader(evsel)) evsel->core.attr.pinned = mod.pinned; } @@ -2190,6 +2191,29 @@ int parse_events_option(const struct option *opt, const char *str, return ret; } +int parse_events_option_new_evlist(const struct option *opt, const char *str, int unset) +{ + struct evlist **evlistp = opt->value; + int ret; + + if (*evlistp == NULL) { + *evlistp = evlist__new(); + + if (*evlistp == NULL) { + fprintf(stderr, "Not enough memory to create evlist\n"); + return -1; + } + } + + ret = parse_events_option(opt, str, unset); + if (ret) { + evlist__delete(*evlistp); + *evlistp = NULL; + } + + return ret; +} + static int foreach_evsel_in_last_glob(struct evlist *evlist, int (*func)(struct evsel *evsel, @@ -2237,7 +2261,7 @@ static int set_filter(struct evsel *evsel, const void *arg) } if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT) { - if (perf_evsel__append_tp_filter(evsel, str) < 0) { + if (evsel__append_tp_filter(evsel, str) < 0) { fprintf(stderr, "not enough memory to hold filter string\n"); return -1; @@ -2262,7 +2286,7 @@ static int set_filter(struct evsel *evsel, const void *arg) return -1; } - if (perf_evsel__append_addr_filter(evsel, str) < 0) { + if (evsel__append_addr_filter(evsel, str) < 0) { fprintf(stderr, "not enough memory to hold filter string\n"); return -1; @@ -2293,7 +2317,7 @@ static int add_exclude_perf_filter(struct evsel *evsel, snprintf(new_filter, sizeof(new_filter), "common_pid != %d", getpid()); - if (perf_evsel__append_tp_filter(evsel, new_filter) < 0) { + if (evsel__append_tp_filter(evsel, new_filter) < 0) { fprintf(stderr, "not enough memory to hold filter string\n"); return -1; @@ -2603,12 +2627,11 @@ restart: for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { /* skip invalid cache type */ - if (!perf_evsel__is_cache_op_valid(type, op)) + if (!evsel__is_cache_op_valid(type, op)) continue; for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - __perf_evsel__hw_cache_type_op_res_name(type, op, i, - name, sizeof(name)); + __evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name)); if (event_glob != NULL && !strglobmatch(name, event_glob)) continue; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 27596cbd0ba0..6ead9661238c 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -31,6 +31,7 @@ bool have_tracepoints(struct list_head *evlist); const char *event_type(int type); int parse_events_option(const struct option *opt, const char *str, int unset); +int parse_events_option_new_evlist(const struct option *opt, const char *str, int unset); int parse_events(struct evlist *evlist, const char *str, struct parse_events_error *error); int parse_events_terms(struct list_head *terms, const char *str); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index baa48f28d57d..c589fc42f058 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -286,6 +286,7 @@ no-overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE); } percore { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); } aux-output { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); } aux-sample-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); } +r{num_raw_hex} { return raw(yyscanner); } , { return ','; } "/" { BEGIN(INITIAL); return '/'; } {name_minus} { return str(yyscanner, PE_NAME); } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 94f8bcd83582..c4ca932d092d 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -44,7 +44,7 @@ static void free_list_evsel(struct list_head* list_evsel) list_for_each_entry_safe(evsel, tmp, list_evsel, core.node) { list_del_init(&evsel->core.node); - perf_evsel__delete(evsel); + evsel__delete(evsel); } free(list_evsel); } @@ -326,6 +326,7 @@ PE_NAME opt_pmu_config } parse_events_terms__delete($2); parse_events_terms__delete(orig_terms); + free(pattern); free($1); $$ = list; #undef CLEANUP_YYABORT @@ -706,6 +707,15 @@ event_term } event_term: +PE_RAW +{ + struct parse_events_term *term; + + ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_CONFIG, + NULL, $1, false, &@1, NULL)); + $$ = term; +} +| PE_NAME '=' PE_NAME { struct parse_events_term *term; diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c new file mode 100644 index 000000000000..1337965673d7 --- /dev/null +++ b/tools/perf/util/perf_api_probe.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "perf-sys.h" +#include "util/cloexec.h" +#include "util/evlist.h" +#include "util/evsel.h" +#include "util/parse-events.h" +#include "util/perf_api_probe.h" +#include <perf/cpumap.h> +#include <errno.h> + +typedef void (*setup_probe_fn_t)(struct evsel *evsel); + +static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str) +{ + struct evlist *evlist; + struct evsel *evsel; + unsigned long flags = perf_event_open_cloexec_flag(); + int err = -EAGAIN, fd; + static pid_t pid = -1; + + evlist = evlist__new(); + if (!evlist) + return -ENOMEM; + + if (parse_events(evlist, str, NULL)) + goto out_delete; + + evsel = evlist__first(evlist); + + while (1) { + fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); + if (fd < 0) { + if (pid == -1 && errno == EACCES) { + pid = 0; + continue; + } + goto out_delete; + } + break; + } + close(fd); + + fn(evsel); + + fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); + if (fd < 0) { + if (errno == EINVAL) + err = -EINVAL; + goto out_delete; + } + close(fd); + err = 0; + +out_delete: + evlist__delete(evlist); + return err; +} + +static bool perf_probe_api(setup_probe_fn_t fn) +{ + const char *try[] = {"cycles:u", "instructions:u", "cpu-clock:u", NULL}; + struct perf_cpu_map *cpus; + int cpu, ret, i = 0; + + cpus = perf_cpu_map__new(NULL); + if (!cpus) + return false; + cpu = cpus->map[0]; + perf_cpu_map__put(cpus); + + do { + ret = perf_do_probe_api(fn, cpu, try[i++]); + if (!ret) + return true; + } while (ret == -EAGAIN && try[i]); + + return false; +} + +static void perf_probe_sample_identifier(struct evsel *evsel) +{ + evsel->core.attr.sample_type |= PERF_SAMPLE_IDENTIFIER; +} + +static void perf_probe_comm_exec(struct evsel *evsel) +{ + evsel->core.attr.comm_exec = 1; +} + +static void perf_probe_context_switch(struct evsel *evsel) +{ + evsel->core.attr.context_switch = 1; +} + +bool perf_can_sample_identifier(void) +{ + return perf_probe_api(perf_probe_sample_identifier); +} + +bool perf_can_comm_exec(void) +{ + return perf_probe_api(perf_probe_comm_exec); +} + +bool perf_can_record_switch_events(void) +{ + return perf_probe_api(perf_probe_context_switch); +} + +bool perf_can_record_cpu_wide(void) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .exclude_kernel = 1, + }; + struct perf_cpu_map *cpus; + int cpu, fd; + + cpus = perf_cpu_map__new(NULL); + if (!cpus) + return false; + cpu = cpus->map[0]; + perf_cpu_map__put(cpus); + + fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); + if (fd < 0) + return false; + close(fd); + + return true; +} + +/* + * Architectures are expected to know if AUX area sampling is supported by the + * hardware. Here we check for kernel support. + */ +bool perf_can_aux_sample(void) +{ + struct perf_event_attr attr = { + .size = sizeof(struct perf_event_attr), + .exclude_kernel = 1, + /* + * Non-zero value causes the kernel to calculate the effective + * attribute size up to that byte. + */ + .aux_sample_size = 1, + }; + int fd; + + fd = sys_perf_event_open(&attr, -1, 0, -1, 0); + /* + * If the kernel attribute is big enough to contain aux_sample_size + * then we assume that it is supported. We are relying on the kernel to + * validate the attribute size before anything else that could be wrong. + */ + if (fd < 0 && errno == E2BIG) + return false; + if (fd >= 0) + close(fd); + + return true; +} diff --git a/tools/perf/util/perf_api_probe.h b/tools/perf/util/perf_api_probe.h new file mode 100644 index 000000000000..706c3c6426e2 --- /dev/null +++ b/tools/perf/util/perf_api_probe.h @@ -0,0 +1,14 @@ + +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_API_PROBE_H +#define __PERF_API_PROBE_H + +#include <stdbool.h> + +bool perf_can_aux_sample(void); +bool perf_can_comm_exec(void); +bool perf_can_record_cpu_wide(void); +bool perf_can_record_switch_events(void); +bool perf_can_sample_identifier(void); + +#endif // __PERF_API_PROBE_H diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index ef6a63f3d386..92bd7fafcce6 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -18,6 +18,7 @@ #include <regex.h> #include <perf/cpumap.h> #include "debug.h" +#include "evsel.h" #include "pmu.h" #include "parse-events.h" #include "header.h" @@ -849,6 +850,7 @@ static struct perf_pmu *pmu_lookup(const char *name) INIT_LIST_HEAD(&pmu->format); INIT_LIST_HEAD(&pmu->aliases); + INIT_LIST_HEAD(&pmu->caps); list_splice(&format, &pmu->format); list_splice(&aliases, &pmu->aliases); list_add_tail(&pmu->list, &pmus); @@ -869,6 +871,17 @@ static struct perf_pmu *pmu_find(const char *name) return NULL; } +struct perf_pmu *perf_pmu__find_by_type(unsigned int type) +{ + struct perf_pmu *pmu; + + list_for_each_entry(pmu, &pmus, list) + if (pmu->type == type) + return pmu; + + return NULL; +} + struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu) { /* @@ -884,6 +897,25 @@ struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu) return NULL; } +struct perf_pmu *evsel__find_pmu(struct evsel *evsel) +{ + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (pmu->type == evsel->core.attr.type) + break; + } + + return pmu; +} + +bool evsel__is_aux_event(struct evsel *evsel) +{ + struct perf_pmu *pmu = evsel__find_pmu(evsel); + + return pmu && pmu->auxtrace; +} + struct perf_pmu *perf_pmu__find(const char *name) { struct perf_pmu *pmu; @@ -1574,3 +1606,84 @@ int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, va_end(args); return ret; } + +static int perf_pmu__new_caps(struct list_head *list, char *name, char *value) +{ + struct perf_pmu_caps *caps = zalloc(sizeof(*caps)); + + if (!caps) + return -ENOMEM; + + caps->name = strdup(name); + if (!caps->name) + goto free_caps; + caps->value = strndup(value, strlen(value) - 1); + if (!caps->value) + goto free_name; + list_add_tail(&caps->list, list); + return 0; + +free_name: + zfree(caps->name); +free_caps: + free(caps); + + return -ENOMEM; +} + +/* + * Reading/parsing the given pmu capabilities, which should be located at: + * /sys/bus/event_source/devices/<dev>/caps as sysfs group attributes. + * Return the number of capabilities + */ +int perf_pmu__caps_parse(struct perf_pmu *pmu) +{ + struct stat st; + char caps_path[PATH_MAX]; + const char *sysfs = sysfs__mountpoint(); + DIR *caps_dir; + struct dirent *evt_ent; + int nr_caps = 0; + + if (!sysfs) + return -1; + + snprintf(caps_path, PATH_MAX, + "%s" EVENT_SOURCE_DEVICE_PATH "%s/caps", sysfs, pmu->name); + + if (stat(caps_path, &st) < 0) + return 0; /* no error if caps does not exist */ + + caps_dir = opendir(caps_path); + if (!caps_dir) + return -EINVAL; + + while ((evt_ent = readdir(caps_dir)) != NULL) { + char path[PATH_MAX + NAME_MAX + 1]; + char *name = evt_ent->d_name; + char value[128]; + FILE *file; + + if (!strcmp(name, ".") || !strcmp(name, "..")) + continue; + + snprintf(path, sizeof(path), "%s/%s", caps_path, name); + + file = fopen(path, "r"); + if (!file) + continue; + + if (!fgets(value, sizeof(value), file) || + (perf_pmu__new_caps(&pmu->caps, name, value) < 0)) { + fclose(file); + continue; + } + + nr_caps++; + fclose(file); + } + + closedir(caps_dir); + + return nr_caps; +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 5fb3f16828df..cb6fbec50313 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -22,6 +22,12 @@ enum { struct perf_event_attr; +struct perf_pmu_caps { + char *name; + char *value; + struct list_head list; +}; + struct perf_pmu { char *name; __u32 type; @@ -33,6 +39,7 @@ struct perf_pmu { struct perf_cpu_map *cpus; struct list_head format; /* HEAD struct perf_pmu_format -> list */ struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */ + struct list_head caps; /* HEAD struct perf_pmu_caps -> list */ struct list_head list; /* ELEM */ }; @@ -65,6 +72,7 @@ struct perf_pmu_alias { }; struct perf_pmu *perf_pmu__find(const char *name); +struct perf_pmu *perf_pmu__find_by_type(unsigned int type); int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, struct list_head *head_terms, struct parse_events_error *error); @@ -107,4 +115,6 @@ bool pmu_uncore_alias_match(const char *pmu_name, const char *name); int perf_pmu__convert_scale(const char *scale, char **end, double *sval); +int perf_pmu__caps_parse(struct perf_pmu *pmu); + #endif /* __PMU_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 83212c65848b..75a9b1d62bba 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -801,7 +801,7 @@ static int pyrf_evsel__init(struct pyrf_evsel *pevsel, static void pyrf_evsel__delete(struct pyrf_evsel *pevsel) { - perf_evsel__exit(&pevsel->evsel); + evsel__exit(&pevsel->evsel); Py_TYPE(pevsel)->tp_free((PyObject*)pevsel); } @@ -1044,7 +1044,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, pevent->evsel = evsel; - err = perf_evsel__parse_sample(evsel, event, &pevent->sample); + err = evsel__parse_sample(evsel, event, &pevent->sample); /* Consume the even only after we parsed it out. */ perf_mmap__consume(&md->core); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 7def66168503..a4cc11592f6b 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -10,161 +10,64 @@ #include <subcmd/parse-options.h> #include <perf/cpumap.h> #include "cloexec.h" +#include "util/perf_api_probe.h" #include "record.h" #include "../perf-sys.h" -typedef void (*setup_probe_fn_t)(struct evsel *evsel); - -static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str) +/* + * evsel__config_leader_sampling() uses special rules for leader sampling. + * However, if the leader is an AUX area event, then assume the event to sample + * is the next event. + */ +static struct evsel *evsel__read_sampler(struct evsel *evsel, struct evlist *evlist) { - struct evlist *evlist; - struct evsel *evsel; - unsigned long flags = perf_event_open_cloexec_flag(); - int err = -EAGAIN, fd; - static pid_t pid = -1; - - evlist = evlist__new(); - if (!evlist) - return -ENOMEM; - - if (parse_events(evlist, str, NULL)) - goto out_delete; - - evsel = evlist__first(evlist); + struct evsel *leader = evsel->leader; - while (1) { - fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); - if (fd < 0) { - if (pid == -1 && errno == EACCES) { - pid = 0; - continue; - } - goto out_delete; + if (evsel__is_aux_event(leader)) { + evlist__for_each_entry(evlist, evsel) { + if (evsel->leader == leader && evsel != evsel->leader) + return evsel; } - break; - } - close(fd); - - fn(evsel); - - fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); - if (fd < 0) { - if (errno == EINVAL) - err = -EINVAL; - goto out_delete; } - close(fd); - err = 0; - -out_delete: - evlist__delete(evlist); - return err; -} - -static bool perf_probe_api(setup_probe_fn_t fn) -{ - const char *try[] = {"cycles:u", "instructions:u", "cpu-clock:u", NULL}; - struct perf_cpu_map *cpus; - int cpu, ret, i = 0; - - cpus = perf_cpu_map__new(NULL); - if (!cpus) - return false; - cpu = cpus->map[0]; - perf_cpu_map__put(cpus); - - do { - ret = perf_do_probe_api(fn, cpu, try[i++]); - if (!ret) - return true; - } while (ret == -EAGAIN && try[i]); - return false; + return leader; } -static void perf_probe_sample_identifier(struct evsel *evsel) +static void evsel__config_leader_sampling(struct evsel *evsel, struct evlist *evlist) { - evsel->core.attr.sample_type |= PERF_SAMPLE_IDENTIFIER; -} - -static void perf_probe_comm_exec(struct evsel *evsel) -{ - evsel->core.attr.comm_exec = 1; -} - -static void perf_probe_context_switch(struct evsel *evsel) -{ - evsel->core.attr.context_switch = 1; -} - -bool perf_can_sample_identifier(void) -{ - return perf_probe_api(perf_probe_sample_identifier); -} + struct perf_event_attr *attr = &evsel->core.attr; + struct evsel *leader = evsel->leader; + struct evsel *read_sampler; -static bool perf_can_comm_exec(void) -{ - return perf_probe_api(perf_probe_comm_exec); -} + if (!leader->sample_read) + return; -bool perf_can_record_switch_events(void) -{ - return perf_probe_api(perf_probe_context_switch); -} + read_sampler = evsel__read_sampler(evsel, evlist); -bool perf_can_record_cpu_wide(void) -{ - struct perf_event_attr attr = { - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_CPU_CLOCK, - .exclude_kernel = 1, - }; - struct perf_cpu_map *cpus; - int cpu, fd; - - cpus = perf_cpu_map__new(NULL); - if (!cpus) - return false; - cpu = cpus->map[0]; - perf_cpu_map__put(cpus); + if (evsel == read_sampler) + return; - fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); - if (fd < 0) - return false; - close(fd); - - return true; -} - -/* - * Architectures are expected to know if AUX area sampling is supported by the - * hardware. Here we check for kernel support. - */ -bool perf_can_aux_sample(void) -{ - struct perf_event_attr attr = { - .size = sizeof(struct perf_event_attr), - .exclude_kernel = 1, - /* - * Non-zero value causes the kernel to calculate the effective - * attribute size up to that byte. - */ - .aux_sample_size = 1, - }; - int fd; - - fd = sys_perf_event_open(&attr, -1, 0, -1, 0); /* - * If the kernel attribute is big enough to contain aux_sample_size - * then we assume that it is supported. We are relying on the kernel to - * validate the attribute size before anything else that could be wrong. + * Disable sampling for all group members other than the leader in + * case the leader 'leads' the sampling, except when the leader is an + * AUX area event, in which case the 2nd event in the group is the one + * that 'leads' the sampling. */ - if (fd < 0 && errno == E2BIG) - return false; - if (fd >= 0) - close(fd); + attr->freq = 0; + attr->sample_freq = 0; + attr->sample_period = 0; + attr->write_backward = 0; - return true; + /* + * We don't get a sample for slave events, we make them when delivering + * the group leader sample. Set the slave event to follow the master + * sample_type to ease up reporting. + * An AUX area event also has sample_type requirements, so also include + * the sample type bits from the leader's sample_type to cover that + * case. + */ + attr->sample_type = read_sampler->core.attr.sample_type | + leader->core.attr.sample_type; } void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, @@ -188,11 +91,15 @@ void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, use_comm_exec = perf_can_comm_exec(); evlist__for_each_entry(evlist, evsel) { - perf_evsel__config(evsel, opts, callchain); + evsel__config(evsel, opts, callchain); if (evsel->tracking && use_comm_exec) evsel->core.attr.comm_exec = 1; } + /* Configure leader sampling here now that the sample type is known */ + evlist__for_each_entry(evlist, evsel) + evsel__config_leader_sampling(evsel, evlist); + if (opts->full_auxtrace) { /* * Need to be able to synthesize and parse selected events with @@ -215,7 +122,7 @@ void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, if (sample_id) { evlist__for_each_entry(evlist, evsel) - perf_evsel__set_sample_id(evsel, use_sample_identifier); + evsel__set_sample_id(evsel, use_sample_identifier); } perf_evlist__set_id_pos(evlist); diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h index 24316458be20..923565c3b155 100644 --- a/tools/perf/util/record.h +++ b/tools/perf/util/record.h @@ -68,6 +68,7 @@ struct record_opts { int affinity; int mmap_flush; unsigned int comp_level; + unsigned int nr_threads_synthesize; }; extern const char * const *record_usage; diff --git a/tools/perf/util/s390-cpumcf-kernel.h b/tools/perf/util/s390-cpumcf-kernel.h index d4356030b504..f55ca07f3ca1 100644 --- a/tools/perf/util/s390-cpumcf-kernel.h +++ b/tools/perf/util/s390-cpumcf-kernel.h @@ -11,6 +11,7 @@ #define S390_CPUMCF_DIAG_DEF 0xfeef /* Counter diagnostic entry ID */ #define PERF_EVENT_CPUM_CF_DIAG 0xBC000 /* Event: Counter sets */ +#define PERF_EVENT_CPUM_SF_DIAG 0xBD000 /* Event: Combined-sampling */ struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ unsigned int def:16; /* 0-15 Data Entry Format */ diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index 6785cd87aa4d..f8861998e5bd 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -1047,6 +1047,14 @@ static void s390_cpumsf_free(struct perf_session *session) free(sf); } +static bool +s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused, + struct evsel *evsel) +{ + return evsel->core.attr.type == PERF_TYPE_RAW && + evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG; +} + static int s390_cpumsf_get_type(const char *cpuid) { int ret, family = 0; @@ -1071,7 +1079,8 @@ static bool check_auxtrace_itrace(struct itrace_synth_opts *itops) itops->pwr_events || itops->errors || itops->dont_decode || itops->calls || itops->returns || itops->callchain || itops->thread_stack || - itops->last_branch; + itops->last_branch || itops->add_callchain || + itops->add_last_branch; if (!ison) return true; pr_err("Unsupported --itrace options specified\n"); @@ -1142,6 +1151,7 @@ int s390_cpumsf_process_auxtrace_info(union perf_event *event, sf->auxtrace.flush_events = s390_cpumsf_flush; sf->auxtrace.free_events = s390_cpumsf_free_events; sf->auxtrace.free = s390_cpumsf_free; + sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace; session->auxtrace = &sf->auxtrace; if (dump_trace) diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 2c372cf5495e..739516fdf6e3 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -741,7 +741,7 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, if (!dict_sample) Py_FatalError("couldn't create Python dictionary"); - pydict_set_item_string_decref(dict, "ev_name", _PyUnicode_FromString(perf_evsel__name(evsel))); + pydict_set_item_string_decref(dict, "ev_name", _PyUnicode_FromString(evsel__name(evsel))); pydict_set_item_string_decref(dict, "attr", _PyBytes_FromStringAndSize((const char *)&evsel->core.attr, sizeof(evsel->core.attr))); pydict_set_item_string_decref(dict_sample, "pid", @@ -968,7 +968,7 @@ static int python_export_evsel(struct db_export *dbe, struct evsel *evsel) t = tuple_new(2); tuple_set_u64(t, 0, evsel->db_id); - tuple_set_string(t, 1, perf_evsel__name(evsel)); + tuple_set_string(t, 1, evsel__name(evsel)); call_object(tables->evsel_handler, t, "evsel_table"); @@ -1349,7 +1349,7 @@ static void get_handler_name(char *str, size_t size, { char *p = str; - scnprintf(str, size, "stat__%s", perf_evsel__name(evsel)); + scnprintf(str, size, "stat__%s", evsel__name(evsel)); while ((p = strchr(p, ':'))) { *p = '_'; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0b0bfe5bef17..c11d89e0ee55 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1059,7 +1059,7 @@ static void callchain__printf(struct evsel *evsel, unsigned int i; struct ip_callchain *callchain = sample->callchain; - if (perf_evsel__has_branch_callstack(evsel)) + if (evsel__has_branch_callstack(evsel)) callchain__lbr_callstack_printf(sample); printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr); @@ -1243,8 +1243,8 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, if (evsel__has_callchain(evsel)) callchain__printf(evsel, sample); - if (sample_type & PERF_SAMPLE_BRANCH_STACK) - branch_stack__printf(sample, perf_evsel__has_branch_callstack(evsel)); + if (evsel__has_br_stack(evsel)) + branch_stack__printf(sample, evsel__has_branch_callstack(evsel)); if (sample_type & PERF_SAMPLE_REGS_USER) regs_user__printf(sample); @@ -1280,8 +1280,7 @@ static void dump_read(struct evsel *evsel, union perf_event *event) return; printf(": %d %d %s %" PRI_lu64 "\n", event->read.pid, event->read.tid, - perf_evsel__name(evsel), - event->read.value); + evsel__name(evsel), event->read.value); if (!evsel) return; diff --git a/tools/perf/util/sideband_evlist.c b/tools/perf/util/sideband_evlist.c new file mode 100644 index 000000000000..1580a3cbec2d --- /dev/null +++ b/tools/perf/util/sideband_evlist.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "util/debug.h" +#include "util/evlist.h" +#include "util/evsel.h" +#include "util/mmap.h" +#include "util/perf_api_probe.h" +#include <perf/mmap.h> +#include <linux/perf_event.h> +#include <limits.h> +#include <pthread.h> +#include <sched.h> +#include <stdbool.h> + +int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, + evsel__sb_cb_t cb, void *data) +{ + struct evsel *evsel; + + if (!attr->sample_id_all) { + pr_warning("enabling sample_id_all for all side band events\n"); + attr->sample_id_all = 1; + } + + evsel = perf_evsel__new_idx(attr, evlist->core.nr_entries); + if (!evsel) + return -1; + + evsel->side_band.cb = cb; + evsel->side_band.data = data; + evlist__add(evlist, evsel); + return 0; +} + +static void *perf_evlist__poll_thread(void *arg) +{ + struct evlist *evlist = arg; + bool draining = false; + int i, done = 0; + /* + * In order to read symbols from other namespaces perf to needs to call + * setns(2). This isn't permitted if the struct_fs has multiple users. + * unshare(2) the fs so that we may continue to setns into namespaces + * that we're observing when, for instance, reading the build-ids at + * the end of a 'perf record' session. + */ + unshare(CLONE_FS); + + while (!done) { + bool got_data = false; + + if (evlist->thread.done) + draining = true; + + if (!draining) + evlist__poll(evlist, 1000); + + for (i = 0; i < evlist->core.nr_mmaps; i++) { + struct mmap *map = &evlist->mmap[i]; + union perf_event *event; + + if (perf_mmap__read_init(&map->core)) + continue; + while ((event = perf_mmap__read_event(&map->core)) != NULL) { + struct evsel *evsel = perf_evlist__event2evsel(evlist, event); + + if (evsel && evsel->side_band.cb) + evsel->side_band.cb(event, evsel->side_band.data); + else + pr_warning("cannot locate proper evsel for the side band event\n"); + + perf_mmap__consume(&map->core); + got_data = true; + } + perf_mmap__read_done(&map->core); + } + + if (draining && !got_data) + break; + } + return NULL; +} + +void evlist__set_cb(struct evlist *evlist, evsel__sb_cb_t cb, void *data) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + evsel->core.attr.sample_id_all = 1; + evsel->core.attr.watermark = 1; + evsel->core.attr.wakeup_watermark = 1; + evsel->side_band.cb = cb; + evsel->side_band.data = data; + } +} + +int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target) +{ + struct evsel *counter; + + if (!evlist) + return 0; + + if (perf_evlist__create_maps(evlist, target)) + goto out_delete_evlist; + + if (evlist->core.nr_entries > 1) { + bool can_sample_identifier = perf_can_sample_identifier(); + + evlist__for_each_entry(evlist, counter) + evsel__set_sample_id(counter, can_sample_identifier); + + perf_evlist__set_id_pos(evlist); + } + + evlist__for_each_entry(evlist, counter) { + if (evsel__open(counter, evlist->core.cpus, evlist->core.threads) < 0) + goto out_delete_evlist; + } + + if (evlist__mmap(evlist, UINT_MAX)) + goto out_delete_evlist; + + evlist__for_each_entry(evlist, counter) { + if (evsel__enable(counter)) + goto out_delete_evlist; + } + + evlist->thread.done = 0; + if (pthread_create(&evlist->thread.th, NULL, perf_evlist__poll_thread, evlist)) + goto out_delete_evlist; + + return 0; + +out_delete_evlist: + evlist__delete(evlist); + evlist = NULL; + return -1; +} + +void perf_evlist__stop_sb_thread(struct evlist *evlist) +{ + if (!evlist) + return; + evlist->thread.done = 1; + pthread_join(evlist->thread.th, NULL); + evlist__delete(evlist); +} diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c index 3b791ef2cd50..20bacd5972ad 100644 --- a/tools/perf/util/smt.c +++ b/tools/perf/util/smt.c @@ -15,6 +15,9 @@ int smt_on(void) if (cached) return cached_result; + if (sysfs__read_int("devices/system/cpu/smt/active", &cached_result) > 0) + goto done; + ncpu = sysconf(_SC_NPROCESSORS_CONF); for (cpu = 0; cpu < ncpu; cpu++) { unsigned long long siblings; @@ -24,13 +27,13 @@ int smt_on(void) snprintf(fn, sizeof fn, "devices/system/cpu/cpu%d/topology/core_cpus", cpu); - if (access(fn, F_OK) == -1) { + if (sysfs__read_str(fn, &str, &strlen) < 0) { snprintf(fn, sizeof fn, "devices/system/cpu/cpu%d/topology/thread_siblings", cpu); + if (sysfs__read_str(fn, &str, &strlen) < 0) + continue; } - if (sysfs__read_str(fn, &str, &strlen) < 0) - continue; /* Entry is hex, but does not have 0x, so need custom parser */ siblings = strtoull(str, NULL, 16); free(str); @@ -42,6 +45,7 @@ int smt_on(void) } if (!cached) { cached_result = 0; +done: cached = true; } return cached_result; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index f14cc728c358..c1f8879f92cc 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -237,7 +237,7 @@ static int64_t _sort__addr_cmp(u64 left_ip, u64 right_ip) return (int64_t)(right_ip - left_ip); } -static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r) +int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r) { if (!sym_l || !sym_r) return cmp_null(sym_l, sym_r); @@ -300,8 +300,14 @@ static int _hist_entry__sym_snprintf(struct map_symbol *ms, if (verbose > 0) { char o = map ? dso__symtab_origin(map->dso) : '!'; + u64 rip = ip; + + if (map && map->dso && map->dso->kernel + && map->dso->adjust_symbols) + rip = map->unmap_ip(map, ip); + ret += repsep_snprintf(bf, size, "%-#*llx %c ", - BITS_PER_LONG / 4 + 2, ip, o); + BITS_PER_LONG / 4 + 2, rip, o); } ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", level); @@ -2354,7 +2360,7 @@ static struct evsel *find_evsel(struct evlist *evlist, char *event_name) evsel = evlist__first(evlist); while (--nr > 0) - evsel = perf_evsel__next(evsel); + evsel = evsel__next(evsel); return evsel; } diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index cfa6ac6f7d06..66d39c4cfe2b 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -311,5 +311,7 @@ int64_t sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right); int64_t sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right); +int64_t +_sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r); char *hist_entry__srcline(struct hist_entry *he); #endif /* __PERF_SORT_H */ diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 9e757d18d713..3c6976f7574c 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -237,8 +237,6 @@ static bool valid_only_metric(const char *unit) if (!unit) return false; if (strstr(unit, "/sec") || - strstr(unit, "hz") || - strstr(unit, "Hz") || strstr(unit, "CPUs utilized")) return false; return true; @@ -248,7 +246,7 @@ static const char *fixunit(char *buf, struct evsel *evsel, const char *unit) { if (!strncmp(unit, "of all", 6)) { - snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel), + snprintf(buf, 1024, "%s %s", evsel__name(evsel), unit); return buf; } @@ -335,7 +333,7 @@ static int first_shadow_cpu(struct perf_stat_config *config, if (config->aggr_mode == AGGR_GLOBAL) return 0; - for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) { + for (i = 0; i < evsel__nr_cpus(evsel); i++) { int cpu2 = evsel__cpus(evsel)->map[i]; if (config->aggr_get_id(config, evlist->core.cpus, cpu2) == id) @@ -369,7 +367,7 @@ static void abs_printout(struct perf_stat_config *config, config->csv_output ? 0 : config->unit_width, evsel->unit, config->csv_sep); - fprintf(output, "%-*s", config->csv_output ? 0 : 25, perf_evsel__name(evsel)); + fprintf(output, "%-*s", config->csv_output ? 0 : 25, evsel__name(evsel)); print_cgroup(config, evsel); } @@ -463,8 +461,7 @@ static void printout(struct perf_stat_config *config, int id, int nr, counter->unit, config->csv_sep); fprintf(config->output, "%*s", - config->csv_output ? 0 : -25, - perf_evsel__name(counter)); + config->csv_output ? 0 : -25, evsel__name(counter)); print_cgroup(config, counter); @@ -510,7 +507,7 @@ static void aggr_update_shadow(struct perf_stat_config *config, id = config->aggr_map->map[s]; evlist__for_each_entry(evlist, counter) { val = 0; - for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { + for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) { s2 = config->aggr_get_id(config, evlist->core.cpus, cpu); if (s2 != id) continue; @@ -561,11 +558,11 @@ static void collect_all_aliases(struct perf_stat_config *config, struct evsel *c alias = list_prepare_entry(counter, &(evlist->core.entries), core.node); list_for_each_entry_continue (alias, &evlist->core.entries, core.node) { - if (strcmp(perf_evsel__name(alias), perf_evsel__name(counter)) || + if (strcmp(evsel__name(alias), evsel__name(counter)) || alias->scale != counter->scale || alias->cgrp != counter->cgrp || strcmp(alias->unit, counter->unit) || - perf_evsel__is_clock(alias) != perf_evsel__is_clock(counter) || + evsel__is_clock(alias) != evsel__is_clock(counter) || !strcmp(alias->pmu_name, counter->pmu_name)) break; alias->merged_stat = true; @@ -601,7 +598,7 @@ static void aggr_cb(struct perf_stat_config *config, struct aggr_data *ad = data; int cpu, s2; - for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { + for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) { struct perf_counts_values *counts; s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu); @@ -849,7 +846,7 @@ static void print_counter(struct perf_stat_config *config, double uval; int cpu; - for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { + for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) { struct aggr_data ad = { .cpu = cpu }; if (!collect_data(config, counter, counter_cb, &ad)) @@ -1150,7 +1147,7 @@ static void print_percore_thread(struct perf_stat_config *config, int s, s2, id; bool first = true; - for (int i = 0; i < perf_evsel__nr_cpus(counter); i++) { + for (int i = 0; i < evsel__nr_cpus(counter); i++) { s2 = config->aggr_get_id(config, evsel__cpus(counter), i); for (s = 0; s < config->aggr_map->nr; s++) { id = config->aggr_map->map[s]; diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 03ecb8cd0eec..129b8c5f2538 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -216,9 +216,9 @@ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, count *= counter->scale; - if (perf_evsel__is_clock(counter)) + if (evsel__is_clock(counter)) update_runtime_stat(st, STAT_NSECS, 0, cpu, count_ns); - else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + else if (evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_runtime_stat(st, STAT_CYCLES, ctx, cpu, count); else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) update_runtime_stat(st, STAT_CYCLES_IN_TX, ctx, cpu, count); @@ -241,25 +241,25 @@ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES, ctx, cpu, count); - else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) + else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT, ctx, cpu, count); - else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) + else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) update_runtime_stat(st, STAT_STALLED_CYCLES_BACK, ctx, cpu, count); - else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + else if (evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_runtime_stat(st, STAT_BRANCHES, ctx, cpu, count); - else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + else if (evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) update_runtime_stat(st, STAT_CACHEREFS, ctx, cpu, count); - else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) + else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) update_runtime_stat(st, STAT_L1_DCACHE, ctx, cpu, count); - else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) + else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) update_runtime_stat(st, STAT_L1_ICACHE, ctx, cpu, count); - else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) + else if (evsel__match(counter, HW_CACHE, HW_CACHE_LL)) update_runtime_stat(st, STAT_LL_CACHE, ctx, cpu, count); - else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) + else if (evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) update_runtime_stat(st, STAT_DTLB_CACHE, ctx, cpu, count); - else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) + else if (evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) update_runtime_stat(st, STAT_ITLB_CACHE, ctx, cpu, count); else if (perf_stat_evsel__is(counter, SMI_NUM)) update_runtime_stat(st, STAT_SMI_NUM, ctx, cpu, count); @@ -336,7 +336,7 @@ void perf_stat__collect_metric_expr(struct evlist *evsel_list) metric_events = counter->metric_events; if (!metric_events) { if (expr__find_other(counter->metric_expr, counter->name, - &metric_names, &num_metric_names) < 0) + &metric_names, &num_metric_names, 1) < 0) continue; metric_events = calloc(sizeof(struct evsel *), @@ -723,13 +723,14 @@ static void generic_metric(struct perf_stat_config *config, char *name, const char *metric_name, const char *metric_unit, + int runtime, double avg, int cpu, struct perf_stat_output_ctx *out, struct runtime_stat *st) { print_metric_t print_metric = out->print_metric; - struct parse_ctx pctx; + struct expr_parse_ctx pctx; double ratio, scale; int i; void *ctxp = out->ctx; @@ -777,7 +778,7 @@ static void generic_metric(struct perf_stat_config *config, } if (!metric_events[i]) { - if (expr__parse(&ratio, &pctx, metric_expr) == 0) { + if (expr__parse(&ratio, &pctx, metric_expr, runtime) == 0) { char *unit; char metric_bf[64]; @@ -786,9 +787,13 @@ static void generic_metric(struct perf_stat_config *config, &unit, &scale) >= 0) { ratio *= scale; } - - scnprintf(metric_bf, sizeof(metric_bf), + if (strstr(metric_expr, "?")) + scnprintf(metric_bf, sizeof(metric_bf), + "%s %s_%d", unit, metric_name, runtime); + else + scnprintf(metric_bf, sizeof(metric_bf), "%s %s", unit, metric_name); + print_metric(config, ctxp, NULL, "%8.1f", metric_bf, ratio); } else { @@ -828,7 +833,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, struct metric_event *me; int num = 1; - if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { + if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); if (total) { @@ -853,7 +858,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, "stalled cycles per insn", ratio); } - } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { + } else if (evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0) print_branch_misses(config, cpu, evsel, avg, out, st); else @@ -908,7 +913,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_ll_cache_misses(config, cpu, evsel, avg, out, st); else print_metric(config, ctxp, NULL, NULL, "of all LL-cache hits", 0); - } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { + } else if (evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu); if (total) @@ -919,11 +924,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, "of all cache refs", ratio); else print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0); - } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { + } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { print_stalled_cycles_frontend(config, cpu, evsel, avg, out, st); - } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { + } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { print_stalled_cycles_backend(config, cpu, evsel, avg, out, st); - } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { + } else if (evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = runtime_stat_avg(st, STAT_NSECS, 0, cpu); if (total) { @@ -974,7 +979,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ratio = total / avg; print_metric(config, ctxp, NULL, "%8.0f", "cycles / elision", ratio); - } else if (perf_evsel__is_clock(evsel)) { + } else if (evsel__is_clock(evsel)) { if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0) print_metric(config, ctxp, NULL, "%8.3f", "CPUs utilized", avg / (ratio * evsel->scale)); @@ -1022,7 +1027,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, name, 0); } else if (evsel->metric_expr) { generic_metric(config, evsel->metric_expr, evsel->metric_events, evsel->name, - evsel->metric_name, NULL, avg, cpu, out, st); + evsel->metric_name, NULL, 1, avg, cpu, out, st); } else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) { char unit = 'M'; char unit_buf[10]; @@ -1051,7 +1056,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, out->new_line(config, ctxp); generic_metric(config, mexp->metric_expr, mexp->metric_events, evsel->name, mexp->metric_name, - mexp->metric_unit, avg, cpu, out, st); + mexp->metric_unit, mexp->runtime, avg, cpu, out, st); } } if (num == 0) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 5f26137b8d60..774468341851 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -108,7 +108,7 @@ static void perf_stat_evsel_id_init(struct evsel *evsel) /* ps->id is 0 hence PERF_STAT_EVSEL_ID__NONE by default */ for (i = 0; i < PERF_STAT_EVSEL_ID__MAX; i++) { - if (!strcmp(perf_evsel__name(evsel), id_str[i])) { + if (!strcmp(evsel__name(evsel), id_str[i])) { ps->id = i; break; } @@ -173,7 +173,7 @@ static void perf_evsel__reset_prev_raw_counts(struct evsel *evsel) static int perf_evsel__alloc_stats(struct evsel *evsel, bool alloc_raw) { - int ncpus = perf_evsel__nr_cpus(evsel); + int ncpus = evsel__nr_cpus(evsel); int nthreads = perf_thread_map__nr(evsel->core.threads); if (perf_evsel__alloc_stat_priv(evsel) < 0 || @@ -302,7 +302,7 @@ process_counter_values(struct perf_stat_config *config, struct evsel *evsel, case AGGR_NODE: case AGGR_NONE: if (!evsel->snapshot) - perf_evsel__compute_deltas(evsel, cpu, thread, count); + evsel__compute_deltas(evsel, cpu, thread, count); perf_counts_values__scale(count, config->scale, NULL); if ((config->aggr_mode == AGGR_NONE) && (!evsel->percore)) { perf_stat__update_shadow_stats(evsel, count->val, @@ -334,7 +334,7 @@ static int process_counter_maps(struct perf_stat_config *config, struct evsel *counter) { int nthreads = perf_thread_map__nr(counter->core.threads); - int ncpus = perf_evsel__nr_cpus(counter); + int ncpus = evsel__nr_cpus(counter); int cpu, thread; if (counter->core.system_wide) @@ -368,8 +368,10 @@ int perf_stat_process_counter(struct perf_stat_config *config, * interval mode, otherwise overall avg running * averages will be shown for each interval. */ - if (config->interval) - init_stats(ps->res_stats); + if (config->interval) { + for (i = 0; i < 3; i++) + init_stats(&ps->res_stats[i]); + } if (counter->per_pkg) zero_per_pkg(counter); @@ -382,7 +384,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, return 0; if (!counter->snapshot) - perf_evsel__compute_deltas(counter, -1, -1, aggr); + evsel__compute_deltas(counter, -1, -1, aggr); perf_counts_values__scale(aggr, config->scale, &counter->counts->scaled); for (i = 0; i < 3; i++) @@ -390,7 +392,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, if (verbose > 0) { fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - perf_evsel__name(counter), count[0], count[1], count[2]); + evsel__name(counter), count[0], count[1], count[2]); } /* @@ -507,7 +509,7 @@ int create_perf_stat_counter(struct evsel *evsel, * either manually by us or by kernel via enable_on_exec * set later. */ - if (perf_evsel__is_group_leader(evsel)) { + if (evsel__is_group_leader(evsel)) { attr->disabled = 1; /* @@ -519,7 +521,7 @@ int create_perf_stat_counter(struct evsel *evsel, } if (target__has_cpu(target) && !target__has_per_thread(target)) - return perf_evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu); + return evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu); - return perf_evsel__open_per_thread(evsel, evsel->core.threads); + return evsel__open_per_thread(evsel, evsel->core.threads); } diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 26bc6a0096ce..381da6b39f89 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -566,6 +566,20 @@ void dso__sort_by_name(struct dso *dso) return symbols__sort_by_name(&dso->symbol_names, &dso->symbols); } +/* + * While we find nice hex chars, build a long_val. + * Return number of chars processed. + */ +static int hex2u64(const char *ptr, u64 *long_val) +{ + char *p; + + *long_val = strtoull(ptr, &p, 16); + + return p - ptr; +} + + int modules__parse(const char *filename, void *arg, int (*process_module)(void *arg, const char *name, u64 start, u64 size)) @@ -1544,6 +1558,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, return true; case DSO_BINARY_TYPE__BPF_PROG_INFO: + case DSO_BINARY_TYPE__BPF_IMAGE: case DSO_BINARY_TYPE__NOT_FOUND: default: return false; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index a661b122d9d8..89b390623b63 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -37,6 +37,7 @@ #include <string.h> #include <uapi/linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */ #include <api/fs/fs.h> +#include <api/io.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> @@ -71,7 +72,6 @@ int perf_tool__process_synth_event(struct perf_tool *tool, static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len, pid_t *tgid, pid_t *ppid) { - char filename[PATH_MAX]; char bf[4096]; int fd; size_t size = 0; @@ -81,11 +81,11 @@ static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len, *tgid = -1; *ppid = -1; - snprintf(filename, sizeof(filename), "/proc/%d/status", pid); + snprintf(bf, sizeof(bf), "/proc/%d/status", pid); - fd = open(filename, O_RDONLY); + fd = open(bf, O_RDONLY); if (fd < 0) { - pr_debug("couldn't open %s\n", filename); + pr_debug("couldn't open %s\n", bf); return -1; } @@ -274,6 +274,79 @@ static int perf_event__synthesize_fork(struct perf_tool *tool, return 0; } +static bool read_proc_maps_line(struct io *io, __u64 *start, __u64 *end, + u32 *prot, u32 *flags, __u64 *offset, + u32 *maj, u32 *min, + __u64 *inode, + ssize_t pathname_size, char *pathname) +{ + __u64 temp; + int ch; + char *start_pathname = pathname; + + if (io__get_hex(io, start) != '-') + return false; + if (io__get_hex(io, end) != ' ') + return false; + + /* map protection and flags bits */ + *prot = 0; + ch = io__get_char(io); + if (ch == 'r') + *prot |= PROT_READ; + else if (ch != '-') + return false; + ch = io__get_char(io); + if (ch == 'w') + *prot |= PROT_WRITE; + else if (ch != '-') + return false; + ch = io__get_char(io); + if (ch == 'x') + *prot |= PROT_EXEC; + else if (ch != '-') + return false; + ch = io__get_char(io); + if (ch == 's') + *flags = MAP_SHARED; + else if (ch == 'p') + *flags = MAP_PRIVATE; + else + return false; + if (io__get_char(io) != ' ') + return false; + + if (io__get_hex(io, offset) != ' ') + return false; + + if (io__get_hex(io, &temp) != ':') + return false; + *maj = temp; + if (io__get_hex(io, &temp) != ' ') + return false; + *min = temp; + + ch = io__get_dec(io, inode); + if (ch != ' ') { + *pathname = '\0'; + return ch == '\n'; + } + do { + ch = io__get_char(io); + } while (ch == ' '); + while (true) { + if (ch < 0) + return false; + if (ch == '\0' || ch == '\n' || + (pathname + 1 - start_pathname) >= pathname_size) { + *pathname = '\0'; + return true; + } + *pathname++ = ch; + ch = io__get_char(io); + } +} + int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, @@ -281,9 +354,9 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, struct machine *machine, bool mmap_data) { - char filename[PATH_MAX]; - FILE *fp; unsigned long long t; + char bf[BUFSIZ]; + struct io io; bool truncation = false; unsigned long long timeout = proc_map_timeout * 1000000ULL; int rc = 0; @@ -293,59 +366,52 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, if (machine__is_default_guest(machine)) return 0; - snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps", - machine->root_dir, pid, pid); + snprintf(bf, sizeof(bf), "%s/proc/%d/task/%d/maps", + machine->root_dir, pid, pid); - fp = fopen(filename, "r"); - if (fp == NULL) { + io.fd = open(bf, O_RDONLY, 0); + if (io.fd < 0) { /* * We raced with a task exiting - just return: */ - pr_debug("couldn't open %s\n", filename); + pr_debug("couldn't open %s\n", bf); return -1; } + io__init(&io, io.fd, bf, sizeof(bf)); event->header.type = PERF_RECORD_MMAP2; t = rdclock(); - while (1) { - char bf[BUFSIZ]; - char prot[5]; - char execname[PATH_MAX]; - char anonstr[] = "//anon"; - unsigned int ino; + while (!io.eof) { + static const char anonstr[] = "//anon"; size_t size; - ssize_t n; - if (fgets(bf, sizeof(bf), fp) == NULL) - break; + /* ensure null termination since stack will be reused. */ + event->mmap2.filename[0] = '\0'; + + /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ + if (!read_proc_maps_line(&io, + &event->mmap2.start, + &event->mmap2.len, + &event->mmap2.prot, + &event->mmap2.flags, + &event->mmap2.pgoff, + &event->mmap2.maj, + &event->mmap2.min, + &event->mmap2.ino, + sizeof(event->mmap2.filename), + event->mmap2.filename)) + continue; if ((rdclock() - t) > timeout) { - pr_warning("Reading %s time out. " + pr_warning("Reading %s/proc/%d/task/%d/maps time out. " "You may want to increase " "the time limit by --proc-map-timeout\n", - filename); + machine->root_dir, pid, pid); truncation = true; goto out; } - /* ensure null termination since stack will be reused. */ - strcpy(execname, ""); - - /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ - n = sscanf(bf, "%"PRI_lx64"-%"PRI_lx64" %s %"PRI_lx64" %x:%x %u %[^\n]\n", - &event->mmap2.start, &event->mmap2.len, prot, - &event->mmap2.pgoff, &event->mmap2.maj, - &event->mmap2.min, - &ino, execname); - - /* - * Anon maps don't have the execname. - */ - if (n < 7) - continue; - - event->mmap2.ino = (u64)ino; event->mmap2.ino_generation = 0; /* @@ -356,23 +422,8 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, else event->header.misc = PERF_RECORD_MISC_GUEST_USER; - /* map protection and flags bits */ - event->mmap2.prot = 0; - event->mmap2.flags = 0; - if (prot[0] == 'r') - event->mmap2.prot |= PROT_READ; - if (prot[1] == 'w') - event->mmap2.prot |= PROT_WRITE; - if (prot[2] == 'x') - event->mmap2.prot |= PROT_EXEC; - - if (prot[3] == 's') - event->mmap2.flags |= MAP_SHARED; - else - event->mmap2.flags |= MAP_PRIVATE; - - if (prot[2] != 'x') { - if (!mmap_data || prot[0] != 'r') + if ((event->mmap2.prot & PROT_EXEC) == 0) { + if (!mmap_data || (event->mmap2.prot & PROT_READ) == 0) continue; event->header.misc |= PERF_RECORD_MISC_MMAP_DATA; @@ -382,17 +433,17 @@ out: if (truncation) event->header.misc |= PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT; - if (!strcmp(execname, "")) - strcpy(execname, anonstr); + if (!strcmp(event->mmap2.filename, "")) + strcpy(event->mmap2.filename, anonstr); if (hugetlbfs_mnt_len && - !strncmp(execname, hugetlbfs_mnt, hugetlbfs_mnt_len)) { - strcpy(execname, anonstr); + !strncmp(event->mmap2.filename, hugetlbfs_mnt, + hugetlbfs_mnt_len)) { + strcpy(event->mmap2.filename, anonstr); event->mmap2.flags |= MAP_HUGETLB; } - size = strlen(execname) + 1; - memcpy(event->mmap2.filename, execname, size); + size = strlen(event->mmap2.filename) + 1; size = PERF_ALIGN(size, sizeof(u64)); event->mmap2.len -= event->mmap.start; event->mmap2.header.size = (sizeof(event->mmap2) - @@ -411,7 +462,7 @@ out: break; } - fclose(fp); + close(io.fd); return rc; } @@ -1130,7 +1181,7 @@ void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf synthesize_mask((struct perf_record_record_cpu_map *)data->data, map, max); default: break; - }; + } } static struct perf_record_cpu_map *cpu_map_event__new(struct perf_cpu_map *map) diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index 0885967d5bc3..1b992bbba4e8 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -80,6 +80,10 @@ struct thread_stack_entry { * @comm: current comm * @arr_sz: size of array if this is the first element of an array * @rstate: used to detect retpolines + * @br_stack_rb: branch stack (ring buffer) + * @br_stack_sz: maximum branch stack size + * @br_stack_pos: current position in @br_stack_rb + * @mispred_all: mark all branches as mispredicted */ struct thread_stack { struct thread_stack_entry *stack; @@ -95,6 +99,10 @@ struct thread_stack { struct comm *comm; unsigned int arr_sz; enum retpoline_state_t rstate; + struct branch_stack *br_stack_rb; + unsigned int br_stack_sz; + unsigned int br_stack_pos; + bool mispred_all; }; /* @@ -126,13 +134,26 @@ static int thread_stack__grow(struct thread_stack *ts) } static int thread_stack__init(struct thread_stack *ts, struct thread *thread, - struct call_return_processor *crp) + struct call_return_processor *crp, + bool callstack, unsigned int br_stack_sz) { int err; - err = thread_stack__grow(ts); - if (err) - return err; + if (callstack) { + err = thread_stack__grow(ts); + if (err) + return err; + } + + if (br_stack_sz) { + size_t sz = sizeof(struct branch_stack); + + sz += br_stack_sz * sizeof(struct branch_entry); + ts->br_stack_rb = zalloc(sz); + if (!ts->br_stack_rb) + return -ENOMEM; + ts->br_stack_sz = br_stack_sz; + } if (thread->maps && thread->maps->machine) { struct machine *machine = thread->maps->machine; @@ -150,7 +171,9 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread, } static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, - struct call_return_processor *crp) + struct call_return_processor *crp, + bool callstack, + unsigned int br_stack_sz) { struct thread_stack *ts = thread->ts, *new_ts; unsigned int old_sz = ts ? ts->arr_sz : 0; @@ -176,7 +199,7 @@ static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, ts += cpu; if (!ts->stack && - thread_stack__init(ts, thread, crp)) + thread_stack__init(ts, thread, crp, callstack, br_stack_sz)) return NULL; return ts; @@ -319,6 +342,9 @@ static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts) if (!crp) { ts->cnt = 0; + ts->br_stack_pos = 0; + if (ts->br_stack_rb) + ts->br_stack_rb->nr = 0; return 0; } @@ -353,8 +379,33 @@ int thread_stack__flush(struct thread *thread) return err; } +static void thread_stack__update_br_stack(struct thread_stack *ts, u32 flags, + u64 from_ip, u64 to_ip) +{ + struct branch_stack *bs = ts->br_stack_rb; + struct branch_entry *be; + + if (!ts->br_stack_pos) + ts->br_stack_pos = ts->br_stack_sz; + + ts->br_stack_pos -= 1; + + be = &bs->entries[ts->br_stack_pos]; + be->from = from_ip; + be->to = to_ip; + be->flags.value = 0; + be->flags.abort = !!(flags & PERF_IP_FLAG_TX_ABORT); + be->flags.in_tx = !!(flags & PERF_IP_FLAG_IN_TX); + /* No support for mispredict */ + be->flags.mispred = ts->mispred_all; + + if (bs->nr < ts->br_stack_sz) + bs->nr += 1; +} + int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, - u64 to_ip, u16 insn_len, u64 trace_nr) + u64 to_ip, u16 insn_len, u64 trace_nr, bool callstack, + unsigned int br_stack_sz, bool mispred_all) { struct thread_stack *ts = thread__stack(thread, cpu); @@ -362,12 +413,13 @@ int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, return -EINVAL; if (!ts) { - ts = thread_stack__new(thread, cpu, NULL); + ts = thread_stack__new(thread, cpu, NULL, callstack, br_stack_sz); if (!ts) { pr_warning("Out of memory: no thread stack\n"); return -ENOMEM; } ts->trace_nr = trace_nr; + ts->mispred_all = mispred_all; } /* @@ -381,8 +433,14 @@ int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, ts->trace_nr = trace_nr; } - /* Stop here if thread_stack__process() is in use */ - if (ts->crp) + if (br_stack_sz) + thread_stack__update_br_stack(ts, flags, from_ip, to_ip); + + /* + * Stop here if thread_stack__process() is in use, or not recording call + * stack. + */ + if (ts->crp || !callstack) return 0; if (flags & PERF_IP_FLAG_CALL) { @@ -430,6 +488,7 @@ static void __thread_stack__free(struct thread *thread, struct thread_stack *ts) { __thread_stack__flush(thread, ts); zfree(&ts->stack); + zfree(&ts->br_stack_rb); } static void thread_stack__reset(struct thread *thread, struct thread_stack *ts) @@ -497,6 +556,199 @@ void thread_stack__sample(struct thread *thread, int cpu, chain->nr = i; } +/* + * Hardware sample records, created some time after the event occurred, need to + * have subsequent addresses removed from the call chain. + */ +void thread_stack__sample_late(struct thread *thread, int cpu, + struct ip_callchain *chain, size_t sz, + u64 sample_ip, u64 kernel_start) +{ + struct thread_stack *ts = thread__stack(thread, cpu); + u64 sample_context = callchain_context(sample_ip, kernel_start); + u64 last_context, context, ip; + size_t nr = 0, j; + + if (sz < 2) { + chain->nr = 0; + return; + } + + if (!ts) + goto out; + + /* + * When tracing kernel space, kernel addresses occur at the top of the + * call chain after the event occurred but before tracing stopped. + * Skip them. + */ + for (j = 1; j <= ts->cnt; j++) { + ip = ts->stack[ts->cnt - j].ret_addr; + context = callchain_context(ip, kernel_start); + if (context == PERF_CONTEXT_USER || + (context == sample_context && ip == sample_ip)) + break; + } + + last_context = sample_ip; /* Use sample_ip as an invalid context */ + + for (; nr < sz && j <= ts->cnt; nr++, j++) { + ip = ts->stack[ts->cnt - j].ret_addr; + context = callchain_context(ip, kernel_start); + if (context != last_context) { + if (nr >= sz - 1) + break; + chain->ips[nr++] = context; + last_context = context; + } + chain->ips[nr] = ip; + } +out: + if (nr) { + chain->nr = nr; + } else { + chain->ips[0] = sample_context; + chain->ips[1] = sample_ip; + chain->nr = 2; + } +} + +void thread_stack__br_sample(struct thread *thread, int cpu, + struct branch_stack *dst, unsigned int sz) +{ + struct thread_stack *ts = thread__stack(thread, cpu); + const size_t bsz = sizeof(struct branch_entry); + struct branch_stack *src; + struct branch_entry *be; + unsigned int nr; + + dst->nr = 0; + + if (!ts) + return; + + src = ts->br_stack_rb; + if (!src->nr) + return; + + dst->nr = min((unsigned int)src->nr, sz); + + be = &dst->entries[0]; + nr = min(ts->br_stack_sz - ts->br_stack_pos, (unsigned int)dst->nr); + memcpy(be, &src->entries[ts->br_stack_pos], bsz * nr); + + if (src->nr >= ts->br_stack_sz) { + sz -= nr; + be = &dst->entries[nr]; + nr = min(ts->br_stack_pos, sz); + memcpy(be, &src->entries[0], bsz * ts->br_stack_pos); + } +} + +/* Start of user space branch entries */ +static bool us_start(struct branch_entry *be, u64 kernel_start, bool *start) +{ + if (!*start) + *start = be->to && be->to < kernel_start; + + return *start; +} + +/* + * Start of branch entries after the ip fell in between 2 branches, or user + * space branch entries. + */ +static bool ks_start(struct branch_entry *be, u64 sample_ip, u64 kernel_start, + bool *start, struct branch_entry *nb) +{ + if (!*start) { + *start = (nb && sample_ip >= be->to && sample_ip <= nb->from) || + be->from < kernel_start || + (be->to && be->to < kernel_start); + } + + return *start; +} + +/* + * Hardware sample records, created some time after the event occurred, need to + * have subsequent addresses removed from the branch stack. + */ +void thread_stack__br_sample_late(struct thread *thread, int cpu, + struct branch_stack *dst, unsigned int sz, + u64 ip, u64 kernel_start) +{ + struct thread_stack *ts = thread__stack(thread, cpu); + struct branch_entry *d, *s, *spos, *ssz; + struct branch_stack *src; + unsigned int nr = 0; + bool start = false; + + dst->nr = 0; + + if (!ts) + return; + + src = ts->br_stack_rb; + if (!src->nr) + return; + + spos = &src->entries[ts->br_stack_pos]; + ssz = &src->entries[ts->br_stack_sz]; + + d = &dst->entries[0]; + s = spos; + + if (ip < kernel_start) { + /* + * User space sample: start copying branch entries when the + * branch is in user space. + */ + for (s = spos; s < ssz && nr < sz; s++) { + if (us_start(s, kernel_start, &start)) { + *d++ = *s; + nr += 1; + } + } + + if (src->nr >= ts->br_stack_sz) { + for (s = &src->entries[0]; s < spos && nr < sz; s++) { + if (us_start(s, kernel_start, &start)) { + *d++ = *s; + nr += 1; + } + } + } + } else { + struct branch_entry *nb = NULL; + + /* + * Kernel space sample: start copying branch entries when the ip + * falls in between 2 branches (or the branch is in user space + * because then the start must have been missed). + */ + for (s = spos; s < ssz && nr < sz; s++) { + if (ks_start(s, ip, kernel_start, &start, nb)) { + *d++ = *s; + nr += 1; + } + nb = s; + } + + if (src->nr >= ts->br_stack_sz) { + for (s = &src->entries[0]; s < spos && nr < sz; s++) { + if (ks_start(s, ip, kernel_start, &start, nb)) { + *d++ = *s; + nr += 1; + } + nb = s; + } + } + } + + dst->nr = nr; +} + struct call_return_processor * call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), void *data) @@ -864,7 +1116,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm, } if (!ts) { - ts = thread_stack__new(thread, sample->cpu, crp); + ts = thread_stack__new(thread, sample->cpu, crp, true, 0); if (!ts) return -ENOMEM; ts->comm = comm; diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index e1ec5a58f1b2..3bc47a42af8e 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -81,10 +81,19 @@ struct call_return_processor { }; int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, - u64 to_ip, u16 insn_len, u64 trace_nr); + u64 to_ip, u16 insn_len, u64 trace_nr, bool callstack, + unsigned int br_stack_sz, bool mispred_all); void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr); void thread_stack__sample(struct thread *thread, int cpu, struct ip_callchain *chain, size_t sz, u64 ip, u64 kernel_start); +void thread_stack__sample_late(struct thread *thread, int cpu, + struct ip_callchain *chain, size_t sz, u64 ip, + u64 kernel_start); +void thread_stack__br_sample(struct thread *thread, int cpu, + struct branch_stack *dst, unsigned int sz); +void thread_stack__br_sample_late(struct thread *thread, int cpu, + struct branch_stack *dst, unsigned int sz, + u64 sample_ip, u64 kernel_start); int thread_stack__flush(struct thread *thread); void thread_stack__free(struct thread *thread); size_t thread_stack__depth(struct thread *thread, int cpu); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 28b719388028..665e5c0618ed 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -47,6 +47,7 @@ struct thread *thread__new(pid_t pid, pid_t tid) thread->tid = tid; thread->ppid = -1; thread->cpu = -1; + thread->lbr_stitch_enable = false; INIT_LIST_HEAD(&thread->namespaces_list); INIT_LIST_HEAD(&thread->comm_list); init_rwsem(&thread->namespaces_lock); @@ -110,6 +111,7 @@ void thread__delete(struct thread *thread) exit_rwsem(&thread->namespaces_lock); exit_rwsem(&thread->comm_lock); + thread__free_stitch_list(thread); free(thread); } @@ -452,3 +454,25 @@ int thread__memcpy(struct thread *thread, struct machine *machine, return dso__data_read_offset(al.map->dso, machine, offset, buf, len); } + +void thread__free_stitch_list(struct thread *thread) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct stitch_list *pos, *tmp; + + if (!lbr_stitch) + return; + + list_for_each_entry_safe(pos, tmp, &lbr_stitch->lists, node) { + list_del_init(&pos->node); + free(pos); + } + + list_for_each_entry_safe(pos, tmp, &lbr_stitch->free_lists, node) { + list_del_init(&pos->node); + free(pos); + } + + zfree(&lbr_stitch->prev_lbr_cursor); + zfree(&thread->lbr_stitch); +} diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 20b96b5d1f15..b066fb30d203 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -13,6 +13,8 @@ #include <strlist.h> #include <intlist.h> #include "rwsem.h" +#include "event.h" +#include "callchain.h" struct addr_location; struct map; @@ -20,6 +22,13 @@ struct perf_record_namespaces; struct thread_stack; struct unwind_libunwind_ops; +struct lbr_stitch { + struct list_head lists; + struct list_head free_lists; + struct perf_sample prev_sample; + struct callchain_cursor_node *prev_lbr_cursor; +}; + struct thread { union { struct rb_node rb_node; @@ -46,6 +55,10 @@ struct thread { struct srccode_state srccode_state; bool filter; int filter_entry_depth; + + /* LBR call stack stitch */ + bool lbr_stitch_enable; + struct lbr_stitch *lbr_stitch; }; struct machine; @@ -142,4 +155,6 @@ static inline bool thread__is_filtered(struct thread *thread) return false; } +void thread__free_stitch_list(struct thread *thread); + #endif /* __PERF_THREAD_H */ diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 3dce2de9d005..27945eeb0cb5 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -77,7 +77,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) opts->freq ? "Hz" : ""); } - ret += SNPRINTF(bf + ret, size - ret, "%s", perf_evsel__name(top->sym_evsel)); + ret += SNPRINTF(bf + ret, size - ret, "%s", evsel__name(top->sym_evsel)); ret += SNPRINTF(bf + ret, size - ret, "], "); diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index f117d4f4821e..ff8391208ecd 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -18,7 +18,7 @@ struct perf_session; struct perf_top { struct perf_tool tool; - struct evlist *evlist; + struct evlist *evlist, *sb_evlist; struct record_opts record_opts; struct annotation_options annotation_opts; struct evswitch evswitch; @@ -36,6 +36,7 @@ struct perf_top { bool use_tui, use_stdio; bool vmlinux_warned; bool dump_symtab; + bool stitch_lbr; struct hist_entry *sym_filter_entry; struct evsel *sym_evsel; struct perf_session *session; diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index 8593d3c200c6..f507dff713c9 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -75,7 +75,7 @@ static void skip(int size) r = size > BUFSIZ ? BUFSIZ : size; do_read(buf, r); size -= r; - }; + } } static unsigned int read4(struct tep_handle *pevent) diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index d707c9624dd9..37a9492edb3e 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -290,6 +290,7 @@ int perf_event_paranoid(void) bool perf_event_paranoid_check(int max_level) { return perf_cap__capable(CAP_SYS_ADMIN) || + perf_cap__capable(CAP_PERFMON) || perf_event_paranoid() <= max_level; } diff --git a/tools/power/pm-graph/config/custom-timeline-functions.cfg b/tools/power/pm-graph/config/custom-timeline-functions.cfg index 4f80ad7d7275..962e5768681c 100644 --- a/tools/power/pm-graph/config/custom-timeline-functions.cfg +++ b/tools/power/pm-graph/config/custom-timeline-functions.cfg @@ -125,7 +125,7 @@ acpi_suspend_begin: suspend_console: acpi_pm_prepare: syscore_suspend: -arch_enable_nonboot_cpus_end: +arch_thaw_secondary_cpus_end: syscore_resume: acpi_pm_finish: resume_console: diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index 9b0404d10768..602e64b68ba7 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -198,7 +198,7 @@ class SystemValues: 'suspend_console': {}, 'acpi_pm_prepare': {}, 'syscore_suspend': {}, - 'arch_enable_nonboot_cpus_end': {}, + 'arch_thaw_secondary_cpus_end': {}, 'syscore_resume': {}, 'acpi_pm_finish': {}, 'resume_console': {}, diff --git a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh new file mode 100755 index 000000000000..e5cc6b2f195e --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# If this was a KCSAN run, collapse the reports in the various console.log +# files onto pairs of functions. +# +# Usage: kcsan-collapse.sh resultsdir +# +# Copyright (C) 2020 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +if test -z "$TORTURE_KCONFIG_KCSAN_ARG" +then + exit 0 +fi +cat $1/*/console.log | + grep "BUG: KCSAN: " | + sed -e 's/^\[[^]]*] //' | + sort | + uniq -c | + sort -k1nr > $1/kcsan.sum diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 9d9a41625dd9..1706cd4466b4 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -41,7 +41,21 @@ else title="$title ($ngpsps/s)" fi echo $title $stopstate $fwdprog - nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'` + nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | \ + awk -v sum=0 ' + { + for (i = 0; i <= NF; i++) { + sum += $i; + if ($i ~ /Batch:/) { + sum = 0; + i = i + 2; + } + } + } + + END { + print sum + }'` if test -z "$nclosecalls" then exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 0326f4a5ff9c..736f04749b90 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -70,6 +70,15 @@ do fi fi done + if test -f "$rd/kcsan.sum" + then + if test -s "$rd/kcsan.sum" + then + echo KCSAN summary in $rd/kcsan.sum + else + echo Clean KCSAN run in $rd + fi + fi done EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1 ret=$? diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index e0352304b98b..6ff611c630d1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -44,30 +44,32 @@ then fi echo ' ---' `date`: Starting build echo ' ---' Kconfig fragment at: $config_template >> $resdir/log -touch $resdir/ConfigFragment.input $resdir/ConfigFragment -if test -r "$config_dir/CFcommon" -then - echo " --- $config_dir/CFcommon" >> $resdir/ConfigFragment.input - cat < $config_dir/CFcommon >> $resdir/ConfigFragment.input - config_override.sh $config_dir/CFcommon $config_template > $T/Kc1 - grep '#CHECK#' $config_dir/CFcommon >> $resdir/ConfigFragment -else - cp $config_template $T/Kc1 -fi -echo " --- $config_template" >> $resdir/ConfigFragment.input -cat $config_template >> $resdir/ConfigFragment.input -grep '#CHECK#' $config_template >> $resdir/ConfigFragment -if test -n "$TORTURE_KCONFIG_ARG" -then - echo $TORTURE_KCONFIG_ARG | tr -s " " "\012" > $T/cmdline - echo " --- --kconfig argument" >> $resdir/ConfigFragment.input - cat $T/cmdline >> $resdir/ConfigFragment.input - config_override.sh $T/Kc1 $T/cmdline > $T/Kc2 - # Note that "#CHECK#" is not permitted on commandline. -else - cp $T/Kc1 $T/Kc2 -fi -cat $T/Kc2 >> $resdir/ConfigFragment +touch $resdir/ConfigFragment.input + +# Combine additional Kconfig options into an existing set such that +# newer options win. The first argument is the Kconfig source ID, the +# second the to-be-updated file within $T, and the third and final the +# list of additional Kconfig options. Note that a $2.tmp file is +# created when doing the update. +config_override_param () { + if test -n "$3" + then + echo $3 | sed -e 's/^ *//' -e 's/ *$//' | tr -s " " "\012" > $T/Kconfig_args + echo " --- $1" >> $resdir/ConfigFragment.input + cat $T/Kconfig_args >> $resdir/ConfigFragment.input + config_override.sh $T/$2 $T/Kconfig_args > $T/$2.tmp + mv $T/$2.tmp $T/$2 + # Note that "#CHECK#" is not permitted on commandline. + fi +} + +echo > $T/KcList +config_override_param "$config_dir/CFcommon" KcList "`cat $config_dir/CFcommon 2> /dev/null`" +config_override_param "$config_template" KcList "`cat $config_template 2> /dev/null`" +config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" +config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" +config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" +cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux @@ -80,7 +82,7 @@ then ln -s $base_resdir/.config $resdir # for kvm-recheck.sh # Arch-independent indicator touch $resdir/builtkernel -elif kvm-build.sh $T/Kc2 $resdir +elif kvm-build.sh $T/KcList $resdir then # Had to build a kernel for this test. QEMU="`identify_qemu vmlinux`" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 2315e2ec12d6..c279cf9cb010 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -31,6 +31,8 @@ TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" +TORTURE_KCONFIG_KASAN_ARG="" +TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 @@ -133,6 +135,12 @@ do TORTURE_KCONFIG_ARG="$2" shift ;; + --kasan) + TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + ;; + --kcsan) + TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_KCSAN_INTERRUPT_WATCHER=y"; export TORTURE_KCONFIG_KCSAN_ARG + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -310,6 +318,8 @@ TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG TORTURE_INITRD="$TORTURE_INITRD"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="$TORTURE_KCONFIG_ARG"; export TORTURE_KCONFIG_ARG +TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG +TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE @@ -464,6 +474,7 @@ echo echo echo " --- `date` Test summary:" echo Results directory: $resdir/$ds +kcsan-collapse.sh $resdir/$ds kvm-recheck.sh $resdir/$ds ___EOF___ diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index c3c1fb5a9e1f..f2b20db9e296 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -14,3 +14,6 @@ TINY02 TASKS01 TASKS02 TASKS03 +RUDE01 +TRACE01 +TRACE02 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 new file mode 100644 index 000000000000..bafe94cbd739 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 @@ -0,0 +1,10 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=2 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot new file mode 100644 index 000000000000..9363708c9075 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-rude diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 new file mode 100644 index 000000000000..12e7661b86f5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -0,0 +1,11 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y +CONFIG_TASKS_TRACE_RCU_READ_MB=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot new file mode 100644 index 000000000000..9675ad632dcc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-tracing diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 new file mode 100644 index 000000000000..b69ed6673c41 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -0,0 +1,11 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n +CONFIG_TASKS_TRACE_RCU_READ_MB=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot new file mode 100644 index 000000000000..9675ad632dcc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-tracing diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index 2debe7891aeb..7311f84a5876 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -1,5 +1,5 @@ CONFIG_SMP=y -CONFIG_NR_CPUS=100 +CONFIG_NR_CPUS=56 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 9803dbb54181..b50c2085c1ac 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -57,7 +57,6 @@ CONFIG_RCU_EQS_DEBUG=y CONFIG_USER_STACKTRACE_SUPPORT=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y -CONFIG_DOUBLEFAULT=y CONFIG_X86_DEBUG_FPU=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_DEBUG_PAGEALLOC=y diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 48d0ec44ad77..53b3ba9173ba 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1387,9 +1387,7 @@ static inline void hyp_cpu_pm_exit(void) static int init_common_resources(void) { - kvm_set_ipa_limit(); - - return 0; + return kvm_set_ipa_limit(); } static int init_subsystems(void) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 89a14ec8b33b..d2339a2b9fb9 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -302,7 +302,7 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, * pending state of interrupt is latched in pending_latch variable. * Userspace will save and restore pending state and line_level * separately. - * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst * for handling of ISPENDR and ICPENDR. */ for (i = 0; i < len * 8; i++) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 769e4802645e..64fcd7511110 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,7 +42,7 @@ VGIC_AFFINITY_LEVEL(val, 3)) /* - * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 @@ -63,7 +63,7 @@ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* - * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 |