diff options
567 files changed, 20414 insertions, 4862 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8384c681a4b2..476722b7b636 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1879,10 +1879,8 @@ following two functions. wbc_init_bio(@wbc, @bio) Should be called for each bio carrying writeback data and - associates the bio with the inode's owner cgroup and the - corresponding request queue. This must be called after - a queue (device) has been associated with the bio and - before submission. + associates the bio with the inode's owner cgroup. Can be + called anytime between bio allocation and submission. wbc_account_io(@wbc, @page, @bytes) Should be called for each data segment being written out. @@ -1901,7 +1899,7 @@ the configuration, the bio may be executed at a lower priority and if the writeback session is holding shared resources, e.g. a journal entry, may lead to priority inversion. There is no one easy solution for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_create_blkg() +cases by skipping wbc_init_bio() or using bio_associate_blkcg() directly. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b90fe3b6bc6c..81d1d5a74728 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1068,7 +1068,7 @@ earlyprintk=serial[,0x...[,baudrate]] earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] - earlyprintk=pciserial,bus:device.function[,baudrate] + earlyprintk=pciserial[,force],bus:device.function[,baudrate] earlyprintk=xdbc[xhciController#] earlyprintk is useful when the kernel crashes before @@ -1100,6 +1100,10 @@ The sclp output can only be used on s390. + The optional "force" to "pciserial" enables use of a + PCI device even when its classcode is not of the + UART class. + edac_report= [HW,EDAC] Control how to report EDAC event Format: {"on" | "off" | "force"} on: enable EDAC to report H/W event. May be overridden diff --git a/Documentation/crypto/asymmetric-keys.txt b/Documentation/crypto/asymmetric-keys.txt index 5969bf42562a..8763866b11cf 100644 --- a/Documentation/crypto/asymmetric-keys.txt +++ b/Documentation/crypto/asymmetric-keys.txt @@ -183,6 +183,10 @@ and looks like the following: void (*describe)(const struct key *key, struct seq_file *m); void (*destroy)(void *payload); + int (*query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info); + int (*eds_op)(struct kernel_pkey_params *params, + const void *in, void *out); int (*verify_signature)(const struct key *key, const struct public_key_signature *sig); }; @@ -207,12 +211,22 @@ There are a number of operations defined by the subtype: asymmetric key will look after freeing the fingerprint and releasing the reference on the subtype module. - (3) verify_signature(). + (3) query(). - Optional. These are the entry points for the key usage operations. - Currently there is only the one defined. If not set, the caller will be - given -ENOTSUPP. The subtype may do anything it likes to implement an - operation, including offloading to hardware. + Mandatory. This is a function for querying the capabilities of a key. + + (4) eds_op(). + + Optional. This is the entry point for the encryption, decryption and + signature creation operations (which are distinguished by the operation ID + in the parameter struct). The subtype may do anything it likes to + implement an operation, including offloading to hardware. + + (5) verify_signature(). + + Optional. This is the entry point for signature verification. The + subtype may do anything it likes to implement an operation, including + offloading to hardware. ========================== @@ -234,6 +248,8 @@ Examples of blob formats for which parsers could be implemented include: - X.509 ASN.1 stream. - Pointer to TPM key. - Pointer to UEFI key. + - PKCS#8 private key [RFC 5208]. + - PKCS#5 encrypted private key [RFC 2898]. During key instantiation each parser in the list is tried until one doesn't return -EBADMSG. diff --git a/Documentation/devicetree/bindings/arm/cpu-capacity.txt b/Documentation/devicetree/bindings/arm/cpu-capacity.txt index 9b5685a1d15d..84262cdb8d29 100644 --- a/Documentation/devicetree/bindings/arm/cpu-capacity.txt +++ b/Documentation/devicetree/bindings/arm/cpu-capacity.txt @@ -59,9 +59,11 @@ mhz values (normalized w.r.t. the highest value found while parsing the DT). =========================================== Example 1 (ARM 64-bit, 6-cpu system, two clusters): -capacities-dmips-mhz are scaled w.r.t. 1024 (cpu@0 and cpu@1) -supposing cluster0@max-freq=1100 and custer1@max-freq=850, -final capacities are 1024 for cluster0 and 446 for cluster1 +The capacities-dmips-mhz or DMIPS/MHz values (scaled to 1024) +are 1024 and 578 for cluster0 and cluster1. Further normalization +is done by the operating system based on cluster0@max-freq=1100 and +custer1@max-freq=850, final capacities are 1024 for cluster0 and +446 for cluster1 (576*850/1100). cpus { #address-cells = <2>; diff --git a/Documentation/devicetree/bindings/display/panel/innolux,tv123wam.txt b/Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.txt index a9b35265fa13..513f03466aba 100644 --- a/Documentation/devicetree/bindings/display/panel/innolux,tv123wam.txt +++ b/Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.txt @@ -1,20 +1,22 @@ -Innolux TV123WAM 12.3 inch eDP 2K display panel +Innolux P120ZDG-BF1 12.02 inch eDP 2K display panel This binding is compatible with the simple-panel binding, which is specified in simple-panel.txt in this directory. Required properties: -- compatible: should be "innolux,tv123wam" +- compatible: should be "innolux,p120zdg-bf1" - power-supply: regulator to provide the supply voltage Optional properties: - enable-gpios: GPIO pin to enable or disable the panel - backlight: phandle of the backlight device attached to the panel +- no-hpd: If HPD isn't hooked up; add this property. Example: panel_edp: panel-edp { - compatible = "innolux,tv123wam"; + compatible = "innolux,p120zdg-bf1"; enable-gpios = <&msmgpio 31 GPIO_ACTIVE_LOW>; power-supply = <&pm8916_l2>; backlight = <&backlight>; + no-hpd; }; diff --git a/Documentation/devicetree/bindings/display/panel/simple-panel.txt b/Documentation/devicetree/bindings/display/panel/simple-panel.txt index 45a457ad38f0..b2b872c710f2 100644 --- a/Documentation/devicetree/bindings/display/panel/simple-panel.txt +++ b/Documentation/devicetree/bindings/display/panel/simple-panel.txt @@ -11,6 +11,9 @@ Optional properties: - ddc-i2c-bus: phandle of an I2C controller used for DDC EDID probing - enable-gpios: GPIO pin to enable or disable the panel - backlight: phandle of the backlight device attached to the panel +- no-hpd: This panel is supposed to communicate that it's ready via HPD + (hot plug detect) signal, but the signal isn't hooked up so we should + hardcode the max delay from the panel spec when powering up the panel. Example: diff --git a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt index 06a363d9ccef..b9a1d7402128 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt @@ -7,6 +7,7 @@ Required properties: for da850 - compatible = "ti,da850-ecap", "ti,am3352-ecap", "ti,am33xx-ecap"; for dra746 - compatible = "ti,dra746-ecap", "ti,am3352-ecap"; for 66ak2g - compatible = "ti,k2g-ecap", "ti,am3352-ecap"; + for am654 - compatible = "ti,am654-ecap", "ti,am3352-ecap"; - #pwm-cells: should be 3. See pwm.txt in this directory for a description of the cells format. The PWM channel index ranges from 0 to 4. The only third cell flag supported by this binding is PWM_POLARITY_INVERTED. diff --git a/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt b/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt index e1ef6afbe3a7..7f31fe7e2093 100644 --- a/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt +++ b/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt @@ -3,7 +3,9 @@ Required Properties: - compatible: should be "renesas,pwm-rcar" and one of the following. - "renesas,pwm-r8a7743": for RZ/G1M + - "renesas,pwm-r8a7744": for RZ/G1N - "renesas,pwm-r8a7745": for RZ/G1E + - "renesas,pwm-r8a774a1": for RZ/G2M - "renesas,pwm-r8a7778": for R-Car M1A - "renesas,pwm-r8a7779": for R-Car H1 - "renesas,pwm-r8a7790": for R-Car H2 @@ -12,6 +14,8 @@ Required Properties: - "renesas,pwm-r8a7795": for R-Car H3 - "renesas,pwm-r8a7796": for R-Car M3-W - "renesas,pwm-r8a77965": for R-Car M3-N + - "renesas,pwm-r8a77970": for R-Car V3M + - "renesas,pwm-r8a77980": for R-Car V3H - "renesas,pwm-r8a77990": for R-Car E3 - "renesas,pwm-r8a77995": for R-Car D3 - reg: base address and length of the registers block for the PWM. diff --git a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt index d53a16715da6..848a92b53d81 100644 --- a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt +++ b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt @@ -2,13 +2,19 @@ Required Properties: - - compatible: should be one of the following. + - compatible: must contain one or more of the following: - "renesas,tpu-r8a73a4": for R8A73A4 (R-Mobile APE6) compatible PWM controller. - "renesas,tpu-r8a7740": for R8A7740 (R-Mobile A1) compatible PWM controller. - "renesas,tpu-r8a7743": for R8A7743 (RZ/G1M) compatible PWM controller. + - "renesas,tpu-r8a7744": for R8A7744 (RZ/G1N) compatible PWM controller. - "renesas,tpu-r8a7745": for R8A7745 (RZ/G1E) compatible PWM controller. - "renesas,tpu-r8a7790": for R8A7790 (R-Car H2) compatible PWM controller. - - "renesas,tpu": for generic R-Car and RZ/G1 TPU PWM controller. + - "renesas,tpu-r8a77970": for R8A77970 (R-Car V3M) compatible PWM + controller. + - "renesas,tpu-r8a77980": for R8A77980 (R-Car V3H) compatible PWM + controller. + - "renesas,tpu": for the generic TPU PWM controller; this is a fallback for + the entries listed above. - reg: Base address and length of each memory resource used by the PWM controller hardware module. diff --git a/Documentation/devicetree/bindings/timer/csky,gx6605s-timer.txt b/Documentation/devicetree/bindings/timer/csky,gx6605s-timer.txt new file mode 100644 index 000000000000..6b04344f4bea --- /dev/null +++ b/Documentation/devicetree/bindings/timer/csky,gx6605s-timer.txt @@ -0,0 +1,42 @@ +================= +gx6605s SOC Timer +================= + +The timer is used in gx6605s soc as system timer and the driver +contain clk event and clk source. + +============================== +timer node bindings definition +============================== + + Description: Describes gx6605s SOC timer + + PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: must be "csky,gx6605s-timer" + - reg + Usage: required + Value type: <u32 u32> + Definition: <phyaddr size> in soc from cpu view + - clocks + Usage: required + Value type: phandle + clock specifier cells + Definition: must be input clk node + - interrupt + Usage: required + Value type: <u32> + Definition: must be timer irq num defined by soc + +Examples: +--------- + + timer0: timer@20a000 { + compatible = "csky,gx6605s-timer"; + reg = <0x0020a000 0x400>; + clocks = <&dummy_apb_clk>; + interrupts = <10>; + interrupt-parent = <&intc>; + }; diff --git a/Documentation/devicetree/bindings/timer/csky,mptimer.txt b/Documentation/devicetree/bindings/timer/csky,mptimer.txt new file mode 100644 index 000000000000..15cfec08fbb8 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/csky,mptimer.txt @@ -0,0 +1,42 @@ +============================ +C-SKY Multi-processors Timer +============================ + +C-SKY multi-processors timer is designed for C-SKY SMP system and the +regs is accessed by cpu co-processor 4 registers with mtcr/mfcr. + + - PTIM_CTLR "cr<0, 14>" Control reg to start reset timer. + - PTIM_TSR "cr<1, 14>" Interrupt cleanup status reg. + - PTIM_CCVR "cr<3, 14>" Current counter value reg. + - PTIM_LVR "cr<6, 14>" Window value reg to triger next event. + +============================== +timer node bindings definition +============================== + + Description: Describes SMP timer + + PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: must be "csky,mptimer" + - clocks + Usage: required + Value type: <node> + Definition: must be input clk node + - interrupts + Usage: required + Value type: <u32> + Definition: must be timer irq num defined by soc + +Examples: +--------- + + timer: timer { + compatible = "csky,mptimer"; + clocks = <&dummy_apb_clk>; + interrupts = <16>; + interrupt-parent = <&intc>; + }; diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 51c136c821bf..eef7d9d259e8 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -286,6 +286,12 @@ pointed by REDIRECT. This should not be possible on local system as setting "trusted." xattrs will require CAP_SYS_ADMIN. But it should be possible for untrusted layers like from a pen drive. +Note: redirect_dir={off|nofollow|follow(*)} conflicts with metacopy=on, and +results in an error. + +(*) redirect_dir=follow only conflicts with metacopy=on if upperdir=... is +given. + Sharing and copying layers -------------------------- diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 321d74b73937..cf43bc4dbf31 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -623,6 +623,11 @@ in your dentry operations instead. On success you get a new struct file sharing the mount/dentry with the original, on failure - ERR_PTR(). -- +[mandatory] + ->clone_file_range() and ->dedupe_file_range have been replaced with + ->remap_file_range(). See Documentation/filesystems/vfs.txt for more + information. +-- [recommended] ->lookup() instances doing an equivalent of if (IS_ERR(inode)) diff --git a/Documentation/filesystems/ubifs-authentication.md b/Documentation/filesystems/ubifs-authentication.md new file mode 100644 index 000000000000..028b3e2e25f9 --- /dev/null +++ b/Documentation/filesystems/ubifs-authentication.md @@ -0,0 +1,426 @@ +% UBIFS Authentication +% sigma star gmbh +% 2018 + +# Introduction + +UBIFS utilizes the fscrypt framework to provide confidentiality for file +contents and file names. This prevents attacks where an attacker is able to +read contents of the filesystem on a single point in time. A classic example +is a lost smartphone where the attacker is unable to read personal data stored +on the device without the filesystem decryption key. + +At the current state, UBIFS encryption however does not prevent attacks where +the attacker is able to modify the filesystem contents and the user uses the +device afterwards. In such a scenario an attacker can modify filesystem +contents arbitrarily without the user noticing. One example is to modify a +binary to perform a malicious action when executed [DMC-CBC-ATTACK]. Since +most of the filesystem metadata of UBIFS is stored in plain, this makes it +fairly easy to swap files and replace their contents. + +Other full disk encryption systems like dm-crypt cover all filesystem metadata, +which makes such kinds of attacks more complicated, but not impossible. +Especially, if the attacker is given access to the device multiple points in +time. For dm-crypt and other filesystems that build upon the Linux block IO +layer, the dm-integrity or dm-verity subsystems [DM-INTEGRITY, DM-VERITY] +can be used to get full data authentication at the block layer. +These can also be combined with dm-crypt [CRYPTSETUP2]. + +This document describes an approach to get file contents _and_ full metadata +authentication for UBIFS. Since UBIFS uses fscrypt for file contents and file +name encryption, the authentication system could be tied into fscrypt such that +existing features like key derivation can be utilized. It should however also +be possible to use UBIFS authentication without using encryption. + + +## MTD, UBI & UBIFS + +On Linux, the MTD (Memory Technology Devices) subsystem provides a uniform +interface to access raw flash devices. One of the more prominent subsystems that +work on top of MTD is UBI (Unsorted Block Images). It provides volume management +for flash devices and is thus somewhat similar to LVM for block devices. In +addition, it deals with flash-specific wear-leveling and transparent I/O error +handling. UBI offers logical erase blocks (LEBs) to the layers on top of it +and maps them transparently to physical erase blocks (PEBs) on the flash. + +UBIFS is a filesystem for raw flash which operates on top of UBI. Thus, wear +leveling and some flash specifics are left to UBI, while UBIFS focuses on +scalability, performance and recoverability. + + + + +------------+ +*******+ +-----------+ +-----+ + | | * UBIFS * | UBI-BLOCK | | ... | + | JFFS/JFFS2 | +*******+ +-----------+ +-----+ + | | +-----------------------------+ +-----------+ +-----+ + | | | UBI | | MTD-BLOCK | | ... | + +------------+ +-----------------------------+ +-----------+ +-----+ + +------------------------------------------------------------------+ + | MEMORY TECHNOLOGY DEVICES (MTD) | + +------------------------------------------------------------------+ + +-----------------------------+ +--------------------------+ +-----+ + | NAND DRIVERS | | NOR DRIVERS | | ... | + +-----------------------------+ +--------------------------+ +-----+ + + Figure 1: Linux kernel subsystems for dealing with raw flash + + + +Internally, UBIFS maintains multiple data structures which are persisted on +the flash: + +- *Index*: an on-flash B+ tree where the leaf nodes contain filesystem data +- *Journal*: an additional data structure to collect FS changes before updating + the on-flash index and reduce flash wear. +- *Tree Node Cache (TNC)*: an in-memory B+ tree that reflects the current FS + state to avoid frequent flash reads. It is basically the in-memory + representation of the index, but contains additional attributes. +- *LEB property tree (LPT)*: an on-flash B+ tree for free space accounting per + UBI LEB. + +In the remainder of this section we will cover the on-flash UBIFS data +structures in more detail. The TNC is of less importance here since it is never +persisted onto the flash directly. More details on UBIFS can also be found in +[UBIFS-WP]. + + +### UBIFS Index & Tree Node Cache + +Basic on-flash UBIFS entities are called *nodes*. UBIFS knows different types +of nodes. Eg. data nodes (`struct ubifs_data_node`) which store chunks of file +contents or inode nodes (`struct ubifs_ino_node`) which represent VFS inodes. +Almost all types of nodes share a common header (`ubifs_ch`) containing basic +information like node type, node length, a sequence number, etc. (see +`fs/ubifs/ubifs-media.h`in kernel source). Exceptions are entries of the LPT +and some less important node types like padding nodes which are used to pad +unusable content at the end of LEBs. + +To avoid re-writing the whole B+ tree on every single change, it is implemented +as *wandering tree*, where only the changed nodes are re-written and previous +versions of them are obsoleted without erasing them right away. As a result, +the index is not stored in a single place on the flash, but *wanders* around +and there are obsolete parts on the flash as long as the LEB containing them is +not reused by UBIFS. To find the most recent version of the index, UBIFS stores +a special node called *master node* into UBI LEB 1 which always points to the +most recent root node of the UBIFS index. For recoverability, the master node +is additionally duplicated to LEB 2. Mounting UBIFS is thus a simple read of +LEB 1 and 2 to get the current master node and from there get the location of +the most recent on-flash index. + +The TNC is the in-memory representation of the on-flash index. It contains some +additional runtime attributes per node which are not persisted. One of these is +a dirty-flag which marks nodes that have to be persisted the next time the +index is written onto the flash. The TNC acts as a write-back cache and all +modifications of the on-flash index are done through the TNC. Like other caches, +the TNC does not have to mirror the full index into memory, but reads parts of +it from flash whenever needed. A *commit* is the UBIFS operation of updating the +on-flash filesystem structures like the index. On every commit, the TNC nodes +marked as dirty are written to the flash to update the persisted index. + + +### Journal + +To avoid wearing out the flash, the index is only persisted (*commited*) when +certain conditions are met (eg. `fsync(2)`). The journal is used to record +any changes (in form of inode nodes, data nodes etc.) between commits +of the index. During mount, the journal is read from the flash and replayed +onto the TNC (which will be created on-demand from the on-flash index). + +UBIFS reserves a bunch of LEBs just for the journal called *log area*. The +amount of log area LEBs is configured on filesystem creation (using +`mkfs.ubifs`) and stored in the superblock node. The log area contains only +two types of nodes: *reference nodes* and *commit start nodes*. A commit start +node is written whenever an index commit is performed. Reference nodes are +written on every journal update. Each reference node points to the position of +other nodes (inode nodes, data nodes etc.) on the flash that are part of this +journal entry. These nodes are called *buds* and describe the actual filesystem +changes including their data. + +The log area is maintained as a ring. Whenever the journal is almost full, +a commit is initiated. This also writes a commit start node so that during +mount, UBIFS will seek for the most recent commit start node and just replay +every reference node after that. Every reference node before the commit start +node will be ignored as they are already part of the on-flash index. + +When writing a journal entry, UBIFS first ensures that enough space is +available to write the reference node and buds part of this entry. Then, the +reference node is written and afterwards the buds describing the file changes. +On replay, UBIFS will record every reference node and inspect the location of +the referenced LEBs to discover the buds. If these are corrupt or missing, +UBIFS will attempt to recover them by re-reading the LEB. This is however only +done for the last referenced LEB of the journal. Only this can become corrupt +because of a power cut. If the recovery fails, UBIFS will not mount. An error +for every other LEB will directly cause UBIFS to fail the mount operation. + + + | ---- LOG AREA ---- | ---------- MAIN AREA ------------ | + + -----+------+-----+--------+---- ------+-----+-----+--------------- + \ | | | | / / | | | \ + / CS | REF | REF | | \ \ DENT | INO | INO | / + \ | | | | / / | | | \ + ----+------+-----+--------+--- -------+-----+-----+---------------- + | | ^ ^ + | | | | + +------------------------+ | + | | + +-------------------------------+ + + + Figure 2: UBIFS flash layout of log area with commit start nodes + (CS) and reference nodes (REF) pointing to main area + containing their buds + + +### LEB Property Tree/Table + +The LEB property tree is used to store per-LEB information. This includes the +LEB type and amount of free and *dirty* (old, obsolete content) space [1] on +the LEB. The type is important, because UBIFS never mixes index nodes with data +nodes on a single LEB and thus each LEB has a specific purpose. This again is +useful for free space calculations. See [UBIFS-WP] for more details. + +The LEB property tree again is a B+ tree, but it is much smaller than the +index. Due to its smaller size it is always written as one chunk on every +commit. Thus, saving the LPT is an atomic operation. + + +[1] Since LEBs can only be appended and never overwritten, there is a +difference between free space ie. the remaining space left on the LEB to be +written to without erasing it and previously written content that is obsolete +but can't be overwritten without erasing the full LEB. + + +# UBIFS Authentication + +This chapter introduces UBIFS authentication which enables UBIFS to verify +the authenticity and integrity of metadata and file contents stored on flash. + + +## Threat Model + +UBIFS authentication enables detection of offline data modification. While it +does not prevent it, it enables (trusted) code to check the integrity and +authenticity of on-flash file contents and filesystem metadata. This covers +attacks where file contents are swapped. + +UBIFS authentication will not protect against rollback of full flash contents. +Ie. an attacker can still dump the flash and restore it at a later time without +detection. It will also not protect against partial rollback of individual +index commits. That means that an attacker is able to partially undo changes. +This is possible because UBIFS does not immediately overwrites obsolete +versions of the index tree or the journal, but instead marks them as obsolete +and garbage collection erases them at a later time. An attacker can use this by +erasing parts of the current tree and restoring old versions that are still on +the flash and have not yet been erased. This is possible, because every commit +will always write a new version of the index root node and the master node +without overwriting the previous version. This is further helped by the +wear-leveling operations of UBI which copies contents from one physical +eraseblock to another and does not atomically erase the first eraseblock. + +UBIFS authentication does not cover attacks where an attacker is able to +execute code on the device after the authentication key was provided. +Additional measures like secure boot and trusted boot have to be taken to +ensure that only trusted code is executed on a device. + + +## Authentication + +To be able to fully trust data read from flash, all UBIFS data structures +stored on flash are authenticated. That is: + +- The index which includes file contents, file metadata like extended + attributes, file length etc. +- The journal which also contains file contents and metadata by recording changes + to the filesystem +- The LPT which stores UBI LEB metadata which UBIFS uses for free space accounting + + +### Index Authentication + +Through UBIFS' concept of a wandering tree, it already takes care of only +updating and persisting changed parts from leaf node up to the root node +of the full B+ tree. This enables us to augment the index nodes of the tree +with a hash over each node's child nodes. As a result, the index basically also +a Merkle tree. Since the leaf nodes of the index contain the actual filesystem +data, the hashes of their parent index nodes thus cover all the file contents +and file metadata. When a file changes, the UBIFS index is updated accordingly +from the leaf nodes up to the root node including the master node. This process +can be hooked to recompute the hash only for each changed node at the same time. +Whenever a file is read, UBIFS can verify the hashes from each leaf node up to +the root node to ensure the node's integrity. + +To ensure the authenticity of the whole index, the UBIFS master node stores a +keyed hash (HMAC) over its own contents and a hash of the root node of the index +tree. As mentioned above, the master node is always written to the flash whenever +the index is persisted (ie. on index commit). + +Using this approach only UBIFS index nodes and the master node are changed to +include a hash. All other types of nodes will remain unchanged. This reduces +the storage overhead which is precious for users of UBIFS (ie. embedded +devices). + + + +---------------+ + | Master Node | + | (hash) | + +---------------+ + | + v + +-------------------+ + | Index Node #1 | + | | + | branch0 branchn | + | (hash) (hash) | + +-------------------+ + | ... | (fanout: 8) + | | + +-------+ +------+ + | | + v v + +-------------------+ +-------------------+ + | Index Node #2 | | Index Node #3 | + | | | | + | branch0 branchn | | branch0 branchn | + | (hash) (hash) | | (hash) (hash) | + +-------------------+ +-------------------+ + | ... | ... | + v v v + +-----------+ +----------+ +-----------+ + | Data Node | | INO Node | | DENT Node | + +-----------+ +----------+ +-----------+ + + + Figure 3: Coverage areas of index node hash and master node HMAC + + + +The most important part for robustness and power-cut safety is to atomically +persist the hash and file contents. Here the existing UBIFS logic for how +changed nodes are persisted is already designed for this purpose such that +UBIFS can safely recover if a power-cut occurs while persisting. Adding +hashes to index nodes does not change this since each hash will be persisted +atomically together with its respective node. + + +### Journal Authentication + +The journal is authenticated too. Since the journal is continuously written +it is necessary to also add authentication information frequently to the +journal so that in case of a powercut not too much data can't be authenticated. +This is done by creating a continuous hash beginning from the commit start node +over the previous reference nodes, the current reference node, and the bud +nodes. From time to time whenever it is suitable authentication nodes are added +between the bud nodes. This new node type contains a HMAC over the current state +of the hash chain. That way a journal can be authenticated up to the last +authentication node. The tail of the journal which may not have a authentication +node cannot be authenticated and is skipped during journal replay. + +We get this picture for journal authentication: + + ,,,,,,,, + ,......,........................................... + ,. CS , hash1.----. hash2.----. + ,. | , . |hmac . |hmac + ,. v , . v . v + ,.REF#0,-> bud -> bud -> bud.-> auth -> bud -> bud.-> auth ... + ,..|...,........................................... + , | , + , | ,,,,,,,,,,,,,,, + . | hash3,----. + , | , |hmac + , v , v + , REF#1 -> bud -> bud,-> auth ... + ,,,|,,,,,,,,,,,,,,,,,, + v + REF#2 -> ... + | + V + ... + +Since the hash also includes the reference nodes an attacker cannot reorder or +skip any journal heads for replay. An attacker can only remove bud nodes or +reference nodes from the end of the journal, effectively rewinding the +filesystem at maximum back to the last commit. + +The location of the log area is stored in the master node. Since the master +node is authenticated with a HMAC as described above, it is not possible to +tamper with that without detection. The size of the log area is specified when +the filesystem is created using `mkfs.ubifs` and stored in the superblock node. +To avoid tampering with this and other values stored there, a HMAC is added to +the superblock struct. The superblock node is stored in LEB 0 and is only +modified on feature flag or similar changes, but never on file changes. + + +### LPT Authentication + +The location of the LPT root node on the flash is stored in the UBIFS master +node. Since the LPT is written and read atomically on every commit, there is +no need to authenticate individual nodes of the tree. It suffices to +protect the integrity of the full LPT by a simple hash stored in the master +node. Since the master node itself is authenticated, the LPTs authenticity can +be verified by verifying the authenticity of the master node and comparing the +LTP hash stored there with the hash computed from the read on-flash LPT. + + +## Key Management + +For simplicity, UBIFS authentication uses a single key to compute the HMACs +of superblock, master, commit start and reference nodes. This key has to be +available on creation of the filesystem (`mkfs.ubifs`) to authenticate the +superblock node. Further, it has to be available on mount of the filesystem +to verify authenticated nodes and generate new HMACs for changes. + +UBIFS authentication is intended to operate side-by-side with UBIFS encryption +(fscrypt) to provide confidentiality and authenticity. Since UBIFS encryption +has a different approach of encryption policies per directory, there can be +multiple fscrypt master keys and there might be folders without encryption. +UBIFS authentication on the other hand has an all-or-nothing approach in the +sense that it either authenticates everything of the filesystem or nothing. +Because of this and because UBIFS authentication should also be usable without +encryption, it does not share the same master key with fscrypt, but manages +a dedicated authentication key. + +The API for providing the authentication key has yet to be defined, but the +key can eg. be provided by userspace through a keyring similar to the way it +is currently done in fscrypt. It should however be noted that the current +fscrypt approach has shown its flaws and the userspace API will eventually +change [FSCRYPT-POLICY2]. + +Nevertheless, it will be possible for a user to provide a single passphrase +or key in userspace that covers UBIFS authentication and encryption. This can +be solved by the corresponding userspace tools which derive a second key for +authentication in addition to the derived fscrypt master key used for +encryption. + +To be able to check if the proper key is available on mount, the UBIFS +superblock node will additionally store a hash of the authentication key. This +approach is similar to the approach proposed for fscrypt encryption policy v2 +[FSCRYPT-POLICY2]. + + +# Future Extensions + +In certain cases where a vendor wants to provide an authenticated filesystem +image to customers, it should be possible to do so without sharing the secret +UBIFS authentication key. Instead, in addition the each HMAC a digital +signature could be stored where the vendor shares the public key alongside the +filesystem image. In case this filesystem has to be modified afterwards, +UBIFS can exchange all digital signatures with HMACs on first mount similar +to the way the IMA/EVM subsystem deals with such situations. The HMAC key +will then have to be provided beforehand in the normal way. + + +# References + +[CRYPTSETUP2] http://www.saout.de/pipermail/dm-crypt/2017-November/005745.html + +[DMC-CBC-ATTACK] http://www.jakoblell.com/blog/2013/12/22/practical-malleability-attack-against-cbc-encrypted-luks-partitions/ + +[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.txt + +[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.txt + +[FSCRYPT-POLICY2] https://www.spinics.net/lists/linux-ext4/msg58710.html + +[UBIFS-WP] http://www.linux-mtd.infradead.org/doc/ubifs_whitepaper.pdf diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index a0a61d2f389f..acc80442a3bb 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt @@ -91,6 +91,13 @@ chk_data_crc do not skip checking CRCs on data nodes compr=none override default compressor and set it to "none" compr=lzo override default compressor and set it to "lzo" compr=zlib override default compressor and set it to "zlib" +auth_key= specify the key used for authenticating the filesystem. + Passing this option makes authentication mandatory. + The passed key must be present in the kernel keyring + and must be of type 'logon' +auth_hash_name= The hash algorithm used for authentication. Used for + both hashing and for creating HMACs. Typical values + include "sha256" or "sha512" Quick usage instructions diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index a6c6a8af48a2..5f71a252e2e0 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -883,8 +883,9 @@ struct file_operations { unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); }; @@ -960,11 +961,18 @@ otherwise noted. copy_file_range: called by the copy_file_range(2) system call. - clone_file_range: called by the ioctl(2) system call for FICLONERANGE and - FICLONE commands. - - dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE - command. + remap_file_range: called by the ioctl(2) system call for FICLONERANGE and + FICLONE and FIDEDUPERANGE commands to remap file ranges. An + implementation should remap len bytes at pos_in of the source file into + the dest file at pos_out. Implementations must handle callers passing + in len == 0; this means "remap to the end of the source file". The + return value should the number of bytes remapped, or the usual + negative error code if errors occurred before any bytes were remapped. + The remap_flags parameter accepts REMAP_FILE_* flags. If + REMAP_FILE_DEDUP is set then the implementation must only remap if the + requested file ranges have identical contents. If REMAP_CAN_SHORTEN is + set, the caller is ok with the implementation shortening the request + length to satisfy alignment or EOF requirements (or any other reason). fadvise: possibly called by the fadvise64() system call. diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 7b6a2b2bdc98..8da26c6dd886 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -537,21 +537,6 @@ more details, with real examples. The third parameter may be a text as in this example, but it may also be an expanded variable or a macro. - cc-fullversion - cc-fullversion is useful when the exact version of gcc is needed. - One typical use-case is when a specific GCC version is broken. - cc-fullversion points out a more specific version than cc-version does. - - Example: - #arch/powerpc/Makefile - $(Q)if test "$(cc-fullversion)" = "040200" ; then \ - echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \ - false ; \ - fi - - In this example for a specific GCC version the build will error out - explaining to the user why it stops. - cc-cross-prefix cc-cross-prefix is used to check if there exists a $(CC) in path with one of the listed prefixes. The first prefix where there exist a diff --git a/Documentation/networking/ice.rst b/Documentation/networking/ice.rst index 1e4948c9e989..4d118b827bbb 100644 --- a/Documentation/networking/ice.rst +++ b/Documentation/networking/ice.rst @@ -20,7 +20,7 @@ Enabling the driver The driver is enabled via the standard kernel configuration system, using the make command:: - make oldconfig/silentoldconfig/menuconfig/etc. + make oldconfig/menuconfig/etc. The driver is located in the menu structure at: diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index 757808526d9a..878ebfda7eef 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -25,6 +25,7 @@ Below are the essential guides that every developer should read. code-of-conduct-interpretation development-process submitting-patches + programming-language coding-style maintainer-pgp-guide email-clients diff --git a/Documentation/process/programming-language.rst b/Documentation/process/programming-language.rst new file mode 100644 index 000000000000..e5f5f065dc24 --- /dev/null +++ b/Documentation/process/programming-language.rst @@ -0,0 +1,45 @@ +.. _programming_language: + +Programming Language +==================== + +The kernel is written in the C programming language [c-language]_. +More precisely, the kernel is typically compiled with ``gcc`` [gcc]_ +under ``-std=gnu89`` [gcc-c-dialect-options]_: the GNU dialect of ISO C90 +(including some C99 features). + +This dialect contains many extensions to the language [gnu-extensions]_, +and many of them are used within the kernel as a matter of course. + +There is some support for compiling the kernel with ``clang`` [clang]_ +and ``icc`` [icc]_ for several of the architectures, although at the time +of writing it is not completed, requiring third-party patches. + +Attributes +---------- + +One of the common extensions used throughout the kernel are attributes +[gcc-attribute-syntax]_. Attributes allow to introduce +implementation-defined semantics to language entities (like variables, +functions or types) without having to make significant syntactic changes +to the language (e.g. adding a new keyword) [n2049]_. + +In some cases, attributes are optional (i.e. a compiler not supporting them +should still produce proper code, even if it is slower or does not perform +as many compile-time checks/diagnostics). + +The kernel defines pseudo-keywords (e.g. ``__pure``) instead of using +directly the GNU attribute syntax (e.g. ``__attribute__((__pure__))``) +in order to feature detect which ones can be used and/or to shorten the code. + +Please refer to ``include/linux/compiler_attributes.h`` for more information. + +.. [c-language] http://www.open-std.org/jtc1/sc22/wg14/www/standards +.. [gcc] https://gcc.gnu.org +.. [clang] https://clang.llvm.org +.. [icc] https://software.intel.com/en-us/c-compilers +.. [gcc-c-dialect-options] https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html +.. [gnu-extensions] https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html +.. [gcc-attribute-syntax] https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html +.. [n2049] http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2049.pdf + diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index 9ce7256c6edb..9521c4207f01 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -859,6 +859,7 @@ The keyctl syscall functions are: and either the buffer length or the OtherInfo length exceeds the allowed length. + * Restrict keyring linkage:: long keyctl(KEYCTL_RESTRICT_KEYRING, key_serial_t keyring, @@ -890,6 +891,116 @@ The keyctl syscall functions are: applicable to the asymmetric key type. + * Query an asymmetric key:: + + long keyctl(KEYCTL_PKEY_QUERY, + key_serial_t key_id, unsigned long reserved, + struct keyctl_pkey_query *info); + + Get information about an asymmetric key. The information is returned in + the keyctl_pkey_query struct:: + + __u32 supported_ops; + __u32 key_size; + __u16 max_data_size; + __u16 max_sig_size; + __u16 max_enc_size; + __u16 max_dec_size; + __u32 __spare[10]; + + ``supported_ops`` contains a bit mask of flags indicating which ops are + supported. This is constructed from a bitwise-OR of:: + + KEYCTL_SUPPORTS_{ENCRYPT,DECRYPT,SIGN,VERIFY} + + ``key_size`` indicated the size of the key in bits. + + ``max_*_size`` indicate the maximum sizes in bytes of a blob of data to be + signed, a signature blob, a blob to be encrypted and a blob to be + decrypted. + + ``__spare[]`` must be set to 0. This is intended for future use to hand + over one or more passphrases needed unlock a key. + + If successful, 0 is returned. If the key is not an asymmetric key, + EOPNOTSUPP is returned. + + + * Encrypt, decrypt, sign or verify a blob using an asymmetric key:: + + long keyctl(KEYCTL_PKEY_ENCRYPT, + const struct keyctl_pkey_params *params, + const char *info, + const void *in, + void *out); + + long keyctl(KEYCTL_PKEY_DECRYPT, + const struct keyctl_pkey_params *params, + const char *info, + const void *in, + void *out); + + long keyctl(KEYCTL_PKEY_SIGN, + const struct keyctl_pkey_params *params, + const char *info, + const void *in, + void *out); + + long keyctl(KEYCTL_PKEY_VERIFY, + const struct keyctl_pkey_params *params, + const char *info, + const void *in, + const void *in2); + + Use an asymmetric key to perform a public-key cryptographic operation a + blob of data. For encryption and verification, the asymmetric key may + only need the public parts to be available, but for decryption and signing + the private parts are required also. + + The parameter block pointed to by params contains a number of integer + values:: + + __s32 key_id; + __u32 in_len; + __u32 out_len; + __u32 in2_len; + + ``key_id`` is the ID of the asymmetric key to be used. ``in_len`` and + ``in2_len`` indicate the amount of data in the in and in2 buffers and + ``out_len`` indicates the size of the out buffer as appropriate for the + above operations. + + For a given operation, the in and out buffers are used as follows:: + + Operation ID in,in_len out,out_len in2,in2_len + ======================= =============== =============== =============== + KEYCTL_PKEY_ENCRYPT Raw data Encrypted data - + KEYCTL_PKEY_DECRYPT Encrypted data Raw data - + KEYCTL_PKEY_SIGN Raw data Signature - + KEYCTL_PKEY_VERIFY Raw data - Signature + + ``info`` is a string of key=value pairs that supply supplementary + information. These include: + + ``enc=<encoding>`` The encoding of the encrypted/signature blob. This + can be "pkcs1" for RSASSA-PKCS1-v1.5 or + RSAES-PKCS1-v1.5; "pss" for "RSASSA-PSS"; "oaep" for + "RSAES-OAEP". If omitted or is "raw", the raw output + of the encryption function is specified. + + ``hash=<algo>`` If the data buffer contains the output of a hash + function and the encoding includes some indication of + which hash function was used, the hash function can be + specified with this, eg. "hash=sha256". + + The ``__spare[]`` space in the parameter block must be set to 0. This is + intended, amongst other things, to allow the passing of passphrases + required to unlock a key. + + If successful, encrypt, decrypt and sign all return the amount of data + written into the output buffer. Verification returns 0 on success. + + Kernel Services =============== @@ -1483,6 +1594,112 @@ The structure has a number of fields, some of which are mandatory: attempted key link operation. If there is no match, -EINVAL is returned. + * ``int (*asym_eds_op)(struct kernel_pkey_params *params, + const void *in, void *out);`` + ``int (*asym_verify_signature)(struct kernel_pkey_params *params, + const void *in, const void *in2);`` + + These methods are optional. If provided the first allows a key to be + used to encrypt, decrypt or sign a blob of data, and the second allows a + key to verify a signature. + + In all cases, the following information is provided in the params block:: + + struct kernel_pkey_params { + struct key *key; + const char *encoding; + const char *hash_algo; + char *info; + __u32 in_len; + union { + __u32 out_len; + __u32 in2_len; + }; + enum kernel_pkey_operation op : 8; + }; + + This includes the key to be used; a string indicating the encoding to use + (for instance, "pkcs1" may be used with an RSA key to indicate + RSASSA-PKCS1-v1.5 or RSAES-PKCS1-v1.5 encoding or "raw" if no encoding); + the name of the hash algorithm used to generate the data for a signature + (if appropriate); the sizes of the input and output (or second input) + buffers; and the ID of the operation to be performed. + + For a given operation ID, the input and output buffers are used as + follows:: + + Operation ID in,in_len out,out_len in2,in2_len + ======================= =============== =============== =============== + kernel_pkey_encrypt Raw data Encrypted data - + kernel_pkey_decrypt Encrypted data Raw data - + kernel_pkey_sign Raw data Signature - + kernel_pkey_verify Raw data - Signature + + asym_eds_op() deals with encryption, decryption and signature creation as + specified by params->op. Note that params->op is also set for + asym_verify_signature(). + + Encrypting and signature creation both take raw data in the input buffer + and return the encrypted result in the output buffer. Padding may have + been added if an encoding was set. In the case of signature creation, + depending on the encoding, the padding created may need to indicate the + digest algorithm - the name of which should be supplied in hash_algo. + + Decryption takes encrypted data in the input buffer and returns the raw + data in the output buffer. Padding will get checked and stripped off if + an encoding was set. + + Verification takes raw data in the input buffer and the signature in the + second input buffer and checks that the one matches the other. Padding + will be validated. Depending on the encoding, the digest algorithm used + to generate the raw data may need to be indicated in hash_algo. + + If successful, asym_eds_op() should return the number of bytes written + into the output buffer. asym_verify_signature() should return 0. + + A variety of errors may be returned, including EOPNOTSUPP if the operation + is not supported; EKEYREJECTED if verification fails; ENOPKG if the + required crypto isn't available. + + + * ``int (*asym_query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info);`` + + This method is optional. If provided it allows information about the + public or asymmetric key held in the key to be determined. + + The parameter block is as for asym_eds_op() and co. but in_len and out_len + are unused. The encoding and hash_algo fields should be used to reduce + the returned buffer/data sizes as appropriate. + + If successful, the following information is filled in:: + + struct kernel_pkey_query { + __u32 supported_ops; + __u32 key_size; + __u16 max_data_size; + __u16 max_sig_size; + __u16 max_enc_size; + __u16 max_dec_size; + }; + + The supported_ops field will contain a bitmask indicating what operations + are supported by the key, including encryption of a blob, decryption of a + blob, signing a blob and verifying the signature on a blob. The following + constants are defined for this:: + + KEYCTL_SUPPORTS_{ENCRYPT,DECRYPT,SIGN,VERIFY} + + The key_size field is the size of the key in bits. max_data_size and + max_sig_size are the maximum raw data and signature sizes for creation and + verification of a signature; max_enc_size and max_dec_size are the maximum + raw data and signature sizes for encryption and decryption. The + max_*_size fields are measured in bytes. + + If successful, 0 will be returned. If the key doesn't support this, + EOPNOTSUPP will be returned. + + Request-Key Callback Service ============================ diff --git a/Documentation/security/self-protection.rst b/Documentation/security/self-protection.rst index e1ca698e0006..f584fb74b4ff 100644 --- a/Documentation/security/self-protection.rst +++ b/Documentation/security/self-protection.rst @@ -302,11 +302,11 @@ sure structure holes are cleared. Memory poisoning ---------------- -When releasing memory, it is best to poison the contents (clear stack on -syscall return, wipe heap memory on a free), to avoid reuse attacks that -rely on the old contents of memory. This frustrates many uninitialized -variable attacks, stack content exposures, heap content exposures, and -use-after-free attacks. +When releasing memory, it is best to poison the contents, to avoid reuse +attacks that rely on the old contents of memory. E.g., clear stack on a +syscall return (``CONFIG_GCC_PLUGIN_STACKLEAK``), wipe heap memory on a +free. This frustrates many uninitialized variable attacks, stack content +exposures, heap content exposures, and use-after-free attacks. Destination tracking -------------------- diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 37a679501ddc..1b8775298cf7 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -89,6 +89,7 @@ show up in /proc/sys/kernel: - shmmni - softlockup_all_cpu_backtrace - soft_watchdog +- stack_erasing - stop-a [ SPARC only ] - sysrq ==> Documentation/admin-guide/sysrq.rst - sysctl_writes_strict @@ -987,6 +988,23 @@ detect a hard lockup condition. ============================================================== +stack_erasing + +This parameter can be used to control kernel stack erasing at the end +of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. + +That erasing reduces the information which kernel stack leak bugs +can reveal and blocks some uninitialized stack variable attacks. +The tradeoff is the performance impact: on a single CPU system kernel +compilation sees a 1% slowdown, other systems and workloads may vary. + + 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated. + + 1: kernel stack erasing is enabled (default), it is performed before + returning to the userspace at the end of syscalls. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which can be diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 702898633b00..73aaaa3da436 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -146,3 +146,6 @@ Their order is preserved but their base will be offset early at boot time. Be very careful vs. KASLR when changing anything here. The KASLR address range must not overlap with anything except the KASAN shadow area, which is correct as KASAN disables KASLR. + +For both 4- and 5-level layouts, the STACKLEAK_POISON value in the last 2MB +hole: ffffffffffff4111 diff --git a/MAINTAINERS b/MAINTAINERS index 690c2f68a401..f4855974f325 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3737,6 +3737,11 @@ L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/compal-laptop.c +COMPILER ATTRIBUTES +M: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com> +S: Maintained +F: include/linux/compiler_attributes.h + CONEXANT ACCESSRUNNER USB DRIVER L: accessrunner-general@lists.sourceforge.net W: http://accessrunner.sourceforge.net/ @@ -15858,7 +15863,6 @@ F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c F: drivers/net/vsockmon.c F: drivers/vhost/vsock.c -F: drivers/vhost/vsock.h F: tools/testing/vsock/ VIRTIO CONSOLE DRIVER @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 -PATCHLEVEL = 19 +PATCHLEVEL = 20 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = "People's Front" # *DOCUMENTATION* @@ -485,7 +485,7 @@ ifneq ($(KBUILD_SRC),) $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkmakefile $(srctree) endif -ifeq ($(cc-name),clang) +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) ifneq ($(CROSS_COMPILE),) CLANG_TARGET := --target=$(notdir $(CROSS_COMPILE:%-=%)) GCC_TOOLCHAIN_DIR := $(dir $(shell which $(LD))) @@ -702,7 +702,7 @@ stackp-flags-$(CONFIG_STACKPROTECTOR_STRONG) := -fstack-protector-strong KBUILD_CFLAGS += $(stackp-flags-y) -ifeq ($(cc-name),clang) +ifdef CONFIG_CC_IS_CLANG KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) diff --git a/arch/Kconfig b/arch/Kconfig index ed27fd262627..e1e540ffa979 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -429,6 +429,13 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +config HAVE_ARCH_STACKLEAK + bool + help + An architecture should select this if it has the code which + fills the used part of the kernel stack with the STACKLEAK_POISON + value before returning from system calls. + config HAVE_STACKPROTECTOR bool help diff --git a/arch/arm/boot/dts/stm32mp157c.dtsi b/arch/arm/boot/dts/stm32mp157c.dtsi index c50c36baba75..8bf1c17f8cef 100644 --- a/arch/arm/boot/dts/stm32mp157c.dtsi +++ b/arch/arm/boot/dts/stm32mp157c.dtsi @@ -923,7 +923,7 @@ interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>; clocks = <&rcc HASH1>; resets = <&rcc HASH1_R>; - dmas = <&mdma1 31 0x10 0x1000A02 0x0 0x0 0x0>; + dmas = <&mdma1 31 0x10 0x1000A02 0x0 0x0>; dma-names = "in"; dma-maxburst = <2>; status = "disabled"; diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 63af6234c1b6..1c7616815a86 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1,6 +1,7 @@ CONFIG_SYSVIPC=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_PREEMPT=y CONFIG_CGROUPS=y CONFIG_BLK_DEV_INITRD=y CONFIG_EMBEDDED=y diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c index af318d958fd2..3d191fd52910 100644 --- a/arch/arm/mach-omap1/board-ams-delta.c +++ b/arch/arm/mach-omap1/board-ams-delta.c @@ -773,7 +773,7 @@ static struct plat_serial8250_port ams_delta_modem_ports[] = { { .membase = IOMEM(MODEM_VIRT), .mapbase = MODEM_PHYS, - .irq = -EINVAL, /* changed later */ + .irq = IRQ_NOTCONNECTED, /* changed later */ .flags = UPF_BOOT_AUTOCONF, .irqflags = IRQF_TRIGGER_RISING, .iotype = UPIO_MEM, @@ -864,8 +864,7 @@ static int __init modem_nreset_init(void) /* - * This function expects MODEM IRQ number already assigned to the port - * and fails if it's not. + * This function expects MODEM IRQ number already assigned to the port. * The MODEM device requires its RESET# pin kept high during probe. * That requirement can be fulfilled in several ways: * - with a descriptor of already functional modem_nreset regulator @@ -888,9 +887,6 @@ static int __init ams_delta_modem_init(void) if (!machine_is_ams_delta()) return -ENODEV; - if (ams_delta_modem_ports[0].irq < 0) - return ams_delta_modem_ports[0].irq; - omap_cfg_reg(M14_1510_GPIO2); /* Initialize the modem_nreset regulator consumer before use */ diff --git a/arch/arm/plat-orion/mpp.c b/arch/arm/plat-orion/mpp.c index 5b4ff9373c89..8a6880d528b6 100644 --- a/arch/arm/plat-orion/mpp.c +++ b/arch/arm/plat-orion/mpp.c @@ -28,10 +28,15 @@ void __init orion_mpp_conf(unsigned int *mpp_list, unsigned int variant_mask, unsigned int mpp_max, void __iomem *dev_bus) { unsigned int mpp_nr_regs = (1 + mpp_max/8); - u32 mpp_ctrl[mpp_nr_regs]; + u32 mpp_ctrl[8]; int i; printk(KERN_DEBUG "initial MPP regs:"); + if (mpp_nr_regs > ARRAY_SIZE(mpp_ctrl)) { + printk(KERN_ERR "orion_mpp_conf: invalid mpp_max\n"); + return; + } + for (i = 0; i < mpp_nr_regs; i++) { mpp_ctrl[i] = readl(mpp_ctrl_addr(i, dev_bus)); printk(" %08x", mpp_ctrl[i]); diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index b4e994cd3a42..6cb9fc7e9382 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -134,6 +134,7 @@ vdso_install: archclean: $(Q)$(MAKE) $(clean)=$(boot) +ifeq ($(KBUILD_EXTMOD),) # We need to generate vdso-offsets.h before compiling certain files in kernel/. # In order to do that, we should use the archprepare target, but we can't since # asm-offsets.h is included in some files used to generate vdso-offsets.h, and @@ -143,6 +144,7 @@ archclean: prepare: vdso_prepare vdso_prepare: prepare0 $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso include/generated/vdso-offsets.h +endif define archhelp echo '* Image.gz - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)' diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 3cb995606e60..c9a57d11330b 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -308,6 +308,9 @@ CONFIG_SERIAL_XILINX_PS_UART_CONSOLE=y CONFIG_SERIAL_MVEBU_UART=y CONFIG_SERIAL_DEV_BUS=y CONFIG_VIRTIO_CONSOLE=y +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m CONFIG_TCG_TPM=y CONFIG_TCG_TIS_I2C_INFINEON=y CONFIG_I2C_CHARDEV=y diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index f46d57c31443..6b5037ed15b2 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -58,7 +58,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, /** * elfcorehdr_read - read from ELF core header * @buf: buffer where the data is placed - * @csize: number of bytes to read + * @count: number of bytes to read * @ppos: address in the memory * * This function reads @count bytes from elf core header which exists diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 9b65132e789a..2a5b338b2542 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -23,7 +23,9 @@ #include <linux/slab.h> #include <linux/stop_machine.h> #include <linux/sched/debug.h> +#include <linux/set_memory.h> #include <linux/stringify.h> +#include <linux/vmalloc.h> #include <asm/traps.h> #include <asm/ptrace.h> #include <asm/cacheflush.h> @@ -42,10 +44,21 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); +static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode) +{ + void *addrs[1]; + u32 insns[1]; + + addrs[0] = addr; + insns[0] = opcode; + + return aarch64_insn_patch_text(addrs, insns, 1); +} + static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { /* prepare insn slot */ - p->ainsn.api.insn[0] = cpu_to_le32(p->opcode); + patch_text(p->ainsn.api.insn, p->opcode); flush_icache_range((uintptr_t) (p->ainsn.api.insn), (uintptr_t) (p->ainsn.api.insn) + @@ -118,15 +131,15 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode) +void *alloc_insn_page(void) { - void *addrs[1]; - u32 insns[1]; + void *page; - addrs[0] = (void *)addr; - insns[0] = (u32)opcode; + page = vmalloc_exec(PAGE_SIZE); + if (page) + set_memory_ro((unsigned long)page, 1); - return aarch64_insn_patch_text(addrs, insns, 1); + return page; } /* arm kprobe: install breakpoint in text */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index ce99c58cd1f1..d9a4c2d6dd8b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -497,25 +497,3 @@ void arch_setup_new_exec(void) { current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; } - -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK -void __used stackleak_check_alloca(unsigned long size) -{ - unsigned long stack_left; - unsigned long current_sp = current_stack_pointer; - struct stack_info info; - - BUG_ON(!on_accessible_stack(current, current_sp, &info)); - - stack_left = current_sp - info.low; - - /* - * There's a good chance we're almost out of stack space if this - * is true. Using panic() over BUG() is more likely to give - * reliable debugging output. - */ - if (size >= stack_left) - panic("alloca() over the kernel stack boundary\n"); -} -EXPORT_SYMBOL(stackleak_check_alloca); -#endif diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 3a703e5d4e32..a3ac26284845 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -160,6 +160,7 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, __dma_unmap_area(phys_to_virt(paddr), size, dir); } +#ifdef CONFIG_IOMMU_DMA static int __swiotlb_get_sgtable_page(struct sg_table *sgt, struct page *page, size_t size) { @@ -188,6 +189,7 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma, return ret; } +#endif /* CONFIG_IOMMU_DMA */ static int __init atomic_pool_init(void) { diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 15a84cfd0719..68410490e12f 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -128,7 +128,7 @@ cflags-y += -ffreestanding # clang's output will be based upon the build machine. So for clang we simply # unconditionally specify -EB or -EL as appropriate. # -ifeq ($(cc-name),clang) +ifdef CONFIG_CC_IS_CLANG cflags-$(CONFIG_CPU_BIG_ENDIAN) += -EB cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -EL else diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 34605ca21498..58a0315ad743 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -10,7 +10,7 @@ ccflags-vdso := \ $(filter -march=%,$(KBUILD_CFLAGS)) \ -D__VDSO__ -ifeq ($(cc-name),clang) +ifdef CONFIG_CC_IS_CLANG ccflags-vdso += $(filter --target=%,$(KBUILD_CFLAGS)) endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d51b2bd4aa1..8be31261aec8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -930,10 +930,6 @@ config FSL_GTM help Freescale General-purpose Timers support -# Yes MCA RS/6000s exist but Linux-PPC does not currently support any -config MCA - bool - # Platforms that what PCI turned unconditionally just do select PCI # in their config node. Platforms that want to choose at config # time should select PPC_PCI_CHOICE @@ -944,7 +940,6 @@ config PCI bool "PCI support" if PPC_PCI_CHOICE default y if !40x && !CPM2 && !PPC_8xx && !PPC_83xx \ && !PPC_85xx && !PPC_86xx && !GAMECUBE_COMMON - default PCI_QSPAN if PPC_8xx select GENERIC_PCI_IOMAP help Find out whether your system includes a PCI bus. PCI is the name of @@ -958,14 +953,6 @@ config PCI_DOMAINS config PCI_SYSCALL def_bool PCI -config PCI_QSPAN - bool "QSpan PCI" - depends on PPC_8xx - select PPC_I8259 - help - Say Y here if you have a system based on a Motorola 8xx-series - embedded processor with a QSPAN PCI interface, otherwise say N. - config PCI_8260 bool depends on PCI && 8260 diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 17be664dafa2..8a2ce14d68d0 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -96,7 +96,7 @@ aflags-$(CONFIG_CPU_BIG_ENDIAN) += $(call cc-option,-mabi=elfv1) aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mabi=elfv2 endif -ifneq ($(cc-name),clang) +ifndef CONFIG_CC_IS_CLANG cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mno-strict-align endif @@ -175,7 +175,7 @@ endif # Work around gcc code-gen bugs with -pg / -fno-omit-frame-pointer in gcc <= 4.8 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44199 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52828 -ifneq ($(cc-name),clang) +ifndef CONFIG_CC_IS_CLANG CC_FLAGS_FTRACE += $(call cc-ifversion, -lt, 0409, -mno-sched-epilog) endif endif diff --git a/arch/powerpc/boot/dts/fsl/t2080rdb.dts b/arch/powerpc/boot/dts/fsl/t2080rdb.dts index 55c0210a771d..092a400740f8 100644 --- a/arch/powerpc/boot/dts/fsl/t2080rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t2080rdb.dts @@ -77,12 +77,12 @@ }; ethernet@f0000 { - phy-handle = <&xg_cs4315_phy1>; + phy-handle = <&xg_cs4315_phy2>; phy-connection-type = "xgmii"; }; ethernet@f2000 { - phy-handle = <&xg_cs4315_phy2>; + phy-handle = <&xg_cs4315_phy1>; phy-connection-type = "xgmii"; }; diff --git a/arch/powerpc/boot/dts/mpc885ads.dts b/arch/powerpc/boot/dts/mpc885ads.dts index 5b037f51741d..3aa300afbbca 100644 --- a/arch/powerpc/boot/dts/mpc885ads.dts +++ b/arch/powerpc/boot/dts/mpc885ads.dts @@ -72,7 +72,7 @@ #address-cells = <1>; #size-cells = <1>; device_type = "soc"; - ranges = <0x0 0xff000000 0x4000>; + ranges = <0x0 0xff000000 0x28000>; bus-frequency = <0>; // Temporary -- will go away once kernel uses ranges for get_immrbase(). @@ -224,6 +224,17 @@ #size-cells = <0>; }; }; + + crypto@20000 { + compatible = "fsl,sec1.2", "fsl,sec1.0"; + reg = <0x20000 0x8000>; + interrupts = <1 1>; + interrupt-parent = <&PIC>; + fsl,num-channels = <1>; + fsl,channel-fifo-len = <24>; + fsl,exec-units-mask = <0x4c>; + fsl,descriptor-types-mask = <0x05000154>; + }; }; chosen { diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 31733a95bbd0..3d5acd2b113a 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -36,6 +36,11 @@ int raw_patch_instruction(unsigned int *addr, unsigned int instr); int patch_instruction_site(s32 *addr, unsigned int instr); int patch_branch_site(s32 *site, unsigned long target, int flags); +static inline unsigned long patch_site_addr(s32 *site) +{ + return (unsigned long)site + *site; +} + int instr_is_relative_branch(unsigned int instr); int instr_is_relative_link_branch(unsigned int instr); int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr); diff --git a/arch/powerpc/include/asm/mmu-8xx.h b/arch/powerpc/include/asm/mmu-8xx.h index 4f547752ae79..fa05aa566ece 100644 --- a/arch/powerpc/include/asm/mmu-8xx.h +++ b/arch/powerpc/include/asm/mmu-8xx.h @@ -34,20 +34,12 @@ * respectively NA for All or X for Supervisor and no access for User. * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) - * We also use the 2nd APG bit for _PAGE_ACCESSED when having SWAP: - * When that bit is not set access is done iaw "all user" - * which means no access iaw page rules. - * Therefore, we define 4 APG groups. lsb is _PMD_USER, 2nd is _PAGE_ACCESSED - * 0x => No access => 11 (all accesses performed as user iaw page definition) - * 10 => No user => 01 (all accesses performed according to page definition) - * 11 => User => 00 (all accesses performed as supervisor iaw page definition) + * Therefore, we define 2 APG groups. lsb is _PMD_USER + * 0 => No user => 01 (all accesses performed according to page definition) + * 1 => User => 00 (all accesses performed as supervisor iaw page definition) * We define all 16 groups so that all other bits of APG can take any value */ -#ifdef CONFIG_SWAP -#define MI_APG_INIT 0xf4f4f4f4 -#else #define MI_APG_INIT 0x44444444 -#endif /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in @@ -115,20 +107,12 @@ * Supervisor and no access for user and NA for ALL. * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) - * We also use the 2nd APG bit for _PAGE_ACCESSED when having SWAP: - * When that bit is not set access is done iaw "all user" - * which means no access iaw page rules. - * Therefore, we define 4 APG groups. lsb is _PMD_USER, 2nd is _PAGE_ACCESSED - * 0x => No access => 11 (all accesses performed as user iaw page definition) - * 10 => No user => 01 (all accesses performed according to page definition) - * 11 => User => 00 (all accesses performed as supervisor iaw page definition) + * Therefore, we define 2 APG groups. lsb is _PMD_USER + * 0 => No user => 01 (all accesses performed according to page definition) + * 1 => User => 00 (all accesses performed as supervisor iaw page definition) * We define all 16 groups so that all other bits of APG can take any value */ -#ifdef CONFIG_SWAP -#define MD_APG_INIT 0xf4f4f4f4 -#else #define MD_APG_INIT 0x44444444 -#endif /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in @@ -180,12 +164,6 @@ */ #define SPRN_M_TW 799 -/* APGs */ -#define M_APG0 0x00000000 -#define M_APG1 0x00000020 -#define M_APG2 0x00000040 -#define M_APG3 0x00000060 - #ifdef CONFIG_PPC_MM_SLICES #include <asm/nohash/32/slice.h> #define SLICE_ARRAY_SIZE (1 << (32 - SLICE_LOW_SHIFT - 1)) @@ -251,6 +229,15 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) BUG(); } +/* patch sites */ +extern s32 patch__itlbmiss_linmem_top; +extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp; +extern s32 patch__fixupdar_linmem_top; + +extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2; +extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, patch__dtlbmiss_exit_3; +extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf; + #endif /* !__ASSEMBLY__ */ #if defined(CONFIG_PPC_4K_PAGES) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index bb38dd67d47d..1b06add4f092 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -5,6 +5,7 @@ #include <linux/spinlock.h> #include <asm/page.h> #include <linux/time.h> +#include <linux/cpumask.h> /* * Definitions for talking to the RTAS on CHRP machines. diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 134a573a9f2d..3b67b9533c82 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -31,6 +31,7 @@ #include <asm/asm-offsets.h> #include <asm/ptrace.h> #include <asm/export.h> +#include <asm/code-patching-asm.h> #if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000 /* By simply checking Address >= 0x80000000, we know if its a kernel address */ @@ -318,8 +319,8 @@ InstructionTLBMiss: cmpli cr0, r11, PAGE_OFFSET@h #ifndef CONFIG_PIN_TLB_TEXT /* It is assumed that kernel code fits into the first 8M page */ -_ENTRY(ITLBMiss_cmp) - cmpli cr7, r11, (PAGE_OFFSET + 0x0800000)@h +0: cmpli cr7, r11, (PAGE_OFFSET + 0x0800000)@h + patch_site 0b, patch__itlbmiss_linmem_top #endif #endif #endif @@ -353,13 +354,14 @@ _ENTRY(ITLBMiss_cmp) #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mtcr r12 #endif - -#ifdef CONFIG_SWAP - rlwinm r11, r10, 31, _PAGE_ACCESSED >> 1 -#endif /* Load the MI_TWC with the attributes for this "segment." */ mtspr SPRN_MI_TWC, r11 /* Set segment attributes */ +#ifdef CONFIG_SWAP + rlwinm r11, r10, 32-5, _PAGE_PRESENT + and r11, r11, r10 + rlwimi r10, r11, 0, _PAGE_PRESENT +#endif li r11, RPN_PATTERN | 0x200 /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 20 and 23 must be clear. @@ -372,16 +374,17 @@ _ENTRY(ITLBMiss_cmp) mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ /* Restore registers */ -_ENTRY(itlb_miss_exit_1) - mfspr r10, SPRN_SPRG_SCRATCH0 +0: mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mfspr r12, SPRN_SPRG_SCRATCH2 #endif rfi + patch_site 0b, patch__itlbmiss_exit_1 + #ifdef CONFIG_PERF_EVENTS -_ENTRY(itlb_miss_perf) - lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha + patch_site 0f, patch__itlbmiss_perf +0: lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha lwz r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10) addi r11, r11, 1 stw r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10) @@ -435,11 +438,11 @@ DataStoreTLBMiss: #ifndef CONFIG_PIN_TLB_IMMR cmpli cr0, r11, VIRT_IMMR_BASE@h #endif -_ENTRY(DTLBMiss_cmp) - cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h +0: cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h + patch_site 0b, patch__dtlbmiss_linmem_top #ifndef CONFIG_PIN_TLB_IMMR -_ENTRY(DTLBMiss_jmp) - beq- DTLBMissIMMR +0: beq- DTLBMissIMMR + patch_site 0b, patch__dtlbmiss_immr_jmp #endif blt cr7, DTLBMissLinear lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha @@ -470,14 +473,22 @@ _ENTRY(DTLBMiss_jmp) * above. */ rlwimi r11, r10, 0, _PAGE_GUARDED -#ifdef CONFIG_SWAP - /* _PAGE_ACCESSED has to be set. We use second APG bit for that, 0 - * on that bit will represent a Non Access group - */ - rlwinm r11, r10, 31, _PAGE_ACCESSED >> 1 -#endif mtspr SPRN_MD_TWC, r11 + /* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set. + * We also need to know if the insn is a load/store, so: + * Clear _PAGE_PRESENT and load that which will + * trap into DTLB Error with store bit set accordinly. + */ + /* PRESENT=0x1, ACCESSED=0x20 + * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5)); + * r10 = (r10 & ~PRESENT) | r11; + */ +#ifdef CONFIG_SWAP + rlwinm r11, r10, 32-5, _PAGE_PRESENT + and r11, r11, r10 + rlwimi r10, r11, 0, _PAGE_PRESENT +#endif /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior @@ -489,14 +500,16 @@ _ENTRY(DTLBMiss_jmp) /* Restore registers */ mtspr SPRN_DAR, r11 /* Tag DAR */ -_ENTRY(dtlb_miss_exit_1) - mfspr r10, SPRN_SPRG_SCRATCH0 + +0: mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 mfspr r12, SPRN_SPRG_SCRATCH2 rfi + patch_site 0b, patch__dtlbmiss_exit_1 + #ifdef CONFIG_PERF_EVENTS -_ENTRY(dtlb_miss_perf) - lis r10, (dtlb_miss_counter - PAGE_OFFSET)@ha + patch_site 0f, patch__dtlbmiss_perf +0: lis r10, (dtlb_miss_counter - PAGE_OFFSET)@ha lwz r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10) addi r11, r11, 1 stw r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10) @@ -637,8 +650,8 @@ InstructionBreakpoint: */ DTLBMissIMMR: mtcr r12 - /* Set 512k byte guarded page and mark it valid and accessed */ - li r10, MD_PS512K | MD_GUARDED | MD_SVALID | M_APG2 + /* Set 512k byte guarded page and mark it valid */ + li r10, MD_PS512K | MD_GUARDED | MD_SVALID mtspr SPRN_MD_TWC, r10 mfspr r10, SPRN_IMMR /* Get current IMMR */ rlwinm r10, r10, 0, 0xfff80000 /* Get 512 kbytes boundary */ @@ -648,16 +661,17 @@ DTLBMissIMMR: li r11, RPN_PATTERN mtspr SPRN_DAR, r11 /* Tag DAR */ -_ENTRY(dtlb_miss_exit_2) - mfspr r10, SPRN_SPRG_SCRATCH0 + +0: mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 mfspr r12, SPRN_SPRG_SCRATCH2 rfi + patch_site 0b, patch__dtlbmiss_exit_2 DTLBMissLinear: mtcr r12 - /* Set 8M byte page and mark it valid and accessed */ - li r11, MD_PS8MEG | MD_SVALID | M_APG2 + /* Set 8M byte page and mark it valid */ + li r11, MD_PS8MEG | MD_SVALID mtspr SPRN_MD_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ @@ -666,28 +680,29 @@ DTLBMissLinear: li r11, RPN_PATTERN mtspr SPRN_DAR, r11 /* Tag DAR */ -_ENTRY(dtlb_miss_exit_3) - mfspr r10, SPRN_SPRG_SCRATCH0 + +0: mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 mfspr r12, SPRN_SPRG_SCRATCH2 rfi + patch_site 0b, patch__dtlbmiss_exit_3 #ifndef CONFIG_PIN_TLB_TEXT ITLBMissLinear: mtcr r12 - /* Set 8M byte page and mark it valid,accessed */ - li r11, MI_PS8MEG | MI_SVALID | M_APG2 + /* Set 8M byte page and mark it valid */ + li r11, MI_PS8MEG | MI_SVALID mtspr SPRN_MI_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ -_ENTRY(itlb_miss_exit_2) - mfspr r10, SPRN_SPRG_SCRATCH0 +0: mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 mfspr r12, SPRN_SPRG_SCRATCH2 rfi + patch_site 0b, patch__itlbmiss_exit_2 #endif /* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions @@ -705,8 +720,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ mfspr r11, SPRN_M_TW /* Get level 1 table */ blt+ 3f rlwinm r11, r10, 16, 0xfff8 -_ENTRY(FixupDAR_cmp) - cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h + +0: cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h + patch_site 0b, patch__fixupdar_linmem_top + /* create physical page address from effective address */ tophys(r11, r10) blt- cr7, 201f @@ -960,7 +977,7 @@ initial_mmu: ori r8, r8, MI_EVALID /* Mark it valid */ mtspr SPRN_MI_EPN, r8 li r8, MI_PS8MEG /* Set 8M byte page */ - ori r8, r8, MI_SVALID | M_APG2 /* Make it valid, APG 2 */ + ori r8, r8, MI_SVALID /* Make it valid */ mtspr SPRN_MI_TWC, r8 li r8, MI_BOOTINIT /* Create RPN for address 0 */ mtspr SPRN_MI_RPN, r8 /* Store TLB entry */ @@ -987,7 +1004,7 @@ initial_mmu: ori r8, r8, MD_EVALID /* Mark it valid */ mtspr SPRN_MD_EPN, r8 li r8, MD_PS512K | MD_GUARDED /* Set 512k byte page */ - ori r8, r8, MD_SVALID | M_APG2 /* Make it valid and accessed */ + ori r8, r8, MD_SVALID /* Make it valid */ mtspr SPRN_MD_TWC, r8 mr r8, r9 /* Create paddr for TLB */ ori r8, r8, MI_BOOTINIT|0x2 /* Inhibit cache -- Cort */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4d5322cfad25..96f34730010f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -590,12 +590,11 @@ void flush_all_to_thread(struct task_struct *tsk) if (tsk->thread.regs) { preempt_disable(); BUG_ON(tsk != current); - save_all(tsk); - #ifdef CONFIG_SPE if (tsk->thread.regs->msr & MSR_SPE) tsk->thread.spefscr = mfspr(SPRN_SPEFSCR); #endif + save_all(tsk); preempt_enable(); } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bf8def2159c3..d65b961661fb 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2337,8 +2337,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) kvmppc_core_prepare_to_enter(vcpu); return; } - dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC - / tb_ticks_per_sec; + dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now); hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL); vcpu->arch.timer_running = 1; } diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index fa888bfc347e..9f5b8c01c4e1 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -61,11 +61,10 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) dec_time = vcpu->arch.dec; /* - * Guest timebase ticks at the same frequency as host decrementer. - * So use the host decrementer calculations for decrementer emulation. + * Guest timebase ticks at the same frequency as host timebase. + * So use the host timebase calculations for decrementer emulation. */ - dec_time = dec_time << decrementer_clockevent.shift; - do_div(dec_time, decrementer_clockevent.mult); + dec_time = tb_to_ns(dec_time); dec_nsec = do_div(dec_time, NSEC_PER_SEC); hrtimer_start(&vcpu->arch.dec_timer, ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 36484a2ef915..01b7f5107c3a 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -13,6 +13,7 @@ */ #include <linux/memblock.h> +#include <linux/mmu_context.h> #include <asm/fixmap.h> #include <asm/code-patching.h> @@ -79,7 +80,7 @@ void __init MMU_init_hw(void) for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { mtspr(SPRN_MD_CTR, ctr | (i << 8)); mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); - mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID | M_APG2); + mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); addr += LARGE_PAGE_SIZE_8M; mem -= LARGE_PAGE_SIZE_8M; @@ -97,22 +98,13 @@ static void __init mmu_mapin_immr(void) map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); } -/* Address of instructions to patch */ -#ifndef CONFIG_PIN_TLB_IMMR -extern unsigned int DTLBMiss_jmp; -#endif -extern unsigned int DTLBMiss_cmp, FixupDAR_cmp; -#ifndef CONFIG_PIN_TLB_TEXT -extern unsigned int ITLBMiss_cmp; -#endif - -static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped) +static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped) { - unsigned int instr = *addr; + unsigned int instr = *(unsigned int *)patch_site_addr(site); instr &= 0xffff0000; instr |= (unsigned long)__va(mapped) >> 16; - patch_instruction(addr, instr); + patch_instruction_site(site, instr); } unsigned long __init mmu_mapin_ram(unsigned long top) @@ -123,17 +115,17 @@ unsigned long __init mmu_mapin_ram(unsigned long top) mapped = 0; mmu_mapin_immr(); #ifndef CONFIG_PIN_TLB_IMMR - patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP); + patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); #endif #ifndef CONFIG_PIN_TLB_TEXT - mmu_patch_cmp_limit(&ITLBMiss_cmp, 0); + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); #endif } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); } - mmu_patch_cmp_limit(&DTLBMiss_cmp, mapped); - mmu_patch_cmp_limit(&FixupDAR_cmp, mapped); + mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); + mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped); /* If the size of RAM is not an exact power of two, we may not * have covered RAM in its entirety with 8 MiB diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c index 6c0020d1c561..e38f74e9e7a4 100644 --- a/arch/powerpc/perf/8xx-pmu.c +++ b/arch/powerpc/perf/8xx-pmu.c @@ -31,9 +31,6 @@ extern unsigned long itlb_miss_counter, dtlb_miss_counter; extern atomic_t instruction_counter; -extern unsigned int itlb_miss_perf, dtlb_miss_perf; -extern unsigned int itlb_miss_exit_1, itlb_miss_exit_2; -extern unsigned int dtlb_miss_exit_1, dtlb_miss_exit_2, dtlb_miss_exit_3; static atomic_t insn_ctr_ref; static atomic_t itlb_miss_ref; @@ -103,22 +100,22 @@ static int mpc8xx_pmu_add(struct perf_event *event, int flags) break; case PERF_8xx_ID_ITLB_LOAD_MISS: if (atomic_inc_return(&itlb_miss_ref) == 1) { - unsigned long target = (unsigned long)&itlb_miss_perf; + unsigned long target = patch_site_addr(&patch__itlbmiss_perf); - patch_branch(&itlb_miss_exit_1, target, 0); + patch_branch_site(&patch__itlbmiss_exit_1, target, 0); #ifndef CONFIG_PIN_TLB_TEXT - patch_branch(&itlb_miss_exit_2, target, 0); + patch_branch_site(&patch__itlbmiss_exit_2, target, 0); #endif } val = itlb_miss_counter; break; case PERF_8xx_ID_DTLB_LOAD_MISS: if (atomic_inc_return(&dtlb_miss_ref) == 1) { - unsigned long target = (unsigned long)&dtlb_miss_perf; + unsigned long target = patch_site_addr(&patch__dtlbmiss_perf); - patch_branch(&dtlb_miss_exit_1, target, 0); - patch_branch(&dtlb_miss_exit_2, target, 0); - patch_branch(&dtlb_miss_exit_3, target, 0); + patch_branch_site(&patch__dtlbmiss_exit_1, target, 0); + patch_branch_site(&patch__dtlbmiss_exit_2, target, 0); + patch_branch_site(&patch__dtlbmiss_exit_3, target, 0); } val = dtlb_miss_counter; break; @@ -180,17 +177,17 @@ static void mpc8xx_pmu_del(struct perf_event *event, int flags) break; case PERF_8xx_ID_ITLB_LOAD_MISS: if (atomic_dec_return(&itlb_miss_ref) == 0) { - patch_instruction(&itlb_miss_exit_1, insn); + patch_instruction_site(&patch__itlbmiss_exit_1, insn); #ifndef CONFIG_PIN_TLB_TEXT - patch_instruction(&itlb_miss_exit_2, insn); + patch_instruction_site(&patch__itlbmiss_exit_2, insn); #endif } break; case PERF_8xx_ID_DTLB_LOAD_MISS: if (atomic_dec_return(&dtlb_miss_ref) == 0) { - patch_instruction(&dtlb_miss_exit_1, insn); - patch_instruction(&dtlb_miss_exit_2, insn); - patch_instruction(&dtlb_miss_exit_3, insn); + patch_instruction_site(&patch__dtlbmiss_exit_1, insn); + patch_instruction_site(&patch__dtlbmiss_exit_2, insn); + patch_instruction_site(&patch__dtlbmiss_exit_3, insn); } break; } diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig index 2a9d66254ffc..5326ece36120 100644 --- a/arch/powerpc/platforms/40x/Kconfig +++ b/arch/powerpc/platforms/40x/Kconfig @@ -29,6 +29,7 @@ config KILAUEA select 405EX select PPC40x_SIMPLE select PPC4xx_PCI_EXPRESS + select PCI select PCI_MSI select PPC4xx_MSI help diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index f024efd5a4c2..9a85d350b1b6 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -21,6 +21,7 @@ config BLUESTONE depends on 44x select PPC44x_SIMPLE select APM821xx + select PCI select PCI_MSI select PPC4xx_MSI select PPC4xx_PCI_EXPRESS @@ -200,6 +201,7 @@ config AKEBONO select SWIOTLB select 476FPE select PPC4xx_PCI_EXPRESS + select PCI select PCI_MSI select PPC4xx_HSTA_MSI select I2C diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 8bd590af488a..794487313cc8 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -26,6 +26,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/hugetlb.h> #include <asm/lppaca.h> #include <asm/hvcall.h> #include <asm/firmware.h> @@ -36,6 +37,7 @@ #include <asm/vio.h> #include <asm/mmu.h> #include <asm/machdep.h> +#include <asm/drmem.h> #include "pseries.h" @@ -433,6 +435,16 @@ static void parse_em_data(struct seq_file *m) seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]); } +static void maxmem_data(struct seq_file *m) +{ + unsigned long maxmem = 0; + + maxmem += drmem_info->n_lmbs * drmem_info->lmb_size; + maxmem += hugetlb_total_pages() * PAGE_SIZE; + + seq_printf(m, "MaxMem=%ld\n", maxmem); +} + static int pseries_lparcfg_data(struct seq_file *m, void *v) { int partition_potential_processors; @@ -491,6 +503,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif parse_em_data(m); + maxmem_data(m); return 0; } diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 69e7fb47bcaa..878f9c1d3615 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -11,6 +11,12 @@ UBSAN_SANITIZE := n ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) +ifdef CONFIG_CC_IS_CLANG +# clang stores addresses on the stack causing the frame size to blow +# out. See https://github.com/ClangBuiltLinux/linux/issues/252 +KBUILD_CFLAGS += -Wframe-larger-than=4096 +endif + ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y += xmon.o nonstdio.o spr_access.o diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 36473d7dbaac..07fa9ea75fea 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -1,6 +1,3 @@ -CONFIG_SMP=y -CONFIG_PCI=y -CONFIG_PCIE_XILINX=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_IKCONFIG=y @@ -11,10 +8,15 @@ CONFIG_CFS_BANDWIDTH=y CONFIG_CGROUP_BPF=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y +CONFIG_CHECKPOINT_RESTORE=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_CHECKPOINT_RESTORE=y CONFIG_BPF_SYSCALL=y +CONFIG_SMP=y +CONFIG_PCI=y +CONFIG_PCIE_XILINX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -59,6 +61,7 @@ CONFIG_USB_OHCI_HCD_PLATFORM=y CONFIG_USB_STORAGE=y CONFIG_USB_UAS=y CONFIG_VIRTIO_MMIO=y +CONFIG_SIFIVE_PLIC=y CONFIG_RAS=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y @@ -72,8 +75,5 @@ CONFIG_NFS_V4=y CONFIG_NFS_V4_1=y CONFIG_NFS_V4_2=y CONFIG_ROOT_NFS=y -# CONFIG_RCU_TRACE is not set CONFIG_CRYPTO_USER_API_HASH=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_SIFIVE_PLIC=y +# CONFIG_RCU_TRACE is not set diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 302795c47c06..81038ab357ce 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -236,7 +236,7 @@ static inline unsigned long current_stack_pointer(void) return sp; } -static __no_sanitize_address_or_inline unsigned short stap(void) +static __no_kasan_or_inline unsigned short stap(void) { unsigned short cpu_address; @@ -330,7 +330,7 @@ static inline void __load_psw(psw_t psw) * Set PSW mask to specified value, while leaving the * PSW addr pointing to the next instruction. */ -static __no_sanitize_address_or_inline void __load_psw_mask(unsigned long mask) +static __no_kasan_or_inline void __load_psw_mask(unsigned long mask) { unsigned long addr; psw_t psw; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c51c989c19c0..ba7e3464ee92 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -129,6 +129,7 @@ config X86 select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_THREAD_STRUCT_WHITELIST + select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 8f0c4c9fc904..51079fc9298f 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -113,7 +113,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) { int err; - memset(&cpu.flags, 0, sizeof cpu.flags); + memset(&cpu.flags, 0, sizeof(cpu.flags)); cpu.level = 3; if (has_eflag(X86_EFLAGS_AC)) diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index b25c53527a94..023bf1c3de8b 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -50,7 +50,7 @@ static void parse_earlyprintk(void) int pos = 0; int port = 0; - if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { + if (cmdline_find_option("earlyprintk", arg, sizeof(arg)) > 0) { char *e; if (!strncmp(arg, "serial", 6)) { @@ -124,7 +124,7 @@ static void parse_console_uart8250(void) * console=uart8250,io,0x3f8,115200n8 * need to make sure it is last one console ! */ - if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) + if (cmdline_find_option("console", optstr, sizeof(optstr)) <= 0) return; options = optstr; diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 223e42527077..6c176b6a42ad 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -76,7 +76,7 @@ static int get_edd_info(u8 devno, struct edd_info *ei) { struct biosregs ireg, oreg; - memset(ei, 0, sizeof *ei); + memset(ei, 0, sizeof(*ei)); /* Check Extensions Present */ @@ -133,7 +133,7 @@ void query_edd(void) struct edd_info ei, *edp; u32 *mbrptr; - if (cmdline_find_option("edd", eddarg, sizeof eddarg) > 0) { + if (cmdline_find_option("edd", eddarg, sizeof(eddarg)) > 0) { if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip")) { do_edd = 1; do_mbr = 0; @@ -166,7 +166,7 @@ void query_edd(void) */ if (!get_edd_info(devno, &ei) && boot_params.eddbuf_entries < EDDMAXNR) { - memcpy(edp, &ei, sizeof ei); + memcpy(edp, &ei, sizeof(ei)); edp++; boot_params.eddbuf_entries++; } diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 9bcea386db65..73532543d689 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -36,8 +36,8 @@ static void copy_boot_params(void) const struct old_cmdline * const oldcmd = (const struct old_cmdline *)OLD_CL_ADDRESS; - BUILD_BUG_ON(sizeof boot_params != 4096); - memcpy(&boot_params.hdr, &hdr, sizeof hdr); + BUILD_BUG_ON(sizeof(boot_params) != 4096); + memcpy(&boot_params.hdr, &hdr, sizeof(hdr)); if (!boot_params.hdr.cmd_line_ptr && oldcmd->cl_magic == OLD_CL_MAGIC) { diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index d9c28c87e477..7df2b28207be 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -26,7 +26,7 @@ static int detect_memory_e820(void) initregs(&ireg); ireg.ax = 0xe820; - ireg.cx = sizeof buf; + ireg.cx = sizeof(buf); ireg.edx = SMAP; ireg.di = (size_t)&buf; diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c index c0fb356a3092..2fe3616ba161 100644 --- a/arch/x86/boot/regs.c +++ b/arch/x86/boot/regs.c @@ -21,7 +21,7 @@ void initregs(struct biosregs *reg) { - memset(reg, 0, sizeof *reg); + memset(reg, 0, sizeof(*reg)); reg->eflags |= X86_EFLAGS_CF; reg->ds = ds(); reg->es = ds(); diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index ba3e100654db..3ecc11a9c440 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -62,7 +62,7 @@ static int vesa_probe(void) if (mode & ~0x1ff) continue; - memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ + memset(&vminfo, 0, sizeof(vminfo)); /* Just in case... */ ireg.ax = 0x4f01; ireg.cx = mode; @@ -109,7 +109,7 @@ static int vesa_set_mode(struct mode_info *mode) int is_graphic; u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; - memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ + memset(&vminfo, 0, sizeof(vminfo)); /* Just in case... */ initregs(&ireg); ireg.ax = 0x4f01; @@ -241,7 +241,7 @@ void vesa_store_edid(void) struct biosregs ireg, oreg; /* Apparently used as a nonsense token... */ - memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); + memset(&boot_params.edid_info, 0x13, sizeof(boot_params.edid_info)); if (vginfo.version < 0x0200) return; /* EDID requires VBE 2.0+ */ diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 77780e386e9b..ac89b6624a40 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -115,7 +115,7 @@ static unsigned int get_entry(void) } else if ((key >= '0' && key <= '9') || (key >= 'A' && key <= 'Z') || (key >= 'a' && key <= 'z')) { - if (len < sizeof entry_buf) { + if (len < sizeof(entry_buf)) { entry_buf[len++] = key; putchar(key); } diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 708b46a54578..25e5a6bda8c3 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -329,8 +329,22 @@ For 32-bit we have the following conventions - kernel is built with #endif +.macro STACKLEAK_ERASE_NOCLOBBER +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + PUSH_AND_CLEAR_REGS + call stackleak_erase + POP_REGS +#endif +.endm + #endif /* CONFIG_X86_64 */ +.macro STACKLEAK_ERASE +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + call stackleak_erase +#endif +.endm + /* * This does 'call enter_from_user_mode' unless we can avoid it based on * kernel config or using the static jump infrastructure. diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 687e47f8a796..d309f30cf7af 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -46,6 +46,8 @@ #include <asm/frame.h> #include <asm/nospec-branch.h> +#include "calling.h" + .section .entry.text, "ax" /* @@ -712,6 +714,7 @@ ENTRY(ret_from_fork) /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax call syscall_return_slowpath + STACKLEAK_ERASE jmp restore_all /* kernel thread */ @@ -886,6 +889,8 @@ ENTRY(entry_SYSENTER_32) ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + STACKLEAK_ERASE + /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ @@ -997,6 +1002,8 @@ ENTRY(entry_INT80_32) call do_int80_syscall_32 .Lsyscall_32_done: + STACKLEAK_ERASE + restore_all: TRACE_IRQS_IRET SWITCH_TO_ENTRY_STACK diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4d7a2d9d44cf..ce25d84023c0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -266,6 +266,8 @@ syscall_return_via_sysret: * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ + STACKLEAK_ERASE_NOCLOBBER + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi popq %rdi @@ -625,6 +627,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ + STACKLEAK_ERASE_NOCLOBBER SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 7d0df78db727..8eaf8952c408 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -261,6 +261,11 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) /* Opportunistic SYSRET */ sysret32_from_system_call: + /* + * We are not going to return to userspace from the trampoline + * stack. So let's erase the thread stack right now. + */ + STACKLEAK_ERASE TRACE_IRQS_ON /* User mode traces as IRQs on. */ movq RBX(%rsp), %rbx /* pt_regs->rbx */ movq RBP(%rsp), %rbp /* pt_regs->rbp */ diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0fb8659b20d8..273c62e81546 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4535,7 +4535,7 @@ __init int intel_pmu_init(void) } } - snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name); + snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name); if (version >= 2 && extra_attr) { x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index fab4df16a3c4..22c4dfe65992 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -217,11 +217,18 @@ static inline bool in_x32_syscall(void) return false; } -static inline bool in_compat_syscall(void) +static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); } + +#ifdef CONFIG_COMPAT +static inline bool in_compat_syscall(void) +{ + return in_32bit_syscall(); +} #define in_compat_syscall in_compat_syscall /* override the generic impl */ +#endif struct compat_siginfo; int __copy_siginfo_to_user32(struct compat_siginfo __user *to, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 89a048c2faec..28c4a502b419 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -331,6 +331,8 @@ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ +#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ +#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c18ed65287d5..cf350639e76d 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -76,9 +76,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1 static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) { - if (in_compat_syscall()) - return true; - return false; + return in_32bit_syscall(); } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ #endif /* !COMPILE_OFFSETS */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index fba54ca23b2a..26942ad63830 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -361,7 +361,6 @@ extern struct paravirt_patch_template pv_ops; __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \ asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name)) -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); unsigned paravirt_patch_default(u8 type, void *insnbuf, unsigned long addr, unsigned len); @@ -651,7 +650,6 @@ void paravirt_leave_lazy_mmu(void); void paravirt_flush_lazy_mmu(void); void _paravirt_nop(void); -u32 _paravirt_ident_32(u32); u64 _paravirt_ident_64(u64); #define paravirt_nop ((void *)_paravirt_nop) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 323a313947e0..d760611cfc35 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -453,6 +453,12 @@ static inline void __native_flush_tlb_one_user(unsigned long addr) */ static inline void __flush_tlb_all(void) { + /* + * This is to catch users with enabled preemption and the PGE feature + * and don't trigger the warning in __native_flush_tlb(). + */ + VM_WARN_ON_ONCE(preemptible()); + if (boot_cpu_has(X86_FEATURE_PGE)) { __flush_tlb_global(); } else { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cbbd57ae06ee..ffb181f959d2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1074,7 +1074,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) #endif c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof c->x86_capability); + memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; if (!have_cpuid_p()) @@ -1317,7 +1317,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_virt_bits = 32; #endif c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof c->x86_capability); + memset(&c->x86_capability, 0, sizeof(c->x86_capability)); generic_identify(c); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8cb3c02980cf..8c66d2fc8f81 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2215,7 +2215,7 @@ static int mce_device_create(unsigned int cpu) if (dev) return 0; - dev = kzalloc(sizeof *dev, GFP_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; dev->id = cpu; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b9bc8a1a584e..2637ff09d6a0 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -666,8 +666,8 @@ static ssize_t pf_show(struct device *dev, } static DEVICE_ATTR_WO(reload); -static DEVICE_ATTR(version, 0400, version_show, NULL); -static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); +static DEVICE_ATTR(version, 0444, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL); static struct attribute *mc_default_attrs[] = { &dev_attr_version.attr, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e12ee86906c6..86e277f8daf4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -798,7 +798,7 @@ static void generic_set_all(void) local_irq_restore(flags); /* Use the atomic bitops to update the global mask */ - for (count = 0; count < sizeof mask * 8; ++count) { + for (count = 0; count < sizeof(mask) * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 40eee6cc4124..2e173d47b450 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -174,12 +174,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC_SET_PAGE_ENTRY: case MTRRIOC_DEL_PAGE_ENTRY: case MTRRIOC_KILL_PAGE_ENTRY: - if (copy_from_user(&sentry, arg, sizeof sentry)) + if (copy_from_user(&sentry, arg, sizeof(sentry))) return -EFAULT; break; case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: - if (copy_from_user(&gentry, arg, sizeof gentry)) + if (copy_from_user(&gentry, arg, sizeof(gentry))) return -EFAULT; break; #ifdef CONFIG_COMPAT @@ -332,7 +332,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) switch (cmd) { case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: - if (copy_to_user(arg, &gentry, sizeof gentry)) + if (copy_to_user(arg, &gentry, sizeof(gentry))) err = -EFAULT; break; #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 5e801c8c8ce7..374a52fa5296 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -213,8 +213,9 @@ static unsigned int mem32_serial_in(unsigned long addr, int offset) * early_pci_serial_init() * * This function is invoked when the early_printk param starts with "pciserial" - * The rest of the param should be ",B:D.F,baud" where B, D & F describe the - * location of a PCI device that must be a UART device. + * The rest of the param should be "[force],B:D.F,baud", where B, D & F describe + * the location of a PCI device that must be a UART device. "force" is optional + * and overrides the use of an UART device with a wrong PCI class code. */ static __init void early_pci_serial_init(char *s) { @@ -224,17 +225,23 @@ static __init void early_pci_serial_init(char *s) u32 classcode, bar0; u16 cmdreg; char *e; + int force = 0; - - /* - * First, part the param to get the BDF values - */ if (*s == ',') ++s; if (*s == 0) return; + /* Force the use of an UART device with wrong class code */ + if (!strncmp(s, "force,", 6)) { + force = 1; + s += 6; + } + + /* + * Part the param to get the BDF values + */ bus = (u8)simple_strtoul(s, &e, 16); s = e; if (*s != ':') @@ -253,7 +260,7 @@ static __init void early_pci_serial_init(char *s) s++; /* - * Second, find the device from the BDF + * Find the device from the BDF */ cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND); classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); @@ -264,8 +271,10 @@ static __init void early_pci_serial_init(char *s) */ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || - (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ - return; + (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ { + if (!force) + return; + } /* * Determine if it is IO or memory mapped @@ -289,7 +298,7 @@ static __init void early_pci_serial_init(char *s) } /* - * Lastly, initialize the hardware + * Initialize the hardware */ if (*s) { if (strcmp(s, "nocfg") == 0) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5dc377dc9d7b..7663a8eb602b 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -385,7 +385,7 @@ static void __init copy_bootdata(char *real_mode_data) */ sme_map_bootdata(real_mode_data); - memcpy(&boot_params, real_mode_data, sizeof boot_params); + memcpy(&boot_params, real_mode_data, sizeof(boot_params)); sanitize_boot_params(&boot_params); cmd_line_ptr = get_cmd_line_ptr(); if (cmd_line_ptr) { diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index ef688804f80d..4588414e2561 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -115,14 +115,14 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = -EBADF; break; } - if (copy_from_user(®s, uregs, sizeof regs)) { + if (copy_from_user(®s, uregs, sizeof(regs))) { err = -EFAULT; break; } err = rdmsr_safe_regs_on_cpu(cpu, regs); if (err) break; - if (copy_to_user(uregs, ®s, sizeof regs)) + if (copy_to_user(uregs, ®s, sizeof(regs))) err = -EFAULT; break; @@ -131,14 +131,14 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = -EBADF; break; } - if (copy_from_user(®s, uregs, sizeof regs)) { + if (copy_from_user(®s, uregs, sizeof(regs))) { err = -EFAULT; break; } err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; - if (copy_to_user(uregs, ®s, sizeof regs)) + if (copy_to_user(uregs, ®s, sizeof(regs))) err = -EFAULT; break; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e4d4df37922a..c0e0101133f3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -56,17 +56,6 @@ asm (".pushsection .entry.text, \"ax\"\n" ".type _paravirt_nop, @function\n\t" ".popsection"); -/* identity function, which can be inlined */ -u32 notrace _paravirt_ident_32(u32 x) -{ - return x; -} - -u64 notrace _paravirt_ident_64(u64 x) -{ - return x; -} - void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -102,6 +91,12 @@ static unsigned paravirt_patch_call(void *insnbuf, const void *target, } #ifdef CONFIG_PARAVIRT_XXL +/* identity function, which can be inlined */ +u64 notrace _paravirt_ident_64(u64 x) +{ + return x; +} + static unsigned paravirt_patch_jmp(void *insnbuf, const void *target, unsigned long addr, unsigned len) { @@ -146,13 +141,11 @@ unsigned paravirt_patch_default(u8 type, void *insnbuf, else if (opfunc == _paravirt_nop) ret = 0; +#ifdef CONFIG_PARAVIRT_XXL /* identity functions just return their single argument */ - else if (opfunc == _paravirt_ident_32) - ret = paravirt_patch_ident_32(insnbuf, len); else if (opfunc == _paravirt_ident_64) ret = paravirt_patch_ident_64(insnbuf, len); -#ifdef CONFIG_PARAVIRT_XXL else if (type == PARAVIRT_PATCH(cpu.iret) || type == PARAVIRT_PATCH(cpu.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -309,13 +302,8 @@ struct pv_info pv_info = { #endif }; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) -/* 32-bit pagetable entries */ -#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) -#else /* 64-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) -#endif struct paravirt_patch_template pv_ops = { /* Init ops. */ @@ -483,5 +471,5 @@ NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); #endif -EXPORT_SYMBOL_GPL(pv_ops); +EXPORT_SYMBOL(pv_ops); EXPORT_SYMBOL_GPL(pv_info); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 6368c22fa1fa..de138d3912e4 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -10,24 +10,18 @@ DEF_NATIVE(cpu, iret, "iret"); DEF_NATIVE(mmu, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(mmu, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(mmu, read_cr3, "mov %cr3, %eax"); -#endif - -#if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)"); -DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); -#endif - -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) -{ - /* arg in %eax, return in %eax */ - return 0; -} unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) { /* arg in %edx:%eax, return in %edx:%eax */ return 0; } +#endif + +#if defined(CONFIG_PARAVIRT_SPINLOCKS) +DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)"); +DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); +#endif extern bool pv_is_native_spin_unlock(void); extern bool pv_is_native_vcpu_is_preempted(void); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 7ca9cb726f4d..9d9e04b31077 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -15,27 +15,19 @@ DEF_NATIVE(cpu, wbinvd, "wbinvd"); DEF_NATIVE(cpu, usergs_sysret64, "swapgs; sysretq"); DEF_NATIVE(cpu, swapgs, "swapgs"); -#endif - -DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%rdi)"); -DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); -#endif - -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) -{ - return paravirt_patch_insns(insnbuf, len, - start__mov32, end__mov32); -} - unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, start__mov64, end__mov64); } +#endif + +#if defined(CONFIG_PARAVIRT_SPINLOCKS) +DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%rdi)"); +DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); +#endif extern bool pv_is_native_spin_unlock(void); extern bool pv_is_native_vcpu_is_preempted(void); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 31b4755369f0..0e0b4288a4b2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -701,10 +701,10 @@ static void __set_personality_x32(void) current->mm->context.ia32_compat = TIF_X32; current->personality &= ~READ_IMPLIES_EXEC; /* - * in_compat_syscall() uses the presence of the x32 syscall bit + * in_32bit_syscall() uses the presence of the x32 syscall bit * flag to determine compat status. The x86 mmap() code relies on * the syscall bitness so set x32 syscall bit right here to make - * in_compat_syscall() work during exec(). + * in_32bit_syscall() work during exec(). * * Pretend to come from a x32 execve. */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6a78d4b36a79..f7476ce23b6e 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -105,7 +105,7 @@ out: static void find_start_end(unsigned long addr, unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!in_compat_syscall() && (flags & MAP_32BIT)) { + if (!in_32bit_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -122,7 +122,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, } *begin = get_mmap_base(1); - if (in_compat_syscall()) + if (in_32bit_syscall()) *end = task_size_32bit(); else *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); @@ -193,7 +193,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legacy mmap base */ - if (!in_compat_syscall() && (flags & MAP_32BIT)) + if (!in_32bit_syscall() && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ @@ -217,9 +217,10 @@ get_unmapped_area: * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. * - * !in_compat_syscall() check to avoid high addresses for x32. + * !in_32bit_syscall() check to avoid high addresses for x32 + * (and make it no op on native i386). */ - if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; info.align_mask = 0; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8f6dcd88202e..9b7c4ca8f0a7 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,7 +306,7 @@ __visible void __noreturn handle_stack_overflow(const char *message, die(message, regs, 0); /* Be absolutely certain we don't return. */ - panic(message); + panic("%s", message); } #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 34edf198708f..78e430f4e15c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1509,7 +1509,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, index << 3 | 0x2); addr = dt.address + index * 8; - return linear_read_system(ctxt, addr, desc, sizeof *desc); + return linear_read_system(ctxt, addr, desc, sizeof(*desc)); } static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, @@ -1522,7 +1522,7 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, struct desc_struct desc; u16 sel; - memset (dt, 0, sizeof *dt); + memset(dt, 0, sizeof(*dt)); if (!ops->get_segment(ctxt, &sel, &desc, &base3, VCPU_SREG_LDTR)) return; @@ -1586,7 +1586,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; - return linear_write_system(ctxt, addr, desc, sizeof *desc); + return linear_write_system(ctxt, addr, desc, sizeof(*desc)); } static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, @@ -1604,7 +1604,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 dummy; u32 base3 = 0; - memset(&seg_desc, 0, sizeof seg_desc); + memset(&seg_desc, 0, sizeof(seg_desc)); if (ctxt->mode == X86EMUL_MODE_REAL) { /* set real mode segment descriptor (keep limit etc. for @@ -3075,17 +3075,17 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg); + ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; save_state_to_tss16(ctxt, &tss_seg); - ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg); + ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; - ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg); + ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; @@ -3094,7 +3094,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ret = linear_write_system(ctxt, new_tss_base, &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link); + sizeof(tss_seg.prev_task_link)); if (ret != X86EMUL_CONTINUE) return ret; } @@ -3216,7 +3216,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, u32 eip_offset = offsetof(struct tss_segment_32, eip); u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); - ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg); + ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; @@ -3228,7 +3228,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; - ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg); + ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; @@ -3237,7 +3237,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = linear_write_system(ctxt, new_tss_base, &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link); + sizeof(tss_seg.prev_task_link)); if (ret != X86EMUL_CONTINUE) return ret; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3cd227ff807f..89db20f8cb70 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2409,7 +2409,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) r = kvm_apic_state_fixup(vcpu, s, true); if (r) return r; - memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); + memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 66d66d77caee..5cd5647120f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2924,7 +2924,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, unsigned size; r = -EFAULT; - if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) goto out; r = -E2BIG; @@ -3091,11 +3091,11 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned n; r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) goto out; n = msr_list.nmsrs; msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) goto out; r = -E2BIG; if (n < msr_list.nmsrs) @@ -3117,7 +3117,7 @@ long kvm_arch_dev_ioctl(struct file *filp, struct kvm_cpuid2 cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, @@ -3126,7 +3126,7 @@ long kvm_arch_dev_ioctl(struct file *filp, goto out; r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) goto out; r = 0; break; @@ -3894,7 +3894,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_interrupt irq; r = -EFAULT; - if (copy_from_user(&irq, argp, sizeof irq)) + if (copy_from_user(&irq, argp, sizeof(irq))) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; @@ -3912,7 +3912,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); break; @@ -3922,7 +3922,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); @@ -3933,14 +3933,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, cpuid_arg->entries); if (r) goto out; r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) goto out; r = 0; break; @@ -3961,13 +3961,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_tpr_access_ctl tac; r = -EFAULT; - if (copy_from_user(&tac, argp, sizeof tac)) + if (copy_from_user(&tac, argp, sizeof(tac))) goto out; r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &tac, sizeof tac)) + if (copy_to_user(argp, &tac, sizeof(tac))) goto out; r = 0; break; @@ -3980,7 +3980,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; - if (copy_from_user(&va, argp, sizeof va)) + if (copy_from_user(&va, argp, sizeof(va))) goto out; idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); @@ -3991,7 +3991,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u64 mcg_cap; r = -EFAULT; - if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap))) goto out; r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); break; @@ -4000,7 +4000,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_x86_mce mce; r = -EFAULT; - if (copy_from_user(&mce, argp, sizeof mce)) + if (copy_from_user(&mce, argp, sizeof(mce))) goto out; r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); break; @@ -4536,7 +4536,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->created_vcpus) goto set_identity_unlock; r = -EFAULT; - if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + if (copy_from_user(&ident_addr, argp, sizeof(ident_addr))) goto set_identity_unlock; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); set_identity_unlock: @@ -4620,7 +4620,7 @@ set_identity_unlock: if (r) goto get_irqchip_out; r = -EFAULT; - if (copy_to_user(argp, chip, sizeof *chip)) + if (copy_to_user(argp, chip, sizeof(*chip))) goto get_irqchip_out; r = 0; get_irqchip_out: @@ -4666,7 +4666,7 @@ set_identity_unlock: } case KVM_SET_PIT: { r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof u.ps)) + if (copy_from_user(&u.ps, argp, sizeof(u.ps))) goto out; r = -ENXIO; if (!kvm->arch.vpit) @@ -8205,7 +8205,7 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); - memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); + memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap)); if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, @@ -8509,7 +8509,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_opcode = fxsave->fop; fpu->last_ip = fxsave->rip; fpu->last_dp = fxsave->rdp; - memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space)); vcpu_put(vcpu); return 0; @@ -8530,7 +8530,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->fop = fpu->last_opcode; fxsave->rip = fpu->last_ip; fxsave->rdp = fpu->last_dp; - memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space)); vcpu_put(vcpu); return 0; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 00b296617ca4..92e4c4b85bba 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -92,7 +92,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ - info.high_limit = in_compat_syscall() ? + info.high_limit = in_32bit_syscall() ? task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); @@ -116,7 +116,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ - if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; info.align_mask = PAGE_MASK & ~huge_page_mask(h); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 1e95d57760cf..db3165714521 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -166,7 +166,7 @@ unsigned long get_mmap_base(int is_legacy) struct mm_struct *mm = current->mm; #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES - if (in_compat_syscall()) { + if (in_32bit_syscall()) { return is_legacy ? mm->mmap_compat_legacy_base : mm->mmap_compat_base; } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a80fdd7fb40f..abffa0be80da 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -399,9 +399,17 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); ret = -1; for_each_node_mask(i, physnode_mask) { + /* + * The reason we pass in blk[0] is due to + * numa_remove_memblk_from() called by + * emu_setup_memblk() will delete entry 0 + * and then move everything else up in the pi.blk + * array. Therefore we should always be looking + * at blk[0]. + */ ret = split_nodes_size_interleave_uniform(&ei, &pi, - pi.blk[i].start, pi.blk[i].end, 0, - n, &pi.blk[i], nid); + pi.blk[0].start, pi.blk[0].end, 0, + n, &pi.blk[0], nid); if (ret < 0) break; if (ret < n) { diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f799076e3d57..db7a10082238 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -2309,9 +2309,13 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) /* * We should perform an IPI and flush all tlbs, - * but that can deadlock->flush only current cpu: + * but that can deadlock->flush only current cpu. + * Preemption needs to be disabled around __flush_tlb_all() due to + * CR3 reload in __native_flush_tlb(). */ + preempt_disable(); __flush_tlb_all(); + preempt_enable(); arch_flush_lazy_mmu_mode(); } diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 0b08067c45f3..b629f6992d9f 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -130,7 +130,7 @@ static void regex_init(int use_real_mode) REG_EXTENDED|REG_NOSUB); if (err) { - regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf); + regerror(err, &sym_regex_c[i], errbuf, sizeof(errbuf)); die("%s", errbuf); } } @@ -405,7 +405,7 @@ static void read_shdrs(FILE *fp) } for (i = 0; i < ehdr.e_shnum; i++) { struct section *sec = &secs[i]; - if (fread(&shdr, sizeof shdr, 1, fp) != 1) + if (fread(&shdr, sizeof(shdr), 1, fp) != 1) die("Cannot read ELF section headers %d/%d: %s\n", i, ehdr.e_shnum, strerror(errno)); sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name); diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 413f3519d9a1..c907b20d4993 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -194,7 +194,7 @@ extern unsigned long um_vdso_addr; typedef unsigned long elf_greg_t; -#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t)) +#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t)) typedef elf_greg_t elf_gregset_t[ELF_NGREG]; typedef struct user_i387_struct elf_fpregset_t; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 60c141af222b..d29b7365da8d 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -config ZONE_DMA - def_bool y - config XTENSA def_bool y select ARCH_HAS_SG_CHAIN diff --git a/arch/xtensa/boot/Makefile b/arch/xtensa/boot/Makefile index dc9e0ba7122c..294846117fc2 100644 --- a/arch/xtensa/boot/Makefile +++ b/arch/xtensa/boot/Makefile @@ -33,7 +33,7 @@ uImage: $(obj)/uImage boot-elf boot-redboot: $(addprefix $(obj)/,$(subdir-y)) $(Q)$(MAKE) $(build)=$(obj)/$@ $(MAKECMDGOALS) -OBJCOPYFLAGS = --strip-all -R .comment -R .note.gnu.build-id -O binary +OBJCOPYFLAGS = --strip-all -R .comment -R .notes -O binary vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index b727b18a68ac..b80a430453b1 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -131,6 +131,7 @@ SECTIONS .fixup : { *(.fixup) } EXCEPTION_TABLE(16) + NOTES /* Data section */ _sdata = .; @@ -296,38 +297,11 @@ SECTIONS _end = .; - .xt.lit : { *(.xt.lit) } - .xt.prop : { *(.xt.prop) } - - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - .debug_info 0 : { *(.debug_info) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - - .xt.insn 0 : - { - *(.xt.insn) - *(.gnu.linkonce.x*) - } + DWARF_DEBUG - .xt.lit 0 : - { - *(.xt.lit) - *(.gnu.linkonce.p*) - } + .xt.prop 0 : { KEEP(*(.xt.prop .xt.prop.* .gnu.linkonce.prop.*)) } + .xt.insn 0 : { KEEP(*(.xt.insn .xt.insn.* .gnu.linkonce.x*)) } + .xt.lit 0 : { KEEP(*(.xt.lit .xt.lit.* .gnu.linkonce.p*)) } /* Sections to be discarded */ DISCARDS diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 9750a48f491b..30a48bba4a47 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -71,7 +71,7 @@ void __init zones_init(void) { /* All pages are DMA-able, so we put them all in the DMA zone. */ unsigned long zones_size[MAX_NR_ZONES] = { - [ZONE_DMA] = max_low_pfn - ARCH_PFN_OFFSET, + [ZONE_NORMAL] = max_low_pfn - ARCH_PFN_OFFSET, #ifdef CONFIG_HIGHMEM [ZONE_HIGHMEM] = max_pfn - max_low_pfn, #endif diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d9a7916ff0ab..9fe5952d117d 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger @@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6075100f03a5..3a27d31fcda6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4384,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, rcu_read_lock(); - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); + bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); if (!bfqg) { bfqq = &bfqd->oom_bfqq; goto out; diff --git a/block/bio.c b/block/bio.c index bbfeb4ee2892..d5368a445561 100644 --- a/block/bio.c +++ b/block/bio.c @@ -609,9 +609,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkg_association(bio, bio_src); - - blkcg_bio_issue_init(bio); + bio_clone_blkcg_association(bio, bio_src); } EXPORT_SYMBOL(__bio_clone_fast); @@ -1256,7 +1254,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || + if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { ret = bio_copy_from_iter(bio, iter); if (ret) @@ -1956,151 +1954,69 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - bio->bi_blkg = blkg_tryget_closest(blkg); - return 0; -} - -/** - * __bio_associate_blkg_from_css - internal blkg association function - * - * This in the core association function that all association paths rely on. - * A blkg reference is taken which is released upon freeing of the bio. - */ -static int __bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; - int ret; - - rcu_read_lock(); - - if (!css || !css->parent) - blkg = q->root_blkg; - else - blkg = blkg_lookup_create(css_to_blkcg(css), q); - - ret = bio_associate_blkg(bio, blkg); - - rcu_read_unlock(); - return ret; -} - -/** - * bio_associate_blkg_from_css - associate a bio with a specified css - * @bio: target bio - * @css: target css - * - * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. - */ -int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - return __bio_associate_blkg_from_css(bio, css); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); - #ifdef CONFIG_MEMCG /** - * bio_associate_blkg_from_page - associate a bio with the page's blkg + * bio_associate_blkcg_from_page - associate a bio with the page's blkcg * @bio: target bio * @page: the page to lookup the blkcg from * - * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. If cgroup_e_css returns NULL, fall back to the queue's - * root_blkg. - * - * Note: this must be called after bio has an associated device. + * Associate @bio with the blkcg from @page's owning memcg. This works like + * every other associate function wrt references. */ -int bio_associate_blkg_from_page(struct bio *bio, struct page *page) +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) { - struct cgroup_subsys_state *css; - int ret; + struct cgroup_subsys_state *blkcg_css; - if (unlikely(bio->bi_blkg)) + if (unlikely(bio->bi_css)) return -EBUSY; if (!page->mem_cgroup) return 0; - - rcu_read_lock(); - - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - - ret = __bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); - return ret; + blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, + &io_cgrp_subsys); + bio->bi_css = blkcg_css; + return 0; } #endif /* CONFIG_MEMCG */ /** - * bio_associate_create_blkg - associate a bio with a blkg from q - * @q: request_queue where bio is going + * bio_associate_blkcg - associate a bio with the specified blkcg * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. * - * Associate @bio with the blkg found from the bio's css and the request_queue. - * If one is not found, bio_lookup_blkg creates the blkg. This falls back to - * the queue's root_blkg if association fails. + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. */ -int bio_associate_create_blkg(struct request_queue *q, struct bio *bio) +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { - struct cgroup_subsys_state *css; - int ret = 0; - - /* someone has already associated this bio with a blkg */ - if (bio->bi_blkg) - return ret; - - rcu_read_lock(); - - css = blkcg_css(); - - ret = __bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); - return ret; + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; } +EXPORT_SYMBOL_GPL(bio_associate_blkcg); /** - * bio_reassociate_blkg - reassociate a bio with a blkg from q - * @q: request_queue where bio is going + * bio_associate_blkg - associate a bio with the specified blkg * @bio: target bio + * @blkg: the blkg to associate * - * When submitting a bio, multiple recursive calls to make_request() may occur. - * This causes the initial associate done in blkcg_bio_issue_check() to be - * incorrect and reference the prior request_queue. This performs reassociation - * when this situation happens. + * Associate @bio with the blkg specified by @blkg. This is the queue specific + * blkcg information associated with the @bio, a reference will be taken on the + * @blkg and will be freed when the bio is freed. */ -int bio_reassociate_blkg(struct request_queue *q, struct bio *bio) +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) { - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } - - return bio_associate_create_blkg(q, bio); + if (unlikely(bio->bi_blkg)) + return -EBUSY; + if (!blkg_try_get(blkg)) + return -ENODEV; + bio->bi_blkg = blkg; + return 0; } /** @@ -2113,6 +2029,10 @@ void bio_disassociate_task(struct bio *bio) put_io_context(bio->bi_ioc); bio->bi_ioc = NULL; } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; @@ -2120,16 +2040,16 @@ void bio_disassociate_task(struct bio *bio) } /** - * bio_clone_blkg_association - clone blkg association from src to dst bio + * bio_clone_blkcg_association - clone blkcg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkg_association(struct bio *dst, struct bio *src) +void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { - if (src->bi_blkg) - bio_associate_blkg(dst, src->bi_blkg); + if (src->bi_css) + WARN_ON(bio_associate_blkcg(dst, src->bi_css)); } -EXPORT_SYMBOL_GPL(bio_clone_blkg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 992da5592c6e..c630e02836a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -84,37 +84,6 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(blkg); } -static void __blkg_release(struct rcu_head *rcu) -{ - struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); - - percpu_ref_exit(&blkg->refcnt); - - /* release the blkcg and parent blkg refs this blkg has been holding */ - css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - - blkg_free(blkg); -} - -/* - * A group is RCU protected, but having an rcu lock does not mean that one - * can access all the fields of blkg and assume these are valid. For - * example, don't try to follow throtl_data and request queue links. - * - * Having a reference to blkg under an rcu allows accesses to only values - * local to groups like group stats and group rate limits. - */ -static void blkg_release(struct percpu_ref *ref) -{ - struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); - - call_rcu(&blkg->rcu_head, __blkg_release); -} - /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -141,6 +110,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; + atomic_set(&blkg->refcnt, 1); /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { @@ -247,11 +217,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_get(blkg->parent); } - ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, - GFP_NOWAIT | __GFP_NOWARN); - if (ret) - goto err_cancel_ref; - /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -284,8 +249,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_cancel_ref: - percpu_ref_exit(&blkg->refcnt); err_put_congested: wb_congested_put(wb_congested); err_put_css: @@ -296,7 +259,7 @@ err_free_blkg: } /** - * __blkg_lookup_create - lookup blkg, try to create one if not there + * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * @@ -305,11 +268,12 @@ err_free_blkg: * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and @q->queue_lock. * - * Returns the blkg or the closest blkg if blkg_create fails as it walks - * down from root. + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). */ -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; @@ -321,7 +285,7 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return q->root_blkg; + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -329,58 +293,23 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, /* * Create blkgs walking down from blkcg_root to @blkcg, so that all - * non-root blkgs have access to their parents. Returns the closest - * blkg to the intended blkg should blkg_create() fail. + * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); - struct blkcg_gq *ret_blkg = q->root_blkg; - - while (parent) { - blkg = __blkg_lookup(parent, q, false); - if (blkg) { - /* remember closest blkg */ - ret_blkg = blkg; - break; - } + + while (parent && !__blkg_lookup(parent, q, false)) { pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, q, NULL); - if (IS_ERR(blkg)) - return ret_blkg; - if (pos == blkcg) + if (pos == blkcg || IS_ERR(blkg)) return blkg; } } -/** - * blkg_lookup_create - find or create a blkg - * @blkcg: target block cgroup - * @q: target request_queue - * - * This looks up or creates the blkg representing the unique pair - * of the blkcg and the request_queue. - */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) -{ - struct blkcg_gq *blkg = blkg_lookup(blkcg, q); - unsigned long flags; - - if (unlikely(!blkg)) { - spin_lock_irqsave(q->queue_lock, flags); - - blkg = __blkg_lookup_create(blkcg, q); - - spin_unlock_irqrestore(q->queue_lock, flags); - } - - return blkg; -} - static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; @@ -424,7 +353,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ - percpu_ref_kill(&blkg->refcnt); + blkg_put(blkg); } /** @@ -452,6 +381,29 @@ static void blkg_destroy_all(struct request_queue *q) } /* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +void __blkg_release_rcu(struct rcu_head *rcu_head) +{ + struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + + wb_congested_put(blkg->wb_congested); + + blkg_free(blkg); +} +EXPORT_SYMBOL_GPL(__blkg_release_rcu); + +/* * The next function used by blk_queue_for_each_rl(). It's a bit tricky * because the root blkg uses @q->root_rl instead of its own rl. */ @@ -1796,7 +1748,8 @@ void blkcg_maybe_throttle_current(void) blkg = blkg_lookup(blkcg, q); if (!blkg) goto out; - if (!blkg_tryget(blkg)) + blkg = blkg_try_get(blkg); + if (!blkg) goto out; rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index bc6ea87d10e0..ce12515f9b9b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -785,6 +785,9 @@ void blk_cleanup_queue(struct request_queue *q) * prevent that q->request_fn() gets invoked after draining finished. */ blk_freeze_queue(q); + + rq_qos_exit(q); + spin_lock_irq(lock); queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); @@ -2432,7 +2435,6 @@ blk_qc_t generic_make_request(struct bio *bio) if (q) blk_queue_exit(q); q = bio->bi_disk->queue; - bio_reassociate_blkg(q, bio); flags = 0; if (bio->bi_opf & REQ_NOWAIT) flags = BLK_MQ_REQ_NOWAIT; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 28f80d227528..38c35c32aff2 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -482,12 +482,34 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - struct blkcg_gq *blkg = bio->bi_blkg; + struct blkcg *blkcg; + struct blkcg_gq *blkg; + struct request_queue *q = rqos->q; bool issue_as_root = bio_issue_as_root_blkg(bio); if (!blk_iolatency_enabled(blkiolat)) return; + rcu_read_lock(); + blkcg = bio_blkcg(bio); + bio_associate_blkcg(bio, &blkcg->css); + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + if (!lock) + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + if (!lock) + spin_unlock_irq(q->queue_lock); + } + if (!blkg) + goto out; + + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio_associate_blkg(bio, blkg); +out: + rcu_read_unlock(); while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { @@ -708,7 +730,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ - if (!blkg_tryget(blkg)) + if (!blkg_try_get(blkg)) continue; iolat = blkg_to_lat(blkg); diff --git a/block/blk-merge.c b/block/blk-merge.c index 42a46744c11b..6b5ad275ed56 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -714,6 +714,31 @@ static void blk_account_io_merge(struct request *req) part_stat_unlock(); } } +/* + * Two cases of handling DISCARD merge: + * If max_discard_segments > 1, the driver takes every bio + * as a range and send them to controller together. The ranges + * needn't to be contiguous. + * Otherwise, the bios/requests will be handled as same as + * others which should be contiguous. + */ +static inline bool blk_discard_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD && + queue_max_discard_segments(req->q) > 1) + return true; + return false; +} + +enum elv_merge blk_try_req_merge(struct request *req, struct request *next) +{ + if (blk_discard_mergable(req)) + return ELEVATOR_DISCARD_MERGE; + else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next)) + return ELEVATOR_BACK_MERGE; + + return ELEVATOR_NO_MERGE; +} /* * For non-mq, this has to be called with the request spinlock acquired. @@ -731,12 +756,6 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - /* - * not contiguous - */ - if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) - return NULL; - if (rq_data_dir(req) != rq_data_dir(next) || req->rq_disk != next->rq_disk || req_no_special_merge(next)) @@ -760,11 +779,19 @@ static struct request *attempt_merge(struct request_queue *q, * counts here. Handle DISCARDs separately, as they * have separate settings. */ - if (req_op(req) == REQ_OP_DISCARD) { + + switch (blk_try_req_merge(req, next)) { + case ELEVATOR_DISCARD_MERGE: if (!req_attempt_discard_merge(q, req, next)) return NULL; - } else if (!ll_merge_requests_fn(q, req, next)) + break; + case ELEVATOR_BACK_MERGE: + if (!ll_merge_requests_fn(q, req, next)) + return NULL; + break; + default: return NULL; + } /* * If failfast settings disagree or any of the two is already @@ -888,8 +915,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) { - if (req_op(rq) == REQ_OP_DISCARD && - queue_max_discard_segments(rq->q) > 1) + if (blk_discard_mergable(rq)) return ELEVATOR_DISCARD_MERGE; else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) return ELEVATOR_BACK_MERGE; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0641533597f1..844a454a7b3a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -1007,8 +1007,6 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - rq_qos_exit(q); - mutex_lock(&q->sysfs_lock); if (q->request_fn || (q->mq_ops && q->elevator)) elv_unregister_queue(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4bda70e8db48..db1a3a2ae006 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2115,11 +2115,21 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) +{ +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + /* fallback to root_blkg if we fail to get a blkg ref */ + if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) + bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +#endif +} + bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; @@ -2138,6 +2148,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, if (unlikely(blk_queue_bypass(q))) goto out_unlock; + blk_throtl_assoc_bio(tg, bio); blk_throtl_update_idletime(tg); sq = &tg->service_queue; diff --git a/block/bounce.c b/block/bounce.c index cf49fe02f65c..36869afc258c 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -276,9 +276,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkg_association(bio, bio_src); - - blkcg_bio_issue_init(bio); + bio_clone_blkcg_association(bio, bio_src); return bio; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6a3d87dd3c1a..ed41aa978c4a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3759,7 +3759,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bio_blkcg(bio)->css.serial_nr; rcu_read_unlock(); /* @@ -3824,7 +3824,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct cfq_group *cfqg; rcu_read_lock(); - cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio)); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); if (!cfqg) { cfqq = &cfqd->oom_cfqq; goto out; diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig index f3702e533ff4..be70ca6c85d3 100644 --- a/crypto/asymmetric_keys/Kconfig +++ b/crypto/asymmetric_keys/Kconfig @@ -21,6 +21,18 @@ config ASYMMETRIC_PUBLIC_KEY_SUBTYPE appropriate hash algorithms (such as SHA-1) must be available. ENOPKG will be reported if the requisite algorithm is unavailable. +config ASYMMETRIC_TPM_KEY_SUBTYPE + tristate "Asymmetric TPM backed private key subtype" + depends on TCG_TPM + depends on TRUSTED_KEYS + select CRYPTO_HMAC + select CRYPTO_SHA1 + select CRYPTO_HASH_INFO + help + This option provides support for TPM backed private key type handling. + Operations such as sign, verify, encrypt, decrypt are performed by + the TPM after the private key is loaded. + config X509_CERTIFICATE_PARSER tristate "X.509 certificate parser" depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE @@ -31,6 +43,25 @@ config X509_CERTIFICATE_PARSER data and provides the ability to instantiate a crypto key from a public key packet found inside the certificate. +config PKCS8_PRIVATE_KEY_PARSER + tristate "PKCS#8 private key parser" + depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + select ASN1 + select OID_REGISTRY + help + This option provides support for parsing PKCS#8 format blobs for + private key data and provides the ability to instantiate a crypto key + from that data. + +config TPM_KEY_PARSER + tristate "TPM private key parser" + depends on ASYMMETRIC_TPM_KEY_SUBTYPE + select ASN1 + help + This option provides support for parsing TPM format blobs for + private key data and provides the ability to instantiate a crypto key + from that data. + config PKCS7_MESSAGE_PARSER tristate "PKCS#7 message parser" depends on X509_CERTIFICATE_PARSER diff --git a/crypto/asymmetric_keys/Makefile b/crypto/asymmetric_keys/Makefile index d4b2e1b2dc65..28b91adba2ae 100644 --- a/crypto/asymmetric_keys/Makefile +++ b/crypto/asymmetric_keys/Makefile @@ -11,6 +11,7 @@ asymmetric_keys-y := \ signature.o obj-$(CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE) += public_key.o +obj-$(CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE) += asym_tpm.o # # X.509 Certificate handling @@ -30,6 +31,19 @@ $(obj)/x509.asn1.o: $(obj)/x509.asn1.c $(obj)/x509.asn1.h $(obj)/x509_akid.asn1.o: $(obj)/x509_akid.asn1.c $(obj)/x509_akid.asn1.h # +# PKCS#8 private key handling +# +obj-$(CONFIG_PKCS8_PRIVATE_KEY_PARSER) += pkcs8_key_parser.o +pkcs8_key_parser-y := \ + pkcs8.asn1.o \ + pkcs8_parser.o + +$(obj)/pkcs8_parser.o: $(obj)/pkcs8.asn1.h +$(obj)/pkcs8-asn1.o: $(obj)/pkcs8.asn1.c $(obj)/pkcs8.asn1.h + +clean-files += pkcs8.asn1.c pkcs8.asn1.h + +# # PKCS#7 message handling # obj-$(CONFIG_PKCS7_MESSAGE_PARSER) += pkcs7_message.o @@ -61,3 +75,14 @@ verify_signed_pefile-y := \ $(obj)/mscode_parser.o: $(obj)/mscode.asn1.h $(obj)/mscode.asn1.h $(obj)/mscode.asn1.o: $(obj)/mscode.asn1.c $(obj)/mscode.asn1.h + +# +# TPM private key parsing +# +obj-$(CONFIG_TPM_KEY_PARSER) += tpm_key_parser.o +tpm_key_parser-y := \ + tpm.asn1.o \ + tpm_parser.o + +$(obj)/tpm_parser.o: $(obj)/tpm.asn1.h +$(obj)/tpm.asn1.o: $(obj)/tpm.asn1.c $(obj)/tpm.asn1.h diff --git a/crypto/asymmetric_keys/asym_tpm.c b/crypto/asymmetric_keys/asym_tpm.c new file mode 100644 index 000000000000..5d4c270463f6 --- /dev/null +++ b/crypto/asymmetric_keys/asym_tpm.c @@ -0,0 +1,988 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "ASYM-TPM: "fmt +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/scatterlist.h> +#include <linux/tpm.h> +#include <linux/tpm_command.h> +#include <crypto/akcipher.h> +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <asm/unaligned.h> +#include <keys/asymmetric-subtype.h> +#include <keys/trusted.h> +#include <crypto/asym_tpm_subtype.h> +#include <crypto/public_key.h> + +#define TPM_ORD_FLUSHSPECIFIC 186 +#define TPM_ORD_LOADKEY2 65 +#define TPM_ORD_UNBIND 30 +#define TPM_ORD_SIGN 60 +#define TPM_LOADKEY2_SIZE 59 +#define TPM_FLUSHSPECIFIC_SIZE 18 +#define TPM_UNBIND_SIZE 63 +#define TPM_SIGN_SIZE 63 + +#define TPM_RT_KEY 0x00000001 + +/* + * Load a TPM key from the blob provided by userspace + */ +static int tpm_loadkey2(struct tpm_buf *tb, + uint32_t keyhandle, unsigned char *keyauth, + const unsigned char *keyblob, int keybloblen, + uint32_t *newhandle) +{ + unsigned char nonceodd[TPM_NONCE_SIZE]; + unsigned char enonce[TPM_NONCE_SIZE]; + unsigned char authdata[SHA1_DIGEST_SIZE]; + uint32_t authhandle = 0; + unsigned char cont = 0; + uint32_t ordinal; + int ret; + + ordinal = htonl(TPM_ORD_LOADKEY2); + + /* session for loading the key */ + ret = oiap(tb, &authhandle, enonce); + if (ret < 0) { + pr_info("oiap failed (%d)\n", ret); + return ret; + } + + /* generate odd nonce */ + ret = tpm_get_random(NULL, nonceodd, TPM_NONCE_SIZE); + if (ret < 0) { + pr_info("tpm_get_random failed (%d)\n", ret); + return ret; + } + + /* calculate authorization HMAC value */ + ret = TSS_authhmac(authdata, keyauth, SHA1_DIGEST_SIZE, enonce, + nonceodd, cont, sizeof(uint32_t), &ordinal, + keybloblen, keyblob, 0, 0); + if (ret < 0) + return ret; + + /* build the request buffer */ + INIT_BUF(tb); + store16(tb, TPM_TAG_RQU_AUTH1_COMMAND); + store32(tb, TPM_LOADKEY2_SIZE + keybloblen); + store32(tb, TPM_ORD_LOADKEY2); + store32(tb, keyhandle); + storebytes(tb, keyblob, keybloblen); + store32(tb, authhandle); + storebytes(tb, nonceodd, TPM_NONCE_SIZE); + store8(tb, cont); + storebytes(tb, authdata, SHA1_DIGEST_SIZE); + + ret = trusted_tpm_send(tb->data, MAX_BUF_SIZE); + if (ret < 0) { + pr_info("authhmac failed (%d)\n", ret); + return ret; + } + + ret = TSS_checkhmac1(tb->data, ordinal, nonceodd, keyauth, + SHA1_DIGEST_SIZE, 0, 0); + if (ret < 0) { + pr_info("TSS_checkhmac1 failed (%d)\n", ret); + return ret; + } + + *newhandle = LOAD32(tb->data, TPM_DATA_OFFSET); + return 0; +} + +/* + * Execute the FlushSpecific TPM command + */ +static int tpm_flushspecific(struct tpm_buf *tb, uint32_t handle) +{ + INIT_BUF(tb); + store16(tb, TPM_TAG_RQU_COMMAND); + store32(tb, TPM_FLUSHSPECIFIC_SIZE); + store32(tb, TPM_ORD_FLUSHSPECIFIC); + store32(tb, handle); + store32(tb, TPM_RT_KEY); + + return trusted_tpm_send(tb->data, MAX_BUF_SIZE); +} + +/* + * Decrypt a blob provided by userspace using a specific key handle. + * The handle is a well known handle or previously loaded by e.g. LoadKey2 + */ +static int tpm_unbind(struct tpm_buf *tb, + uint32_t keyhandle, unsigned char *keyauth, + const unsigned char *blob, uint32_t bloblen, + void *out, uint32_t outlen) +{ + unsigned char nonceodd[TPM_NONCE_SIZE]; + unsigned char enonce[TPM_NONCE_SIZE]; + unsigned char authdata[SHA1_DIGEST_SIZE]; + uint32_t authhandle = 0; + unsigned char cont = 0; + uint32_t ordinal; + uint32_t datalen; + int ret; + + ordinal = htonl(TPM_ORD_UNBIND); + datalen = htonl(bloblen); + + /* session for loading the key */ + ret = oiap(tb, &authhandle, enonce); + if (ret < 0) { + pr_info("oiap failed (%d)\n", ret); + return ret; + } + + /* generate odd nonce */ + ret = tpm_get_random(NULL, nonceodd, TPM_NONCE_SIZE); + if (ret < 0) { + pr_info("tpm_get_random failed (%d)\n", ret); + return ret; + } + + /* calculate authorization HMAC value */ + ret = TSS_authhmac(authdata, keyauth, SHA1_DIGEST_SIZE, enonce, + nonceodd, cont, sizeof(uint32_t), &ordinal, + sizeof(uint32_t), &datalen, + bloblen, blob, 0, 0); + if (ret < 0) + return ret; + + /* build the request buffer */ + INIT_BUF(tb); + store16(tb, TPM_TAG_RQU_AUTH1_COMMAND); + store32(tb, TPM_UNBIND_SIZE + bloblen); + store32(tb, TPM_ORD_UNBIND); + store32(tb, keyhandle); + store32(tb, bloblen); + storebytes(tb, blob, bloblen); + store32(tb, authhandle); + storebytes(tb, nonceodd, TPM_NONCE_SIZE); + store8(tb, cont); + storebytes(tb, authdata, SHA1_DIGEST_SIZE); + + ret = trusted_tpm_send(tb->data, MAX_BUF_SIZE); + if (ret < 0) { + pr_info("authhmac failed (%d)\n", ret); + return ret; + } + + datalen = LOAD32(tb->data, TPM_DATA_OFFSET); + + ret = TSS_checkhmac1(tb->data, ordinal, nonceodd, + keyauth, SHA1_DIGEST_SIZE, + sizeof(uint32_t), TPM_DATA_OFFSET, + datalen, TPM_DATA_OFFSET + sizeof(uint32_t), + 0, 0); + if (ret < 0) { + pr_info("TSS_checkhmac1 failed (%d)\n", ret); + return ret; + } + + memcpy(out, tb->data + TPM_DATA_OFFSET + sizeof(uint32_t), + min(outlen, datalen)); + + return datalen; +} + +/* + * Sign a blob provided by userspace (that has had the hash function applied) + * using a specific key handle. The handle is assumed to have been previously + * loaded by e.g. LoadKey2. + * + * Note that the key signature scheme of the used key should be set to + * TPM_SS_RSASSAPKCS1v15_DER. This allows the hashed input to be of any size + * up to key_length_in_bytes - 11 and not be limited to size 20 like the + * TPM_SS_RSASSAPKCS1v15_SHA1 signature scheme. + */ +static int tpm_sign(struct tpm_buf *tb, + uint32_t keyhandle, unsigned char *keyauth, + const unsigned char *blob, uint32_t bloblen, + void *out, uint32_t outlen) +{ + unsigned char nonceodd[TPM_NONCE_SIZE]; + unsigned char enonce[TPM_NONCE_SIZE]; + unsigned char authdata[SHA1_DIGEST_SIZE]; + uint32_t authhandle = 0; + unsigned char cont = 0; + uint32_t ordinal; + uint32_t datalen; + int ret; + + ordinal = htonl(TPM_ORD_SIGN); + datalen = htonl(bloblen); + + /* session for loading the key */ + ret = oiap(tb, &authhandle, enonce); + if (ret < 0) { + pr_info("oiap failed (%d)\n", ret); + return ret; + } + + /* generate odd nonce */ + ret = tpm_get_random(NULL, nonceodd, TPM_NONCE_SIZE); + if (ret < 0) { + pr_info("tpm_get_random failed (%d)\n", ret); + return ret; + } + + /* calculate authorization HMAC value */ + ret = TSS_authhmac(authdata, keyauth, SHA1_DIGEST_SIZE, enonce, + nonceodd, cont, sizeof(uint32_t), &ordinal, + sizeof(uint32_t), &datalen, + bloblen, blob, 0, 0); + if (ret < 0) + return ret; + + /* build the request buffer */ + INIT_BUF(tb); + store16(tb, TPM_TAG_RQU_AUTH1_COMMAND); + store32(tb, TPM_SIGN_SIZE + bloblen); + store32(tb, TPM_ORD_SIGN); + store32(tb, keyhandle); + store32(tb, bloblen); + storebytes(tb, blob, bloblen); + store32(tb, authhandle); + storebytes(tb, nonceodd, TPM_NONCE_SIZE); + store8(tb, cont); + storebytes(tb, authdata, SHA1_DIGEST_SIZE); + + ret = trusted_tpm_send(tb->data, MAX_BUF_SIZE); + if (ret < 0) { + pr_info("authhmac failed (%d)\n", ret); + return ret; + } + + datalen = LOAD32(tb->data, TPM_DATA_OFFSET); + + ret = TSS_checkhmac1(tb->data, ordinal, nonceodd, + keyauth, SHA1_DIGEST_SIZE, + sizeof(uint32_t), TPM_DATA_OFFSET, + datalen, TPM_DATA_OFFSET + sizeof(uint32_t), + 0, 0); + if (ret < 0) { + pr_info("TSS_checkhmac1 failed (%d)\n", ret); + return ret; + } + + memcpy(out, tb->data + TPM_DATA_OFFSET + sizeof(uint32_t), + min(datalen, outlen)); + + return datalen; +} +/* + * Maximum buffer size for the BER/DER encoded public key. The public key + * is of the form SEQUENCE { INTEGER n, INTEGER e } where n is a maximum 2048 + * bit key and e is usually 65537 + * The encoding overhead is: + * - max 4 bytes for SEQUENCE + * - max 4 bytes for INTEGER n type/length + * - 257 bytes of n + * - max 2 bytes for INTEGER e type/length + * - 3 bytes of e + */ +#define PUB_KEY_BUF_SIZE (4 + 4 + 257 + 2 + 3) + +/* + * Provide a part of a description of the key for /proc/keys. + */ +static void asym_tpm_describe(const struct key *asymmetric_key, + struct seq_file *m) +{ + struct tpm_key *tk = asymmetric_key->payload.data[asym_crypto]; + + if (!tk) + return; + + seq_printf(m, "TPM1.2/Blob"); +} + +static void asym_tpm_destroy(void *payload0, void *payload3) +{ + struct tpm_key *tk = payload0; + + if (!tk) + return; + + kfree(tk->blob); + tk->blob_len = 0; + + kfree(tk); +} + +/* How many bytes will it take to encode the length */ +static inline uint32_t definite_length(uint32_t len) +{ + if (len <= 127) + return 1; + if (len <= 255) + return 2; + return 3; +} + +static inline uint8_t *encode_tag_length(uint8_t *buf, uint8_t tag, + uint32_t len) +{ + *buf++ = tag; + + if (len <= 127) { + buf[0] = len; + return buf + 1; + } + + if (len <= 255) { + buf[0] = 0x81; + buf[1] = len; + return buf + 2; + } + + buf[0] = 0x82; + put_unaligned_be16(len, buf + 1); + return buf + 3; +} + +static uint32_t derive_pub_key(const void *pub_key, uint32_t len, uint8_t *buf) +{ + uint8_t *cur = buf; + uint32_t n_len = definite_length(len) + 1 + len + 1; + uint32_t e_len = definite_length(3) + 1 + 3; + uint8_t e[3] = { 0x01, 0x00, 0x01 }; + + /* SEQUENCE */ + cur = encode_tag_length(cur, 0x30, n_len + e_len); + /* INTEGER n */ + cur = encode_tag_length(cur, 0x02, len + 1); + cur[0] = 0x00; + memcpy(cur + 1, pub_key, len); + cur += len + 1; + cur = encode_tag_length(cur, 0x02, sizeof(e)); + memcpy(cur, e, sizeof(e)); + cur += sizeof(e); + + return cur - buf; +} + +/* + * Determine the crypto algorithm name. + */ +static int determine_akcipher(const char *encoding, const char *hash_algo, + char alg_name[CRYPTO_MAX_ALG_NAME]) +{ + if (strcmp(encoding, "pkcs1") == 0) { + if (!hash_algo) { + strcpy(alg_name, "pkcs1pad(rsa)"); + return 0; + } + + if (snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "pkcs1pad(rsa,%s)", + hash_algo) >= CRYPTO_MAX_ALG_NAME) + return -EINVAL; + + return 0; + } + + if (strcmp(encoding, "raw") == 0) { + strcpy(alg_name, "rsa"); + return 0; + } + + return -ENOPKG; +} + +/* + * Query information about a key. + */ +static int tpm_key_query(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info) +{ + struct tpm_key *tk = params->key->payload.data[asym_crypto]; + int ret; + char alg_name[CRYPTO_MAX_ALG_NAME]; + struct crypto_akcipher *tfm; + uint8_t der_pub_key[PUB_KEY_BUF_SIZE]; + uint32_t der_pub_key_len; + int len; + + /* TPM only works on private keys, public keys still done in software */ + ret = determine_akcipher(params->encoding, params->hash_algo, alg_name); + if (ret < 0) + return ret; + + tfm = crypto_alloc_akcipher(alg_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + der_pub_key_len = derive_pub_key(tk->pub_key, tk->pub_key_len, + der_pub_key); + + ret = crypto_akcipher_set_pub_key(tfm, der_pub_key, der_pub_key_len); + if (ret < 0) + goto error_free_tfm; + + len = crypto_akcipher_maxsize(tfm); + + info->key_size = tk->key_len; + info->max_data_size = tk->key_len / 8; + info->max_sig_size = len; + info->max_enc_size = len; + info->max_dec_size = tk->key_len / 8; + + info->supported_ops = KEYCTL_SUPPORTS_ENCRYPT | + KEYCTL_SUPPORTS_DECRYPT | + KEYCTL_SUPPORTS_VERIFY | + KEYCTL_SUPPORTS_SIGN; + + ret = 0; +error_free_tfm: + crypto_free_akcipher(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} + +/* + * Encryption operation is performed with the public key. Hence it is done + * in software + */ +static int tpm_key_encrypt(struct tpm_key *tk, + struct kernel_pkey_params *params, + const void *in, void *out) +{ + char alg_name[CRYPTO_MAX_ALG_NAME]; + struct crypto_akcipher *tfm; + struct akcipher_request *req; + struct crypto_wait cwait; + struct scatterlist in_sg, out_sg; + uint8_t der_pub_key[PUB_KEY_BUF_SIZE]; + uint32_t der_pub_key_len; + int ret; + + pr_devel("==>%s()\n", __func__); + + ret = determine_akcipher(params->encoding, params->hash_algo, alg_name); + if (ret < 0) + return ret; + + tfm = crypto_alloc_akcipher(alg_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + der_pub_key_len = derive_pub_key(tk->pub_key, tk->pub_key_len, + der_pub_key); + + ret = crypto_akcipher_set_pub_key(tfm, der_pub_key, der_pub_key_len); + if (ret < 0) + goto error_free_tfm; + + req = akcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) + goto error_free_tfm; + + sg_init_one(&in_sg, in, params->in_len); + sg_init_one(&out_sg, out, params->out_len); + akcipher_request_set_crypt(req, &in_sg, &out_sg, params->in_len, + params->out_len); + crypto_init_wait(&cwait); + akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &cwait); + + ret = crypto_akcipher_encrypt(req); + ret = crypto_wait_req(ret, &cwait); + + if (ret == 0) + ret = req->dst_len; + + akcipher_request_free(req); +error_free_tfm: + crypto_free_akcipher(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} + +/* + * Decryption operation is performed with the private key in the TPM. + */ +static int tpm_key_decrypt(struct tpm_key *tk, + struct kernel_pkey_params *params, + const void *in, void *out) +{ + struct tpm_buf *tb; + uint32_t keyhandle; + uint8_t srkauth[SHA1_DIGEST_SIZE]; + uint8_t keyauth[SHA1_DIGEST_SIZE]; + int r; + + pr_devel("==>%s()\n", __func__); + + if (params->hash_algo) + return -ENOPKG; + + if (strcmp(params->encoding, "pkcs1")) + return -ENOPKG; + + tb = kzalloc(sizeof(*tb), GFP_KERNEL); + if (!tb) + return -ENOMEM; + + /* TODO: Handle a non-all zero SRK authorization */ + memset(srkauth, 0, sizeof(srkauth)); + + r = tpm_loadkey2(tb, SRKHANDLE, srkauth, + tk->blob, tk->blob_len, &keyhandle); + if (r < 0) { + pr_devel("loadkey2 failed (%d)\n", r); + goto error; + } + + /* TODO: Handle a non-all zero key authorization */ + memset(keyauth, 0, sizeof(keyauth)); + + r = tpm_unbind(tb, keyhandle, keyauth, + in, params->in_len, out, params->out_len); + if (r < 0) + pr_devel("tpm_unbind failed (%d)\n", r); + + if (tpm_flushspecific(tb, keyhandle) < 0) + pr_devel("flushspecific failed (%d)\n", r); + +error: + kzfree(tb); + pr_devel("<==%s() = %d\n", __func__, r); + return r; +} + +/* + * Hash algorithm OIDs plus ASN.1 DER wrappings [RFC4880 sec 5.2.2]. + */ +static const u8 digest_info_md5[] = { + 0x30, 0x20, 0x30, 0x0c, 0x06, 0x08, + 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05, /* OID */ + 0x05, 0x00, 0x04, 0x10 +}; + +static const u8 digest_info_sha1[] = { + 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, + 0x2b, 0x0e, 0x03, 0x02, 0x1a, + 0x05, 0x00, 0x04, 0x14 +}; + +static const u8 digest_info_rmd160[] = { + 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, + 0x2b, 0x24, 0x03, 0x02, 0x01, + 0x05, 0x00, 0x04, 0x14 +}; + +static const u8 digest_info_sha224[] = { + 0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09, + 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04, + 0x05, 0x00, 0x04, 0x1c +}; + +static const u8 digest_info_sha256[] = { + 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, + 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, + 0x05, 0x00, 0x04, 0x20 +}; + +static const u8 digest_info_sha384[] = { + 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, + 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, + 0x05, 0x00, 0x04, 0x30 +}; + +static const u8 digest_info_sha512[] = { + 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, + 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, + 0x05, 0x00, 0x04, 0x40 +}; + +static const struct asn1_template { + const char *name; + const u8 *data; + size_t size; +} asn1_templates[] = { +#define _(X) { #X, digest_info_##X, sizeof(digest_info_##X) } + _(md5), + _(sha1), + _(rmd160), + _(sha256), + _(sha384), + _(sha512), + _(sha224), + { NULL } +#undef _ +}; + +static const struct asn1_template *lookup_asn1(const char *name) +{ + const struct asn1_template *p; + + for (p = asn1_templates; p->name; p++) + if (strcmp(name, p->name) == 0) + return p; + return NULL; +} + +/* + * Sign operation is performed with the private key in the TPM. + */ +static int tpm_key_sign(struct tpm_key *tk, + struct kernel_pkey_params *params, + const void *in, void *out) +{ + struct tpm_buf *tb; + uint32_t keyhandle; + uint8_t srkauth[SHA1_DIGEST_SIZE]; + uint8_t keyauth[SHA1_DIGEST_SIZE]; + void *asn1_wrapped = NULL; + uint32_t in_len = params->in_len; + int r; + + pr_devel("==>%s()\n", __func__); + + if (strcmp(params->encoding, "pkcs1")) + return -ENOPKG; + + if (params->hash_algo) { + const struct asn1_template *asn1 = + lookup_asn1(params->hash_algo); + + if (!asn1) + return -ENOPKG; + + /* request enough space for the ASN.1 template + input hash */ + asn1_wrapped = kzalloc(in_len + asn1->size, GFP_KERNEL); + if (!asn1_wrapped) + return -ENOMEM; + + /* Copy ASN.1 template, then the input */ + memcpy(asn1_wrapped, asn1->data, asn1->size); + memcpy(asn1_wrapped + asn1->size, in, in_len); + + in = asn1_wrapped; + in_len += asn1->size; + } + + if (in_len > tk->key_len / 8 - 11) { + r = -EOVERFLOW; + goto error_free_asn1_wrapped; + } + + r = -ENOMEM; + tb = kzalloc(sizeof(*tb), GFP_KERNEL); + if (!tb) + goto error_free_asn1_wrapped; + + /* TODO: Handle a non-all zero SRK authorization */ + memset(srkauth, 0, sizeof(srkauth)); + + r = tpm_loadkey2(tb, SRKHANDLE, srkauth, + tk->blob, tk->blob_len, &keyhandle); + if (r < 0) { + pr_devel("loadkey2 failed (%d)\n", r); + goto error_free_tb; + } + + /* TODO: Handle a non-all zero key authorization */ + memset(keyauth, 0, sizeof(keyauth)); + + r = tpm_sign(tb, keyhandle, keyauth, in, in_len, out, params->out_len); + if (r < 0) + pr_devel("tpm_sign failed (%d)\n", r); + + if (tpm_flushspecific(tb, keyhandle) < 0) + pr_devel("flushspecific failed (%d)\n", r); + +error_free_tb: + kzfree(tb); +error_free_asn1_wrapped: + kfree(asn1_wrapped); + pr_devel("<==%s() = %d\n", __func__, r); + return r; +} + +/* + * Do encryption, decryption and signing ops. + */ +static int tpm_key_eds_op(struct kernel_pkey_params *params, + const void *in, void *out) +{ + struct tpm_key *tk = params->key->payload.data[asym_crypto]; + int ret = -EOPNOTSUPP; + + /* Perform the encryption calculation. */ + switch (params->op) { + case kernel_pkey_encrypt: + ret = tpm_key_encrypt(tk, params, in, out); + break; + case kernel_pkey_decrypt: + ret = tpm_key_decrypt(tk, params, in, out); + break; + case kernel_pkey_sign: + ret = tpm_key_sign(tk, params, in, out); + break; + default: + BUG(); + } + + return ret; +} + +/* + * Verify a signature using a public key. + */ +static int tpm_key_verify_signature(const struct key *key, + const struct public_key_signature *sig) +{ + const struct tpm_key *tk = key->payload.data[asym_crypto]; + struct crypto_wait cwait; + struct crypto_akcipher *tfm; + struct akcipher_request *req; + struct scatterlist sig_sg, digest_sg; + char alg_name[CRYPTO_MAX_ALG_NAME]; + uint8_t der_pub_key[PUB_KEY_BUF_SIZE]; + uint32_t der_pub_key_len; + void *output; + unsigned int outlen; + int ret; + + pr_devel("==>%s()\n", __func__); + + BUG_ON(!tk); + BUG_ON(!sig); + BUG_ON(!sig->s); + + if (!sig->digest) + return -ENOPKG; + + ret = determine_akcipher(sig->encoding, sig->hash_algo, alg_name); + if (ret < 0) + return ret; + + tfm = crypto_alloc_akcipher(alg_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + der_pub_key_len = derive_pub_key(tk->pub_key, tk->pub_key_len, + der_pub_key); + + ret = crypto_akcipher_set_pub_key(tfm, der_pub_key, der_pub_key_len); + if (ret < 0) + goto error_free_tfm; + + ret = -ENOMEM; + req = akcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) + goto error_free_tfm; + + ret = -ENOMEM; + outlen = crypto_akcipher_maxsize(tfm); + output = kmalloc(outlen, GFP_KERNEL); + if (!output) + goto error_free_req; + + sg_init_one(&sig_sg, sig->s, sig->s_size); + sg_init_one(&digest_sg, output, outlen); + akcipher_request_set_crypt(req, &sig_sg, &digest_sg, sig->s_size, + outlen); + crypto_init_wait(&cwait); + akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &cwait); + + /* Perform the verification calculation. This doesn't actually do the + * verification, but rather calculates the hash expected by the + * signature and returns that to us. + */ + ret = crypto_wait_req(crypto_akcipher_verify(req), &cwait); + if (ret) + goto out_free_output; + + /* Do the actual verification step. */ + if (req->dst_len != sig->digest_size || + memcmp(sig->digest, output, sig->digest_size) != 0) + ret = -EKEYREJECTED; + +out_free_output: + kfree(output); +error_free_req: + akcipher_request_free(req); +error_free_tfm: + crypto_free_akcipher(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + if (WARN_ON_ONCE(ret > 0)) + ret = -EINVAL; + return ret; +} + +/* + * Parse enough information out of TPM_KEY structure: + * TPM_STRUCT_VER -> 4 bytes + * TPM_KEY_USAGE -> 2 bytes + * TPM_KEY_FLAGS -> 4 bytes + * TPM_AUTH_DATA_USAGE -> 1 byte + * TPM_KEY_PARMS -> variable + * UINT32 PCRInfoSize -> 4 bytes + * BYTE* -> PCRInfoSize bytes + * TPM_STORE_PUBKEY + * UINT32 encDataSize; + * BYTE* -> encDataSize; + * + * TPM_KEY_PARMS: + * TPM_ALGORITHM_ID -> 4 bytes + * TPM_ENC_SCHEME -> 2 bytes + * TPM_SIG_SCHEME -> 2 bytes + * UINT32 parmSize -> 4 bytes + * BYTE* -> variable + */ +static int extract_key_parameters(struct tpm_key *tk) +{ + const void *cur = tk->blob; + uint32_t len = tk->blob_len; + const void *pub_key; + uint32_t sz; + uint32_t key_len; + + if (len < 11) + return -EBADMSG; + + /* Ensure this is a legacy key */ + if (get_unaligned_be16(cur + 4) != 0x0015) + return -EBADMSG; + + /* Skip to TPM_KEY_PARMS */ + cur += 11; + len -= 11; + + if (len < 12) + return -EBADMSG; + + /* Make sure this is an RSA key */ + if (get_unaligned_be32(cur) != 0x00000001) + return -EBADMSG; + + /* Make sure this is TPM_ES_RSAESPKCSv15 encoding scheme */ + if (get_unaligned_be16(cur + 4) != 0x0002) + return -EBADMSG; + + /* Make sure this is TPM_SS_RSASSAPKCS1v15_DER signature scheme */ + if (get_unaligned_be16(cur + 6) != 0x0003) + return -EBADMSG; + + sz = get_unaligned_be32(cur + 8); + if (len < sz + 12) + return -EBADMSG; + + /* Move to TPM_RSA_KEY_PARMS */ + len -= 12; + cur += 12; + + /* Grab the RSA key length */ + key_len = get_unaligned_be32(cur); + + switch (key_len) { + case 512: + case 1024: + case 1536: + case 2048: + break; + default: + return -EINVAL; + } + + /* Move just past TPM_KEY_PARMS */ + cur += sz; + len -= sz; + + if (len < 4) + return -EBADMSG; + + sz = get_unaligned_be32(cur); + if (len < 4 + sz) + return -EBADMSG; + + /* Move to TPM_STORE_PUBKEY */ + cur += 4 + sz; + len -= 4 + sz; + + /* Grab the size of the public key, it should jive with the key size */ + sz = get_unaligned_be32(cur); + if (sz > 256) + return -EINVAL; + + pub_key = cur + 4; + + tk->key_len = key_len; + tk->pub_key = pub_key; + tk->pub_key_len = sz; + + return 0; +} + +/* Given the blob, parse it and load it into the TPM */ +struct tpm_key *tpm_key_create(const void *blob, uint32_t blob_len) +{ + int r; + struct tpm_key *tk; + + r = tpm_is_tpm2(NULL); + if (r < 0) + goto error; + + /* We don't support TPM2 yet */ + if (r > 0) { + r = -ENODEV; + goto error; + } + + r = -ENOMEM; + tk = kzalloc(sizeof(struct tpm_key), GFP_KERNEL); + if (!tk) + goto error; + + tk->blob = kmemdup(blob, blob_len, GFP_KERNEL); + if (!tk->blob) + goto error_memdup; + + tk->blob_len = blob_len; + + r = extract_key_parameters(tk); + if (r < 0) + goto error_extract; + + return tk; + +error_extract: + kfree(tk->blob); + tk->blob_len = 0; +error_memdup: + kfree(tk); +error: + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(tpm_key_create); + +/* + * TPM-based asymmetric key subtype + */ +struct asymmetric_key_subtype asym_tpm_subtype = { + .owner = THIS_MODULE, + .name = "asym_tpm", + .name_len = sizeof("asym_tpm") - 1, + .describe = asym_tpm_describe, + .destroy = asym_tpm_destroy, + .query = tpm_key_query, + .eds_op = tpm_key_eds_op, + .verify_signature = tpm_key_verify_signature, +}; +EXPORT_SYMBOL_GPL(asym_tpm_subtype); + +MODULE_DESCRIPTION("TPM based asymmetric key subtype"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); diff --git a/crypto/asymmetric_keys/asymmetric_keys.h b/crypto/asymmetric_keys/asymmetric_keys.h index ca8e9ac34ce6..7be1ccf4fa9f 100644 --- a/crypto/asymmetric_keys/asymmetric_keys.h +++ b/crypto/asymmetric_keys/asymmetric_keys.h @@ -16,3 +16,6 @@ extern struct asymmetric_key_id *asymmetric_key_hex_to_key_id(const char *id); extern int __asymmetric_key_hex_to_key_id(const char *id, struct asymmetric_key_id *match_id, size_t hexlen); + +extern int asymmetric_key_eds_op(struct kernel_pkey_params *params, + const void *in, void *out); diff --git a/crypto/asymmetric_keys/asymmetric_type.c b/crypto/asymmetric_keys/asymmetric_type.c index 26539e9a8bda..69a0788a7de5 100644 --- a/crypto/asymmetric_keys/asymmetric_type.c +++ b/crypto/asymmetric_keys/asymmetric_type.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/ctype.h> #include <keys/system_keyring.h> +#include <keys/user-type.h> #include "asymmetric_keys.h" MODULE_LICENSE("GPL"); @@ -538,6 +539,45 @@ out: return ret; } +int asymmetric_key_eds_op(struct kernel_pkey_params *params, + const void *in, void *out) +{ + const struct asymmetric_key_subtype *subtype; + struct key *key = params->key; + int ret; + + pr_devel("==>%s()\n", __func__); + + if (key->type != &key_type_asymmetric) + return -EINVAL; + subtype = asymmetric_key_subtype(key); + if (!subtype || + !key->payload.data[0]) + return -EINVAL; + if (!subtype->eds_op) + return -ENOTSUPP; + + ret = subtype->eds_op(params, in, out); + + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} + +static int asymmetric_key_verify_signature(struct kernel_pkey_params *params, + const void *in, const void *in2) +{ + struct public_key_signature sig = { + .s_size = params->in2_len, + .digest_size = params->in_len, + .encoding = params->encoding, + .hash_algo = params->hash_algo, + .digest = (void *)in, + .s = (void *)in2, + }; + + return verify_signature(params->key, &sig); +} + struct key_type key_type_asymmetric = { .name = "asymmetric", .preparse = asymmetric_key_preparse, @@ -548,6 +588,9 @@ struct key_type key_type_asymmetric = { .destroy = asymmetric_key_destroy, .describe = asymmetric_key_describe, .lookup_restriction = asymmetric_lookup_restriction, + .asym_query = query_asymmetric_key, + .asym_eds_op = asymmetric_key_eds_op, + .asym_verify_signature = asymmetric_key_verify_signature, }; EXPORT_SYMBOL_GPL(key_type_asymmetric); diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 0f134162cef4..f0d56e1a8b7e 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -271,6 +271,7 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, switch (ctx->last_oid) { case OID_rsaEncryption: ctx->sinfo->sig->pkey_algo = "rsa"; + ctx->sinfo->sig->encoding = "pkcs1"; break; default: printk("Unsupported pkey algo: %u\n", ctx->last_oid); diff --git a/crypto/asymmetric_keys/pkcs8.asn1 b/crypto/asymmetric_keys/pkcs8.asn1 new file mode 100644 index 000000000000..702c41a3c713 --- /dev/null +++ b/crypto/asymmetric_keys/pkcs8.asn1 @@ -0,0 +1,24 @@ +-- +-- This is the unencrypted variant +-- +PrivateKeyInfo ::= SEQUENCE { + version Version, + privateKeyAlgorithm PrivateKeyAlgorithmIdentifier, + privateKey PrivateKey, + attributes [0] IMPLICIT Attributes OPTIONAL +} + +Version ::= INTEGER ({ pkcs8_note_version }) + +PrivateKeyAlgorithmIdentifier ::= AlgorithmIdentifier ({ pkcs8_note_algo }) + +PrivateKey ::= OCTET STRING ({ pkcs8_note_key }) + +Attributes ::= SET OF Attribute + +Attribute ::= ANY + +AlgorithmIdentifier ::= SEQUENCE { + algorithm OBJECT IDENTIFIER ({ pkcs8_note_OID }), + parameters ANY OPTIONAL +} diff --git a/crypto/asymmetric_keys/pkcs8_parser.c b/crypto/asymmetric_keys/pkcs8_parser.c new file mode 100644 index 000000000000..5f6a7ecc9765 --- /dev/null +++ b/crypto/asymmetric_keys/pkcs8_parser.c @@ -0,0 +1,184 @@ +/* PKCS#8 Private Key parser [RFC 5208]. + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "PKCS8: "fmt +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/oid_registry.h> +#include <keys/asymmetric-subtype.h> +#include <keys/asymmetric-parser.h> +#include <crypto/public_key.h> +#include "pkcs8.asn1.h" + +struct pkcs8_parse_context { + struct public_key *pub; + unsigned long data; /* Start of data */ + enum OID last_oid; /* Last OID encountered */ + enum OID algo_oid; /* Algorithm OID */ + u32 key_size; + const void *key; +}; + +/* + * Note an OID when we find one for later processing when we know how to + * interpret it. + */ +int pkcs8_note_OID(void *context, size_t hdrlen, + unsigned char tag, + const void *value, size_t vlen) +{ + struct pkcs8_parse_context *ctx = context; + + ctx->last_oid = look_up_OID(value, vlen); + if (ctx->last_oid == OID__NR) { + char buffer[50]; + + sprint_oid(value, vlen, buffer, sizeof(buffer)); + pr_info("Unknown OID: [%lu] %s\n", + (unsigned long)value - ctx->data, buffer); + } + return 0; +} + +/* + * Note the version number of the ASN.1 blob. + */ +int pkcs8_note_version(void *context, size_t hdrlen, + unsigned char tag, + const void *value, size_t vlen) +{ + if (vlen != 1 || ((const u8 *)value)[0] != 0) { + pr_warn("Unsupported PKCS#8 version\n"); + return -EBADMSG; + } + return 0; +} + +/* + * Note the public algorithm. + */ +int pkcs8_note_algo(void *context, size_t hdrlen, + unsigned char tag, + const void *value, size_t vlen) +{ + struct pkcs8_parse_context *ctx = context; + + if (ctx->last_oid != OID_rsaEncryption) + return -ENOPKG; + + ctx->pub->pkey_algo = "rsa"; + return 0; +} + +/* + * Note the key data of the ASN.1 blob. + */ +int pkcs8_note_key(void *context, size_t hdrlen, + unsigned char tag, + const void *value, size_t vlen) +{ + struct pkcs8_parse_context *ctx = context; + + ctx->key = value; + ctx->key_size = vlen; + return 0; +} + +/* + * Parse a PKCS#8 private key blob. + */ +static struct public_key *pkcs8_parse(const void *data, size_t datalen) +{ + struct pkcs8_parse_context ctx; + struct public_key *pub; + long ret; + + memset(&ctx, 0, sizeof(ctx)); + + ret = -ENOMEM; + ctx.pub = kzalloc(sizeof(struct public_key), GFP_KERNEL); + if (!ctx.pub) + goto error; + + ctx.data = (unsigned long)data; + + /* Attempt to decode the private key */ + ret = asn1_ber_decoder(&pkcs8_decoder, &ctx, data, datalen); + if (ret < 0) + goto error_decode; + + ret = -ENOMEM; + pub = ctx.pub; + pub->key = kmemdup(ctx.key, ctx.key_size, GFP_KERNEL); + if (!pub->key) + goto error_decode; + + pub->keylen = ctx.key_size; + pub->key_is_private = true; + return pub; + +error_decode: + kfree(ctx.pub); +error: + return ERR_PTR(ret); +} + +/* + * Attempt to parse a data blob for a key as a PKCS#8 private key. + */ +static int pkcs8_key_preparse(struct key_preparsed_payload *prep) +{ + struct public_key *pub; + + pub = pkcs8_parse(prep->data, prep->datalen); + if (IS_ERR(pub)) + return PTR_ERR(pub); + + pr_devel("Cert Key Algo: %s\n", pub->pkey_algo); + pub->id_type = "PKCS8"; + + /* We're pinning the module by being linked against it */ + __module_get(public_key_subtype.owner); + prep->payload.data[asym_subtype] = &public_key_subtype; + prep->payload.data[asym_key_ids] = NULL; + prep->payload.data[asym_crypto] = pub; + prep->payload.data[asym_auth] = NULL; + prep->quotalen = 100; + return 0; +} + +static struct asymmetric_key_parser pkcs8_key_parser = { + .owner = THIS_MODULE, + .name = "pkcs8", + .parse = pkcs8_key_preparse, +}; + +/* + * Module stuff + */ +static int __init pkcs8_key_init(void) +{ + return register_asymmetric_key_parser(&pkcs8_key_parser); +} + +static void __exit pkcs8_key_exit(void) +{ + unregister_asymmetric_key_parser(&pkcs8_key_parser); +} + +module_init(pkcs8_key_init); +module_exit(pkcs8_key_exit); + +MODULE_DESCRIPTION("PKCS#8 certificate parser"); +MODULE_LICENSE("GPL"); diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index e929fe1e4106..f5d85b47fcc6 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -60,6 +60,165 @@ static void public_key_destroy(void *payload0, void *payload3) } /* + * Determine the crypto algorithm name. + */ +static +int software_key_determine_akcipher(const char *encoding, + const char *hash_algo, + const struct public_key *pkey, + char alg_name[CRYPTO_MAX_ALG_NAME]) +{ + int n; + + if (strcmp(encoding, "pkcs1") == 0) { + /* The data wangled by the RSA algorithm is typically padded + * and encoded in some manner, such as EMSA-PKCS1-1_5 [RFC3447 + * sec 8.2]. + */ + if (!hash_algo) + n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, + "pkcs1pad(%s)", + pkey->pkey_algo); + else + n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, + "pkcs1pad(%s,%s)", + pkey->pkey_algo, hash_algo); + return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0; + } + + if (strcmp(encoding, "raw") == 0) { + strcpy(alg_name, pkey->pkey_algo); + return 0; + } + + return -ENOPKG; +} + +/* + * Query information about a key. + */ +static int software_key_query(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info) +{ + struct crypto_akcipher *tfm; + struct public_key *pkey = params->key->payload.data[asym_crypto]; + char alg_name[CRYPTO_MAX_ALG_NAME]; + int ret, len; + + ret = software_key_determine_akcipher(params->encoding, + params->hash_algo, + pkey, alg_name); + if (ret < 0) + return ret; + + tfm = crypto_alloc_akcipher(alg_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + if (pkey->key_is_private) + ret = crypto_akcipher_set_priv_key(tfm, + pkey->key, pkey->keylen); + else + ret = crypto_akcipher_set_pub_key(tfm, + pkey->key, pkey->keylen); + if (ret < 0) + goto error_free_tfm; + + len = crypto_akcipher_maxsize(tfm); + info->key_size = len * 8; + info->max_data_size = len; + info->max_sig_size = len; + info->max_enc_size = len; + info->max_dec_size = len; + info->supported_ops = (KEYCTL_SUPPORTS_ENCRYPT | + KEYCTL_SUPPORTS_VERIFY); + if (pkey->key_is_private) + info->supported_ops |= (KEYCTL_SUPPORTS_DECRYPT | + KEYCTL_SUPPORTS_SIGN); + ret = 0; + +error_free_tfm: + crypto_free_akcipher(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} + +/* + * Do encryption, decryption and signing ops. + */ +static int software_key_eds_op(struct kernel_pkey_params *params, + const void *in, void *out) +{ + const struct public_key *pkey = params->key->payload.data[asym_crypto]; + struct akcipher_request *req; + struct crypto_akcipher *tfm; + struct crypto_wait cwait; + struct scatterlist in_sg, out_sg; + char alg_name[CRYPTO_MAX_ALG_NAME]; + int ret; + + pr_devel("==>%s()\n", __func__); + + ret = software_key_determine_akcipher(params->encoding, + params->hash_algo, + pkey, alg_name); + if (ret < 0) + return ret; + + tfm = crypto_alloc_akcipher(alg_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + req = akcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) + goto error_free_tfm; + + if (pkey->key_is_private) + ret = crypto_akcipher_set_priv_key(tfm, + pkey->key, pkey->keylen); + else + ret = crypto_akcipher_set_pub_key(tfm, + pkey->key, pkey->keylen); + if (ret) + goto error_free_req; + + sg_init_one(&in_sg, in, params->in_len); + sg_init_one(&out_sg, out, params->out_len); + akcipher_request_set_crypt(req, &in_sg, &out_sg, params->in_len, + params->out_len); + crypto_init_wait(&cwait); + akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &cwait); + + /* Perform the encryption calculation. */ + switch (params->op) { + case kernel_pkey_encrypt: + ret = crypto_akcipher_encrypt(req); + break; + case kernel_pkey_decrypt: + ret = crypto_akcipher_decrypt(req); + break; + case kernel_pkey_sign: + ret = crypto_akcipher_sign(req); + break; + default: + BUG(); + } + + ret = crypto_wait_req(ret, &cwait); + if (ret == 0) + ret = req->dst_len; + +error_free_req: + akcipher_request_free(req); +error_free_tfm: + crypto_free_akcipher(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} + +/* * Verify a signature using a public key. */ int public_key_verify_signature(const struct public_key *pkey, @@ -69,8 +228,7 @@ int public_key_verify_signature(const struct public_key *pkey, struct crypto_akcipher *tfm; struct akcipher_request *req; struct scatterlist sig_sg, digest_sg; - const char *alg_name; - char alg_name_buf[CRYPTO_MAX_ALG_NAME]; + char alg_name[CRYPTO_MAX_ALG_NAME]; void *output; unsigned int outlen; int ret; @@ -81,21 +239,11 @@ int public_key_verify_signature(const struct public_key *pkey, BUG_ON(!sig); BUG_ON(!sig->s); - if (!sig->digest) - return -ENOPKG; - - alg_name = sig->pkey_algo; - if (strcmp(sig->pkey_algo, "rsa") == 0) { - /* The data wangled by the RSA algorithm is typically padded - * and encoded in some manner, such as EMSA-PKCS1-1_5 [RFC3447 - * sec 8.2]. - */ - if (snprintf(alg_name_buf, CRYPTO_MAX_ALG_NAME, - "pkcs1pad(rsa,%s)", sig->hash_algo - ) >= CRYPTO_MAX_ALG_NAME) - return -EINVAL; - alg_name = alg_name_buf; - } + ret = software_key_determine_akcipher(sig->encoding, + sig->hash_algo, + pkey, alg_name); + if (ret < 0) + return ret; tfm = crypto_alloc_akcipher(alg_name, 0, 0); if (IS_ERR(tfm)) @@ -106,7 +254,12 @@ int public_key_verify_signature(const struct public_key *pkey, if (!req) goto error_free_tfm; - ret = crypto_akcipher_set_pub_key(tfm, pkey->key, pkey->keylen); + if (pkey->key_is_private) + ret = crypto_akcipher_set_priv_key(tfm, + pkey->key, pkey->keylen); + else + ret = crypto_akcipher_set_pub_key(tfm, + pkey->key, pkey->keylen); if (ret) goto error_free_req; @@ -167,6 +320,8 @@ struct asymmetric_key_subtype public_key_subtype = { .name_len = sizeof("public_key") - 1, .describe = public_key_describe, .destroy = public_key_destroy, + .query = software_key_query, + .eds_op = software_key_eds_op, .verify_signature = public_key_verify_signature_2, }; EXPORT_SYMBOL_GPL(public_key_subtype); diff --git a/crypto/asymmetric_keys/signature.c b/crypto/asymmetric_keys/signature.c index 28198314bc39..ad95a58c6642 100644 --- a/crypto/asymmetric_keys/signature.c +++ b/crypto/asymmetric_keys/signature.c @@ -16,7 +16,9 @@ #include <linux/export.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/keyctl.h> #include <crypto/public_key.h> +#include <keys/user-type.h> #include "asymmetric_keys.h" /* @@ -37,6 +39,99 @@ void public_key_signature_free(struct public_key_signature *sig) EXPORT_SYMBOL_GPL(public_key_signature_free); /** + * query_asymmetric_key - Get information about an aymmetric key. + * @params: Various parameters. + * @info: Where to put the information. + */ +int query_asymmetric_key(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info) +{ + const struct asymmetric_key_subtype *subtype; + struct key *key = params->key; + int ret; + + pr_devel("==>%s()\n", __func__); + + if (key->type != &key_type_asymmetric) + return -EINVAL; + subtype = asymmetric_key_subtype(key); + if (!subtype || + !key->payload.data[0]) + return -EINVAL; + if (!subtype->query) + return -ENOTSUPP; + + ret = subtype->query(params, info); + + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} +EXPORT_SYMBOL_GPL(query_asymmetric_key); + +/** + * encrypt_blob - Encrypt data using an asymmetric key + * @params: Various parameters + * @data: Data blob to be encrypted, length params->data_len + * @enc: Encrypted data buffer, length params->enc_len + * + * Encrypt the specified data blob using the private key specified by + * params->key. The encrypted data is wrapped in an encoding if + * params->encoding is specified (eg. "pkcs1"). + * + * Returns the length of the data placed in the encrypted data buffer or an + * error. + */ +int encrypt_blob(struct kernel_pkey_params *params, + const void *data, void *enc) +{ + params->op = kernel_pkey_encrypt; + return asymmetric_key_eds_op(params, data, enc); +} +EXPORT_SYMBOL_GPL(encrypt_blob); + +/** + * decrypt_blob - Decrypt data using an asymmetric key + * @params: Various parameters + * @enc: Encrypted data to be decrypted, length params->enc_len + * @data: Decrypted data buffer, length params->data_len + * + * Decrypt the specified data blob using the private key specified by + * params->key. The decrypted data is wrapped in an encoding if + * params->encoding is specified (eg. "pkcs1"). + * + * Returns the length of the data placed in the decrypted data buffer or an + * error. + */ +int decrypt_blob(struct kernel_pkey_params *params, + const void *enc, void *data) +{ + params->op = kernel_pkey_decrypt; + return asymmetric_key_eds_op(params, enc, data); +} +EXPORT_SYMBOL_GPL(decrypt_blob); + +/** + * create_signature - Sign some data using an asymmetric key + * @params: Various parameters + * @data: Data blob to be signed, length params->data_len + * @enc: Signature buffer, length params->enc_len + * + * Sign the specified data blob using the private key specified by params->key. + * The signature is wrapped in an encoding if params->encoding is specified + * (eg. "pkcs1"). If the encoding needs to know the digest type, this can be + * passed through params->hash_algo (eg. "sha1"). + * + * Returns the length of the data placed in the signature buffer or an error. + */ +int create_signature(struct kernel_pkey_params *params, + const void *data, void *enc) +{ + params->op = kernel_pkey_sign; + return asymmetric_key_eds_op(params, data, enc); +} +EXPORT_SYMBOL_GPL(create_signature); + +/** * verify_signature - Initiate the use of an asymmetric key to verify a signature * @key: The asymmetric key to verify against * @sig: The signature to check diff --git a/crypto/asymmetric_keys/tpm.asn1 b/crypto/asymmetric_keys/tpm.asn1 new file mode 100644 index 000000000000..d7f194232f30 --- /dev/null +++ b/crypto/asymmetric_keys/tpm.asn1 @@ -0,0 +1,5 @@ +-- +-- Unencryted TPM Blob. For details of the format, see: +-- http://david.woodhou.se/draft-woodhouse-cert-best-practice.html#I-D.mavrogiannopoulos-tpmuri +-- +PrivateKeyInfo ::= OCTET STRING ({ tpm_note_key }) diff --git a/crypto/asymmetric_keys/tpm_parser.c b/crypto/asymmetric_keys/tpm_parser.c new file mode 100644 index 000000000000..96405d8dcd98 --- /dev/null +++ b/crypto/asymmetric_keys/tpm_parser.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "TPM-PARSER: "fmt +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <keys/asymmetric-subtype.h> +#include <keys/asymmetric-parser.h> +#include <crypto/asym_tpm_subtype.h> +#include "tpm.asn1.h" + +struct tpm_parse_context { + const void *blob; + u32 blob_len; +}; + +/* + * Note the key data of the ASN.1 blob. + */ +int tpm_note_key(void *context, size_t hdrlen, + unsigned char tag, + const void *value, size_t vlen) +{ + struct tpm_parse_context *ctx = context; + + ctx->blob = value; + ctx->blob_len = vlen; + + return 0; +} + +/* + * Parse a TPM-encrypted private key blob. + */ +static struct tpm_key *tpm_parse(const void *data, size_t datalen) +{ + struct tpm_parse_context ctx; + long ret; + + memset(&ctx, 0, sizeof(ctx)); + + /* Attempt to decode the private key */ + ret = asn1_ber_decoder(&tpm_decoder, &ctx, data, datalen); + if (ret < 0) + goto error; + + return tpm_key_create(ctx.blob, ctx.blob_len); + +error: + return ERR_PTR(ret); +} +/* + * Attempt to parse a data blob for a key as a TPM private key blob. + */ +static int tpm_key_preparse(struct key_preparsed_payload *prep) +{ + struct tpm_key *tk; + + /* + * TPM 1.2 keys are max 2048 bits long, so assume the blob is no + * more than 4x that + */ + if (prep->datalen > 256 * 4) + return -EMSGSIZE; + + tk = tpm_parse(prep->data, prep->datalen); + + if (IS_ERR(tk)) + return PTR_ERR(tk); + + /* We're pinning the module by being linked against it */ + __module_get(asym_tpm_subtype.owner); + prep->payload.data[asym_subtype] = &asym_tpm_subtype; + prep->payload.data[asym_key_ids] = NULL; + prep->payload.data[asym_crypto] = tk; + prep->payload.data[asym_auth] = NULL; + prep->quotalen = 100; + return 0; +} + +static struct asymmetric_key_parser tpm_key_parser = { + .owner = THIS_MODULE, + .name = "tpm_parser", + .parse = tpm_key_preparse, +}; + +static int __init tpm_key_init(void) +{ + return register_asymmetric_key_parser(&tpm_key_parser); +} + +static void __exit tpm_key_exit(void) +{ + unregister_asymmetric_key_parser(&tpm_key_parser); +} + +module_init(tpm_key_init); +module_exit(tpm_key_exit); + +MODULE_DESCRIPTION("TPM private key-blob parser"); +MODULE_LICENSE("GPL v2"); diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index b6cabac4b62b..991f4d735a4e 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -199,35 +199,32 @@ int x509_note_pkey_algo(void *context, size_t hdrlen, case OID_md4WithRSAEncryption: ctx->cert->sig->hash_algo = "md4"; - ctx->cert->sig->pkey_algo = "rsa"; - break; + goto rsa_pkcs1; case OID_sha1WithRSAEncryption: ctx->cert->sig->hash_algo = "sha1"; - ctx->cert->sig->pkey_algo = "rsa"; - break; + goto rsa_pkcs1; case OID_sha256WithRSAEncryption: ctx->cert->sig->hash_algo = "sha256"; - ctx->cert->sig->pkey_algo = "rsa"; - break; + goto rsa_pkcs1; case OID_sha384WithRSAEncryption: ctx->cert->sig->hash_algo = "sha384"; - ctx->cert->sig->pkey_algo = "rsa"; - break; + goto rsa_pkcs1; case OID_sha512WithRSAEncryption: ctx->cert->sig->hash_algo = "sha512"; - ctx->cert->sig->pkey_algo = "rsa"; - break; + goto rsa_pkcs1; case OID_sha224WithRSAEncryption: ctx->cert->sig->hash_algo = "sha224"; - ctx->cert->sig->pkey_algo = "rsa"; - break; + goto rsa_pkcs1; } +rsa_pkcs1: + ctx->cert->sig->pkey_algo = "rsa"; + ctx->cert->sig->encoding = "pkcs1"; ctx->algo_oid = ctx->last_oid; return 0; } diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c index 812476e46821..cfc04e15fd97 100644 --- a/crypto/rsa-pkcs1pad.c +++ b/crypto/rsa-pkcs1pad.c @@ -392,7 +392,8 @@ static int pkcs1pad_sign(struct akcipher_request *req) if (!ctx->key_size) return -EINVAL; - digest_size = digest_info->size; + if (digest_info) + digest_size = digest_info->size; if (req->src_len + digest_size > ctx->key_size - 11) return -EOVERFLOW; @@ -412,8 +413,9 @@ static int pkcs1pad_sign(struct akcipher_request *req) memset(req_ctx->in_buf + 1, 0xff, ps_end - 1); req_ctx->in_buf[ps_end] = 0x00; - memcpy(req_ctx->in_buf + ps_end + 1, digest_info->data, - digest_info->size); + if (digest_info) + memcpy(req_ctx->in_buf + ps_end + 1, digest_info->data, + digest_info->size); pkcs1pad_sg_set_buf(req_ctx->in_sg, req_ctx->in_buf, ctx->key_size - 1 - req->src_len, req->src); @@ -475,10 +477,13 @@ static int pkcs1pad_verify_complete(struct akcipher_request *req, int err) goto done; pos++; - if (crypto_memneq(out_buf + pos, digest_info->data, digest_info->size)) - goto done; + if (digest_info) { + if (crypto_memneq(out_buf + pos, digest_info->data, + digest_info->size)) + goto done; - pos += digest_info->size; + pos += digest_info->size; + } err = 0; @@ -608,11 +613,14 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb) hash_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(hash_name)) - return PTR_ERR(hash_name); + hash_name = NULL; - digest_info = rsa_lookup_asn1(hash_name); - if (!digest_info) - return -EINVAL; + if (hash_name) { + digest_info = rsa_lookup_asn1(hash_name); + if (!digest_info) + return -EINVAL; + } else + digest_info = NULL; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) @@ -632,14 +640,29 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb) err = -ENAMETOOLONG; - if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, - "pkcs1pad(%s,%s)", rsa_alg->base.cra_name, hash_name) >= - CRYPTO_MAX_ALG_NAME || - snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, - "pkcs1pad(%s,%s)", - rsa_alg->base.cra_driver_name, hash_name) >= - CRYPTO_MAX_ALG_NAME) - goto out_drop_alg; + if (!hash_name) { + if (snprintf(inst->alg.base.cra_name, + CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)", + rsa_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) + goto out_drop_alg; + + if (snprintf(inst->alg.base.cra_driver_name, + CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)", + rsa_alg->base.cra_driver_name) >= + CRYPTO_MAX_ALG_NAME) + goto out_drop_alg; + } else { + if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, + "pkcs1pad(%s,%s)", rsa_alg->base.cra_name, + hash_name) >= CRYPTO_MAX_ALG_NAME) + goto out_drop_alg; + + if (snprintf(inst->alg.base.cra_driver_name, + CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s,%s)", + rsa_alg->base.cra_driver_name, + hash_name) >= CRYPTO_MAX_ALG_NAME) + goto out_drop_alg; + } inst->alg.base.cra_flags = rsa_alg->base.cra_flags & CRYPTO_ALG_ASYNC; inst->alg.base.cra_priority = rsa_alg->base.cra_priority; diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index a7c2673ffd36..824ae985ad93 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -126,6 +126,7 @@ int acpi_device_get_power(struct acpi_device *device, int *state) return 0; } +EXPORT_SYMBOL(acpi_device_get_power); static int acpi_dev_pm_explicit_set(struct acpi_device *adev, int state) { diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index 3b25a643058c..21b9b2f2470a 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -155,10 +155,9 @@ struct logical_input { int release_data; } std; struct { /* valid when type == INPUT_TYPE_KBD */ - /* strings can be non null-terminated */ - char press_str[sizeof(void *) + sizeof(int)]; - char repeat_str[sizeof(void *) + sizeof(int)]; - char release_str[sizeof(void *) + sizeof(int)]; + char press_str[sizeof(void *) + sizeof(int)] __nonstring; + char repeat_str[sizeof(void *) + sizeof(int)] __nonstring; + char release_str[sizeof(void *) + sizeof(int)] __nonstring; } kbd; } u; }; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index df8103dd40ac..c18586fccb6f 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -396,15 +396,14 @@ static struct brd_device *brd_alloc(int i) disk->first_minor = i * max_part; disk->fops = &brd_fops; disk->private_data = brd; - disk->queue = brd->brd_queue; disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); - disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; + brd->brd_queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; /* Tell the block layer that this is not a rotational device */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NONROT, brd->brd_queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, brd->brd_queue); return brd; @@ -436,6 +435,7 @@ static struct brd_device *brd_init_one(int i, bool *new) brd = brd_alloc(i); if (brd) { + brd->brd_disk->queue = brd->brd_queue; add_disk(brd->brd_disk); list_add_tail(&brd->brd_list, &brd_devices); } @@ -503,8 +503,14 @@ static int __init brd_init(void) /* point of no return */ - list_for_each_entry(brd, &brd_devices, brd_list) + list_for_each_entry(brd, &brd_devices, brd_list) { + /* + * associate with queue just before adding disk for + * avoiding to mess up failure path + */ + brd->brd_disk->queue = brd->brd_queue; add_disk(brd->brd_disk); + } blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, THIS_MODULE, brd_probe, NULL, NULL); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 55fd104f1ed4..fa8204214ac0 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1856,7 +1856,7 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, /* THINK if (signal_pending) return ... ? */ - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); if (sock == connection->data.socket) { rcu_read_lock(); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index fc67fd853375..61c392752fe4 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -516,7 +516,7 @@ static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flag struct msghdr msg = { .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) }; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size); return sock_recvmsg(sock, &msg, msg.msg_flags); } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index abad6d15f956..cb0cc8685076 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,7 +77,6 @@ #include <linux/falloc.h> #include <linux/uio.h> #include <linux/ioprio.h> -#include <linux/blk-cgroup.h> #include "loop.h" @@ -269,7 +268,7 @@ static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) struct iov_iter i; ssize_t bw; - iov_iter_bvec(&i, ITER_BVEC | WRITE, bvec, 1, bvec->bv_len); + iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); file_start_write(file); bw = vfs_iter_write(file, &i, ppos, 0); @@ -347,7 +346,7 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq, ssize_t len; rq_for_each_segment(bvec, rq, iter) { - iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len); + iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) return len; @@ -388,7 +387,7 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq, b.bv_offset = 0; b.bv_len = bvec.bv_len; - iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len); + iov_iter_bvec(&i, READ, &b, 1, b.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) { ret = len; @@ -555,8 +554,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, } atomic_set(&cmd->ref, 2); - iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, - segments, blk_rq_bytes(rq)); + iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; @@ -1761,8 +1759,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, /* always use the first bio's css */ #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { - cmd->css = &bio_blkcg(rq->bio)->css; + if (cmd->use_aio && rq->bio && rq->bio->bi_css) { + cmd->css = rq->bio->bi_css; css_get(cmd->css); } else #endif diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index dfc8de6ce525..a7daa8acbab3 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1942,8 +1942,8 @@ static int exec_drive_taskfile(struct driver_data *dd, dev_warn(&dd->pdev->dev, "data movement but " "sect_count is 0\n"); - err = -EINVAL; - goto abort; + err = -EINVAL; + goto abort; } } } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 14a51254c3db..4d4d6129ff66 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -473,7 +473,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) u32 nbd_cmd_flags = 0; int sent = nsock->sent, skip = 0; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request)); + iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); switch (req_op(req)) { case REQ_OP_DISCARD: @@ -564,8 +564,7 @@ send_pages: dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", req, bvec.bv_len); - iov_iter_bvec(&from, ITER_BVEC | WRITE, - &bvec, 1, bvec.bv_len); + iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len); if (skip) { if (skip >= iov_iter_count(&from)) { skip -= iov_iter_count(&from); @@ -624,7 +623,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) int ret = 0; reply.magic = 0; - iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply)); + iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply)); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); if (result <= 0) { if (!nbd_disconnected(config)) @@ -678,8 +677,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) struct bio_vec bvec; rq_for_each_segment(bvec, req, iter) { - iov_iter_bvec(&to, ITER_BVEC | READ, - &bvec, 1, bvec.bv_len); + iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", @@ -1073,7 +1071,7 @@ static void send_disconnects(struct nbd_device *nbd) for (i = 0; i < config->num_connections; i++) { struct nbd_sock *nsock = config->socks[i]; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request)); + iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); mutex_lock(&nsock->tx_lock); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); if (ret <= 0) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index a11f4ba98b05..55c77e44bb2d 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -620,4 +620,22 @@ config RISCV_TIMER is accessed via both the SBI and the rdcycle instruction. This is required for all RISC-V systems. +config CSKY_MP_TIMER + bool "SMP Timer for the C-SKY platform" if COMPILE_TEST + depends on CSKY + select TIMER_OF + help + Say yes here to enable C-SKY SMP timer driver used for C-SKY SMP + system. + csky,mptimer is not only used in SMP system, it also could be used + single core system. It's not a mmio reg and it use mtcr/mfcr instruction. + +config GX6605S_TIMER + bool "Gx6605s SOC system timer driver" if COMPILE_TEST + depends on CSKY + select CLKSRC_MMIO + select TIMER_OF + help + This option enables support for gx6605s SOC's timer. + endmenu diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index e33b21d3f9d8..dd9138104568 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -79,3 +79,5 @@ obj-$(CONFIG_CLKSRC_ST_LPC) += clksrc_st_lpc.o obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_ATCPIT100_TIMER) += timer-atcpit100.o obj-$(CONFIG_RISCV_TIMER) += riscv_timer.o +obj-$(CONFIG_CSKY_MP_TIMER) += timer-mp-csky.o +obj-$(CONFIG_GX6605S_TIMER) += timer-gx6605s.o diff --git a/drivers/clocksource/timer-gx6605s.c b/drivers/clocksource/timer-gx6605s.c new file mode 100644 index 000000000000..80d0939d040b --- /dev/null +++ b/drivers/clocksource/timer-gx6605s.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. + +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/sched_clock.h> + +#include "timer-of.h" + +#define CLKSRC_OFFSET 0x40 + +#define TIMER_STATUS 0x00 +#define TIMER_VALUE 0x04 +#define TIMER_CONTRL 0x10 +#define TIMER_CONFIG 0x20 +#define TIMER_DIV 0x24 +#define TIMER_INI 0x28 + +#define GX6605S_STATUS_CLR BIT(0) +#define GX6605S_CONTRL_RST BIT(0) +#define GX6605S_CONTRL_START BIT(1) +#define GX6605S_CONFIG_EN BIT(0) +#define GX6605S_CONFIG_IRQ_EN BIT(1) + +static irqreturn_t gx6605s_timer_interrupt(int irq, void *dev) +{ + struct clock_event_device *ce = dev; + void __iomem *base = timer_of_base(to_timer_of(ce)); + + writel_relaxed(GX6605S_STATUS_CLR, base + TIMER_STATUS); + + ce->event_handler(ce); + + return IRQ_HANDLED; +} + +static int gx6605s_timer_set_oneshot(struct clock_event_device *ce) +{ + void __iomem *base = timer_of_base(to_timer_of(ce)); + + /* reset and stop counter */ + writel_relaxed(GX6605S_CONTRL_RST, base + TIMER_CONTRL); + + /* enable with irq and start */ + writel_relaxed(GX6605S_CONFIG_EN | GX6605S_CONFIG_IRQ_EN, + base + TIMER_CONFIG); + + return 0; +} + +static int gx6605s_timer_set_next_event(unsigned long delta, + struct clock_event_device *ce) +{ + void __iomem *base = timer_of_base(to_timer_of(ce)); + + /* use reset to pause timer */ + writel_relaxed(GX6605S_CONTRL_RST, base + TIMER_CONTRL); + + /* config next timeout value */ + writel_relaxed(ULONG_MAX - delta, base + TIMER_INI); + writel_relaxed(GX6605S_CONTRL_START, base + TIMER_CONTRL); + + return 0; +} + +static int gx6605s_timer_shutdown(struct clock_event_device *ce) +{ + void __iomem *base = timer_of_base(to_timer_of(ce)); + + writel_relaxed(0, base + TIMER_CONTRL); + writel_relaxed(0, base + TIMER_CONFIG); + + return 0; +} + +static struct timer_of to = { + .flags = TIMER_OF_IRQ | TIMER_OF_BASE | TIMER_OF_CLOCK, + .clkevt = { + .rating = 300, + .features = CLOCK_EVT_FEAT_DYNIRQ | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_shutdown = gx6605s_timer_shutdown, + .set_state_oneshot = gx6605s_timer_set_oneshot, + .set_next_event = gx6605s_timer_set_next_event, + .cpumask = cpu_possible_mask, + }, + .of_irq = { + .handler = gx6605s_timer_interrupt, + .flags = IRQF_TIMER | IRQF_IRQPOLL, + }, +}; + +static u64 notrace gx6605s_sched_clock_read(void) +{ + void __iomem *base; + + base = timer_of_base(&to) + CLKSRC_OFFSET; + + return (u64)readl_relaxed(base + TIMER_VALUE); +} + +static void gx6605s_clkevt_init(void __iomem *base) +{ + writel_relaxed(0, base + TIMER_DIV); + writel_relaxed(0, base + TIMER_CONFIG); + + clockevents_config_and_register(&to.clkevt, timer_of_rate(&to), 2, + ULONG_MAX); +} + +static int gx6605s_clksrc_init(void __iomem *base) +{ + writel_relaxed(0, base + TIMER_DIV); + writel_relaxed(0, base + TIMER_INI); + + writel_relaxed(GX6605S_CONTRL_RST, base + TIMER_CONTRL); + + writel_relaxed(GX6605S_CONFIG_EN, base + TIMER_CONFIG); + + writel_relaxed(GX6605S_CONTRL_START, base + TIMER_CONTRL); + + sched_clock_register(gx6605s_sched_clock_read, 32, timer_of_rate(&to)); + + return clocksource_mmio_init(base + TIMER_VALUE, "gx6605s", + timer_of_rate(&to), 200, 32, clocksource_mmio_readl_up); +} + +static int __init gx6605s_timer_init(struct device_node *np) +{ + int ret; + + /* + * The timer driver is for nationalchip gx6605s SOC and there are two + * same timer in gx6605s. We use one for clkevt and another for clksrc. + * + * The timer is mmio map to access, so we need give mmio address in dts. + * + * It provides a 32bit countup timer and interrupt will be caused by + * count-overflow. + * So we need set-next-event by ULONG_MAX - delta in TIMER_INI reg. + * + * The counter at 0x0 offset is clock event. + * The counter at 0x40 offset is clock source. + * They are the same in hardware, just different used by driver. + */ + ret = timer_of_init(np, &to); + if (ret) + return ret; + + gx6605s_clkevt_init(timer_of_base(&to)); + + return gx6605s_clksrc_init(timer_of_base(&to) + CLKSRC_OFFSET); +} +TIMER_OF_DECLARE(csky_gx6605s_timer, "csky,gx6605s-timer", gx6605s_timer_init); diff --git a/drivers/clocksource/timer-mp-csky.c b/drivers/clocksource/timer-mp-csky.c new file mode 100644 index 000000000000..a8acc431a774 --- /dev/null +++ b/drivers/clocksource/timer-mp-csky.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. + +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/sched_clock.h> +#include <linux/cpu.h> +#include <linux/of_irq.h> +#include <asm/reg_ops.h> + +#include "timer-of.h" + +#define PTIM_CCVR "cr<3, 14>" +#define PTIM_CTLR "cr<0, 14>" +#define PTIM_LVR "cr<6, 14>" +#define PTIM_TSR "cr<1, 14>" + +static int csky_mptimer_irq; + +static int csky_mptimer_set_next_event(unsigned long delta, + struct clock_event_device *ce) +{ + mtcr(PTIM_LVR, delta); + + return 0; +} + +static int csky_mptimer_shutdown(struct clock_event_device *ce) +{ + mtcr(PTIM_CTLR, 0); + + return 0; +} + +static int csky_mptimer_oneshot(struct clock_event_device *ce) +{ + mtcr(PTIM_CTLR, 1); + + return 0; +} + +static int csky_mptimer_oneshot_stopped(struct clock_event_device *ce) +{ + mtcr(PTIM_CTLR, 0); + + return 0; +} + +static DEFINE_PER_CPU(struct timer_of, csky_to) = { + .flags = TIMER_OF_CLOCK, + .clkevt = { + .rating = 300, + .features = CLOCK_EVT_FEAT_PERCPU | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_shutdown = csky_mptimer_shutdown, + .set_state_oneshot = csky_mptimer_oneshot, + .set_state_oneshot_stopped = csky_mptimer_oneshot_stopped, + .set_next_event = csky_mptimer_set_next_event, + }, +}; + +static irqreturn_t csky_timer_interrupt(int irq, void *dev) +{ + struct timer_of *to = this_cpu_ptr(&csky_to); + + mtcr(PTIM_TSR, 0); + + to->clkevt.event_handler(&to->clkevt); + + return IRQ_HANDLED; +} + +/* + * clock event for percpu + */ +static int csky_mptimer_starting_cpu(unsigned int cpu) +{ + struct timer_of *to = per_cpu_ptr(&csky_to, cpu); + + to->clkevt.cpumask = cpumask_of(cpu); + + clockevents_config_and_register(&to->clkevt, timer_of_rate(to), + 2, ULONG_MAX); + + enable_percpu_irq(csky_mptimer_irq, 0); + + return 0; +} + +static int csky_mptimer_dying_cpu(unsigned int cpu) +{ + disable_percpu_irq(csky_mptimer_irq); + + return 0; +} + +/* + * clock source + */ +static u64 sched_clock_read(void) +{ + return (u64)mfcr(PTIM_CCVR); +} + +static u64 clksrc_read(struct clocksource *c) +{ + return (u64)mfcr(PTIM_CCVR); +} + +struct clocksource csky_clocksource = { + .name = "csky", + .rating = 400, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .read = clksrc_read, +}; + +static int __init csky_mptimer_init(struct device_node *np) +{ + int ret, cpu, cpu_rollback; + struct timer_of *to = NULL; + + /* + * Csky_mptimer is designed for C-SKY SMP multi-processors and + * every core has it's own private irq and regs for clkevt and + * clksrc. + * + * The regs is accessed by cpu instruction: mfcr/mtcr instead of + * mmio map style. So we needn't mmio-address in dts, but we still + * need to give clk and irq number. + * + * We use private irq for the mptimer and irq number is the same + * for every core. So we use request_percpu_irq() in timer_of_init. + */ + csky_mptimer_irq = irq_of_parse_and_map(np, 0); + if (csky_mptimer_irq <= 0) + return -EINVAL; + + ret = request_percpu_irq(csky_mptimer_irq, csky_timer_interrupt, + "csky_mp_timer", &csky_to); + if (ret) + return -EINVAL; + + for_each_possible_cpu(cpu) { + to = per_cpu_ptr(&csky_to, cpu); + ret = timer_of_init(np, to); + if (ret) + goto rollback; + } + + clocksource_register_hz(&csky_clocksource, timer_of_rate(to)); + sched_clock_register(sched_clock_read, 32, timer_of_rate(to)); + + ret = cpuhp_setup_state(CPUHP_AP_CSKY_TIMER_STARTING, + "clockevents/csky/timer:starting", + csky_mptimer_starting_cpu, + csky_mptimer_dying_cpu); + if (ret) + return -EINVAL; + + return 0; + +rollback: + for_each_possible_cpu(cpu_rollback) { + if (cpu_rollback == cpu) + break; + + to = per_cpu_ptr(&csky_to, cpu_rollback); + timer_of_cleanup(to); + } + return -EINVAL; +} +TIMER_OF_DECLARE(csky_mptimer, "csky,mptimer", csky_mptimer_init); diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index df9467eef32a..41c9ccdd20d6 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -234,6 +234,7 @@ config EDAC_SKX depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y select DMI + select ACPI_ADXL if ACPI help Support for error detection and correction the Intel Skylake server Integrated Memory Controllers. If your diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index dd209e0dd9ab..a99ea61dad32 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -26,6 +26,7 @@ #include <linux/bitmap.h> #include <linux/math64.h> #include <linux/mod_devicetable.h> +#include <linux/adxl.h> #include <acpi/nfit.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> @@ -35,6 +36,7 @@ #include "edac_module.h" #define EDAC_MOD_STR "skx_edac" +#define MSG_SIZE 1024 /* * Debug macros @@ -54,6 +56,29 @@ static LIST_HEAD(skx_edac_list); static u64 skx_tolm, skx_tohm; +static char *skx_msg; +static unsigned int nvdimm_count; + +enum { + INDEX_SOCKET, + INDEX_MEMCTRL, + INDEX_CHANNEL, + INDEX_DIMM, + INDEX_MAX +}; + +static const char * const component_names[] = { + [INDEX_SOCKET] = "ProcessorSocketId", + [INDEX_MEMCTRL] = "MemoryControllerId", + [INDEX_CHANNEL] = "ChannelId", + [INDEX_DIMM] = "DimmSlotId", +}; + +static int component_indices[ARRAY_SIZE(component_names)]; +static int adxl_component_count; +static const char * const *adxl_component_names; +static u64 *adxl_values; +static char *adxl_msg; #define NUM_IMC 2 /* memory controllers per socket */ #define NUM_CHANNELS 3 /* channels per memory controller */ @@ -393,6 +418,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, u16 flags; u64 size = 0; + nvdimm_count++; + dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc, imc->src_id, 0); @@ -941,12 +968,46 @@ static void teardown_skx_debug(void) } #endif /*CONFIG_EDAC_DEBUG*/ +static bool skx_adxl_decode(struct decoded_addr *res) + +{ + int i, len = 0; + + if (res->addr >= skx_tohm || (res->addr >= skx_tolm && + res->addr < BIT_ULL(32))) { + edac_dbg(0, "Address 0x%llx out of range\n", res->addr); + return false; + } + + if (adxl_decode(res->addr, adxl_values)) { + edac_dbg(0, "Failed to decode 0x%llx\n", res->addr); + return false; + } + + res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; + res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; + res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; + res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + + for (i = 0; i < adxl_component_count; i++) { + if (adxl_values[i] == ~0x0ull) + continue; + + len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx", + adxl_component_names[i], adxl_values[i]); + if (MSG_SIZE - len <= 0) + break; + } + + return true; +} + static void skx_mce_output_error(struct mem_ctl_info *mci, const struct mce *m, struct decoded_addr *res) { enum hw_event_mc_err_type tp_event; - char *type, *optype, msg[256]; + char *type, *optype; bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); @@ -1007,22 +1068,47 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, break; } } + if (adxl_component_count) { + snprintf(skx_msg, MSG_SIZE, "%s%s err_code:%04x:%04x %s", + overflow ? " OVERFLOW" : "", + (uncorrected_error && recoverable) ? " recoverable" : "", + mscod, errcode, adxl_msg); + } else { + snprintf(skx_msg, MSG_SIZE, + "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x", + overflow ? " OVERFLOW" : "", + (uncorrected_error && recoverable) ? " recoverable" : "", + mscod, errcode, + res->socket, res->imc, res->rank, + res->bank_group, res->bank_address, res->row, res->column); + } - snprintf(msg, sizeof(msg), - "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x", - overflow ? " OVERFLOW" : "", - (uncorrected_error && recoverable) ? " recoverable" : "", - mscod, errcode, - res->socket, res->imc, res->rank, - res->bank_group, res->bank_address, res->row, res->column); - - edac_dbg(0, "%s\n", msg); + edac_dbg(0, "%s\n", skx_msg); /* Call the helper to output message */ edac_mc_handle_error(tp_event, mci, core_err_cnt, m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, res->channel, res->dimm, -1, - optype, msg); + optype, skx_msg); +} + +static struct mem_ctl_info *get_mci(int src_id, int lmc) +{ + struct skx_dev *d; + + if (lmc > NUM_IMC - 1) { + skx_printk(KERN_ERR, "Bad lmc %d\n", lmc); + return NULL; + } + + list_for_each_entry(d, &skx_edac_list, list) { + if (d->imc[0].src_id == src_id) + return d->imc[lmc].mci; + } + + skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc); + + return NULL; } static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, @@ -1040,10 +1126,23 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) return NOTIFY_DONE; + memset(&res, 0, sizeof(res)); res.addr = mce->addr; - if (!skx_decode(&res)) + + if (adxl_component_count) { + if (!skx_adxl_decode(&res)) + return NOTIFY_DONE; + + mci = get_mci(res.socket, res.imc); + } else { + if (!skx_decode(&res)) + return NOTIFY_DONE; + + mci = res.dev->imc[res.imc].mci; + } + + if (!mci) return NOTIFY_DONE; - mci = res.dev->imc[res.imc].mci; if (mce->mcgstatus & MCG_STATUS_MCIP) type = "Exception"; @@ -1094,6 +1193,62 @@ static void skx_remove(void) } } +static void __init skx_adxl_get(void) +{ + const char * const *names; + int i, j; + + names = adxl_get_component_names(); + if (!names) { + skx_printk(KERN_NOTICE, "No firmware support for address translation."); + skx_printk(KERN_CONT, " Only decoding DDR4 address!\n"); + return; + } + + for (i = 0; i < INDEX_MAX; i++) { + for (j = 0; names[j]; j++) { + if (!strcmp(component_names[i], names[j])) { + component_indices[i] = j; + break; + } + } + + if (!names[j]) + goto err; + } + + adxl_component_names = names; + while (*names++) + adxl_component_count++; + + adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values), + GFP_KERNEL); + if (!adxl_values) { + adxl_component_count = 0; + return; + } + + adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL); + if (!adxl_msg) { + adxl_component_count = 0; + kfree(adxl_values); + } + + return; +err: + skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ", + component_names[i]); + for (j = 0; names[j]; j++) + skx_printk(KERN_CONT, "%s ", names[j]); + skx_printk(KERN_CONT, "\n"); +} + +static void __exit skx_adxl_put(void) +{ + kfree(adxl_values); + kfree(adxl_msg); +} + /* * skx_init: * make sure we are running on the correct cpu model @@ -1158,6 +1313,15 @@ static int __init skx_init(void) } } + skx_msg = kzalloc(MSG_SIZE, GFP_KERNEL); + if (!skx_msg) { + rc = -ENOMEM; + goto fail; + } + + if (nvdimm_count) + skx_adxl_get(); + /* Ensure that the OPSTATE is set correctly for POLL or NMI */ opstate_init(); @@ -1176,6 +1340,9 @@ static void __exit skx_exit(void) edac_dbg(2, "\n"); mce_unregister_decode_chain(&skx_mce_dec); skx_remove(); + if (nvdimm_count) + skx_adxl_put(); + kfree(skx_msg); teardown_skx_debug(); } diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index 3e626fd9bd4e..8061667a6765 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -229,14 +229,6 @@ sanity_check(struct efi_variable *var, efi_char16_t *name, efi_guid_t vendor, return 0; } -static inline bool is_compat(void) -{ - if (IS_ENABLED(CONFIG_COMPAT) && in_compat_syscall()) - return true; - - return false; -} - static void copy_out_compat(struct efi_variable *dst, struct compat_efi_variable *src) { @@ -263,7 +255,7 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count) u8 *data; int err; - if (is_compat()) { + if (in_compat_syscall()) { struct compat_efi_variable *compat; if (count != sizeof(*compat)) @@ -324,7 +316,7 @@ efivar_show_raw(struct efivar_entry *entry, char *buf) &entry->var.DataSize, entry->var.Data)) return -EIO; - if (is_compat()) { + if (in_compat_syscall()) { compat = (struct compat_efi_variable *)buf; size = sizeof(*compat); @@ -418,7 +410,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj, struct compat_efi_variable *compat = (struct compat_efi_variable *)buf; struct efi_variable *new_var = (struct efi_variable *)buf; struct efivar_entry *new_entry; - bool need_compat = is_compat(); + bool need_compat = in_compat_syscall(); efi_char16_t *name; unsigned long size; u32 attributes; @@ -495,7 +487,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (is_compat()) { + if (in_compat_syscall()) { if (count != sizeof(*compat)) return -EINVAL; diff --git a/drivers/fsi/fsi-sbefifo.c b/drivers/fsi/fsi-sbefifo.c index ae861342626e..d92f5b87c251 100644 --- a/drivers/fsi/fsi-sbefifo.c +++ b/drivers/fsi/fsi-sbefifo.c @@ -638,7 +638,7 @@ static void sbefifo_collect_async_ffdc(struct sbefifo *sbefifo) } ffdc_iov.iov_base = ffdc; ffdc_iov.iov_len = SBEFIFO_MAX_FFDC_SIZE; - iov_iter_kvec(&ffdc_iter, WRITE | ITER_KVEC, &ffdc_iov, 1, SBEFIFO_MAX_FFDC_SIZE); + iov_iter_kvec(&ffdc_iter, WRITE, &ffdc_iov, 1, SBEFIFO_MAX_FFDC_SIZE); cmd[0] = cpu_to_be32(2); cmd[1] = cpu_to_be32(SBEFIFO_CMD_GET_SBE_FFDC); rc = sbefifo_do_command(sbefifo, cmd, 2, &ffdc_iter); @@ -735,7 +735,7 @@ int sbefifo_submit(struct device *dev, const __be32 *command, size_t cmd_len, rbytes = (*resp_len) * sizeof(__be32); resp_iov.iov_base = response; resp_iov.iov_len = rbytes; - iov_iter_kvec(&resp_iter, WRITE | ITER_KVEC, &resp_iov, 1, rbytes); + iov_iter_kvec(&resp_iter, WRITE, &resp_iov, 1, rbytes); /* Perform the command */ mutex_lock(&sbefifo->lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c index 297a5490ad8c..0a4fba196b84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c @@ -135,7 +135,8 @@ static int acp_poweroff(struct generic_pm_domain *genpd) * 2. power off the acp tiles * 3. check and enter ulv state */ - if (adev->powerplay.pp_funcs->set_powergating_by_smu) + if (adev->powerplay.pp_funcs && + adev->powerplay.pp_funcs->set_powergating_by_smu) amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_ACP, true); } return 0; @@ -517,7 +518,8 @@ static int acp_set_powergating_state(void *handle, struct amdgpu_device *adev = (struct amdgpu_device *)handle; bool enable = state == AMD_PG_STATE_GATE ? true : false; - if (adev->powerplay.pp_funcs->set_powergating_by_smu) + if (adev->powerplay.pp_funcs && + adev->powerplay.pp_funcs->set_powergating_by_smu) amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_ACP, enable); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1e4dd09a5072..30bc345d6fdf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1493,8 +1493,6 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) } adev->powerplay.pp_feature = amdgpu_pp_feature_mask; - if (amdgpu_sriov_vf(adev)) - adev->powerplay.pp_feature &= ~PP_GFXOFF_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { if ((amdgpu_ip_block_mask & (1 << i)) == 0) { @@ -1600,7 +1598,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) } } - if (adev->powerplay.pp_funcs->load_firmware) { + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->load_firmware) { r = adev->powerplay.pp_funcs->load_firmware(adev->powerplay.pp_handle); if (r) { pr_err("firmware loading failed\n"); @@ -3341,7 +3339,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, kthread_park(ring->sched.thread); - if (job && job->base.sched == &ring->sched) + if (job && job->base.sched != &ring->sched) continue; drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 28781414d71c..943dbf3c5da1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -114,8 +114,8 @@ uint amdgpu_pg_mask = 0xffffffff; uint amdgpu_sdma_phase_quantum = 32; char *amdgpu_disable_cu = NULL; char *amdgpu_virtual_display = NULL; -/* OverDrive(bit 14) disabled by default*/ -uint amdgpu_pp_feature_mask = 0xffffbfff; +/* OverDrive(bit 14),gfxoff(bit 15),stutter mode(bit 17) disabled by default*/ +uint amdgpu_pp_feature_mask = 0xfffd3fff; int amdgpu_ngg = 0; int amdgpu_prim_buf_per_se = 0; int amdgpu_pos_buf_per_se = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 790fd5408ddf..1a656b8657f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -392,7 +392,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) if (!(adev->powerplay.pp_feature & PP_GFXOFF_MASK)) return; - if (!adev->powerplay.pp_funcs->set_powergating_by_smu) + if (!adev->powerplay.pp_funcs || !adev->powerplay.pp_funcs->set_powergating_by_smu) return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c index 94055a485e01..59cc678de8c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c @@ -704,7 +704,10 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev, return ret; if (adev->powerplay.pp_funcs->force_clock_level) - amdgpu_dpm_force_clock_level(adev, PP_SCLK, mask); + ret = amdgpu_dpm_force_clock_level(adev, PP_SCLK, mask); + + if (ret) + return -EINVAL; return count; } @@ -737,7 +740,10 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev, return ret; if (adev->powerplay.pp_funcs->force_clock_level) - amdgpu_dpm_force_clock_level(adev, PP_MCLK, mask); + ret = amdgpu_dpm_force_clock_level(adev, PP_MCLK, mask); + + if (ret) + return -EINVAL; return count; } @@ -770,7 +776,10 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev, return ret; if (adev->powerplay.pp_funcs->force_clock_level) - amdgpu_dpm_force_clock_level(adev, PP_PCIE, mask); + ret = amdgpu_dpm_force_clock_level(adev, PP_PCIE, mask); + + if (ret) + return -EINVAL; return count; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 6904d794d60a..352b30409060 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -542,7 +542,8 @@ static void amdgpu_vm_pt_next_leaf(struct amdgpu_device *adev, struct amdgpu_vm_pt_cursor *cursor) { amdgpu_vm_pt_next(adev, cursor); - while (amdgpu_vm_pt_descendant(adev, cursor)); + if (cursor->pfn != ~0ll) + while (amdgpu_vm_pt_descendant(adev, cursor)); } /** @@ -3234,8 +3235,10 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) } rbtree_postorder_for_each_entry_safe(mapping, tmp, &vm->va.rb_root, rb) { + /* Don't remove the mapping here, we don't want to trigger a + * rebalance and the tree is about to be destroyed anyway. + */ list_del(&mapping->list); - amdgpu_vm_it_remove(mapping, &vm->va); kfree(mapping); } list_for_each_entry_safe(mapping, tmp, &vm->freed, list) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 3d0f277a6523..617b0c8908a3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4815,8 +4815,10 @@ static int gfx_v8_0_kcq_resume(struct amdgpu_device *adev) if (r) goto done; - /* Test KCQs */ - for (i = 0; i < adev->gfx.num_compute_rings; i++) { + /* Test KCQs - reversing the order of rings seems to fix ring test failure + * after GPU reset + */ + for (i = adev->gfx.num_compute_rings - 1; i >= 0; i--) { ring = &adev->gfx.compute_ring[i]; ring->ready = true; r = amdgpu_ring_test_ring(ring); diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c index 14649f8475f3..fd23ba1226a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c @@ -280,7 +280,7 @@ void mmhub_v1_0_update_power_gating(struct amdgpu_device *adev, return; if (enable && adev->pg_flags & AMD_PG_SUPPORT_MMHUB) { - if (adev->powerplay.pp_funcs->set_powergating_by_smu) + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->set_powergating_by_smu) amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GMC, true); } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 04fa3d972636..7a8c9172d30a 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1366,7 +1366,8 @@ static int sdma_v4_0_hw_init(void *handle) int r; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs->set_powergating_by_smu) + if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs && + adev->powerplay.pp_funcs->set_powergating_by_smu) amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_SDMA, false); sdma_v4_0_init_golden_registers(adev); @@ -1386,7 +1387,8 @@ static int sdma_v4_0_hw_fini(void *handle) sdma_v4_0_ctx_switch_enable(adev, false); sdma_v4_0_enable(adev, false); - if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs->set_powergating_by_smu) + if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs + && adev->powerplay.pp_funcs->set_powergating_by_smu) amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_SDMA, true); return 0; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index e224f23e2215..b0df6dc9a775 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1524,6 +1524,13 @@ static int amdgpu_dm_backlight_update_status(struct backlight_device *bd) { struct amdgpu_display_manager *dm = bl_get_data(bd); + /* + * PWM interperts 0 as 100% rather than 0% because of HW + * limitation for level 0.So limiting minimum brightness level + * to 1. + */ + if (bd->props.brightness < 1) + return 1; if (dc_link_set_backlight_level(dm->backlight_link, bd->props.brightness, 0, 0)) return 0; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_pp_smu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_pp_smu.c index 0fab64a2a915..12001a006b2d 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_pp_smu.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_pp_smu.c @@ -101,7 +101,7 @@ bool dm_pp_apply_display_requirements( adev->pm.pm_display_cfg.displays[i].controller_id = dc_cfg->pipe_idx + 1; } - if (adev->powerplay.pp_funcs->display_configuration_change) + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->display_configuration_change) adev->powerplay.pp_funcs->display_configuration_change( adev->powerplay.pp_handle, &adev->pm.pm_display_cfg); @@ -304,7 +304,7 @@ bool dm_pp_get_clock_levels_by_type( struct amd_pp_simple_clock_info validation_clks = { 0 }; uint32_t i; - if (adev->powerplay.pp_funcs->get_clock_by_type) { + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->get_clock_by_type) { if (adev->powerplay.pp_funcs->get_clock_by_type(pp_handle, dc_to_pp_clock_type(clk_type), &pp_clks)) { /* Error in pplib. Provide default values. */ @@ -315,7 +315,7 @@ bool dm_pp_get_clock_levels_by_type( pp_to_dc_clock_levels(&pp_clks, dc_clks, clk_type); - if (adev->powerplay.pp_funcs->get_display_mode_validation_clocks) { + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->get_display_mode_validation_clocks) { if (adev->powerplay.pp_funcs->get_display_mode_validation_clocks( pp_handle, &validation_clks)) { /* Error in pplib. Provide default values. */ @@ -398,6 +398,9 @@ bool dm_pp_get_clock_levels_by_type_with_voltage( struct pp_clock_levels_with_voltage pp_clk_info = {0}; const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; + if (!pp_funcs || !pp_funcs->get_clock_by_type_with_voltage) + return false; + if (pp_funcs->get_clock_by_type_with_voltage(pp_handle, dc_to_pp_clock_type(clk_type), &pp_clk_info)) @@ -438,7 +441,7 @@ bool dm_pp_apply_clock_for_voltage_request( if (!pp_clock_request.clock_type) return false; - if (adev->powerplay.pp_funcs->display_clock_voltage_request) + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->display_clock_voltage_request) ret = adev->powerplay.pp_funcs->display_clock_voltage_request( adev->powerplay.pp_handle, &pp_clock_request); @@ -455,7 +458,7 @@ bool dm_pp_get_static_clocks( struct amd_pp_clock_info pp_clk_info = {0}; int ret = 0; - if (adev->powerplay.pp_funcs->get_current_clocks) + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->get_current_clocks) ret = adev->powerplay.pp_funcs->get_current_clocks( adev->powerplay.pp_handle, &pp_clk_info); @@ -505,6 +508,9 @@ void pp_rv_set_wm_ranges(struct pp_smu *pp, wm_with_clock_ranges.num_wm_dmif_sets = ranges->num_reader_wm_sets; wm_with_clock_ranges.num_wm_mcif_sets = ranges->num_writer_wm_sets; + if (!pp_funcs || !pp_funcs->set_watermarks_for_clocks_ranges) + return; + for (i = 0; i < wm_with_clock_ranges.num_wm_dmif_sets; i++) { if (ranges->reader_wm_sets[i].wm_inst > 3) wm_dce_clocks[i].wm_set_id = WM_SET_A; diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c index de190935f0a4..e3624ca24574 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c @@ -568,7 +568,7 @@ static struct input_pixel_processor *dce110_ipp_create( static const struct encoder_feature_support link_enc_feature = { .max_hdmi_deep_color = COLOR_DEPTH_121212, - .max_hdmi_pixel_clock = 594000, + .max_hdmi_pixel_clock = 300000, .flags.bits.IS_HBR2_CAPABLE = true, .flags.bits.IS_TPS3_CAPABLE = true }; diff --git a/drivers/gpu/drm/amd/display/dc/os_types.h b/drivers/gpu/drm/amd/display/dc/os_types.h index a407892905af..c0d9f332baed 100644 --- a/drivers/gpu/drm/amd/display/dc/os_types.h +++ b/drivers/gpu/drm/amd/display/dc/os_types.h @@ -40,8 +40,6 @@ #define LITTLEENDIAN_CPU #endif -#undef READ -#undef WRITE #undef FRAME_SIZE #define dm_output_to_console(fmt, ...) DRM_DEBUG_KMS(fmt, ##__VA_ARGS__) diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c index e8964cae6b93..d6aa1d414320 100644 --- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c +++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c @@ -723,11 +723,14 @@ static int pp_dpm_force_clock_level(void *handle, pr_info("%s was not implemented.\n", __func__); return 0; } + + if (hwmgr->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL) { + pr_info("force clock level is for dpm manual mode only.\n"); + return -EINVAL; + } + mutex_lock(&hwmgr->smu_lock); - if (hwmgr->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL) - ret = hwmgr->hwmgr_func->force_clock_level(hwmgr, type, mask); - else - ret = -EINVAL; + ret = hwmgr->hwmgr_func->force_clock_level(hwmgr, type, mask); mutex_unlock(&hwmgr->smu_lock); return ret; } @@ -963,6 +966,7 @@ static int pp_dpm_switch_power_profile(void *handle, static int pp_set_power_limit(void *handle, uint32_t limit) { struct pp_hwmgr *hwmgr = handle; + uint32_t max_power_limit; if (!hwmgr || !hwmgr->pm_en) return -EINVAL; @@ -975,7 +979,13 @@ static int pp_set_power_limit(void *handle, uint32_t limit) if (limit == 0) limit = hwmgr->default_power_limit; - if (limit > hwmgr->default_power_limit) + max_power_limit = hwmgr->default_power_limit; + if (hwmgr->od_enabled) { + max_power_limit *= (100 + hwmgr->platform_descriptor.TDPODLimit); + max_power_limit /= 100; + } + + if (limit > max_power_limit) return -EINVAL; mutex_lock(&hwmgr->smu_lock); @@ -994,8 +1004,13 @@ static int pp_get_power_limit(void *handle, uint32_t *limit, bool default_limit) mutex_lock(&hwmgr->smu_lock); - if (default_limit) + if (default_limit) { *limit = hwmgr->default_power_limit; + if (hwmgr->od_enabled) { + *limit *= (100 + hwmgr->platform_descriptor.TDPODLimit); + *limit /= 100; + } + } else *limit = hwmgr->power_limit; @@ -1303,12 +1318,12 @@ static int pp_enable_mgpu_fan_boost(void *handle) { struct pp_hwmgr *hwmgr = handle; - if (!hwmgr || !hwmgr->pm_en) + if (!hwmgr) return -EINVAL; - if (hwmgr->hwmgr_func->enable_mgpu_fan_boost == NULL) { + if (!hwmgr->pm_en || + hwmgr->hwmgr_func->enable_mgpu_fan_boost == NULL) return 0; - } mutex_lock(&hwmgr->smu_lock); hwmgr->hwmgr_func->enable_mgpu_fan_boost(hwmgr); diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c index 6c99cbf51c08..ed35ec0341e6 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c @@ -3588,9 +3588,10 @@ static int smu7_find_dpm_states_clocks_in_dpm_table(struct pp_hwmgr *hwmgr, cons break; } - if (i >= sclk_table->count) + if (i >= sclk_table->count) { data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_SCLK; - else { + sclk_table->dpm_levels[i-1].value = sclk; + } else { /* TODO: Check SCLK in DAL's minimum clocks * in case DeepSleep divider update is required. */ @@ -3605,9 +3606,10 @@ static int smu7_find_dpm_states_clocks_in_dpm_table(struct pp_hwmgr *hwmgr, cons break; } - if (i >= mclk_table->count) + if (i >= mclk_table->count) { data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_MCLK; - + mclk_table->dpm_levels[i-1].value = mclk; + } if (data->display_timing.num_existing_displays != hwmgr->display_config->num_display) data->need_update_smu7_dpm_table |= DPMTABLE_UPDATE_MCLK; diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c index 4714b5b59825..99a33c33a32c 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c @@ -718,7 +718,7 @@ int smu_set_watermarks_for_clocks_ranges(void *wt_table, table->WatermarkRow[1][i].MaxClock = cpu_to_le16((uint16_t) (wm_with_clock_ranges->wm_dmif_clocks_ranges[i].wm_max_dcfclk_clk_in_khz) / - 100); + 1000); table->WatermarkRow[1][i].MinUclk = cpu_to_le16((uint16_t) (wm_with_clock_ranges->wm_dmif_clocks_ranges[i].wm_min_mem_clk_in_khz) / diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c index 419a1d77d661..8c4db86bb4b7 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c @@ -1333,7 +1333,6 @@ static int vega10_setup_default_dpm_tables(struct pp_hwmgr *hwmgr) if (hwmgr->platform_descriptor.overdriveLimit.memoryClock == 0) hwmgr->platform_descriptor.overdriveLimit.memoryClock = dpm_table->dpm_levels[dpm_table->count-1].value; - vega10_init_dpm_state(&(dpm_table->dpm_state)); data->dpm_table.eclk_table.count = 0; @@ -3249,6 +3248,37 @@ static int vega10_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, static int vega10_find_dpm_states_clocks_in_dpm_table(struct pp_hwmgr *hwmgr, const void *input) { struct vega10_hwmgr *data = hwmgr->backend; + const struct phm_set_power_state_input *states = + (const struct phm_set_power_state_input *)input; + const struct vega10_power_state *vega10_ps = + cast_const_phw_vega10_power_state(states->pnew_state); + struct vega10_single_dpm_table *sclk_table = &(data->dpm_table.gfx_table); + uint32_t sclk = vega10_ps->performance_levels + [vega10_ps->performance_level_count - 1].gfx_clock; + struct vega10_single_dpm_table *mclk_table = &(data->dpm_table.mem_table); + uint32_t mclk = vega10_ps->performance_levels + [vega10_ps->performance_level_count - 1].mem_clock; + uint32_t i; + + for (i = 0; i < sclk_table->count; i++) { + if (sclk == sclk_table->dpm_levels[i].value) + break; + } + + if (i >= sclk_table->count) { + data->need_update_dpm_table |= DPMTABLE_OD_UPDATE_SCLK; + sclk_table->dpm_levels[i-1].value = sclk; + } + + for (i = 0; i < mclk_table->count; i++) { + if (mclk == mclk_table->dpm_levels[i].value) + break; + } + + if (i >= mclk_table->count) { + data->need_update_dpm_table |= DPMTABLE_OD_UPDATE_MCLK; + mclk_table->dpm_levels[i-1].value = mclk; + } if (data->display_timing.num_existing_displays != hwmgr->display_config->num_display) data->need_update_dpm_table |= DPMTABLE_UPDATE_MCLK; @@ -4529,11 +4559,13 @@ static int vega10_set_sclk_od(struct pp_hwmgr *hwmgr, uint32_t value) if (vega10_ps->performance_levels [vega10_ps->performance_level_count - 1].gfx_clock > - hwmgr->platform_descriptor.overdriveLimit.engineClock) + hwmgr->platform_descriptor.overdriveLimit.engineClock) { vega10_ps->performance_levels [vega10_ps->performance_level_count - 1].gfx_clock = hwmgr->platform_descriptor.overdriveLimit.engineClock; - + pr_warn("max sclk supported by vbios is %d\n", + hwmgr->platform_descriptor.overdriveLimit.engineClock); + } return 0; } @@ -4581,10 +4613,13 @@ static int vega10_set_mclk_od(struct pp_hwmgr *hwmgr, uint32_t value) if (vega10_ps->performance_levels [vega10_ps->performance_level_count - 1].mem_clock > - hwmgr->platform_descriptor.overdriveLimit.memoryClock) + hwmgr->platform_descriptor.overdriveLimit.memoryClock) { vega10_ps->performance_levels [vega10_ps->performance_level_count - 1].mem_clock = hwmgr->platform_descriptor.overdriveLimit.memoryClock; + pr_warn("max mclk supported by vbios is %d\n", + hwmgr->platform_descriptor.overdriveLimit.memoryClock); + } return 0; } diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c index 9600e2f226e9..74bc37308dc0 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c @@ -2356,6 +2356,13 @@ static int vega12_gfx_off_control(struct pp_hwmgr *hwmgr, bool enable) return vega12_disable_gfx_off(hwmgr); } +static int vega12_get_performance_level(struct pp_hwmgr *hwmgr, const struct pp_hw_power_state *state, + PHM_PerformanceLevelDesignation designation, uint32_t index, + PHM_PerformanceLevel *level) +{ + return 0; +} + static const struct pp_hwmgr_func vega12_hwmgr_funcs = { .backend_init = vega12_hwmgr_backend_init, .backend_fini = vega12_hwmgr_backend_fini, @@ -2406,6 +2413,7 @@ static const struct pp_hwmgr_func vega12_hwmgr_funcs = { .register_irq_handlers = smu9_register_irq_handlers, .start_thermal_controller = vega12_start_thermal_controller, .powergate_gfx = vega12_gfx_off_control, + .get_performance_level = vega12_get_performance_level, }; int vega12_hwmgr_init(struct pp_hwmgr *hwmgr) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index b4dbbb7c334c..57143d51e3ee 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -1875,38 +1875,20 @@ static int vega20_get_gpu_power(struct pp_hwmgr *hwmgr, return ret; } -static int vega20_get_current_gfx_clk_freq(struct pp_hwmgr *hwmgr, uint32_t *gfx_freq) +static int vega20_get_current_clk_freq(struct pp_hwmgr *hwmgr, + PPCLK_e clk_id, uint32_t *clk_freq) { - uint32_t gfx_clk = 0; int ret = 0; - *gfx_freq = 0; + *clk_freq = 0; PP_ASSERT_WITH_CODE((ret = smum_send_msg_to_smc_with_parameter(hwmgr, - PPSMC_MSG_GetDpmClockFreq, (PPCLK_GFXCLK << 16))) == 0, - "[GetCurrentGfxClkFreq] Attempt to get Current GFXCLK Frequency Failed!", + PPSMC_MSG_GetDpmClockFreq, (clk_id << 16))) == 0, + "[GetCurrentClkFreq] Attempt to get Current Frequency Failed!", return ret); - gfx_clk = smum_get_argument(hwmgr); + *clk_freq = smum_get_argument(hwmgr); - *gfx_freq = gfx_clk * 100; - - return 0; -} - -static int vega20_get_current_mclk_freq(struct pp_hwmgr *hwmgr, uint32_t *mclk_freq) -{ - uint32_t mem_clk = 0; - int ret = 0; - - *mclk_freq = 0; - - PP_ASSERT_WITH_CODE((ret = smum_send_msg_to_smc_with_parameter(hwmgr, - PPSMC_MSG_GetDpmClockFreq, (PPCLK_UCLK << 16))) == 0, - "[GetCurrentMClkFreq] Attempt to get Current MCLK Frequency Failed!", - return ret); - mem_clk = smum_get_argument(hwmgr); - - *mclk_freq = mem_clk * 100; + *clk_freq = *clk_freq * 100; return 0; } @@ -1937,12 +1919,16 @@ static int vega20_read_sensor(struct pp_hwmgr *hwmgr, int idx, switch (idx) { case AMDGPU_PP_SENSOR_GFX_SCLK: - ret = vega20_get_current_gfx_clk_freq(hwmgr, (uint32_t *)value); + ret = vega20_get_current_clk_freq(hwmgr, + PPCLK_GFXCLK, + (uint32_t *)value); if (!ret) *size = 4; break; case AMDGPU_PP_SENSOR_GFX_MCLK: - ret = vega20_get_current_mclk_freq(hwmgr, (uint32_t *)value); + ret = vega20_get_current_clk_freq(hwmgr, + PPCLK_UCLK, + (uint32_t *)value); if (!ret) *size = 4; break; @@ -2012,7 +1998,6 @@ int vega20_display_clock_voltage_request(struct pp_hwmgr *hwmgr, if (data->smu_features[GNLD_DPM_DCEFCLK].enabled) { switch (clk_type) { case amd_pp_dcef_clock: - clk_freq = clock_req->clock_freq_in_khz / 100; clk_select = PPCLK_DCEFCLK; break; case amd_pp_disp_clock: @@ -2041,11 +2026,20 @@ int vega20_display_clock_voltage_request(struct pp_hwmgr *hwmgr, return result; } +static int vega20_get_performance_level(struct pp_hwmgr *hwmgr, const struct pp_hw_power_state *state, + PHM_PerformanceLevelDesignation designation, uint32_t index, + PHM_PerformanceLevel *level) +{ + return 0; +} + static int vega20_notify_smc_display_config_after_ps_adjustment( struct pp_hwmgr *hwmgr) { struct vega20_hwmgr *data = (struct vega20_hwmgr *)(hwmgr->backend); + struct vega20_single_dpm_table *dpm_table = + &data->dpm_table.mem_table; struct PP_Clocks min_clocks = {0}; struct pp_display_clock_request clock_req; int ret = 0; @@ -2063,7 +2057,7 @@ static int vega20_notify_smc_display_config_after_ps_adjustment( if (data->smu_features[GNLD_DPM_DCEFCLK].supported) { clock_req.clock_type = amd_pp_dcef_clock; - clock_req.clock_freq_in_khz = min_clocks.dcefClock; + clock_req.clock_freq_in_khz = min_clocks.dcefClock * 10; if (!vega20_display_clock_voltage_request(hwmgr, &clock_req)) { if (data->smu_features[GNLD_DS_DCEFCLK].supported) PP_ASSERT_WITH_CODE((ret = smum_send_msg_to_smc_with_parameter( @@ -2076,6 +2070,15 @@ static int vega20_notify_smc_display_config_after_ps_adjustment( } } + if (data->smu_features[GNLD_DPM_UCLK].enabled) { + dpm_table->dpm_state.hard_min_level = min_clocks.memoryClock / 100; + PP_ASSERT_WITH_CODE(!(ret = smum_send_msg_to_smc_with_parameter(hwmgr, + PPSMC_MSG_SetHardMinByFreq, + (PPCLK_UCLK << 16 ) | dpm_table->dpm_state.hard_min_level)), + "[SetHardMinFreq] Set hard min uclk failed!", + return ret); + } + return 0; } @@ -2353,7 +2356,7 @@ static int vega20_get_sclks(struct pp_hwmgr *hwmgr, for (i = 0; i < count; i++) { clocks->data[i].clocks_in_khz = - dpm_table->dpm_levels[i].value * 100; + dpm_table->dpm_levels[i].value * 1000; clocks->data[i].latency_in_us = 0; } @@ -2383,7 +2386,7 @@ static int vega20_get_memclocks(struct pp_hwmgr *hwmgr, for (i = 0; i < count; i++) { clocks->data[i].clocks_in_khz = data->mclk_latency_table.entries[i].frequency = - dpm_table->dpm_levels[i].value * 100; + dpm_table->dpm_levels[i].value * 1000; clocks->data[i].latency_in_us = data->mclk_latency_table.entries[i].latency = vega20_get_mem_latency(hwmgr, dpm_table->dpm_levels[i].value); @@ -2408,7 +2411,7 @@ static int vega20_get_dcefclocks(struct pp_hwmgr *hwmgr, for (i = 0; i < count; i++) { clocks->data[i].clocks_in_khz = - dpm_table->dpm_levels[i].value * 100; + dpm_table->dpm_levels[i].value * 1000; clocks->data[i].latency_in_us = 0; } @@ -2431,7 +2434,7 @@ static int vega20_get_socclocks(struct pp_hwmgr *hwmgr, for (i = 0; i < count; i++) { clocks->data[i].clocks_in_khz = - dpm_table->dpm_levels[i].value * 100; + dpm_table->dpm_levels[i].value * 1000; clocks->data[i].latency_in_us = 0; } @@ -2582,11 +2585,11 @@ static int vega20_odn_edit_dpm_table(struct pp_hwmgr *hwmgr, return -EINVAL; } - if (input_clk < clocks.data[0].clocks_in_khz / 100 || + if (input_clk < clocks.data[0].clocks_in_khz / 1000 || input_clk > od8_settings[OD8_SETTING_UCLK_FMAX].max_value) { pr_info("clock freq %d is not within allowed range [%d - %d]\n", input_clk, - clocks.data[0].clocks_in_khz / 100, + clocks.data[0].clocks_in_khz / 1000, od8_settings[OD8_SETTING_UCLK_FMAX].max_value); return -EINVAL; } @@ -2726,7 +2729,7 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, switch (type) { case PP_SCLK: - ret = vega20_get_current_gfx_clk_freq(hwmgr, &now); + ret = vega20_get_current_clk_freq(hwmgr, PPCLK_GFXCLK, &now); PP_ASSERT_WITH_CODE(!ret, "Attempt to get current gfx clk Failed!", return ret); @@ -2738,12 +2741,12 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, for (i = 0; i < clocks.num_levels; i++) size += sprintf(buf + size, "%d: %uMhz %s\n", - i, clocks.data[i].clocks_in_khz / 100, + i, clocks.data[i].clocks_in_khz / 1000, (clocks.data[i].clocks_in_khz == now) ? "*" : ""); break; case PP_MCLK: - ret = vega20_get_current_mclk_freq(hwmgr, &now); + ret = vega20_get_current_clk_freq(hwmgr, PPCLK_UCLK, &now); PP_ASSERT_WITH_CODE(!ret, "Attempt to get current mclk freq Failed!", return ret); @@ -2755,7 +2758,7 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, for (i = 0; i < clocks.num_levels; i++) size += sprintf(buf + size, "%d: %uMhz %s\n", - i, clocks.data[i].clocks_in_khz / 100, + i, clocks.data[i].clocks_in_khz / 1000, (clocks.data[i].clocks_in_khz == now) ? "*" : ""); break; @@ -2820,7 +2823,7 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, return ret); size += sprintf(buf + size, "MCLK: %7uMhz %10uMhz\n", - clocks.data[0].clocks_in_khz / 100, + clocks.data[0].clocks_in_khz / 1000, od8_settings[OD8_SETTING_UCLK_FMAX].max_value); } @@ -3476,6 +3479,8 @@ static const struct pp_hwmgr_func vega20_hwmgr_funcs = { vega20_set_watermarks_for_clocks_ranges, .display_clock_voltage_request = vega20_display_clock_voltage_request, + .get_performance_level = + vega20_get_performance_level, /* UMD pstate, profile related */ .force_dpm_level = vega20_dpm_force_dpm_level, diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c index e5f7f8230065..97f8a1a970c3 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c @@ -642,8 +642,14 @@ static int check_powerplay_tables( "Unsupported PPTable format!", return -1); PP_ASSERT_WITH_CODE(powerplay_table->sHeader.structuresize > 0, "Invalid PowerPlay Table!", return -1); - PP_ASSERT_WITH_CODE(powerplay_table->smcPPTable.Version == PPTABLE_V20_SMU_VERSION, - "Unmatch PPTable version, vbios update may be needed!", return -1); + + if (powerplay_table->smcPPTable.Version != PPTABLE_V20_SMU_VERSION) { + pr_info("Unmatch PPTable version: " + "pptable from VBIOS is V%d while driver supported is V%d!", + powerplay_table->smcPPTable.Version, + PPTABLE_V20_SMU_VERSION); + return -EINVAL; + } //dump_pptable(&powerplay_table->smcPPTable); @@ -716,10 +722,6 @@ static int append_vbios_pptable(struct pp_hwmgr *hwmgr, PPTable_t *ppsmc_pptable "[appendVbiosPPTable] Failed to retrieve Smc Dpm Table from VBIOS!", return -1); - memset(ppsmc_pptable->Padding32, - 0, - sizeof(struct atom_smc_dpm_info_v4_4) - - sizeof(struct atom_common_table_header)); ppsmc_pptable->MaxVoltageStepGfx = smc_dpm_table->maxvoltagestepgfx; ppsmc_pptable->MaxVoltageStepSoc = smc_dpm_table->maxvoltagestepsoc; @@ -778,22 +780,19 @@ static int append_vbios_pptable(struct pp_hwmgr *hwmgr, PPTable_t *ppsmc_pptable ppsmc_pptable->FllGfxclkSpreadPercent = smc_dpm_table->fllgfxclkspreadpercent; ppsmc_pptable->FllGfxclkSpreadFreq = smc_dpm_table->fllgfxclkspreadfreq; - if ((smc_dpm_table->table_header.format_revision == 4) && - (smc_dpm_table->table_header.content_revision == 4)) { - for (i = 0; i < I2C_CONTROLLER_NAME_COUNT; i++) { - ppsmc_pptable->I2cControllers[i].Enabled = - smc_dpm_table->i2ccontrollers[i].enabled; - ppsmc_pptable->I2cControllers[i].SlaveAddress = - smc_dpm_table->i2ccontrollers[i].slaveaddress; - ppsmc_pptable->I2cControllers[i].ControllerPort = - smc_dpm_table->i2ccontrollers[i].controllerport; - ppsmc_pptable->I2cControllers[i].ThermalThrottler = - smc_dpm_table->i2ccontrollers[i].thermalthrottler; - ppsmc_pptable->I2cControllers[i].I2cProtocol = - smc_dpm_table->i2ccontrollers[i].i2cprotocol; - ppsmc_pptable->I2cControllers[i].I2cSpeed = - smc_dpm_table->i2ccontrollers[i].i2cspeed; - } + for (i = 0; i < I2C_CONTROLLER_NAME_COUNT; i++) { + ppsmc_pptable->I2cControllers[i].Enabled = + smc_dpm_table->i2ccontrollers[i].enabled; + ppsmc_pptable->I2cControllers[i].SlaveAddress = + smc_dpm_table->i2ccontrollers[i].slaveaddress; + ppsmc_pptable->I2cControllers[i].ControllerPort = + smc_dpm_table->i2ccontrollers[i].controllerport; + ppsmc_pptable->I2cControllers[i].ThermalThrottler = + smc_dpm_table->i2ccontrollers[i].thermalthrottler; + ppsmc_pptable->I2cControllers[i].I2cProtocol = + smc_dpm_table->i2ccontrollers[i].i2cprotocol; + ppsmc_pptable->I2cControllers[i].I2cSpeed = + smc_dpm_table->i2ccontrollers[i].i2cspeed; } return 0; @@ -882,15 +881,10 @@ static int init_powerplay_table_information( if (pptable_information->smc_pptable == NULL) return -ENOMEM; - if (powerplay_table->smcPPTable.Version <= 2) - memcpy(pptable_information->smc_pptable, - &(powerplay_table->smcPPTable), - sizeof(PPTable_t) - - sizeof(I2cControllerConfig_t) * I2C_CONTROLLER_NAME_COUNT); - else - memcpy(pptable_information->smc_pptable, - &(powerplay_table->smcPPTable), - sizeof(PPTable_t)); + memcpy(pptable_information->smc_pptable, + &(powerplay_table->smcPPTable), + sizeof(PPTable_t)); + result = append_vbios_pptable(hwmgr, (pptable_information->smc_pptable)); diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu11_driver_if.h b/drivers/gpu/drm/amd/powerplay/inc/smu11_driver_if.h index 2998a49960ed..63d5cf691549 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/smu11_driver_if.h +++ b/drivers/gpu/drm/amd/powerplay/inc/smu11_driver_if.h @@ -29,7 +29,7 @@ // any structure is changed in this file #define SMU11_DRIVER_IF_VERSION 0x12 -#define PPTABLE_V20_SMU_VERSION 2 +#define PPTABLE_V20_SMU_VERSION 3 #define NUM_GFXCLK_DPM_LEVELS 16 #define NUM_VCLK_DPM_LEVELS 8 diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smu8_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smu8_smumgr.c index f836d30fdd44..09b844ec3eab 100644 --- a/drivers/gpu/drm/amd/powerplay/smumgr/smu8_smumgr.c +++ b/drivers/gpu/drm/amd/powerplay/smumgr/smu8_smumgr.c @@ -71,7 +71,11 @@ static int smu8_send_msg_to_smc_async(struct pp_hwmgr *hwmgr, uint16_t msg) result = PHM_WAIT_FIELD_UNEQUAL(hwmgr, SMU_MP1_SRBM2P_RESP_0, CONTENT, 0); if (result != 0) { + /* Read the last message to SMU, to report actual cause */ + uint32_t val = cgs_read_register(hwmgr->device, + mmSMU_MP1_SRBM2P_MSG_0); pr_err("smu8_send_msg_to_smc_async (0x%04x) failed\n", msg); + pr_err("SMU still servicing msg (0x%04x)\n", val); return result; } diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index f8a931cf3665..680566d97adc 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -458,18 +458,6 @@ static void ti_sn_bridge_enable(struct drm_bridge *bridge) unsigned int val; int ret; - /* - * FIXME: - * This 70ms was found necessary by experimentation. If it's not - * present, link training fails. It seems like it can go anywhere from - * pre_enable() up to semi-auto link training initiation below. - * - * Neither the datasheet for the bridge nor the panel tested mention a - * delay of this magnitude in the timing requirements. So for now, add - * the mystery delay until someone figures out a better fix. - */ - msleep(70); - /* DSI_A lane config */ val = CHA_DSI_LANES(4 - pdata->dsi->lanes); regmap_update_bits(pdata->regmap, SN_DSI_LANES_REG, @@ -536,7 +524,22 @@ static void ti_sn_bridge_pre_enable(struct drm_bridge *bridge) /* configure bridge ref_clk */ ti_sn_bridge_set_refclk_freq(pdata); - /* in case drm_panel is connected then HPD is not supported */ + /* + * HPD on this bridge chip is a bit useless. This is an eDP bridge + * so the HPD is an internal signal that's only there to signal that + * the panel is done powering up. ...but the bridge chip debounces + * this signal by between 100 ms and 400 ms (depending on process, + * voltage, and temperate--I measured it at about 200 ms). One + * particular panel asserted HPD 84 ms after it was powered on meaning + * that we saw HPD 284 ms after power on. ...but the same panel said + * that instead of looking at HPD you could just hardcode a delay of + * 200 ms. We'll assume that the panel driver will have the hardcoded + * delay in its prepare and always disable HPD. + * + * If HPD somehow makes sense on some future panel we'll have to + * change this to be conditional on someone specifying that HPD should + * be used. + */ regmap_update_bits(pdata->regmap, SN_HPD_DISABLE_REG, HPD_DISABLE, HPD_DISABLE); diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 701cb334e1ea..d8b526b7932c 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -308,6 +308,26 @@ update_connector_routing(struct drm_atomic_state *state, return 0; } + crtc_state = drm_atomic_get_new_crtc_state(state, + new_connector_state->crtc); + /* + * For compatibility with legacy users, we want to make sure that + * we allow DPMS On->Off modesets on unregistered connectors. Modesets + * which would result in anything else must be considered invalid, to + * avoid turning on new displays on dead connectors. + * + * Since the connector can be unregistered at any point during an + * atomic check or commit, this is racy. But that's OK: all we care + * about is ensuring that userspace can't do anything but shut off the + * display on a connector that was destroyed after its been notified, + * not before. + */ + if (drm_connector_is_unregistered(connector) && crtc_state->active) { + DRM_DEBUG_ATOMIC("[CONNECTOR:%d:%s] is not registered\n", + connector->base.id, connector->name); + return -EINVAL; + } + funcs = connector->helper_private; if (funcs->atomic_best_encoder) @@ -352,7 +372,6 @@ update_connector_routing(struct drm_atomic_state *state, set_best_encoder(state, new_connector_state, new_encoder); - crtc_state = drm_atomic_get_new_crtc_state(state, new_connector_state->crtc); crtc_state->connectors_changed = true; DRM_DEBUG_ATOMIC("[CONNECTOR:%d:%s] using [ENCODER:%d:%s] on [CRTC:%d:%s]\n", diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index 1e40e5decbe9..4943cef178be 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -379,7 +379,8 @@ void drm_connector_cleanup(struct drm_connector *connector) /* The connector should have been removed from userspace long before * it is finally destroyed. */ - if (WARN_ON(connector->registered)) + if (WARN_ON(connector->registration_state == + DRM_CONNECTOR_REGISTERED)) drm_connector_unregister(connector); if (connector->tile_group) { @@ -436,7 +437,7 @@ int drm_connector_register(struct drm_connector *connector) return 0; mutex_lock(&connector->mutex); - if (connector->registered) + if (connector->registration_state != DRM_CONNECTOR_INITIALIZING) goto unlock; ret = drm_sysfs_connector_add(connector); @@ -456,7 +457,7 @@ int drm_connector_register(struct drm_connector *connector) drm_mode_object_register(connector->dev, &connector->base); - connector->registered = true; + connector->registration_state = DRM_CONNECTOR_REGISTERED; goto unlock; err_debugfs: @@ -478,7 +479,7 @@ EXPORT_SYMBOL(drm_connector_register); void drm_connector_unregister(struct drm_connector *connector) { mutex_lock(&connector->mutex); - if (!connector->registered) { + if (connector->registration_state != DRM_CONNECTOR_REGISTERED) { mutex_unlock(&connector->mutex); return; } @@ -489,7 +490,7 @@ void drm_connector_unregister(struct drm_connector *connector) drm_sysfs_connector_remove(connector); drm_debugfs_connector_remove(connector); - connector->registered = false; + connector->registration_state = DRM_CONNECTOR_UNREGISTERED; mutex_unlock(&connector->mutex); } EXPORT_SYMBOL(drm_connector_unregister); diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index ff0bfc65a8c1..b506e3622b08 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -122,6 +122,9 @@ static const struct edid_quirk { /* SDC panel of Lenovo B50-80 reports 8 bpc, but is a 6 bpc panel */ { "SDC", 0x3652, EDID_QUIRK_FORCE_6BPC }, + /* BOE model 0x0771 reports 8 bpc, but is a 6 bpc panel */ + { "BOE", 0x0771, EDID_QUIRK_FORCE_6BPC }, + /* Belinea 10 15 55 */ { "MAX", 1516, EDID_QUIRK_PREFER_LARGE_60 }, { "MAX", 0x77e, EDID_QUIRK_PREFER_LARGE_60 }, diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 3fae4dab295f..13f9b56a9ce7 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -5102,19 +5102,13 @@ intel_dp_long_pulse(struct intel_connector *connector, */ status = connector_status_disconnected; goto out; - } else { - /* - * If display is now connected check links status, - * there has been known issues of link loss triggering - * long pulse. - * - * Some sinks (eg. ASUS PB287Q) seem to perform some - * weird HPD ping pong during modesets. So we can apparently - * end up with HPD going low during a modeset, and then - * going back up soon after. And once that happens we must - * retrain the link to get a picture. That's in case no - * userspace component reacted to intermittent HPD dip. - */ + } + + /* + * Some external monitors do not signal loss of link synchronization + * with an IRQ_HPD, so force a link status check. + */ + if (!intel_dp_is_edp(intel_dp)) { struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; intel_dp_retrain_link(encoder, ctx); diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c index 7f155b4f1a7d..1b00f8ea145b 100644 --- a/drivers/gpu/drm/i915/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/intel_dp_mst.c @@ -77,7 +77,7 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder, pipe_config->pbn = mst_pbn; /* Zombie connectors can't have VCPI slots */ - if (READ_ONCE(connector->registered)) { + if (!drm_connector_is_unregistered(connector)) { slots = drm_dp_atomic_find_vcpi_slots(state, &intel_dp->mst_mgr, port, @@ -313,7 +313,7 @@ static int intel_dp_mst_get_ddc_modes(struct drm_connector *connector) struct edid *edid; int ret; - if (!READ_ONCE(connector->registered)) + if (drm_connector_is_unregistered(connector)) return intel_connector_update_modes(connector, NULL); edid = drm_dp_mst_get_edid(connector, &intel_dp->mst_mgr, intel_connector->port); @@ -329,7 +329,7 @@ intel_dp_mst_detect(struct drm_connector *connector, bool force) struct intel_connector *intel_connector = to_intel_connector(connector); struct intel_dp *intel_dp = intel_connector->mst_port; - if (!READ_ONCE(connector->registered)) + if (drm_connector_is_unregistered(connector)) return connector_status_disconnected; return drm_dp_mst_detect_port(connector, &intel_dp->mst_mgr, intel_connector->port); @@ -372,7 +372,7 @@ intel_dp_mst_mode_valid(struct drm_connector *connector, int bpp = 24; /* MST uses fixed bpp */ int max_rate, mode_rate, max_lanes, max_link_clock; - if (!READ_ONCE(connector->registered)) + if (drm_connector_is_unregistered(connector)) return MODE_ERROR; if (mode->flags & DRM_MODE_FLAG_DBLSCAN) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index 6bb78076b5b5..6cbbae3f438b 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -881,22 +881,16 @@ nv50_mstc_atomic_best_encoder(struct drm_connector *connector, { struct nv50_head *head = nv50_head(connector_state->crtc); struct nv50_mstc *mstc = nv50_mstc(connector); - if (mstc->port) { - struct nv50_mstm *mstm = mstc->mstm; - return &mstm->msto[head->base.index]->encoder; - } - return NULL; + + return &mstc->mstm->msto[head->base.index]->encoder; } static struct drm_encoder * nv50_mstc_best_encoder(struct drm_connector *connector) { struct nv50_mstc *mstc = nv50_mstc(connector); - if (mstc->port) { - struct nv50_mstm *mstm = mstc->mstm; - return &mstm->msto[0]->encoder; - } - return NULL; + + return &mstc->mstm->msto[0]->encoder; } static enum drm_mode_status diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index 97964f7f2ace..a04ffb3b2174 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -56,6 +56,8 @@ struct panel_desc { /** * @prepare: the time (in milliseconds) that it takes for the panel to * become ready and start receiving video data + * @hpd_absent_delay: Add this to the prepare delay if we know Hot + * Plug Detect isn't used. * @enable: the time (in milliseconds) that it takes for the panel to * display the first valid frame after starting to receive * video data @@ -66,6 +68,7 @@ struct panel_desc { */ struct { unsigned int prepare; + unsigned int hpd_absent_delay; unsigned int enable; unsigned int disable; unsigned int unprepare; @@ -79,6 +82,7 @@ struct panel_simple { struct drm_panel base; bool prepared; bool enabled; + bool no_hpd; const struct panel_desc *desc; @@ -202,6 +206,7 @@ static int panel_simple_unprepare(struct drm_panel *panel) static int panel_simple_prepare(struct drm_panel *panel) { struct panel_simple *p = to_panel_simple(panel); + unsigned int delay; int err; if (p->prepared) @@ -215,8 +220,11 @@ static int panel_simple_prepare(struct drm_panel *panel) gpiod_set_value_cansleep(p->enable_gpio, 1); - if (p->desc->delay.prepare) - msleep(p->desc->delay.prepare); + delay = p->desc->delay.prepare; + if (p->no_hpd) + delay += p->desc->delay.hpd_absent_delay; + if (delay) + msleep(delay); p->prepared = true; @@ -305,6 +313,8 @@ static int panel_simple_probe(struct device *dev, const struct panel_desc *desc) panel->prepared = false; panel->desc = desc; + panel->no_hpd = of_property_read_bool(dev->of_node, "no-hpd"); + panel->supply = devm_regulator_get(dev, "power"); if (IS_ERR(panel->supply)) return PTR_ERR(panel->supply); @@ -1363,7 +1373,7 @@ static const struct panel_desc innolux_n156bge_l21 = { }, }; -static const struct drm_display_mode innolux_tv123wam_mode = { +static const struct drm_display_mode innolux_p120zdg_bf1_mode = { .clock = 206016, .hdisplay = 2160, .hsync_start = 2160 + 48, @@ -1377,15 +1387,16 @@ static const struct drm_display_mode innolux_tv123wam_mode = { .flags = DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC, }; -static const struct panel_desc innolux_tv123wam = { - .modes = &innolux_tv123wam_mode, +static const struct panel_desc innolux_p120zdg_bf1 = { + .modes = &innolux_p120zdg_bf1_mode, .num_modes = 1, .bpc = 8, .size = { - .width = 259, - .height = 173, + .width = 254, + .height = 169, }, .delay = { + .hpd_absent_delay = 200, .unprepare = 500, }, }; @@ -2445,8 +2456,8 @@ static const struct of_device_id platform_of_match[] = { .compatible = "innolux,n156bge-l21", .data = &innolux_n156bge_l21, }, { - .compatible = "innolux,tv123wam", - .data = &innolux_tv123wam, + .compatible = "innolux,p120zdg-bf1", + .data = &innolux_p120zdg_bf1, }, { .compatible = "innolux,zj070na-01p", .data = &innolux_zj070na_01p, diff --git a/drivers/irqchip/irq-mvebu-sei.c b/drivers/irqchip/irq-mvebu-sei.c index 566d69a2edbc..add4c9c934c8 100644 --- a/drivers/irqchip/irq-mvebu-sei.c +++ b/drivers/irqchip/irq-mvebu-sei.c @@ -384,9 +384,9 @@ static int mvebu_sei_probe(struct platform_device *pdev) sei->res = platform_get_resource(pdev, IORESOURCE_MEM, 0); sei->base = devm_ioremap_resource(sei->dev, sei->res); - if (!sei->base) { + if (IS_ERR(sei->base)) { dev_err(sei->dev, "Failed to remap SEI resource\n"); - return -ENODEV; + return PTR_ERR(sei->base); } /* Retrieve the SEI capabilities with the interrupt ranges */ diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index b05022f94f18..072bb5e36c18 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -718,8 +718,7 @@ l1oip_socket_thread(void *data) printk(KERN_DEBUG "%s: socket created and open\n", __func__); while (!signal_pending(current)) { - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, - recvbuf_size); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, recvbuf_size); recvlen = sock_recvmsg(socket, &msg, 0); if (recvlen > 0) { l1oip_socket_parse(hc, &sin_rx, recvbuf, recvlen); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f3fb5bb8c82a..ac1cffd2a09b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); + bio_clone_blkcg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile index 3370a4138e94..951c984de61a 100644 --- a/drivers/misc/lkdtm/Makefile +++ b/drivers/misc/lkdtm/Makefile @@ -8,7 +8,9 @@ lkdtm-$(CONFIG_LKDTM) += perms.o lkdtm-$(CONFIG_LKDTM) += refcount.o lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o lkdtm-$(CONFIG_LKDTM) += usercopy.o +lkdtm-$(CONFIG_LKDTM) += stackleak.o +KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_rodata.o := n OBJCOPYFLAGS := diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index 5a755590d3dc..2837dc77478e 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -184,6 +184,7 @@ static const struct crashtype crashtypes[] = { CRASHTYPE(USERCOPY_STACK_BEYOND), CRASHTYPE(USERCOPY_KERNEL), CRASHTYPE(USERCOPY_KERNEL_DS), + CRASHTYPE(STACKLEAK_ERASING), }; diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h index 07db641d71d0..3c6fd327e166 100644 --- a/drivers/misc/lkdtm/lkdtm.h +++ b/drivers/misc/lkdtm/lkdtm.h @@ -84,4 +84,7 @@ void lkdtm_USERCOPY_STACK_BEYOND(void); void lkdtm_USERCOPY_KERNEL(void); void lkdtm_USERCOPY_KERNEL_DS(void); +/* lkdtm_stackleak.c */ +void lkdtm_STACKLEAK_ERASING(void); + #endif diff --git a/drivers/misc/lkdtm/stackleak.c b/drivers/misc/lkdtm/stackleak.c new file mode 100644 index 000000000000..d5a084475abc --- /dev/null +++ b/drivers/misc/lkdtm/stackleak.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code tests that the current task stack is properly erased (filled + * with STACKLEAK_POISON). + * + * Authors: + * Alexander Popov <alex.popov@linux.com> + * Tycho Andersen <tycho@tycho.ws> + */ + +#include "lkdtm.h" +#include <linux/stackleak.h> + +void lkdtm_STACKLEAK_ERASING(void) +{ + unsigned long *sp, left, found, i; + const unsigned long check_depth = + STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); + + /* + * For the details about the alignment of the poison values, see + * the comment in stackleak_track_stack(). + */ + sp = PTR_ALIGN(&i, sizeof(unsigned long)); + + left = ((unsigned long)sp & (THREAD_SIZE - 1)) / sizeof(unsigned long); + sp--; + + /* + * One 'long int' at the bottom of the thread stack is reserved + * and not poisoned. + */ + if (left > 1) { + left--; + } else { + pr_err("FAIL: not enough stack space for the test\n"); + return; + } + + pr_info("checking unused part of the thread stack (%lu bytes)...\n", + left * sizeof(unsigned long)); + + /* + * Search for 'check_depth' poison values in a row (just like + * stackleak_erase() does). + */ + for (i = 0, found = 0; i < left && found <= check_depth; i++) { + if (*(sp - i) == STACKLEAK_POISON) + found++; + else + found = 0; + } + + if (found <= check_depth) { + pr_err("FAIL: thread stack is not erased (checked %lu bytes)\n", + i * sizeof(unsigned long)); + return; + } + + pr_info("first %lu bytes are unpoisoned\n", + (i - found) * sizeof(unsigned long)); + + /* The rest of thread stack should be erased */ + for (; i < left; i++) { + if (*(sp - i) != STACKLEAK_POISON) { + pr_err("FAIL: thread stack is NOT properly erased\n"); + return; + } + } + + pr_info("OK: the rest of the thread stack is properly erased\n"); + return; +} diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index bd52f29b4a4e..264f4ed8eef2 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -3030,7 +3030,7 @@ ssize_t vmci_qpair_enqueue(struct vmci_qp *qpair, if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &v, 1, buf_size); + iov_iter_kvec(&from, WRITE, &v, 1, buf_size); qp_lock(qpair); @@ -3074,7 +3074,7 @@ ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair, if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; - iov_iter_kvec(&to, READ | ITER_KVEC, &v, 1, buf_size); + iov_iter_kvec(&to, READ, &v, 1, buf_size); qp_lock(qpair); @@ -3119,7 +3119,7 @@ ssize_t vmci_qpair_peek(struct vmci_qp *qpair, if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; - iov_iter_kvec(&to, READ | ITER_KVEC, &v, 1, buf_size); + iov_iter_kvec(&to, READ, &v, 1, buf_size); qp_lock(qpair); diff --git a/drivers/mtd/ubi/attach.c b/drivers/mtd/ubi/attach.c index 93ceea4f27d5..e294d3986ba9 100644 --- a/drivers/mtd/ubi/attach.c +++ b/drivers/mtd/ubi/attach.c @@ -1072,6 +1072,7 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, * be a result of power cut during erasure. */ ai->maybe_bad_peb_count += 1; + /* fall through */ case UBI_IO_BAD_HDR: /* * If we're facing a bad VID header we have to drop *all* diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index d2a726654ff1..a4e3454133a4 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -1334,8 +1334,10 @@ static int bytes_str_to_int(const char *str) switch (*endp) { case 'G': result *= 1024; + /* fall through */ case 'M': result *= 1024; + /* fall through */ case 'K': result *= 1024; if (endp[1] == 'i' && endp[2] == 'B') diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c index b12023bc2cab..a5bab614ff84 100644 --- a/drivers/net/ntb_netdev.c +++ b/drivers/net/ntb_netdev.c @@ -71,7 +71,6 @@ static unsigned int tx_start = 10; static unsigned int tx_stop = 5; struct ntb_netdev { - struct list_head list; struct pci_dev *pdev; struct net_device *ndev; struct ntb_transport_qp *qp; @@ -81,8 +80,6 @@ struct ntb_netdev { #define NTB_TX_TIMEOUT_MS 1000 #define NTB_RXQ_SIZE 100 -static LIST_HEAD(dev_list); - static void ntb_netdev_event_handler(void *data, int link_is_up) { struct net_device *ndev = data; @@ -236,7 +233,7 @@ static void ntb_netdev_tx_timer(struct timer_list *t) struct net_device *ndev = dev->ndev; if (ntb_transport_tx_free_entry(dev->qp) < tx_stop) { - mod_timer(&dev->tx_timer, jiffies + msecs_to_jiffies(tx_time)); + mod_timer(&dev->tx_timer, jiffies + usecs_to_jiffies(tx_time)); } else { /* Make sure anybody stopping the queue after this sees the new * value of ntb_transport_tx_free_entry() @@ -452,7 +449,7 @@ static int ntb_netdev_probe(struct device *client_dev) if (rc) goto err1; - list_add(&dev->list, &dev_list); + dev_set_drvdata(client_dev, ndev); dev_info(&pdev->dev, "%s created\n", ndev->name); return 0; @@ -465,27 +462,8 @@ err: static void ntb_netdev_remove(struct device *client_dev) { - struct ntb_dev *ntb; - struct net_device *ndev; - struct pci_dev *pdev; - struct ntb_netdev *dev; - bool found = false; - - ntb = dev_ntb(client_dev->parent); - pdev = ntb->pdev; - - list_for_each_entry(dev, &dev_list, list) { - if (dev->pdev == pdev) { - found = true; - break; - } - } - if (!found) - return; - - list_del(&dev->list); - - ndev = dev->ndev; + struct net_device *ndev = dev_get_drvdata(client_dev); + struct ntb_netdev *dev = netdev_priv(ndev); unregister_netdev(ndev); ntb_transport_free_queue(dev->qp); diff --git a/drivers/ntb/hw/idt/Kconfig b/drivers/ntb/hw/idt/Kconfig index b360e5613b9f..f8948cf515ce 100644 --- a/drivers/ntb/hw/idt/Kconfig +++ b/drivers/ntb/hw/idt/Kconfig @@ -1,6 +1,7 @@ config NTB_IDT tristate "IDT PCIe-switch Non-Transparent Bridge support" depends on PCI + select HWMON help This driver supports NTB of cappable IDT PCIe-switches. @@ -23,9 +24,7 @@ config NTB_IDT BAR settings of peer NT-functions, the BAR setups can't be done over kernel PCI fixups. That's why the alternative pre-initialization techniques like BIOS using SMBus interface or EEPROM should be - utilized. Additionally if one needs to have temperature sensor - information printed to system log, the corresponding registers must - be initialized within BIOS/EEPROM as well. + utilized. If unsure, say N. diff --git a/drivers/ntb/hw/idt/ntb_hw_idt.c b/drivers/ntb/hw/idt/ntb_hw_idt.c index dbe72f116017..1dede87dd54f 100644 --- a/drivers/ntb/hw/idt/ntb_hw_idt.c +++ b/drivers/ntb/hw/idt/ntb_hw_idt.c @@ -4,7 +4,7 @@ * * GPL LICENSE SUMMARY * - * Copyright (C) 2016 T-Platforms All Rights Reserved. + * Copyright (C) 2016-2018 T-Platforms JSC All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -49,11 +49,14 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/pci.h> #include <linux/aer.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/debugfs.h> +#include <linux/hwmon.h> +#include <linux/hwmon-sysfs.h> #include <linux/ntb.h> #include "ntb_hw_idt.h" @@ -1105,9 +1108,9 @@ static struct idt_mw_cfg *idt_scan_mws(struct idt_ntb_dev *ndev, int port, } /* Allocate memory for memory window descriptors */ - ret_mws = devm_kcalloc(&ndev->ntb.pdev->dev, *mw_cnt, - sizeof(*ret_mws), GFP_KERNEL); - if (IS_ERR_OR_NULL(ret_mws)) + ret_mws = devm_kcalloc(&ndev->ntb.pdev->dev, *mw_cnt, sizeof(*ret_mws), + GFP_KERNEL); + if (!ret_mws) return ERR_PTR(-ENOMEM); /* Copy the info of detected memory windows */ @@ -1320,7 +1323,7 @@ static int idt_ntb_peer_mw_set_trans(struct ntb_dev *ntb, int pidx, int widx, idt_nt_write(ndev, bar->ltbase, (u32)addr); idt_nt_write(ndev, bar->utbase, (u32)(addr >> 32)); /* Set the custom BAR aperture limit */ - limit = pci_resource_start(ntb->pdev, mw_cfg->bar) + size; + limit = pci_bus_address(ntb->pdev, mw_cfg->bar) + size; idt_nt_write(ndev, bar->limit, (u32)limit); if (IS_FLD_SET(BARSETUP_TYPE, data, 64)) idt_nt_write(ndev, (bar + 1)->limit, (limit >> 32)); @@ -1821,61 +1824,284 @@ static int idt_ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx, * 7. Temperature sensor operations * * IDT PCIe-switch has an embedded temperature sensor, which can be used to - * warn a user-space of possible chip overheating. Since workload temperature - * can be different on different platforms, temperature thresholds as well as - * general sensor settings must be setup in the framework of BIOS/EEPROM - * initializations. It includes the actual sensor enabling as well. + * check current chip core temperature. Since a workload environment can be + * different on different platforms, an offset and ADC/filter settings can be + * specified. Although the offset configuration is only exposed to the sysfs + * hwmon interface at the moment. The rest of the settings can be adjusted + * for instance by the BIOS/EEPROM firmware. *============================================================================= */ /* + * idt_get_deg() - convert millidegree Celsius value to just degree + * @mdegC: IN - millidegree Celsius value + * + * Return: Degree corresponding to the passed millidegree value + */ +static inline s8 idt_get_deg(long mdegC) +{ + return mdegC / 1000; +} + +/* + * idt_get_frac() - retrieve 0/0.5 fraction of the millidegree Celsius value + * @mdegC: IN - millidegree Celsius value + * + * Return: 0/0.5 degree fraction of the passed millidegree value + */ +static inline u8 idt_get_deg_frac(long mdegC) +{ + return (mdegC % 1000) >= 500 ? 5 : 0; +} + +/* + * idt_get_temp_fmt() - convert millidegree Celsius value to 0:7:1 format + * @mdegC: IN - millidegree Celsius value + * + * Return: 0:7:1 format acceptable by the IDT temperature sensor + */ +static inline u8 idt_temp_get_fmt(long mdegC) +{ + return (idt_get_deg(mdegC) << 1) | (idt_get_deg_frac(mdegC) ? 1 : 0); +} + +/* + * idt_get_temp_sval() - convert temp sample to signed millidegree Celsius + * @data: IN - shifted to LSB 8-bits temperature sample + * + * Return: signed millidegree Celsius + */ +static inline long idt_get_temp_sval(u32 data) +{ + return ((s8)data / 2) * 1000 + (data & 0x1 ? 500 : 0); +} + +/* + * idt_get_temp_sval() - convert temp sample to unsigned millidegree Celsius + * @data: IN - shifted to LSB 8-bits temperature sample + * + * Return: unsigned millidegree Celsius + */ +static inline long idt_get_temp_uval(u32 data) +{ + return (data / 2) * 1000 + (data & 0x1 ? 500 : 0); +} + +/* * idt_read_temp() - read temperature from chip sensor * @ntb: NTB device context. - * @val: OUT - integer value of temperature - * @frac: OUT - fraction + * @type: IN - type of the temperature value to read + * @val: OUT - integer value of temperature in millidegree Celsius */ -static void idt_read_temp(struct idt_ntb_dev *ndev, unsigned char *val, - unsigned char *frac) +static void idt_read_temp(struct idt_ntb_dev *ndev, + const enum idt_temp_val type, long *val) { u32 data; - /* Read the data from TEMP field of the TMPSTS register */ - data = idt_sw_read(ndev, IDT_SW_TMPSTS); - data = GET_FIELD(TMPSTS_TEMP, data); - /* TEMP field has one fractional bit and seven integer bits */ - *val = data >> 1; - *frac = ((data & 0x1) ? 5 : 0); + /* Alter the temperature field in accordance with the passed type */ + switch (type) { + case IDT_TEMP_CUR: + data = GET_FIELD(TMPSTS_TEMP, + idt_sw_read(ndev, IDT_SW_TMPSTS)); + break; + case IDT_TEMP_LOW: + data = GET_FIELD(TMPSTS_LTEMP, + idt_sw_read(ndev, IDT_SW_TMPSTS)); + break; + case IDT_TEMP_HIGH: + data = GET_FIELD(TMPSTS_HTEMP, + idt_sw_read(ndev, IDT_SW_TMPSTS)); + break; + case IDT_TEMP_OFFSET: + /* This is the only field with signed 0:7:1 format */ + data = GET_FIELD(TMPADJ_OFFSET, + idt_sw_read(ndev, IDT_SW_TMPADJ)); + *val = idt_get_temp_sval(data); + return; + default: + data = GET_FIELD(TMPSTS_TEMP, + idt_sw_read(ndev, IDT_SW_TMPSTS)); + break; + } + + /* The rest of the fields accept unsigned 0:7:1 format */ + *val = idt_get_temp_uval(data); } /* - * idt_temp_isr() - temperature sensor alarm events ISR - * @ndev: IDT NTB hardware driver descriptor - * @ntint_sts: NT-function interrupt status + * idt_write_temp() - write temperature to the chip sensor register + * @ntb: NTB device context. + * @type: IN - type of the temperature value to change + * @val: IN - integer value of temperature in millidegree Celsius + */ +static void idt_write_temp(struct idt_ntb_dev *ndev, + const enum idt_temp_val type, const long val) +{ + unsigned int reg; + u32 data; + u8 fmt; + + /* Retrieve the properly formatted temperature value */ + fmt = idt_temp_get_fmt(val); + + mutex_lock(&ndev->hwmon_mtx); + switch (type) { + case IDT_TEMP_LOW: + reg = IDT_SW_TMPALARM; + data = SET_FIELD(TMPALARM_LTEMP, idt_sw_read(ndev, reg), fmt) & + ~IDT_TMPALARM_IRQ_MASK; + break; + case IDT_TEMP_HIGH: + reg = IDT_SW_TMPALARM; + data = SET_FIELD(TMPALARM_HTEMP, idt_sw_read(ndev, reg), fmt) & + ~IDT_TMPALARM_IRQ_MASK; + break; + case IDT_TEMP_OFFSET: + reg = IDT_SW_TMPADJ; + data = SET_FIELD(TMPADJ_OFFSET, idt_sw_read(ndev, reg), fmt); + break; + default: + goto inval_spin_unlock; + } + + idt_sw_write(ndev, reg, data); + +inval_spin_unlock: + mutex_unlock(&ndev->hwmon_mtx); +} + +/* + * idt_sysfs_show_temp() - printout corresponding temperature value + * @dev: Pointer to the NTB device structure + * @da: Sensor device attribute structure + * @buf: Buffer to print temperature out * - * It handles events of temperature crossing alarm thresholds. Since reading - * of TMPALARM register clears it up, the function doesn't analyze the - * read value, instead the current temperature value just warningly printed to - * log. - * The method is called from PCIe ISR bottom-half routine. + * Return: Number of written symbols or negative error */ -static void idt_temp_isr(struct idt_ntb_dev *ndev, u32 ntint_sts) +static ssize_t idt_sysfs_show_temp(struct device *dev, + struct device_attribute *da, char *buf) { - unsigned char val, frac; + struct sensor_device_attribute *attr = to_sensor_dev_attr(da); + struct idt_ntb_dev *ndev = dev_get_drvdata(dev); + enum idt_temp_val type = attr->index; + long mdeg; - /* Read the current temperature value */ - idt_read_temp(ndev, &val, &frac); + idt_read_temp(ndev, type, &mdeg); + return sprintf(buf, "%ld\n", mdeg); +} - /* Read the temperature alarm to clean the alarm status out */ - /*(void)idt_sw_read(ndev, IDT_SW_TMPALARM);*/ +/* + * idt_sysfs_set_temp() - set corresponding temperature value + * @dev: Pointer to the NTB device structure + * @da: Sensor device attribute structure + * @buf: Buffer to print temperature out + * @count: Size of the passed buffer + * + * Return: Number of written symbols or negative error + */ +static ssize_t idt_sysfs_set_temp(struct device *dev, + struct device_attribute *da, const char *buf, + size_t count) +{ + struct sensor_device_attribute *attr = to_sensor_dev_attr(da); + struct idt_ntb_dev *ndev = dev_get_drvdata(dev); + enum idt_temp_val type = attr->index; + long mdeg; + int ret; - /* Clean the corresponding interrupt bit */ - idt_nt_write(ndev, IDT_NT_NTINTSTS, IDT_NTINTSTS_TMPSENSOR); + ret = kstrtol(buf, 10, &mdeg); + if (ret) + return ret; + + /* Clamp the passed value in accordance with the type */ + if (type == IDT_TEMP_OFFSET) + mdeg = clamp_val(mdeg, IDT_TEMP_MIN_OFFSET, + IDT_TEMP_MAX_OFFSET); + else + mdeg = clamp_val(mdeg, IDT_TEMP_MIN_MDEG, IDT_TEMP_MAX_MDEG); + + idt_write_temp(ndev, type, mdeg); + + return count; +} + +/* + * idt_sysfs_reset_hist() - reset temperature history + * @dev: Pointer to the NTB device structure + * @da: Sensor device attribute structure + * @buf: Buffer to print temperature out + * @count: Size of the passed buffer + * + * Return: Number of written symbols or negative error + */ +static ssize_t idt_sysfs_reset_hist(struct device *dev, + struct device_attribute *da, + const char *buf, size_t count) +{ + struct idt_ntb_dev *ndev = dev_get_drvdata(dev); + + /* Just set the maximal value to the lowest temperature field and + * minimal value to the highest temperature field + */ + idt_write_temp(ndev, IDT_TEMP_LOW, IDT_TEMP_MAX_MDEG); + idt_write_temp(ndev, IDT_TEMP_HIGH, IDT_TEMP_MIN_MDEG); - dev_dbg(&ndev->ntb.pdev->dev, - "Temp sensor IRQ detected %#08x", ntint_sts); + return count; +} + +/* + * Hwmon IDT sysfs attributes + */ +static SENSOR_DEVICE_ATTR(temp1_input, 0444, idt_sysfs_show_temp, NULL, + IDT_TEMP_CUR); +static SENSOR_DEVICE_ATTR(temp1_lowest, 0444, idt_sysfs_show_temp, NULL, + IDT_TEMP_LOW); +static SENSOR_DEVICE_ATTR(temp1_highest, 0444, idt_sysfs_show_temp, NULL, + IDT_TEMP_HIGH); +static SENSOR_DEVICE_ATTR(temp1_offset, 0644, idt_sysfs_show_temp, + idt_sysfs_set_temp, IDT_TEMP_OFFSET); +static DEVICE_ATTR(temp1_reset_history, 0200, NULL, idt_sysfs_reset_hist); - /* Print temperature value to log */ - dev_warn(&ndev->ntb.pdev->dev, "Temperature %hhu.%hhu", val, frac); +/* + * Hwmon IDT sysfs attributes group + */ +static struct attribute *idt_temp_attrs[] = { + &sensor_dev_attr_temp1_input.dev_attr.attr, + &sensor_dev_attr_temp1_lowest.dev_attr.attr, + &sensor_dev_attr_temp1_highest.dev_attr.attr, + &sensor_dev_attr_temp1_offset.dev_attr.attr, + &dev_attr_temp1_reset_history.attr, + NULL +}; +ATTRIBUTE_GROUPS(idt_temp); + +/* + * idt_init_temp() - initialize temperature sensor interface + * @ndev: IDT NTB hardware driver descriptor + * + * Simple sensor initializarion method is responsible for device switching + * on and resource management based hwmon interface registration. Note, that + * since the device is shared we won't disable it on remove, but leave it + * working until the system is powered off. + */ +static void idt_init_temp(struct idt_ntb_dev *ndev) +{ + struct device *hwmon; + + /* Enable sensor if it hasn't been already */ + idt_sw_write(ndev, IDT_SW_TMPCTL, 0x0); + + /* Initialize hwmon interface fields */ + mutex_init(&ndev->hwmon_mtx); + + hwmon = devm_hwmon_device_register_with_groups(&ndev->ntb.pdev->dev, + ndev->swcfg->name, ndev, idt_temp_groups); + if (IS_ERR(hwmon)) { + dev_err(&ndev->ntb.pdev->dev, "Couldn't create hwmon device"); + return; + } + + dev_dbg(&ndev->ntb.pdev->dev, "Temperature HWmon interface registered"); } /*============================================================================= @@ -1931,7 +2157,7 @@ static int idt_init_isr(struct idt_ntb_dev *ndev) goto err_free_vectors; } - /* Unmask Message/Doorbell/SE/Temperature interrupts */ + /* Unmask Message/Doorbell/SE interrupts */ ntint_mask = idt_nt_read(ndev, IDT_NT_NTINTMSK) & ~IDT_NTINTMSK_ALL; idt_nt_write(ndev, IDT_NT_NTINTMSK, ntint_mask); @@ -1946,7 +2172,6 @@ err_free_vectors: return ret; } - /* * idt_deinit_ist() - deinitialize PCIe interrupt handler * @ndev: IDT NTB hardware driver descriptor @@ -2007,12 +2232,6 @@ static irqreturn_t idt_thread_isr(int irq, void *devid) handled = true; } - /* Handle temperature sensor interrupt */ - if (ntint_sts & IDT_NTINTSTS_TMPSENSOR) { - idt_temp_isr(ndev, ntint_sts); - handled = true; - } - dev_dbg(&ndev->ntb.pdev->dev, "IDT IRQs 0x%08x handled", ntint_sts); return handled ? IRQ_HANDLED : IRQ_NONE; @@ -2123,9 +2342,9 @@ static ssize_t idt_dbgfs_info_read(struct file *filp, char __user *ubuf, size_t count, loff_t *offp) { struct idt_ntb_dev *ndev = filp->private_data; - unsigned char temp, frac, idx, pidx, cnt; + unsigned char idx, pidx, cnt; + unsigned long irqflags, mdeg; ssize_t ret = 0, off = 0; - unsigned long irqflags; enum ntb_speed speed; enum ntb_width width; char *strbuf; @@ -2274,9 +2493,10 @@ static ssize_t idt_dbgfs_info_read(struct file *filp, char __user *ubuf, off += scnprintf(strbuf + off, size - off, "\n"); /* Current temperature */ - idt_read_temp(ndev, &temp, &frac); + idt_read_temp(ndev, IDT_TEMP_CUR, &mdeg); off += scnprintf(strbuf + off, size - off, - "Switch temperature\t\t- %hhu.%hhuC\n", temp, frac); + "Switch temperature\t\t- %hhd.%hhuC\n", + idt_get_deg(mdeg), idt_get_deg_frac(mdeg)); /* Copy the buffer to the User Space */ ret = simple_read_from_buffer(ubuf, count, offp, strbuf, off); @@ -2390,7 +2610,7 @@ static struct idt_ntb_dev *idt_create_dev(struct pci_dev *pdev, /* Allocate memory for the IDT PCIe-device descriptor */ ndev = devm_kzalloc(&pdev->dev, sizeof(*ndev), GFP_KERNEL); - if (IS_ERR_OR_NULL(ndev)) { + if (!ndev) { dev_err(&pdev->dev, "Memory allocation failed for descriptor"); return ERR_PTR(-ENOMEM); } @@ -2571,6 +2791,9 @@ static int idt_pci_probe(struct pci_dev *pdev, /* Initialize Messaging subsystem */ idt_init_msg(ndev); + /* Initialize hwmon interface */ + idt_init_temp(ndev); + /* Initialize IDT interrupts handler */ ret = idt_init_isr(ndev); if (ret != 0) diff --git a/drivers/ntb/hw/idt/ntb_hw_idt.h b/drivers/ntb/hw/idt/ntb_hw_idt.h index 856fd182f6f4..2f1aa121b0cf 100644 --- a/drivers/ntb/hw/idt/ntb_hw_idt.h +++ b/drivers/ntb/hw/idt/ntb_hw_idt.h @@ -4,7 +4,7 @@ * * GPL LICENSE SUMMARY * - * Copyright (C) 2016 T-Platforms All Rights Reserved. + * Copyright (C) 2016-2018 T-Platforms JSC All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -47,9 +47,9 @@ #include <linux/pci_ids.h> #include <linux/interrupt.h> #include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/ntb.h> - /* * Macro is used to create the struct pci_device_id that matches * the supported IDT PCIe-switches @@ -688,15 +688,14 @@ * @IDT_NTINTMSK_DBELL: Doorbell interrupt mask bit * @IDT_NTINTMSK_SEVENT: Switch Event interrupt mask bit * @IDT_NTINTMSK_TMPSENSOR: Temperature sensor interrupt mask bit - * @IDT_NTINTMSK_ALL: All the useful interrupts mask + * @IDT_NTINTMSK_ALL: NTB-related interrupts mask */ #define IDT_NTINTMSK_MSG 0x00000001U #define IDT_NTINTMSK_DBELL 0x00000002U #define IDT_NTINTMSK_SEVENT 0x00000008U #define IDT_NTINTMSK_TMPSENSOR 0x00000080U #define IDT_NTINTMSK_ALL \ - (IDT_NTINTMSK_MSG | IDT_NTINTMSK_DBELL | \ - IDT_NTINTMSK_SEVENT | IDT_NTINTMSK_TMPSENSOR) + (IDT_NTINTMSK_MSG | IDT_NTINTMSK_DBELL | IDT_NTINTMSK_SEVENT) /* * NTGSIGNAL register fields related constants @@ -886,12 +885,60 @@ #define IDT_SWPxMSGCTL_PART_FLD 4 /* + * TMPCTL register fields related constants + * @IDT_TMPCTL_LTH_MASK: Low temperature threshold field mask + * @IDT_TMPCTL_LTH_FLD: Low temperature threshold field offset + * @IDT_TMPCTL_MTH_MASK: Middle temperature threshold field mask + * @IDT_TMPCTL_MTH_FLD: Middle temperature threshold field offset + * @IDT_TMPCTL_HTH_MASK: High temperature threshold field mask + * @IDT_TMPCTL_HTH_FLD: High temperature threshold field offset + * @IDT_TMPCTL_PDOWN: Temperature sensor power down + */ +#define IDT_TMPCTL_LTH_MASK 0x000000FFU +#define IDT_TMPCTL_LTH_FLD 0 +#define IDT_TMPCTL_MTH_MASK 0x0000FF00U +#define IDT_TMPCTL_MTH_FLD 8 +#define IDT_TMPCTL_HTH_MASK 0x00FF0000U +#define IDT_TMPCTL_HTH_FLD 16 +#define IDT_TMPCTL_PDOWN 0x80000000U + +/* * TMPSTS register fields related constants * @IDT_TMPSTS_TEMP_MASK: Current temperature field mask * @IDT_TMPSTS_TEMP_FLD: Current temperature field offset + * @IDT_TMPSTS_LTEMP_MASK: Lowest temperature field mask + * @IDT_TMPSTS_LTEMP_FLD: Lowest temperature field offset + * @IDT_TMPSTS_HTEMP_MASK: Highest temperature field mask + * @IDT_TMPSTS_HTEMP_FLD: Highest temperature field offset */ #define IDT_TMPSTS_TEMP_MASK 0x000000FFU #define IDT_TMPSTS_TEMP_FLD 0 +#define IDT_TMPSTS_LTEMP_MASK 0x0000FF00U +#define IDT_TMPSTS_LTEMP_FLD 8 +#define IDT_TMPSTS_HTEMP_MASK 0x00FF0000U +#define IDT_TMPSTS_HTEMP_FLD 16 + +/* + * TMPALARM register fields related constants + * @IDT_TMPALARM_LTEMP_MASK: Lowest temperature field mask + * @IDT_TMPALARM_LTEMP_FLD: Lowest temperature field offset + * @IDT_TMPALARM_HTEMP_MASK: Highest temperature field mask + * @IDT_TMPALARM_HTEMP_FLD: Highest temperature field offset + * @IDT_TMPALARM_IRQ_MASK: Alarm IRQ status mask + */ +#define IDT_TMPALARM_LTEMP_MASK 0x0000FF00U +#define IDT_TMPALARM_LTEMP_FLD 8 +#define IDT_TMPALARM_HTEMP_MASK 0x00FF0000U +#define IDT_TMPALARM_HTEMP_FLD 16 +#define IDT_TMPALARM_IRQ_MASK 0x3F000000U + +/* + * TMPADJ register fields related constants + * @IDT_TMPADJ_OFFSET_MASK: Temperature value offset field mask + * @IDT_TMPADJ_OFFSET_FLD: Temperature value offset field offset + */ +#define IDT_TMPADJ_OFFSET_MASK 0x000000FFU +#define IDT_TMPADJ_OFFSET_FLD 0 /* * Helper macro to get/set the corresponding field value @@ -951,6 +998,32 @@ #define IDT_DIR_SIZE_ALIGN 1 /* + * IDT PCIe-switch temperature sensor value limits + * @IDT_TEMP_MIN_MDEG: Minimal integer value of temperature + * @IDT_TEMP_MAX_MDEG: Maximal integer value of temperature + * @IDT_TEMP_MIN_OFFSET:Minimal integer value of temperature offset + * @IDT_TEMP_MAX_OFFSET:Maximal integer value of temperature offset + */ +#define IDT_TEMP_MIN_MDEG 0 +#define IDT_TEMP_MAX_MDEG 127500 +#define IDT_TEMP_MIN_OFFSET -64000 +#define IDT_TEMP_MAX_OFFSET 63500 + +/* + * Temperature sensor values enumeration + * @IDT_TEMP_CUR: Current temperature + * @IDT_TEMP_LOW: Lowest historical temperature + * @IDT_TEMP_HIGH: Highest historical temperature + * @IDT_TEMP_OFFSET: Current temperature offset + */ +enum idt_temp_val { + IDT_TEMP_CUR, + IDT_TEMP_LOW, + IDT_TEMP_HIGH, + IDT_TEMP_OFFSET +}; + +/* * IDT Memory Windows type. Depending on the device settings, IDT supports * Direct Address Translation MW registers and Lookup Table registers * @IDT_MW_DIR: Direct address translation @@ -1044,6 +1117,8 @@ struct idt_ntb_peer { * @msg_mask_lock: Message mask register lock * @gasa_lock: GASA registers access lock * + * @hwmon_mtx: Temperature sensor interface update mutex + * * @dbgfs_info: DebugFS info node */ struct idt_ntb_dev { @@ -1071,6 +1146,8 @@ struct idt_ntb_dev { spinlock_t msg_mask_lock; spinlock_t gasa_lock; + struct mutex hwmon_mtx; + struct dentry *dbgfs_info; }; #define to_ndev_ntb(__ntb) container_of(__ntb, struct idt_ntb_dev, ntb) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c index 6aa573227279..2ad263f708da 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen1.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c @@ -265,7 +265,7 @@ static inline int ndev_db_clear_mask(struct intel_ntb_dev *ndev, u64 db_bits, return 0; } -static inline int ndev_vec_mask(struct intel_ntb_dev *ndev, int db_vector) +static inline u64 ndev_vec_mask(struct intel_ntb_dev *ndev, int db_vector) { u64 shift, mask; diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 939895966476..3bfdb4562408 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -194,6 +194,8 @@ struct ntb_transport_mw { void __iomem *vbase; size_t xlat_size; size_t buff_size; + size_t alloc_size; + void *alloc_addr; void *virt_addr; dma_addr_t dma_addr; }; @@ -672,13 +674,59 @@ static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw) return; ntb_mw_clear_trans(nt->ndev, PIDX, num_mw); - dma_free_coherent(&pdev->dev, mw->buff_size, - mw->virt_addr, mw->dma_addr); + dma_free_coherent(&pdev->dev, mw->alloc_size, + mw->alloc_addr, mw->dma_addr); mw->xlat_size = 0; mw->buff_size = 0; + mw->alloc_size = 0; + mw->alloc_addr = NULL; mw->virt_addr = NULL; } +static int ntb_alloc_mw_buffer(struct ntb_transport_mw *mw, + struct device *dma_dev, size_t align) +{ + dma_addr_t dma_addr; + void *alloc_addr, *virt_addr; + int rc; + + alloc_addr = dma_alloc_coherent(dma_dev, mw->alloc_size, + &dma_addr, GFP_KERNEL); + if (!alloc_addr) { + dev_err(dma_dev, "Unable to alloc MW buff of size %zu\n", + mw->alloc_size); + return -ENOMEM; + } + virt_addr = alloc_addr; + + /* + * we must ensure that the memory address allocated is BAR size + * aligned in order for the XLAT register to take the value. This + * is a requirement of the hardware. It is recommended to setup CMA + * for BAR sizes equal or greater than 4MB. + */ + if (!IS_ALIGNED(dma_addr, align)) { + if (mw->alloc_size > mw->buff_size) { + virt_addr = PTR_ALIGN(alloc_addr, align); + dma_addr = ALIGN(dma_addr, align); + } else { + rc = -ENOMEM; + goto err; + } + } + + mw->alloc_addr = alloc_addr; + mw->virt_addr = virt_addr; + mw->dma_addr = dma_addr; + + return 0; + +err: + dma_free_coherent(dma_dev, mw->alloc_size, alloc_addr, dma_addr); + + return rc; +} + static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, resource_size_t size) { @@ -710,28 +758,20 @@ static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, /* Alloc memory for receiving data. Must be aligned */ mw->xlat_size = xlat_size; mw->buff_size = buff_size; + mw->alloc_size = buff_size; - mw->virt_addr = dma_alloc_coherent(&pdev->dev, buff_size, - &mw->dma_addr, GFP_KERNEL); - if (!mw->virt_addr) { - mw->xlat_size = 0; - mw->buff_size = 0; - dev_err(&pdev->dev, "Unable to alloc MW buff of size %zu\n", - buff_size); - return -ENOMEM; - } - - /* - * we must ensure that the memory address allocated is BAR size - * aligned in order for the XLAT register to take the value. This - * is a requirement of the hardware. It is recommended to setup CMA - * for BAR sizes equal or greater than 4MB. - */ - if (!IS_ALIGNED(mw->dma_addr, xlat_align)) { - dev_err(&pdev->dev, "DMA memory %pad is not aligned\n", - &mw->dma_addr); - ntb_free_mw(nt, num_mw); - return -ENOMEM; + rc = ntb_alloc_mw_buffer(mw, &pdev->dev, xlat_align); + if (rc) { + mw->alloc_size *= 2; + rc = ntb_alloc_mw_buffer(mw, &pdev->dev, xlat_align); + if (rc) { + dev_err(&pdev->dev, + "Unable to alloc aligned MW buff\n"); + mw->xlat_size = 0; + mw->buff_size = 0; + mw->alloc_size = 0; + return rc; + } } /* Notify HW the memory location of the receive buffer */ @@ -1278,6 +1318,7 @@ static void ntb_rx_copy_callback(void *data, case DMA_TRANS_READ_FAILED: case DMA_TRANS_WRITE_FAILED: entry->errors++; + /* fall through */ case DMA_TRANS_ABORTED: { struct ntb_transport_qp *qp = entry->qp; @@ -1533,6 +1574,7 @@ static void ntb_tx_copy_callback(void *data, case DMA_TRANS_READ_FAILED: case DMA_TRANS_WRITE_FAILED: entry->errors++; + /* fall through */ case DMA_TRANS_ABORTED: { void __iomem *offset = diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e52b9d3c0bd6..0b70c8bab045 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1704,7 +1704,6 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, op->fcp_req.rspaddr = &op->rsp_iu; op->fcp_req.rsplen = sizeof(op->rsp_iu); op->fcp_req.done = nvme_fc_fcpio_done; - op->fcp_req.private = &op->fcp_req.first_sgl[SG_CHUNK_SIZE]; op->ctrl = ctrl; op->queue = queue; op->rq = rq; @@ -1752,6 +1751,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, if (res) return res; op->op.fcp_req.first_sgl = &op->sgl[0]; + op->op.fcp_req.private = &op->priv[0]; return res; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f30031945ee4..c33bb201b884 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1663,6 +1663,9 @@ static void nvme_map_cmb(struct nvme_dev *dev) struct pci_dev *pdev = to_pci_dev(dev->dev); int bar; + if (dev->cmb_size) + return; + dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!dev->cmbsz) return; @@ -2147,7 +2150,6 @@ static void nvme_pci_disable(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); - nvme_release_cmb(dev); pci_free_irq_vectors(pdev); if (pci_is_enabled(pdev)) { @@ -2595,6 +2597,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); + nvme_release_cmb(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 39d972e2595f..01feebec29ea 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -101,7 +101,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, rw = READ; } - iov_iter_bvec(&iter, ITER_BVEC | rw, req->f.bvec, nr_segs, count); + iov_iter_bvec(&iter, rw, req->f.bvec, nr_segs, count); iocb->ki_pos = pos; iocb->ki_filp = req->ns->file; diff --git a/drivers/of/base.c b/drivers/of/base.c index d023cf303d56..09692c9b32a7 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -777,8 +777,6 @@ struct device_node *of_get_next_cpu_node(struct device_node *prev) if (!(of_node_name_eq(next, "cpu") || (next->type && !of_node_cmp(next->type, "cpu")))) continue; - if (!__of_device_is_available(next)) - continue; if (of_node_get(next)) break; } diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 504d252716f2..27e5dd47a01f 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -447,10 +447,9 @@ config PWM_TEGRA config PWM_TIECAP tristate "ECAP PWM support" - depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_KEYSTONE + depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_KEYSTONE || ARCH_K3 help - PWM driver support for the ECAP APWM controller found on AM33XX - TI SOC + PWM driver support for the ECAP APWM controller found on TI SOCs To compile this driver as a module, choose M here: the module will be called pwm-tiecap. diff --git a/drivers/pwm/pwm-lpss-platform.c b/drivers/pwm/pwm-lpss-platform.c index 5561b9e190f8..757230e1f575 100644 --- a/drivers/pwm/pwm-lpss-platform.c +++ b/drivers/pwm/pwm-lpss-platform.c @@ -30,6 +30,7 @@ static const struct pwm_lpss_boardinfo pwm_lpss_bsw_info = { .clk_rate = 19200000, .npwm = 1, .base_unit_bits = 16, + .other_devices_aml_touches_pwm_regs = true, }; /* Broxton */ @@ -60,6 +61,7 @@ static int pwm_lpss_probe_platform(struct platform_device *pdev) platform_set_drvdata(pdev, lpwm); + dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); @@ -74,13 +76,29 @@ static int pwm_lpss_remove_platform(struct platform_device *pdev) return pwm_lpss_remove(lpwm); } -static SIMPLE_DEV_PM_OPS(pwm_lpss_platform_pm_ops, - pwm_lpss_suspend, - pwm_lpss_resume); +static int pwm_lpss_prepare(struct device *dev) +{ + struct pwm_lpss_chip *lpwm = dev_get_drvdata(dev); + + /* + * If other device's AML code touches the PWM regs on suspend/resume + * force runtime-resume the PWM controller to allow this. + */ + if (lpwm->info->other_devices_aml_touches_pwm_regs) + return 0; /* Force runtime-resume */ + + return 1; /* If runtime-suspended leave as is */ +} + +static const struct dev_pm_ops pwm_lpss_platform_pm_ops = { + .prepare = pwm_lpss_prepare, + SET_SYSTEM_SLEEP_PM_OPS(pwm_lpss_suspend, pwm_lpss_resume) +}; static const struct acpi_device_id pwm_lpss_acpi_match[] = { { "80860F09", (unsigned long)&pwm_lpss_byt_info }, { "80862288", (unsigned long)&pwm_lpss_bsw_info }, + { "80862289", (unsigned long)&pwm_lpss_bsw_info }, { "80865AC8", (unsigned long)&pwm_lpss_bxt_info }, { }, }; diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index 4721a264bac2..2ac3a2aa9e53 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -32,15 +32,6 @@ /* Size of each PWM register space if multiple */ #define PWM_SIZE 0x400 -#define MAX_PWMS 4 - -struct pwm_lpss_chip { - struct pwm_chip chip; - void __iomem *regs; - const struct pwm_lpss_boardinfo *info; - u32 saved_ctrl[MAX_PWMS]; -}; - static inline struct pwm_lpss_chip *to_lpwm(struct pwm_chip *chip) { return container_of(chip, struct pwm_lpss_chip, chip); @@ -97,7 +88,7 @@ static void pwm_lpss_prepare(struct pwm_lpss_chip *lpwm, struct pwm_device *pwm, unsigned long long on_time_div; unsigned long c = lpwm->info->clk_rate, base_unit_range; unsigned long long base_unit, freq = NSEC_PER_SEC; - u32 ctrl; + u32 orig_ctrl, ctrl; do_div(freq, period_ns); @@ -114,13 +105,17 @@ static void pwm_lpss_prepare(struct pwm_lpss_chip *lpwm, struct pwm_device *pwm, do_div(on_time_div, period_ns); on_time_div = 255ULL - on_time_div; - ctrl = pwm_lpss_read(pwm); + orig_ctrl = ctrl = pwm_lpss_read(pwm); ctrl &= ~PWM_ON_TIME_DIV_MASK; ctrl &= ~(base_unit_range << PWM_BASE_UNIT_SHIFT); base_unit &= base_unit_range; ctrl |= (u32) base_unit << PWM_BASE_UNIT_SHIFT; ctrl |= on_time_div; - pwm_lpss_write(pwm, ctrl); + + if (orig_ctrl != ctrl) { + pwm_lpss_write(pwm, ctrl); + pwm_lpss_write(pwm, ctrl | PWM_SW_UPDATE); + } } static inline void pwm_lpss_cond_enable(struct pwm_device *pwm, bool cond) @@ -144,7 +139,6 @@ static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm, return ret; } pwm_lpss_prepare(lpwm, pwm, state->duty_cycle, state->period); - pwm_lpss_write(pwm, pwm_lpss_read(pwm) | PWM_SW_UPDATE); pwm_lpss_cond_enable(pwm, lpwm->info->bypass == false); ret = pwm_lpss_wait_for_update(pwm); if (ret) { @@ -157,7 +151,6 @@ static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm, if (ret) return ret; pwm_lpss_prepare(lpwm, pwm, state->duty_cycle, state->period); - pwm_lpss_write(pwm, pwm_lpss_read(pwm) | PWM_SW_UPDATE); return pwm_lpss_wait_for_update(pwm); } } else if (pwm_is_enabled(pwm)) { @@ -168,8 +161,42 @@ static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } +/* This function gets called once from pwmchip_add to get the initial state */ +static void pwm_lpss_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct pwm_lpss_chip *lpwm = to_lpwm(chip); + unsigned long base_unit_range; + unsigned long long base_unit, freq, on_time_div; + u32 ctrl; + + base_unit_range = BIT(lpwm->info->base_unit_bits); + + ctrl = pwm_lpss_read(pwm); + on_time_div = 255 - (ctrl & PWM_ON_TIME_DIV_MASK); + base_unit = (ctrl >> PWM_BASE_UNIT_SHIFT) & (base_unit_range - 1); + + freq = base_unit * lpwm->info->clk_rate; + do_div(freq, base_unit_range); + if (freq == 0) + state->period = NSEC_PER_SEC; + else + state->period = NSEC_PER_SEC / (unsigned long)freq; + + on_time_div *= state->period; + do_div(on_time_div, 255); + state->duty_cycle = on_time_div; + + state->polarity = PWM_POLARITY_NORMAL; + state->enabled = !!(ctrl & PWM_ENABLE); + + if (state->enabled) + pm_runtime_get(chip->dev); +} + static const struct pwm_ops pwm_lpss_ops = { .apply = pwm_lpss_apply, + .get_state = pwm_lpss_get_state, .owner = THIS_MODULE, }; @@ -214,6 +241,12 @@ EXPORT_SYMBOL_GPL(pwm_lpss_probe); int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) { + int i; + + for (i = 0; i < lpwm->info->npwm; i++) { + if (pwm_is_enabled(&lpwm->chip.pwms[i])) + pm_runtime_put(lpwm->chip.dev); + } return pwmchip_remove(&lpwm->chip); } EXPORT_SYMBOL_GPL(pwm_lpss_remove); diff --git a/drivers/pwm/pwm-lpss.h b/drivers/pwm/pwm-lpss.h index 7a4238ad1fcb..3236be835bd9 100644 --- a/drivers/pwm/pwm-lpss.h +++ b/drivers/pwm/pwm-lpss.h @@ -16,13 +16,25 @@ #include <linux/device.h> #include <linux/pwm.h> -struct pwm_lpss_chip; +#define MAX_PWMS 4 + +struct pwm_lpss_chip { + struct pwm_chip chip; + void __iomem *regs; + const struct pwm_lpss_boardinfo *info; + u32 saved_ctrl[MAX_PWMS]; +}; struct pwm_lpss_boardinfo { unsigned long clk_rate; unsigned int npwm; unsigned long base_unit_bits; bool bypass; + /* + * On some devices the _PS0/_PS3 AML code of the GPU (GFX0) device + * messes with the PWM0 controllers state, + */ + bool other_devices_aml_touches_pwm_regs; }; struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r, diff --git a/drivers/pwm/pwm-rcar.c b/drivers/pwm/pwm-rcar.c index 748f614d5375..a41812fc6f95 100644 --- a/drivers/pwm/pwm-rcar.c +++ b/drivers/pwm/pwm-rcar.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * R-Car PWM Timer driver * * Copyright (C) 2015 Renesas Electronics Corporation - * - * This is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. */ #include <linux/clk.h> diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index 29267d12fb4c..4a855a21b782 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * R-Mobile TPU PWM driver * * Copyright (C) 2012 Renesas Solutions Corp. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/clk.h> diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c index f8ebbece57b7..48c4595a0ffc 100644 --- a/drivers/pwm/pwm-tegra.c +++ b/drivers/pwm/pwm-tegra.c @@ -300,7 +300,6 @@ static const struct of_device_id tegra_pwm_of_match[] = { { .compatible = "nvidia,tegra186-pwm", .data = &tegra186_pwm_soc }, { } }; - MODULE_DEVICE_TABLE(of, tegra_pwm_of_match); static const struct dev_pm_ops tegra_pwm_pm_ops = { diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c index 7c71cdb8a9d8..ceb233dd6048 100644 --- a/drivers/pwm/sysfs.c +++ b/drivers/pwm/sysfs.c @@ -249,6 +249,7 @@ static void pwm_export_release(struct device *child) static int pwm_export_child(struct device *parent, struct pwm_device *pwm) { struct pwm_export *export; + char *pwm_prop[2]; int ret; if (test_and_set_bit(PWMF_EXPORTED, &pwm->flags)) @@ -263,7 +264,6 @@ static int pwm_export_child(struct device *parent, struct pwm_device *pwm) export->pwm = pwm; mutex_init(&export->lock); - export->child.class = parent->class; export->child.release = pwm_export_release; export->child.parent = parent; export->child.devt = MKDEV(0, 0); @@ -277,6 +277,10 @@ static int pwm_export_child(struct device *parent, struct pwm_device *pwm) export = NULL; return ret; } + pwm_prop[0] = kasprintf(GFP_KERNEL, "EXPORT=pwm%u", pwm->hwpwm); + pwm_prop[1] = NULL; + kobject_uevent_env(&parent->kobj, KOBJ_CHANGE, pwm_prop); + kfree(pwm_prop[0]); return 0; } @@ -289,6 +293,7 @@ static int pwm_unexport_match(struct device *child, void *data) static int pwm_unexport_child(struct device *parent, struct pwm_device *pwm) { struct device *child; + char *pwm_prop[2]; if (!test_and_clear_bit(PWMF_EXPORTED, &pwm->flags)) return -ENODEV; @@ -297,6 +302,11 @@ static int pwm_unexport_child(struct device *parent, struct pwm_device *pwm) if (!child) return -ENODEV; + pwm_prop[0] = kasprintf(GFP_KERNEL, "UNEXPORT=pwm%u", pwm->hwpwm); + pwm_prop[1] = NULL; + kobject_uevent_env(&parent->kobj, KOBJ_CHANGE, pwm_prop); + kfree(pwm_prop[0]); + /* for device_find_child() */ put_device(child); device_unregister(child); diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 05293babb031..2d655a97b959 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -143,7 +143,9 @@ static int twa_poll_status_gone(TW_Device_Extension *tw_dev, u32 flag, int secon static int twa_post_command_packet(TW_Device_Extension *tw_dev, int request_id, char internal); static int twa_reset_device_extension(TW_Device_Extension *tw_dev); static int twa_reset_sequence(TW_Device_Extension *tw_dev, int soft_reset); -static int twa_scsiop_execute_scsi(TW_Device_Extension *tw_dev, int request_id, char *cdb, int use_sg, TW_SG_Entry *sglistarg); +static int twa_scsiop_execute_scsi(TW_Device_Extension *tw_dev, int request_id, + unsigned char *cdb, int use_sg, + TW_SG_Entry *sglistarg); static void twa_scsiop_execute_scsi_complete(TW_Device_Extension *tw_dev, int request_id); static char *twa_string_lookup(twa_message_type *table, unsigned int aen_code); @@ -278,7 +280,7 @@ out: static int twa_aen_drain_queue(TW_Device_Extension *tw_dev, int no_check_reset) { int request_id = 0; - char cdb[TW_MAX_CDB_LEN]; + unsigned char cdb[TW_MAX_CDB_LEN]; TW_SG_Entry sglist[1]; int finished = 0, count = 0; TW_Command_Full *full_command_packet; @@ -423,7 +425,7 @@ static void twa_aen_queue_event(TW_Device_Extension *tw_dev, TW_Command_Apache_H /* This function will read the aen queue from the isr */ static int twa_aen_read_queue(TW_Device_Extension *tw_dev, int request_id) { - char cdb[TW_MAX_CDB_LEN]; + unsigned char cdb[TW_MAX_CDB_LEN]; TW_SG_Entry sglist[1]; TW_Command_Full *full_command_packet; int retval = 1; @@ -1798,7 +1800,9 @@ out: static DEF_SCSI_QCMD(twa_scsi_queue) /* This function hands scsi cdb's to the firmware */ -static int twa_scsiop_execute_scsi(TW_Device_Extension *tw_dev, int request_id, char *cdb, int use_sg, TW_SG_Entry *sglistarg) +static int twa_scsiop_execute_scsi(TW_Device_Extension *tw_dev, int request_id, + unsigned char *cdb, int use_sg, + TW_SG_Entry *sglistarg) { TW_Command_Full *full_command_packet; TW_Command_Apache *command_packet; diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index 266bdac75304..480cf82700e9 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -287,7 +287,9 @@ static int twl_post_command_packet(TW_Device_Extension *tw_dev, int request_id) } /* End twl_post_command_packet() */ /* This function hands scsi cdb's to the firmware */ -static int twl_scsiop_execute_scsi(TW_Device_Extension *tw_dev, int request_id, char *cdb, int use_sg, TW_SG_Entry_ISO *sglistarg) +static int twl_scsiop_execute_scsi(TW_Device_Extension *tw_dev, int request_id, + unsigned char *cdb, int use_sg, + TW_SG_Entry_ISO *sglistarg) { TW_Command_Full *full_command_packet; TW_Command_Apache *command_packet; @@ -372,7 +374,7 @@ out: /* This function will read the aen queue from the isr */ static int twl_aen_read_queue(TW_Device_Extension *tw_dev, int request_id) { - char cdb[TW_MAX_CDB_LEN]; + unsigned char cdb[TW_MAX_CDB_LEN]; TW_SG_Entry_ISO sglist[1]; TW_Command_Full *full_command_packet; int retval = 1; @@ -554,7 +556,7 @@ out: static int twl_aen_drain_queue(TW_Device_Extension *tw_dev, int no_check_reset) { int request_id = 0; - char cdb[TW_MAX_CDB_LEN]; + unsigned char cdb[TW_MAX_CDB_LEN]; TW_SG_Entry_ISO sglist[1]; int finished = 0, count = 0; TW_Command_Full *full_command_packet; diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 70988c381268..f07444d30b21 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -538,7 +538,7 @@ config SCSI_HPTIOP config SCSI_BUSLOGIC tristate "BusLogic SCSI support" - depends on (PCI || ISA || MCA) && SCSI && ISA_DMA_API && VIRT_TO_BUS + depends on (PCI || ISA) && SCSI && ISA_DMA_API && VIRT_TO_BUS ---help--- This is support for BusLogic MultiMaster and FlashPoint SCSI Host Adapters. Consult the SCSI-HOWTO, available from @@ -1175,12 +1175,12 @@ config SCSI_LPFC_DEBUG_FS config SCSI_SIM710 tristate "Simple 53c710 SCSI support (Compaq, NCR machines)" - depends on (EISA || MCA) && SCSI + depends on EISA && SCSI select SCSI_SPI_ATTRS ---help--- This driver is for NCR53c710 based SCSI host adapters. - It currently supports Compaq EISA cards and NCR MCA cards + It currently supports Compaq EISA cards. config SCSI_DC395x tristate "Tekram DC395(U/UW/F) and DC315(U) SCSI support" diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index 4d7b0e0adbf7..301b3cad15f8 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -269,7 +269,7 @@ static LIST_HEAD(aha152x_host_list); /* DEFINES */ /* For PCMCIA cards, always use AUTOCONF */ -#if defined(PCMCIA) || defined(MODULE) +#if defined(AHA152X_PCMCIA) || defined(MODULE) #if !defined(AUTOCONF) #define AUTOCONF #endif @@ -297,7 +297,7 @@ CMD_INC_RESID(struct scsi_cmnd *cmd, int inc) #define DELAY_DEFAULT 1000 -#if defined(PCMCIA) +#if defined(AHA152X_PCMCIA) #define IRQ_MIN 0 #define IRQ_MAX 16 #else @@ -328,7 +328,7 @@ MODULE_AUTHOR("Jürgen Fischer"); MODULE_DESCRIPTION(AHA152X_REVID); MODULE_LICENSE("GPL"); -#if !defined(PCMCIA) +#if !defined(AHA152X_PCMCIA) #if defined(MODULE) static int io[] = {0, 0}; module_param_hw_array(io, int, ioport, NULL, 0); @@ -391,7 +391,7 @@ static struct isapnp_device_id id_table[] = { MODULE_DEVICE_TABLE(isapnp, id_table); #endif /* ISAPNP */ -#endif /* !PCMCIA */ +#endif /* !AHA152X_PCMCIA */ static struct scsi_host_template aha152x_driver_template; @@ -863,7 +863,7 @@ void aha152x_release(struct Scsi_Host *shpnt) if (shpnt->irq) free_irq(shpnt->irq, shpnt); -#if !defined(PCMCIA) +#if !defined(AHA152X_PCMCIA) if (shpnt->io_port) release_region(shpnt->io_port, IO_RANGE); #endif @@ -2924,7 +2924,7 @@ static struct scsi_host_template aha152x_driver_template = { .slave_alloc = aha152x_adjust_queue, }; -#if !defined(PCMCIA) +#if !defined(AHA152X_PCMCIA) static int setup_count; static struct aha152x_setup setup[2]; @@ -3392,4 +3392,4 @@ static int __init aha152x_setup(char *str) __setup("aha152x=", aha152x_setup); #endif -#endif /* !PCMCIA */ +#endif /* !AHA152X_PCMCIA */ diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c index 3df1428df317..311d23c727ce 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -790,12 +790,11 @@ static int mvs_task_prep(struct sas_task *task, struct mvs_info *mvi, int is_tmf slot->n_elem = n_elem; slot->slot_tag = tag; - slot->buf = dma_pool_alloc(mvi->dma_pool, GFP_ATOMIC, &slot->buf_dma); + slot->buf = dma_pool_zalloc(mvi->dma_pool, GFP_ATOMIC, &slot->buf_dma); if (!slot->buf) { rc = -ENOMEM; goto err_out_tag; } - memset(slot->buf, 0, MVS_SLOT_BUF_SZ); tei.task = task; tei.hdr = &mvi->slot[tag]; @@ -1906,8 +1905,7 @@ static void mvs_work_queue(struct work_struct *work) if (phy->phy_event & PHY_PLUG_OUT) { u32 tmp; - struct sas_identify_frame *id; - id = (struct sas_identify_frame *)phy->frame_rcvd; + tmp = MVS_CHIP_DISP->read_phy_ctl(mvi, phy_no); phy->phy_event &= ~PHY_PLUG_OUT; if (!(tmp & PHY_READY_MASK)) { diff --git a/drivers/scsi/pcmcia/aha152x_core.c b/drivers/scsi/pcmcia/aha152x_core.c index dba3716511c5..24b89228b241 100644 --- a/drivers/scsi/pcmcia/aha152x_core.c +++ b/drivers/scsi/pcmcia/aha152x_core.c @@ -1,3 +1,3 @@ -#define PCMCIA 1 +#define AHA152X_PCMCIA 1 #define AHA152X_STAT 1 #include "aha152x.c" diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index b28f159fdaee..0bb9ac6ece92 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -218,7 +218,7 @@ qla2x00_sysfs_write_nvram(struct file *filp, struct kobject *kobj, mutex_lock(&ha->optrom_mutex); if (qla2x00_chip_is_down(vha)) { - mutex_unlock(&vha->hw->optrom_mutex); + mutex_unlock(&ha->optrom_mutex); return -EAGAIN; } diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index c72d8012fe2a..6fe20c27acc1 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -425,7 +425,7 @@ void qla24xx_handle_adisc_event(scsi_qla_host_t *vha, struct event_arg *ea) __qla24xx_handle_gpdb_event(vha, ea); } -int qla_post_els_plogi_work(struct scsi_qla_host *vha, fc_port_t *fcport) +static int qla_post_els_plogi_work(struct scsi_qla_host *vha, fc_port_t *fcport) { struct qla_work_evt *e; @@ -680,7 +680,7 @@ static void qla24xx_handle_gnl_done_event(scsi_qla_host_t *vha, fcport); break; } - /* drop through */ + /* fall through */ default: if (fcport_is_smaller(fcport)) { /* local adapter is bigger */ @@ -1551,7 +1551,8 @@ void qla24xx_handle_relogin_event(scsi_qla_host_t *vha, } -void qla_handle_els_plogi_done(scsi_qla_host_t *vha, struct event_arg *ea) +static void qla_handle_els_plogi_done(scsi_qla_host_t *vha, + struct event_arg *ea) { ql_dbg(ql_dbg_disc, vha, 0x2118, "%s %d %8phC post PRLI\n", diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index 86fb8b21aa71..032635321ad6 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -1195,8 +1195,8 @@ qla24xx_walk_and_build_prot_sglist(struct qla_hw_data *ha, srb_t *sp, * @sp: SRB command to process * @cmd_pkt: Command type 3 IOCB * @tot_dsds: Total number of segments to transfer - * @tot_prot_dsds: - * @fw_prot_opts: + * @tot_prot_dsds: Total number of segments with protection information + * @fw_prot_opts: Protection options to be passed to firmware */ inline int qla24xx_build_scsi_crc_2_iocbs(srb_t *sp, struct cmd_type_crc_2 *cmd_pkt, diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index d73b04e40590..30d3090842f8 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -25,7 +25,7 @@ static int qla2x00_error_entry(scsi_qla_host_t *, struct rsp_que *, /** * qla2100_intr_handler() - Process interrupts for the ISP2100 and ISP2200. - * @irq: + * @irq: interrupt number * @dev_id: SCSI driver HA context * * Called by system whenever the host adapter generates an interrupt. @@ -144,7 +144,7 @@ qla2x00_check_reg16_for_disconnect(scsi_qla_host_t *vha, uint16_t reg) /** * qla2300_intr_handler() - Process interrupts for the ISP23xx and ISP63xx. - * @irq: + * @irq: interrupt number * @dev_id: SCSI driver HA context * * Called by system whenever the host adapter generates an interrupt. @@ -3109,7 +3109,7 @@ done: /** * qla24xx_intr_handler() - Process interrupts for the ISP23xx and ISP24xx. - * @irq: + * @irq: interrupt number * @dev_id: SCSI driver HA context * * Called by system whenever the host adapter generates an interrupt. diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 2f3e5075ae76..191b6b7c8747 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -3478,9 +3478,9 @@ qla8044_read_serdes_word(scsi_qla_host_t *vha, uint32_t addr, uint32_t *data) /** * qla2x00_set_serdes_params() - * @vha: HA context - * @sw_em_1g: - * @sw_em_2g: - * @sw_em_4g: + * @sw_em_1g: serial link options + * @sw_em_2g: serial link options + * @sw_em_4g: serial link options * * Returns */ diff --git a/drivers/scsi/qla2xxx/qla_mr.c b/drivers/scsi/qla2xxx/qla_mr.c index 521a51370554..60f964c53c01 100644 --- a/drivers/scsi/qla2xxx/qla_mr.c +++ b/drivers/scsi/qla2xxx/qla_mr.c @@ -2212,7 +2212,7 @@ qlafx00_ioctl_iosb_entry(scsi_qla_host_t *vha, struct req_que *req, struct bsg_job *bsg_job; struct fc_bsg_reply *bsg_reply; struct srb_iocb *iocb_job; - int res; + int res = 0; struct qla_mt_iocb_rsp_fx00 fstatus; uint8_t *fw_sts_ptr; @@ -2624,7 +2624,7 @@ qlafx00_status_cont_entry(struct rsp_que *rsp, sts_cont_entry_t *pkt) * qlafx00_multistatus_entry() - Process Multi response queue entries. * @vha: SCSI driver HA context * @rsp: response queue - * @pkt: + * @pkt: received packet */ static void qlafx00_multistatus_entry(struct scsi_qla_host *vha, @@ -2681,12 +2681,10 @@ qlafx00_multistatus_entry(struct scsi_qla_host *vha, * @vha: SCSI driver HA context * @rsp: response queue * @pkt: Entry pointer - * @estatus: - * @etype: */ static void qlafx00_error_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, - struct sts_entry_fx00 *pkt, uint8_t estatus, uint8_t etype) + struct sts_entry_fx00 *pkt) { srb_t *sp; struct qla_hw_data *ha = vha->hw; @@ -2695,9 +2693,6 @@ qlafx00_error_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, struct req_que *req = NULL; int res = DID_ERROR << 16; - ql_dbg(ql_dbg_async, vha, 0x507f, - "type of error status in response: 0x%x\n", estatus); - req = ha->req_q_map[que]; sp = qla2x00_get_sp_from_handle(vha, func, req, pkt); @@ -2745,9 +2740,11 @@ qlafx00_process_response_queue(struct scsi_qla_host *vha, if (pkt->entry_status != 0 && pkt->entry_type != IOCTL_IOSB_TYPE_FX00) { + ql_dbg(ql_dbg_async, vha, 0x507f, + "type of error status in response: 0x%x\n", + pkt->entry_status); qlafx00_error_entry(vha, rsp, - (struct sts_entry_fx00 *)pkt, pkt->entry_status, - pkt->entry_type); + (struct sts_entry_fx00 *)pkt); continue; } @@ -2867,7 +2864,7 @@ qlafx00_async_event(scsi_qla_host_t *vha) /** * qlafx00x_mbx_completion() - Process mailbox command completions. * @vha: SCSI driver HA context - * @mb0: + * @mb0: value to be written into mailbox register 0 */ static void qlafx00_mbx_completion(scsi_qla_host_t *vha, uint32_t mb0) @@ -2893,7 +2890,7 @@ qlafx00_mbx_completion(scsi_qla_host_t *vha, uint32_t mb0) /** * qlafx00_intr_handler() - Process interrupts for the ISPFX00. - * @irq: + * @irq: interrupt number * @dev_id: SCSI driver HA context * * Called by system whenever the host adapter generates an interrupt. diff --git a/drivers/scsi/qla2xxx/qla_nx.c b/drivers/scsi/qla2xxx/qla_nx.c index 121e18b3b9f8..f2f54806f4da 100644 --- a/drivers/scsi/qla2xxx/qla_nx.c +++ b/drivers/scsi/qla2xxx/qla_nx.c @@ -2010,7 +2010,7 @@ qla82xx_mbx_completion(scsi_qla_host_t *vha, uint16_t mb0) /** * qla82xx_intr_handler() - Process interrupts for the ISP23xx and ISP63xx. - * @irq: + * @irq: interrupt number * @dev_id: SCSI driver HA context * * Called by system whenever the host adapter generates an interrupt. diff --git a/drivers/scsi/qla2xxx/qla_nx2.c b/drivers/scsi/qla2xxx/qla_nx2.c index 3a2b0282df14..fe856b602e03 100644 --- a/drivers/scsi/qla2xxx/qla_nx2.c +++ b/drivers/scsi/qla2xxx/qla_nx2.c @@ -3878,7 +3878,7 @@ out: #define PF_BITS_MASK (0xF << 16) /** * qla8044_intr_handler() - Process interrupts for the ISP8044 - * @irq: + * @irq: interrupt number * @dev_id: SCSI driver HA context * * Called by system whenever the host adapter generates an interrupt. diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 8794e54f43a9..518f15141170 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1749,7 +1749,7 @@ qla2x00_loop_reset(scsi_qla_host_t *vha) static void __qla2x00_abort_all_cmds(struct qla_qpair *qp, int res) { - int cnt, status; + int cnt; unsigned long flags; srb_t *sp; scsi_qla_host_t *vha = qp->vha; @@ -1799,8 +1799,8 @@ __qla2x00_abort_all_cmds(struct qla_qpair *qp, int res) if (!sp_get(sp)) { spin_unlock_irqrestore (qp->qp_lock_ptr, flags); - status = qla2xxx_eh_abort( - GET_CMD_SP(sp)); + qla2xxx_eh_abort( + GET_CMD_SP(sp)); spin_lock_irqsave (qp->qp_lock_ptr, flags); } diff --git a/drivers/scsi/qla2xxx/qla_sup.c b/drivers/scsi/qla2xxx/qla_sup.c index 4499c787165f..2a3055c799fb 100644 --- a/drivers/scsi/qla2xxx/qla_sup.c +++ b/drivers/scsi/qla2xxx/qla_sup.c @@ -2229,7 +2229,7 @@ qla2x00_erase_flash_sector(struct qla_hw_data *ha, uint32_t addr, /** * qla2x00_get_flash_manufacturer() - Read manufacturer ID from flash chip. - * @ha: + * @ha: host adapter * @man_id: Flash manufacturer ID * @flash_id: Flash ID */ diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 39828207bc1d..c4504740f0e2 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -4540,7 +4540,7 @@ static int qlt_issue_task_mgmt(struct fc_port *sess, u64 lun, case QLA_TGT_CLEAR_TS: case QLA_TGT_ABORT_TS: abort_cmds_for_lun(vha, lun, a->u.isp24.fcp_hdr.s_id); - /* drop through */ + /* fall through */ case QLA_TGT_CLEAR_ACA: h = qlt_find_qphint(vha, mcmd->unpacked_lun); mcmd->qpair = h->qpair; @@ -6598,9 +6598,9 @@ static void qlt_lport_dump(struct scsi_qla_host *vha, u64 wwpn, * qla_tgt_lport_register - register lport with external module * * @target_lport_ptr: pointer for tcm_qla2xxx specific lport data - * @phys_wwpn: - * @npiv_wwpn: - * @npiv_wwnn: + * @phys_wwpn: physical port WWPN + * @npiv_wwpn: NPIV WWPN + * @npiv_wwnn: NPIV WWNN * @callback: lport initialization callback for tcm_qla2xxx code */ int qlt_lport_register(void *target_lport_ptr, u64 phys_wwpn, diff --git a/drivers/soc/ti/knav_qmss.h b/drivers/soc/ti/knav_qmss.h index 7c128132799e..4c28fa938ac7 100644 --- a/drivers/soc/ti/knav_qmss.h +++ b/drivers/soc/ti/knav_qmss.h @@ -329,8 +329,8 @@ struct knav_range_ops { }; struct knav_irq_info { - int irq; - u32 cpu_map; + int irq; + struct cpumask *cpu_mask; }; struct knav_range_info { diff --git a/drivers/soc/ti/knav_qmss_acc.c b/drivers/soc/ti/knav_qmss_acc.c index 316e82e46f6c..2f7fb2dcc1d6 100644 --- a/drivers/soc/ti/knav_qmss_acc.c +++ b/drivers/soc/ti/knav_qmss_acc.c @@ -205,18 +205,18 @@ static int knav_range_setup_acc_irq(struct knav_range_info *range, { struct knav_device *kdev = range->kdev; struct knav_acc_channel *acc; - unsigned long cpu_map; + struct cpumask *cpu_mask; int ret = 0, irq; u32 old, new; if (range->flags & RANGE_MULTI_QUEUE) { acc = range->acc; irq = range->irqs[0].irq; - cpu_map = range->irqs[0].cpu_map; + cpu_mask = range->irqs[0].cpu_mask; } else { acc = range->acc + queue; irq = range->irqs[queue].irq; - cpu_map = range->irqs[queue].cpu_map; + cpu_mask = range->irqs[queue].cpu_mask; } old = acc->open_mask; @@ -239,8 +239,8 @@ static int knav_range_setup_acc_irq(struct knav_range_info *range, acc->name, acc->name); ret = request_irq(irq, knav_acc_int_handler, 0, acc->name, range); - if (!ret && cpu_map) { - ret = irq_set_affinity_hint(irq, to_cpumask(&cpu_map)); + if (!ret && cpu_mask) { + ret = irq_set_affinity_hint(irq, cpu_mask); if (ret) { dev_warn(range->kdev->dev, "Failed to set IRQ affinity\n"); diff --git a/drivers/soc/ti/knav_qmss_queue.c b/drivers/soc/ti/knav_qmss_queue.c index b5d5673c255c..8b418379272d 100644 --- a/drivers/soc/ti/knav_qmss_queue.c +++ b/drivers/soc/ti/knav_qmss_queue.c @@ -118,19 +118,17 @@ static int knav_queue_setup_irq(struct knav_range_info *range, struct knav_queue_inst *inst) { unsigned queue = inst->id - range->queue_base; - unsigned long cpu_map; int ret = 0, irq; if (range->flags & RANGE_HAS_IRQ) { irq = range->irqs[queue].irq; - cpu_map = range->irqs[queue].cpu_map; ret = request_irq(irq, knav_queue_int_handler, 0, inst->irq_name, inst); if (ret) return ret; disable_irq(irq); - if (cpu_map) { - ret = irq_set_affinity_hint(irq, to_cpumask(&cpu_map)); + if (range->irqs[queue].cpu_mask) { + ret = irq_set_affinity_hint(irq, range->irqs[queue].cpu_mask); if (ret) { dev_warn(range->kdev->dev, "Failed to set IRQ affinity\n"); @@ -1262,9 +1260,19 @@ static int knav_setup_queue_range(struct knav_device *kdev, range->num_irqs++; - if (IS_ENABLED(CONFIG_SMP) && oirq.args_count == 3) - range->irqs[i].cpu_map = - (oirq.args[2] & 0x0000ff00) >> 8; + if (IS_ENABLED(CONFIG_SMP) && oirq.args_count == 3) { + unsigned long mask; + int bit; + + range->irqs[i].cpu_mask = devm_kzalloc(dev, + cpumask_size(), GFP_KERNEL); + if (!range->irqs[i].cpu_mask) + return -ENOMEM; + + mask = (oirq.args[2] & 0x0000ff00) >> 8; + for_each_set_bit(bit, &mask, BITS_PER_LONG) + cpumask_set_cpu(bit, range->irqs[i].cpu_mask); + } } range->num_irqs = min(range->num_irqs, range->num_queues); diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 1227872227dc..36b742932c72 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -1245,8 +1245,7 @@ static int iscsit_do_rx_data( return -1; memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, - count->iov, count->iov_count, data); + iov_iter_kvec(&msg.msg_iter, READ, count->iov, count->iov_count, data); while (msg_data_left(&msg)) { rx_loop = sock_recvmsg(conn->sock, &msg, MSG_WAITALL); @@ -1302,8 +1301,7 @@ int tx_data( memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, - iov, iov_count, data); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, iov_count, data); while (msg_data_left(&msg)) { int tx_loop = sock_sendmsg(conn->sock, &msg); diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index e46ca968009c..4f134b0c3e29 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -268,7 +268,7 @@ target_emulate_report_target_port_groups(struct se_cmd *cmd) } transport_kunmap_data_sg(cmd); - target_complete_cmd(cmd, GOOD); + target_complete_cmd_with_length(cmd, GOOD, rd_len + 4); return 0; } diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 16751ae55d7b..49b110d1b972 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -303,7 +303,7 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, len += sg->length; } - iov_iter_bvec(&iter, ITER_BVEC | is_write, bvec, sgl_nents, len); + iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); aio_cmd->cmd = cmd; aio_cmd->len = len; @@ -353,7 +353,7 @@ static int fd_do_rw(struct se_cmd *cmd, struct file *fd, len += sg->length; } - iov_iter_bvec(&iter, ITER_BVEC, bvec, sgl_nents, len); + iov_iter_bvec(&iter, READ, bvec, sgl_nents, len); if (is_write) ret = vfs_iter_write(fd, &iter, &pos, 0); else @@ -490,7 +490,7 @@ fd_execute_write_same(struct se_cmd *cmd) len += se_dev->dev_attrib.block_size; } - iov_iter_bvec(&iter, ITER_BVEC, bvec, nolb, len); + iov_iter_bvec(&iter, READ, bvec, nolb, len); ret = vfs_iter_write(fd_dev->fd_file, &iter, &pos, 0); kfree(bvec); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 4cf33e2cc705..e31e4fc31aa1 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -205,19 +205,19 @@ void transport_subsystem_check_init(void) if (sub_api_initialized) return; - ret = request_module("target_core_iblock"); + ret = IS_ENABLED(CONFIG_TCM_IBLOCK) && request_module("target_core_iblock"); if (ret != 0) pr_err("Unable to load target_core_iblock\n"); - ret = request_module("target_core_file"); + ret = IS_ENABLED(CONFIG_TCM_FILEIO) && request_module("target_core_file"); if (ret != 0) pr_err("Unable to load target_core_file\n"); - ret = request_module("target_core_pscsi"); + ret = IS_ENABLED(CONFIG_TCM_PSCSI) && request_module("target_core_pscsi"); if (ret != 0) pr_err("Unable to load target_core_pscsi\n"); - ret = request_module("target_core_user"); + ret = IS_ENABLED(CONFIG_TCM_USER2) && request_module("target_core_user"); if (ret != 0) pr_err("Unable to load target_core_user\n"); diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 9756752c0681..45da3e01c7b0 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -309,7 +309,7 @@ int usbip_recv(struct socket *sock, void *buf, int size) if (!sock || !buf || !size) return -EINVAL; - iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size); usbip_dbg_xmit("enter\n"); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index c24bb690680b..50dffe83714c 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -203,6 +203,19 @@ struct vhost_scsi { int vs_events_nr; /* num of pending events, protected by vq->mutex */ }; +/* + * Context for processing request and control queue operations. + */ +struct vhost_scsi_ctx { + int head; + unsigned int out, in; + size_t req_size, rsp_size; + size_t out_size, in_size; + u8 *target, *lunp; + void *req; + struct iov_iter out_iter; +}; + static struct workqueue_struct *vhost_scsi_workqueue; /* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */ @@ -800,24 +813,120 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs, pr_err("Faulted on virtio_scsi_cmd_resp\n"); } +static int +vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + int ret = -ENXIO; + + vc->head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &vc->out, &vc->in, + NULL, NULL); + + pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", + vc->head, vc->out, vc->in); + + /* On error, stop handling until the next kick. */ + if (unlikely(vc->head < 0)) + goto done; + + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (vc->head == vq->num) { + if (unlikely(vhost_enable_notify(&vs->dev, vq))) { + vhost_disable_notify(&vs->dev, vq); + ret = -EAGAIN; + } + goto done; + } + + /* + * Get the size of request and response buffers. + * FIXME: Not correct for BIDI operation + */ + vc->out_size = iov_length(vq->iov, vc->out); + vc->in_size = iov_length(&vq->iov[vc->out], vc->in); + + /* + * Copy over the virtio-scsi request header, which for a + * ANY_LAYOUT enabled guest may span multiple iovecs, or a + * single iovec may contain both the header + outgoing + * WRITE payloads. + * + * copy_from_iter() will advance out_iter, so that it will + * point at the start of the outgoing WRITE payload, if + * DMA_TO_DEVICE is set. + */ + iov_iter_init(&vc->out_iter, WRITE, vq->iov, vc->out, vc->out_size); + ret = 0; + +done: + return ret; +} + +static int +vhost_scsi_chk_size(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc) +{ + if (unlikely(vc->in_size < vc->rsp_size)) { + vq_err(vq, + "Response buf too small, need min %zu bytes got %zu", + vc->rsp_size, vc->in_size); + return -EINVAL; + } else if (unlikely(vc->out_size < vc->req_size)) { + vq_err(vq, + "Request buf too small, need min %zu bytes got %zu", + vc->req_size, vc->out_size); + return -EIO; + } + + return 0; +} + +static int +vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc, + struct vhost_scsi_tpg **tpgp) +{ + int ret = -EIO; + + if (unlikely(!copy_from_iter_full(vc->req, vc->req_size, + &vc->out_iter))) { + vq_err(vq, "Faulted on copy_from_iter\n"); + } else if (unlikely(*vc->lunp != 1)) { + /* virtio-scsi spec requires byte 0 of the lun to be 1 */ + vq_err(vq, "Illegal virtio-scsi lun: %u\n", *vc->lunp); + } else { + struct vhost_scsi_tpg **vs_tpg, *tpg; + + vs_tpg = vq->private_data; /* validated at handler entry */ + + tpg = READ_ONCE(vs_tpg[*vc->target]); + if (unlikely(!tpg)) { + vq_err(vq, "Target 0x%x does not exist\n", *vc->target); + } else { + if (tpgp) + *tpgp = tpg; + ret = 0; + } + } + + return ret; +} + static void vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) { struct vhost_scsi_tpg **vs_tpg, *tpg; struct virtio_scsi_cmd_req v_req; struct virtio_scsi_cmd_req_pi v_req_pi; + struct vhost_scsi_ctx vc; struct vhost_scsi_cmd *cmd; - struct iov_iter out_iter, in_iter, prot_iter, data_iter; + struct iov_iter in_iter, prot_iter, data_iter; u64 tag; u32 exp_data_len, data_direction; - unsigned int out = 0, in = 0; - int head, ret, prot_bytes; - size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp); - size_t out_size, in_size; + int ret, prot_bytes; u16 lun; - u8 *target, *lunp, task_attr; + u8 task_attr; bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); - void *req, *cdb; + void *cdb; mutex_lock(&vq->mutex); /* @@ -828,85 +937,47 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (!vs_tpg) goto out; + memset(&vc, 0, sizeof(vc)); + vc.rsp_size = sizeof(struct virtio_scsi_cmd_resp); + vhost_disable_notify(&vs->dev, vq); for (;;) { - head = vhost_get_vq_desc(vq, vq->iov, - ARRAY_SIZE(vq->iov), &out, &in, - NULL, NULL); - pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", - head, out, in); - /* On error, stop handling until the next kick. */ - if (unlikely(head < 0)) - break; - /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (head == vq->num) { - if (unlikely(vhost_enable_notify(&vs->dev, vq))) { - vhost_disable_notify(&vs->dev, vq); - continue; - } - break; - } - /* - * Check for a sane response buffer so we can report early - * errors back to the guest. - */ - if (unlikely(vq->iov[out].iov_len < rsp_size)) { - vq_err(vq, "Expecting at least virtio_scsi_cmd_resp" - " size, got %zu bytes\n", vq->iov[out].iov_len); - break; - } + ret = vhost_scsi_get_desc(vs, vq, &vc); + if (ret) + goto err; + /* * Setup pointers and values based upon different virtio-scsi * request header if T10_PI is enabled in KVM guest. */ if (t10_pi) { - req = &v_req_pi; - req_size = sizeof(v_req_pi); - lunp = &v_req_pi.lun[0]; - target = &v_req_pi.lun[1]; + vc.req = &v_req_pi; + vc.req_size = sizeof(v_req_pi); + vc.lunp = &v_req_pi.lun[0]; + vc.target = &v_req_pi.lun[1]; } else { - req = &v_req; - req_size = sizeof(v_req); - lunp = &v_req.lun[0]; - target = &v_req.lun[1]; + vc.req = &v_req; + vc.req_size = sizeof(v_req); + vc.lunp = &v_req.lun[0]; + vc.target = &v_req.lun[1]; } - /* - * FIXME: Not correct for BIDI operation - */ - out_size = iov_length(vq->iov, out); - in_size = iov_length(&vq->iov[out], in); /* - * Copy over the virtio-scsi request header, which for a - * ANY_LAYOUT enabled guest may span multiple iovecs, or a - * single iovec may contain both the header + outgoing - * WRITE payloads. - * - * copy_from_iter() will advance out_iter, so that it will - * point at the start of the outgoing WRITE payload, if - * DMA_TO_DEVICE is set. + * Validate the size of request and response buffers. + * Check for a sane response buffer so we can report + * early errors back to the guest. */ - iov_iter_init(&out_iter, WRITE, vq->iov, out, out_size); + ret = vhost_scsi_chk_size(vq, &vc); + if (ret) + goto err; - if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) { - vq_err(vq, "Faulted on copy_from_iter\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } - /* virtio-scsi spec requires byte 0 of the lun to be 1 */ - if (unlikely(*lunp != 1)) { - vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } + ret = vhost_scsi_get_req(vq, &vc, &tpg); + if (ret) + goto err; + + ret = -EIO; /* bad target on any error from here on */ - tpg = READ_ONCE(vs_tpg[*target]); - if (unlikely(!tpg)) { - /* Target does not exist, fail the request */ - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } /* * Determine data_direction by calculating the total outgoing * iovec sizes + incoming iovec sizes vs. virtio-scsi request + @@ -924,17 +995,17 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) */ prot_bytes = 0; - if (out_size > req_size) { + if (vc.out_size > vc.req_size) { data_direction = DMA_TO_DEVICE; - exp_data_len = out_size - req_size; - data_iter = out_iter; - } else if (in_size > rsp_size) { + exp_data_len = vc.out_size - vc.req_size; + data_iter = vc.out_iter; + } else if (vc.in_size > vc.rsp_size) { data_direction = DMA_FROM_DEVICE; - exp_data_len = in_size - rsp_size; + exp_data_len = vc.in_size - vc.rsp_size; - iov_iter_init(&in_iter, READ, &vq->iov[out], in, - rsp_size + exp_data_len); - iov_iter_advance(&in_iter, rsp_size); + iov_iter_init(&in_iter, READ, &vq->iov[vc.out], vc.in, + vc.rsp_size + exp_data_len); + iov_iter_advance(&in_iter, vc.rsp_size); data_iter = in_iter; } else { data_direction = DMA_NONE; @@ -950,21 +1021,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (data_direction != DMA_TO_DEVICE) { vq_err(vq, "Received non zero pi_bytesout," " but wrong data_direction\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout); } else if (v_req_pi.pi_bytesin) { if (data_direction != DMA_FROM_DEVICE) { vq_err(vq, "Received non zero pi_bytesin," " but wrong data_direction\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin); } /* - * Set prot_iter to data_iter, and advance past any + * Set prot_iter to data_iter and truncate it to + * prot_bytes, and advance data_iter past any * preceeding prot_bytes that may be present. * * Also fix up the exp_data_len to reflect only the @@ -973,6 +1043,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (prot_bytes) { exp_data_len -= prot_bytes; prot_iter = data_iter; + iov_iter_truncate(&prot_iter, prot_bytes); iov_iter_advance(&data_iter, prot_bytes); } tag = vhost64_to_cpu(vq, v_req_pi.tag); @@ -996,8 +1067,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) vq_err(vq, "Received SCSI CDB with command_size: %d that" " exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n", scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr, exp_data_len + prot_bytes, @@ -1005,13 +1075,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (IS_ERR(cmd)) { vq_err(vq, "vhost_scsi_get_tag failed %ld\n", PTR_ERR(cmd)); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } cmd->tvc_vhost = vs; cmd->tvc_vq = vq; - cmd->tvc_resp_iov = vq->iov[out]; - cmd->tvc_in_iovs = in; + cmd->tvc_resp_iov = vq->iov[vc.out]; + cmd->tvc_in_iovs = vc.in; pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", cmd->tvc_cdb[0], cmd->tvc_lun); @@ -1019,14 +1088,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) " %d\n", cmd, exp_data_len, prot_bytes, data_direction); if (data_direction != DMA_NONE) { - ret = vhost_scsi_mapal(cmd, - prot_bytes, &prot_iter, - exp_data_len, &data_iter); - if (unlikely(ret)) { + if (unlikely(vhost_scsi_mapal(cmd, prot_bytes, + &prot_iter, exp_data_len, + &data_iter))) { vq_err(vq, "Failed to map iov to sgl\n"); vhost_scsi_release_cmd(&cmd->tvc_se_cmd); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } } /* @@ -1034,7 +1101,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) * complete the virtio-scsi request in TCM callback context via * vhost_scsi_queue_data_in() and vhost_scsi_queue_status() */ - cmd->tvc_vq_desc = head; + cmd->tvc_vq_desc = vc.head; /* * Dispatch cmd descriptor for cmwq execution in process * context provided by vhost_scsi_workqueue. This also ensures @@ -1043,6 +1110,166 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) */ INIT_WORK(&cmd->work, vhost_scsi_submission_work); queue_work(vhost_scsi_workqueue, &cmd->work); + ret = 0; +err: + /* + * ENXIO: No more requests, or read error, wait for next kick + * EINVAL: Invalid response buffer, drop the request + * EIO: Respond with bad target + * EAGAIN: Pending request + */ + if (ret == -ENXIO) + break; + else if (ret == -EIO) + vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); + } +out: + mutex_unlock(&vq->mutex); +} + +static void +vhost_scsi_send_tmf_reject(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + struct virtio_scsi_ctrl_tmf_resp __user *resp; + struct virtio_scsi_ctrl_tmf_resp rsp; + int ret; + + pr_debug("%s\n", __func__); + memset(&rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_FUNCTION_REJECTED; + resp = vq->iov[vc->out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); + else + pr_err("Faulted on virtio_scsi_ctrl_tmf_resp\n"); +} + +static void +vhost_scsi_send_an_resp(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + struct virtio_scsi_ctrl_an_resp __user *resp; + struct virtio_scsi_ctrl_an_resp rsp; + int ret; + + pr_debug("%s\n", __func__); + memset(&rsp, 0, sizeof(rsp)); /* event_actual = 0 */ + rsp.response = VIRTIO_SCSI_S_OK; + resp = vq->iov[vc->out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); + else + pr_err("Faulted on virtio_scsi_ctrl_an_resp\n"); +} + +static void +vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) +{ + union { + __virtio32 type; + struct virtio_scsi_ctrl_an_req an; + struct virtio_scsi_ctrl_tmf_req tmf; + } v_req; + struct vhost_scsi_ctx vc; + size_t typ_size; + int ret; + + mutex_lock(&vq->mutex); + /* + * We can handle the vq only after the endpoint is setup by calling the + * VHOST_SCSI_SET_ENDPOINT ioctl. + */ + if (!vq->private_data) + goto out; + + memset(&vc, 0, sizeof(vc)); + + vhost_disable_notify(&vs->dev, vq); + + for (;;) { + ret = vhost_scsi_get_desc(vs, vq, &vc); + if (ret) + goto err; + + /* + * Get the request type first in order to setup + * other parameters dependent on the type. + */ + vc.req = &v_req.type; + typ_size = sizeof(v_req.type); + + if (unlikely(!copy_from_iter_full(vc.req, typ_size, + &vc.out_iter))) { + vq_err(vq, "Faulted on copy_from_iter tmf type\n"); + /* + * The size of the response buffer depends on the + * request type and must be validated against it. + * Since the request type is not known, don't send + * a response. + */ + continue; + } + + switch (v_req.type) { + case VIRTIO_SCSI_T_TMF: + vc.req = &v_req.tmf; + vc.req_size = sizeof(struct virtio_scsi_ctrl_tmf_req); + vc.rsp_size = sizeof(struct virtio_scsi_ctrl_tmf_resp); + vc.lunp = &v_req.tmf.lun[0]; + vc.target = &v_req.tmf.lun[1]; + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: + vc.req = &v_req.an; + vc.req_size = sizeof(struct virtio_scsi_ctrl_an_req); + vc.rsp_size = sizeof(struct virtio_scsi_ctrl_an_resp); + vc.lunp = &v_req.an.lun[0]; + vc.target = NULL; + break; + default: + vq_err(vq, "Unknown control request %d", v_req.type); + continue; + } + + /* + * Validate the size of request and response buffers. + * Check for a sane response buffer so we can report + * early errors back to the guest. + */ + ret = vhost_scsi_chk_size(vq, &vc); + if (ret) + goto err; + + /* + * Get the rest of the request now that its size is known. + */ + vc.req += typ_size; + vc.req_size -= typ_size; + + ret = vhost_scsi_get_req(vq, &vc, NULL); + if (ret) + goto err; + + if (v_req.type == VIRTIO_SCSI_T_TMF) + vhost_scsi_send_tmf_reject(vs, vq, &vc); + else + vhost_scsi_send_an_resp(vs, vq, &vc); +err: + /* + * ENXIO: No more requests, or read error, wait for next kick + * EINVAL: Invalid response buffer, drop the request + * EIO: Respond with bad target + * EAGAIN: Pending request + */ + if (ret == -ENXIO) + break; + else if (ret == -EIO) + vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); } out: mutex_unlock(&vq->mutex); @@ -1050,7 +1277,12 @@ out: static void vhost_scsi_ctl_handle_kick(struct vhost_work *work) { + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev); + pr_debug("%s: The handling func for control queue.\n", __func__); + vhost_scsi_ctl_handle_vq(vs, vq); } static void diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index d1c1f6283729..728ecd1eea30 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -41,13 +41,34 @@ #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 +#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NOMEMALLOC) +/* The order of free page blocks to report to host */ +#define VIRTIO_BALLOON_FREE_PAGE_ORDER (MAX_ORDER - 1) +/* The size of a free page block in bytes */ +#define VIRTIO_BALLOON_FREE_PAGE_SIZE \ + (1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT)) + #ifdef CONFIG_BALLOON_COMPACTION static struct vfsmount *balloon_mnt; #endif +enum virtio_balloon_vq { + VIRTIO_BALLOON_VQ_INFLATE, + VIRTIO_BALLOON_VQ_DEFLATE, + VIRTIO_BALLOON_VQ_STATS, + VIRTIO_BALLOON_VQ_FREE_PAGE, + VIRTIO_BALLOON_VQ_MAX +}; + struct virtio_balloon { struct virtio_device *vdev; - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; + + /* Balloon's own wq for cpu-intensive work items */ + struct workqueue_struct *balloon_wq; + /* The free page reporting work item submitted to the balloon wq */ + struct work_struct report_free_page_work; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; @@ -57,6 +78,18 @@ struct virtio_balloon { spinlock_t stop_update_lock; bool stop_update; + /* The list of allocated free pages, waiting to be given back to mm */ + struct list_head free_page_list; + spinlock_t free_page_list_lock; + /* The number of free page blocks on the above list */ + unsigned long num_free_page_blocks; + /* The cmd id received from host */ + u32 cmd_id_received; + /* The cmd id that is actively in use */ + __virtio32 cmd_id_active; + /* Buffer to store the stop sign */ + __virtio32 cmd_id_stop; + /* Waiting for host to ack the pages we released. */ wait_queue_head_t acked; @@ -320,17 +353,6 @@ static void stats_handle_request(struct virtio_balloon *vb) virtqueue_kick(vq); } -static void virtballoon_changed(struct virtio_device *vdev) -{ - struct virtio_balloon *vb = vdev->priv; - unsigned long flags; - - spin_lock_irqsave(&vb->stop_update_lock, flags); - if (!vb->stop_update) - queue_work(system_freezable_wq, &vb->update_balloon_size_work); - spin_unlock_irqrestore(&vb->stop_update_lock, flags); -} - static inline s64 towards_target(struct virtio_balloon *vb) { s64 target; @@ -347,6 +369,60 @@ static inline s64 towards_target(struct virtio_balloon *vb) return target - vb->num_pages; } +/* Gives back @num_to_return blocks of free pages to mm. */ +static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb, + unsigned long num_to_return) +{ + struct page *page; + unsigned long num_returned; + + spin_lock_irq(&vb->free_page_list_lock); + for (num_returned = 0; num_returned < num_to_return; num_returned++) { + page = balloon_page_pop(&vb->free_page_list); + if (!page) + break; + free_pages((unsigned long)page_address(page), + VIRTIO_BALLOON_FREE_PAGE_ORDER); + } + vb->num_free_page_blocks -= num_returned; + spin_unlock_irq(&vb->free_page_list_lock); + + return num_returned; +} + +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + unsigned long flags; + s64 diff = towards_target(vb); + + if (diff) { + spin_lock_irqsave(&vb->stop_update_lock, flags); + if (!vb->stop_update) + queue_work(system_freezable_wq, + &vb->update_balloon_size_work); + spin_unlock_irqrestore(&vb->stop_update_lock, flags); + } + + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + virtio_cread(vdev, struct virtio_balloon_config, + free_page_report_cmd_id, &vb->cmd_id_received); + if (vb->cmd_id_received == VIRTIO_BALLOON_CMD_ID_DONE) { + /* Pass ULONG_MAX to give back all the free pages */ + return_free_pages_to_mm(vb, ULONG_MAX); + } else if (vb->cmd_id_received != VIRTIO_BALLOON_CMD_ID_STOP && + vb->cmd_id_received != + virtio32_to_cpu(vdev, vb->cmd_id_active)) { + spin_lock_irqsave(&vb->stop_update_lock, flags); + if (!vb->stop_update) { + queue_work(vb->balloon_wq, + &vb->report_free_page_work); + } + spin_unlock_irqrestore(&vb->stop_update_lock, flags); + } + } +} + static void update_balloon_size(struct virtio_balloon *vb) { u32 actual = vb->num_pages; @@ -389,26 +465,44 @@ static void update_balloon_size_func(struct work_struct *work) static int init_vqs(struct virtio_balloon *vb) { - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request }; - static const char * const names[] = { "inflate", "deflate", "stats" }; - int err, nvqs; + struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX]; + vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX]; + const char *names[VIRTIO_BALLOON_VQ_MAX]; + int err; /* - * We expect two virtqueues: inflate and deflate, and - * optionally stat. + * Inflateq and deflateq are used unconditionally. The names[] + * will be NULL if the related feature is not enabled, which will + * cause no allocation for the corresponding virtqueue in find_vqs. */ - nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2; - err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL); + callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack; + names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate"; + callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack; + names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; + names[VIRTIO_BALLOON_VQ_STATS] = NULL; + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { + names[VIRTIO_BALLOON_VQ_STATS] = "stats"; + callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request; + } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq"; + callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + } + + err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, + vqs, callbacks, names, NULL, NULL); if (err) return err; - vb->inflate_vq = vqs[0]; - vb->deflate_vq = vqs[1]; + vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; + vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { struct scatterlist sg; unsigned int num_stats; - vb->stats_vq = vqs[2]; + vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS]; /* * Prime this virtqueue with one buffer so the hypervisor can @@ -426,9 +520,145 @@ static int init_vqs(struct virtio_balloon *vb) } virtqueue_kick(vb->stats_vq); } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; + + return 0; +} + +static int send_cmd_id_start(struct virtio_balloon *vb) +{ + struct scatterlist sg; + struct virtqueue *vq = vb->free_page_vq; + int err, unused; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + vb->cmd_id_active = cpu_to_virtio32(vb->vdev, vb->cmd_id_received); + sg_init_one(&sg, &vb->cmd_id_active, sizeof(vb->cmd_id_active)); + err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_active, GFP_KERNEL); + if (!err) + virtqueue_kick(vq); + return err; +} + +static int send_cmd_id_stop(struct virtio_balloon *vb) +{ + struct scatterlist sg; + struct virtqueue *vq = vb->free_page_vq; + int err, unused; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + sg_init_one(&sg, &vb->cmd_id_stop, sizeof(vb->cmd_id_stop)); + err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_stop, GFP_KERNEL); + if (!err) + virtqueue_kick(vq); + return err; +} + +static int get_free_page_and_send(struct virtio_balloon *vb) +{ + struct virtqueue *vq = vb->free_page_vq; + struct page *page; + struct scatterlist sg; + int err, unused; + void *p; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + page = alloc_pages(VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG, + VIRTIO_BALLOON_FREE_PAGE_ORDER); + /* + * When the allocation returns NULL, it indicates that we have got all + * the possible free pages, so return -EINTR to stop. + */ + if (!page) + return -EINTR; + + p = page_address(page); + sg_init_one(&sg, p, VIRTIO_BALLOON_FREE_PAGE_SIZE); + /* There is always 1 entry reserved for the cmd id to use. */ + if (vq->num_free > 1) { + err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL); + if (unlikely(err)) { + free_pages((unsigned long)p, + VIRTIO_BALLOON_FREE_PAGE_ORDER); + return err; + } + virtqueue_kick(vq); + spin_lock_irq(&vb->free_page_list_lock); + balloon_page_push(&vb->free_page_list, page); + vb->num_free_page_blocks++; + spin_unlock_irq(&vb->free_page_list_lock); + } else { + /* + * The vq has no available entry to add this page block, so + * just free it. + */ + free_pages((unsigned long)p, VIRTIO_BALLOON_FREE_PAGE_ORDER); + } + + return 0; +} + +static int send_free_pages(struct virtio_balloon *vb) +{ + int err; + u32 cmd_id_active; + + while (1) { + /* + * If a stop id or a new cmd id was just received from host, + * stop the reporting. + */ + cmd_id_active = virtio32_to_cpu(vb->vdev, vb->cmd_id_active); + if (cmd_id_active != vb->cmd_id_received) + break; + + /* + * The free page blocks are allocated and sent to host one by + * one. + */ + err = get_free_page_and_send(vb); + if (err == -EINTR) + break; + else if (unlikely(err)) + return err; + } + return 0; } +static void report_free_page_func(struct work_struct *work) +{ + int err; + struct virtio_balloon *vb = container_of(work, struct virtio_balloon, + report_free_page_work); + struct device *dev = &vb->vdev->dev; + + /* Start by sending the received cmd id to host with an outbuf. */ + err = send_cmd_id_start(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a start id, err = %d\n", err); + + err = send_free_pages(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a free page, err = %d\n", err); + + /* End by sending a stop id to host with an outbuf. */ + err = send_cmd_id_stop(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a stop id, err = %d\n", err); +} + #ifdef CONFIG_BALLOON_COMPACTION /* * virtballoon_migratepage - perform the balloon page migration on behalf of @@ -512,14 +742,23 @@ static struct file_system_type balloon_fs = { #endif /* CONFIG_BALLOON_COMPACTION */ -static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, - struct shrink_control *sc) +static unsigned long shrink_free_pages(struct virtio_balloon *vb, + unsigned long pages_to_free) { - unsigned long pages_to_free, pages_freed = 0; - struct virtio_balloon *vb = container_of(shrinker, - struct virtio_balloon, shrinker); + unsigned long blocks_to_free, blocks_freed; - pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE; + pages_to_free = round_up(pages_to_free, + 1 << VIRTIO_BALLOON_FREE_PAGE_ORDER); + blocks_to_free = pages_to_free >> VIRTIO_BALLOON_FREE_PAGE_ORDER; + blocks_freed = return_free_pages_to_mm(vb, blocks_to_free); + + return blocks_freed << VIRTIO_BALLOON_FREE_PAGE_ORDER; +} + +static unsigned long shrink_balloon_pages(struct virtio_balloon *vb, + unsigned long pages_to_free) +{ + unsigned long pages_freed = 0; /* * One invocation of leak_balloon can deflate at most @@ -527,12 +766,33 @@ static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, * multiple times to deflate pages till reaching pages_to_free. */ while (vb->num_pages && pages_to_free) { + pages_freed += leak_balloon(vb, pages_to_free) / + VIRTIO_BALLOON_PAGES_PER_PAGE; pages_to_free -= pages_freed; - pages_freed += leak_balloon(vb, pages_to_free); } update_balloon_size(vb); - return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE; + return pages_freed; +} + +static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_to_free, pages_freed = 0; + struct virtio_balloon *vb = container_of(shrinker, + struct virtio_balloon, shrinker); + + pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + pages_freed = shrink_free_pages(vb, pages_to_free); + + if (pages_freed >= pages_to_free) + return pages_freed; + + pages_freed += shrink_balloon_pages(vb, pages_to_free - pages_freed); + + return pages_freed; } static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, @@ -540,8 +800,12 @@ static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, { struct virtio_balloon *vb = container_of(shrinker, struct virtio_balloon, shrinker); + unsigned long count; - return vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE; + count = vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE; + count += vb->num_free_page_blocks >> VIRTIO_BALLOON_FREE_PAGE_ORDER; + + return count; } static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) @@ -561,6 +825,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) static int virtballoon_probe(struct virtio_device *vdev) { struct virtio_balloon *vb; + __u32 poison_val; int err; if (!vdev->config->get) { @@ -604,6 +869,36 @@ static int virtballoon_probe(struct virtio_device *vdev) } vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; #endif + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + /* + * There is always one entry reserved for cmd id, so the ring + * size needs to be at least two to report free page hints. + */ + if (virtqueue_get_vring_size(vb->free_page_vq) < 2) { + err = -ENOSPC; + goto out_del_vqs; + } + vb->balloon_wq = alloc_workqueue("balloon-wq", + WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); + if (!vb->balloon_wq) { + err = -ENOMEM; + goto out_del_vqs; + } + INIT_WORK(&vb->report_free_page_work, report_free_page_func); + vb->cmd_id_received = VIRTIO_BALLOON_CMD_ID_STOP; + vb->cmd_id_active = cpu_to_virtio32(vb->vdev, + VIRTIO_BALLOON_CMD_ID_STOP); + vb->cmd_id_stop = cpu_to_virtio32(vb->vdev, + VIRTIO_BALLOON_CMD_ID_STOP); + vb->num_free_page_blocks = 0; + spin_lock_init(&vb->free_page_list_lock); + INIT_LIST_HEAD(&vb->free_page_list); + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { + memset(&poison_val, PAGE_POISON, sizeof(poison_val)); + virtio_cwrite(vb->vdev, struct virtio_balloon_config, + poison_val, &poison_val); + } + } /* * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a * shrinker needs to be registered to relieve memory pressure. @@ -611,7 +906,7 @@ static int virtballoon_probe(struct virtio_device *vdev) if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { err = virtio_balloon_register_shrinker(vb); if (err) - goto out_del_vqs; + goto out_del_balloon_wq; } virtio_device_ready(vdev); @@ -619,6 +914,9 @@ static int virtballoon_probe(struct virtio_device *vdev) virtballoon_changed(vdev); return 0; +out_del_balloon_wq: + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + destroy_workqueue(vb->balloon_wq); out_del_vqs: vdev->config->del_vqs(vdev); out_free_vb: @@ -652,6 +950,11 @@ static void virtballoon_remove(struct virtio_device *vdev) cancel_work_sync(&vb->update_balloon_size_work); cancel_work_sync(&vb->update_balloon_stats_work); + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + cancel_work_sync(&vb->report_free_page_work); + destroy_workqueue(vb->balloon_wq); + } + remove_common(vb); #ifdef CONFIG_BALLOON_COMPACTION if (vb->vb_dev_info.inode) @@ -695,6 +998,9 @@ static int virtballoon_restore(struct virtio_device *vdev) static int virtballoon_validate(struct virtio_device *vdev) { + if (!page_poisoning_enabled()) + __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON); + __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM); return 0; } @@ -703,6 +1009,8 @@ static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, + VIRTIO_BALLOON_F_FREE_PAGE_HINT, + VIRTIO_BALLOON_F_PAGE_POISON, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index b1092fbefa63..2e5d845b5091 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -137,13 +137,13 @@ static void pvcalls_conn_back_read(void *opaque) if (masked_prod < masked_cons) { vec[0].iov_base = data->in + masked_prod; vec[0].iov_len = wanted; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 1, wanted); + iov_iter_kvec(&msg.msg_iter, WRITE, vec, 1, wanted); } else { vec[0].iov_base = data->in + masked_prod; vec[0].iov_len = array_size - masked_prod; vec[1].iov_base = data->in; vec[1].iov_len = wanted - vec[0].iov_len; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 2, wanted); + iov_iter_kvec(&msg.msg_iter, WRITE, vec, 2, wanted); } atomic_set(&map->read, 0); @@ -195,13 +195,13 @@ static void pvcalls_conn_back_write(struct sock_mapping *map) if (pvcalls_mask(prod, array_size) > pvcalls_mask(cons, array_size)) { vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); vec[0].iov_len = size; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 1, size); + iov_iter_kvec(&msg.msg_iter, READ, vec, 1, size); } else { vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); vec[0].iov_len = array_size - pvcalls_mask(cons, array_size); vec[1].iov_base = data->out; vec[1].iov_len = size - vec[0].iov_len; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 2, size); + iov_iter_kvec(&msg.msg_iter, READ, vec, 2, size); } atomic_set(&map->write, 0); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index e1cbdfdb7c68..0bcbcc20f769 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -65,7 +65,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) if (retval == 0) return retval; - iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE); + iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE); retval = p9_client_read(fid, page_offset(page), &to, &err); if (err) { @@ -175,7 +175,7 @@ static int v9fs_vfs_writepage_locked(struct page *page) bvec.bv_page = page; bvec.bv_offset = 0; bvec.bv_len = len; - iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len); + iov_iter_bvec(&from, WRITE, &bvec, 1, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index cb6c4031af55..00745147329d 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -123,7 +123,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (rdir->tail == rdir->head) { struct iov_iter to; int n; - iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen); + iov_iter_kvec(&to, READ, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); if (err) diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 352abc39e891..ac8ff8ca4c11 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -32,7 +32,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, struct iov_iter to; int err; - iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size); + iov_iter_kvec(&to, READ, &kvec, 1, buffer_size); attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { @@ -107,7 +107,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, struct iov_iter from; int retval, err; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); + iov_iter_kvec(&from, WRITE, &kvec, 1, value_len); p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index ebba3b18e5da..701aaa9b1899 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -27,3 +27,15 @@ config AFS_FSCACHE help Say Y here if you want AFS data to be cached locally on disk through the generic filesystem cache manager + +config AFS_DEBUG_CURSOR + bool "AFS server cursor debugging" + depends on AFS_FS + help + Say Y here to cause the contents of a server cursor to be dumped to + the dmesg log if the server rotation algorithm fails to successfully + contact a server. + + See <file:Documentation/filesystems/afs.txt> for more information. + + If unsure, say N. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 546874057bd3..0738e2bf5193 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -17,6 +17,7 @@ kafs-y := \ file.o \ flock.o \ fsclient.o \ + fs_probe.o \ inode.o \ main.o \ misc.o \ @@ -29,9 +30,13 @@ kafs-y := \ super.o \ netdevices.o \ vlclient.o \ + vl_list.o \ + vl_probe.o \ + vl_rotate.o \ volume.o \ write.o \ - xattr.o + xattr.o \ + yfsclient.o kafs-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 55a756c60746..967db336d11a 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -64,19 +64,25 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, /* * Parse a text string consisting of delimited addresses. */ -struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, - char delim, - unsigned short service, - unsigned short port) +struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, + const char *text, size_t len, + char delim, + unsigned short service, + unsigned short port) { + struct afs_vlserver_list *vllist; struct afs_addr_list *alist; const char *p, *end = text + len; + const char *problem; unsigned int nr = 0; + int ret = -ENOMEM; _enter("%*.*s,%c", (int)len, (int)len, text, delim); - if (!len) + if (!len) { + _leave(" = -EDESTADDRREQ [empty]"); return ERR_PTR(-EDESTADDRREQ); + } if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len))) delim = ','; @@ -84,18 +90,24 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, /* Count the addresses */ p = text; do { - if (!*p) - return ERR_PTR(-EINVAL); + if (!*p) { + problem = "nul"; + goto inval; + } if (*p == delim) continue; nr++; if (*p == '[') { p++; - if (p == end) - return ERR_PTR(-EINVAL); + if (p == end) { + problem = "brace1"; + goto inval; + } p = memchr(p, ']', end - p); - if (!p) - return ERR_PTR(-EINVAL); + if (!p) { + problem = "brace2"; + goto inval; + } p++; if (p >= end) break; @@ -109,10 +121,19 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, _debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES); - alist = afs_alloc_addrlist(nr, service, port); - if (!alist) + vllist = afs_alloc_vlserver_list(1); + if (!vllist) return ERR_PTR(-ENOMEM); + vllist->nr_servers = 1; + vllist->servers[0].server = afs_alloc_vlserver("<dummy>", 7, AFS_VL_PORT); + if (!vllist->servers[0].server) + goto error_vl; + + alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT); + if (!alist) + goto error; + /* Extract the addresses */ p = text; do { @@ -135,17 +156,21 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, break; } - if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) + if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) { family = AF_INET; - else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) + } else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) { family = AF_INET6; - else + } else { + problem = "family"; goto bad_address; + } - if (stop != q) + p = q; + if (stop != p) { + problem = "nostop"; goto bad_address; + } - p = q; if (q < end && *q == ']') p++; @@ -154,18 +179,23 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, /* Port number specification "+1234" */ xport = 0; p++; - if (p >= end || !isdigit(*p)) + if (p >= end || !isdigit(*p)) { + problem = "port"; goto bad_address; + } do { xport *= 10; xport += *p - '0'; - if (xport > 65535) + if (xport > 65535) { + problem = "pval"; goto bad_address; + } p++; } while (p < end && isdigit(*p)); } else if (*p == delim) { p++; } else { + problem = "weird"; goto bad_address; } } @@ -177,12 +207,23 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, } while (p < end); + rcu_assign_pointer(vllist->servers[0].server->addresses, alist); _leave(" = [nr %u]", alist->nr_addrs); - return alist; + return vllist; -bad_address: - kfree(alist); +inval: + _leave(" = -EINVAL [%s %zu %*.*s]", + problem, p - text, (int)len, (int)len, text); return ERR_PTR(-EINVAL); +bad_address: + _leave(" = -EINVAL [%s %zu %*.*s]", + problem, p - text, (int)len, (int)len, text); + ret = -EINVAL; +error: + afs_put_addrlist(alist); +error_vl: + afs_put_vlserverlist(net, vllist); + return ERR_PTR(ret); } /* @@ -201,30 +242,34 @@ static int afs_cmp_addr_list(const struct afs_addr_list *a1, /* * Perform a DNS query for VL servers and build a up an address list. */ -struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) +struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) { - struct afs_addr_list *alist; - char *vllist = NULL; + struct afs_vlserver_list *vllist; + char *result = NULL; int ret; _enter("%s", cell->name); - ret = dns_query("afsdb", cell->name, cell->name_len, - "", &vllist, _expiry); - if (ret < 0) + ret = dns_query("afsdb", cell->name, cell->name_len, "srv=1", + &result, _expiry); + if (ret < 0) { + _leave(" = %d [dns]", ret); return ERR_PTR(ret); - - alist = afs_parse_text_addrs(vllist, strlen(vllist), ',', - VL_SERVICE, AFS_VL_PORT); - if (IS_ERR(alist)) { - kfree(vllist); - if (alist != ERR_PTR(-ENOMEM)) - pr_err("Failed to parse DNS data\n"); - return alist; } - kfree(vllist); - return alist; + if (*_expiry == 0) + *_expiry = ktime_get_real_seconds() + 60; + + if (ret > 1 && result[0] == 0) + vllist = afs_extract_vlserver_list(cell, result, ret); + else + vllist = afs_parse_text_addrs(cell->net, result, ret, ',', + VL_SERVICE, AFS_VL_PORT); + kfree(result); + if (IS_ERR(vllist) && vllist != ERR_PTR(-ENOMEM)) + pr_err("Failed to parse DNS data %ld\n", PTR_ERR(vllist)); + + return vllist; } /* @@ -258,6 +303,8 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); srx = &alist->addrs[i]; + srx->srx_family = AF_RXRPC; + srx->transport_type = SOCK_DGRAM; srx->transport_len = sizeof(srx->transport.sin); srx->transport.sin.sin_family = AF_INET; srx->transport.sin.sin_port = htons(port); @@ -296,6 +343,8 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); srx = &alist->addrs[i]; + srx->srx_family = AF_RXRPC; + srx->transport_type = SOCK_DGRAM; srx->transport_len = sizeof(srx->transport.sin6); srx->transport.sin6.sin6_family = AF_INET6; srx->transport.sin6.sin6_port = htons(port); @@ -308,25 +357,33 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) */ bool afs_iterate_addresses(struct afs_addr_cursor *ac) { - _enter("%hu+%hd", ac->start, (short)ac->index); + unsigned long set, failed; + int index; if (!ac->alist) return false; - if (ac->begun) { - ac->index++; - if (ac->index == ac->alist->nr_addrs) - ac->index = 0; + set = ac->alist->responded; + failed = ac->alist->failed; + _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index); - if (ac->index == ac->start) { - ac->error = -EDESTADDRREQ; - return false; - } - } + ac->nr_iterations++; + + set &= ~(failed | ac->tried); + + if (!set) + return false; - ac->begun = true; + index = READ_ONCE(ac->alist->preferred); + if (test_bit(index, &set)) + goto selected; + + index = __ffs(set); + +selected: + ac->index = index; + set_bit(index, &ac->tried); ac->responded = false; - ac->addr = &ac->alist->addrs[ac->index]; return true; } @@ -339,53 +396,13 @@ int afs_end_cursor(struct afs_addr_cursor *ac) alist = ac->alist; if (alist) { - if (ac->responded && ac->index != ac->start) - WRITE_ONCE(alist->index, ac->index); + if (ac->responded && + ac->index != alist->preferred && + test_bit(ac->alist->preferred, &ac->tried)) + WRITE_ONCE(alist->preferred, ac->index); afs_put_addrlist(alist); + ac->alist = NULL; } - ac->addr = NULL; - ac->alist = NULL; - ac->begun = false; return ac->error; } - -/* - * Set the address cursor for iterating over VL servers. - */ -int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell) -{ - struct afs_addr_list *alist; - int ret; - - if (!rcu_access_pointer(cell->vl_addrs)) { - ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET, - TASK_INTERRUPTIBLE); - if (ret < 0) - return ret; - - if (!rcu_access_pointer(cell->vl_addrs) && - ktime_get_real_seconds() < cell->dns_expiry) - return cell->error; - } - - read_lock(&cell->vl_addrs_lock); - alist = rcu_dereference_protected(cell->vl_addrs, - lockdep_is_held(&cell->vl_addrs_lock)); - if (alist->nr_addrs > 0) - afs_get_addrlist(alist); - else - alist = NULL; - read_unlock(&cell->vl_addrs_lock); - - if (!alist) - return -EDESTADDRREQ; - - ac->alist = alist; - ac->addr = NULL; - ac->start = READ_ONCE(alist->index); - ac->index = ac->start; - ac->error = 0; - ac->begun = false; - return 0; -} diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b4ff1f7ae4ab..d12ffb457e47 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -23,9 +23,9 @@ #define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */ #define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */ -typedef unsigned afs_volid_t; -typedef unsigned afs_vnodeid_t; -typedef unsigned long long afs_dataversion_t; +typedef u64 afs_volid_t; +typedef u64 afs_vnodeid_t; +typedef u64 afs_dataversion_t; typedef enum { AFSVL_RWVOL, /* read/write volume */ @@ -52,8 +52,9 @@ typedef enum { */ struct afs_fid { afs_volid_t vid; /* volume ID */ - afs_vnodeid_t vnode; /* file index within volume */ - unsigned unique; /* unique ID number (file index version) */ + afs_vnodeid_t vnode; /* Lower 64-bits of file index within volume */ + u32 vnode_hi; /* Upper 32-bits of file index */ + u32 unique; /* unique ID number (file index version) */ }; /* @@ -67,14 +68,14 @@ typedef enum { } afs_callback_type_t; struct afs_callback { + time64_t expires_at; /* Time at which expires */ unsigned version; /* Callback version */ - unsigned expiry; /* Time at which expires */ afs_callback_type_t type; /* Type of callback */ }; struct afs_callback_break { struct afs_fid fid; /* File identifier */ - struct afs_callback cb; /* Callback details */ + //struct afs_callback cb; /* Callback details */ }; #define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ @@ -129,19 +130,18 @@ typedef u32 afs_access_t; struct afs_file_status { u64 size; /* file size */ afs_dataversion_t data_version; /* current data version */ - time_t mtime_client; /* last time client changed data */ - time_t mtime_server; /* last time server changed data */ - unsigned abort_code; /* Abort if bulk-fetching this failed */ - - afs_file_type_t type; /* file type */ - unsigned nlink; /* link count */ - u32 author; /* author ID */ - u32 owner; /* owner ID */ - u32 group; /* group ID */ + struct timespec64 mtime_client; /* Last time client changed data */ + struct timespec64 mtime_server; /* Last time server changed data */ + s64 author; /* author ID */ + s64 owner; /* owner ID */ + s64 group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ + afs_file_type_t type; /* file type */ + u32 nlink; /* link count */ s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ + u32 abort_code; /* Abort if bulk-fetching this failed */ }; /* @@ -158,25 +158,27 @@ struct afs_file_status { * AFS volume synchronisation information */ struct afs_volsync { - time_t creation; /* volume creation time */ + time64_t creation; /* volume creation time */ }; /* * AFS volume status record */ struct afs_volume_status { - u32 vid; /* volume ID */ - u32 parent_id; /* parent volume ID */ + afs_volid_t vid; /* volume ID */ + afs_volid_t parent_id; /* parent volume ID */ u8 online; /* true if volume currently online and available */ u8 in_service; /* true if volume currently in service */ u8 blessed; /* same as in_service */ u8 needs_salvage; /* true if consistency checking required */ u32 type; /* volume type (afs_voltype_t) */ - u32 min_quota; /* minimum space set aside (blocks) */ - u32 max_quota; /* maximum space this volume may occupy (blocks) */ - u32 blocks_in_use; /* space this volume currently occupies (blocks) */ - u32 part_blocks_avail; /* space available in volume's partition */ - u32 part_max_blocks; /* size of volume's partition */ + u64 min_quota; /* minimum space set aside (blocks) */ + u64 max_quota; /* maximum space this volume may occupy (blocks) */ + u64 blocks_in_use; /* space this volume currently occupies (blocks) */ + u64 part_blocks_avail; /* space available in volume's partition */ + u64 part_max_blocks; /* size of volume's partition */ + s64 vol_copy_date; + s64 vol_backup_date; }; #define AFS_BLOCK_SIZE 1024 diff --git a/fs/afs/cache.c b/fs/afs/cache.c index b1c31ec4523a..f6d0a21e8052 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -49,7 +49,7 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, struct afs_vnode *vnode = cookie_netfs_data; struct afs_vnode_cache_aux aux; - _enter("{%x,%x,%llx},%p,%u", + _enter("{%llx,%x,%llx},%p,%u", vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, buffer, buflen); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 5f261fbf2182..1c7955f5cdaf 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -210,12 +210,10 @@ void afs_init_callback_state(struct afs_server *server) /* * actually break a callback */ -void afs_break_callback(struct afs_vnode *vnode) +void __afs_break_callback(struct afs_vnode *vnode) { _enter(""); - write_seqlock(&vnode->cb_lock); - clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { vnode->cb_break++; @@ -230,7 +228,12 @@ void afs_break_callback(struct afs_vnode *vnode) afs_lock_may_be_available(vnode); spin_unlock(&vnode->lock); } +} +void afs_break_callback(struct afs_vnode *vnode) +{ + write_seqlock(&vnode->cb_lock); + __afs_break_callback(vnode); write_sequnlock(&vnode->cb_lock); } @@ -310,14 +313,10 @@ void afs_break_callbacks(struct afs_server *server, size_t count, /* TODO: Sort the callback break list by volume ID */ for (; count > 0; callbacks++, count--) { - _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", + _debug("- Fid { vl=%08llx n=%llu u=%u }", callbacks->fid.vid, callbacks->fid.vnode, - callbacks->fid.unique, - callbacks->cb.version, - callbacks->cb.expiry, - callbacks->cb.type - ); + callbacks->fid.unique); afs_break_one_callback(server, &callbacks->fid); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 6127f0fcd62c..cf445dbd5f2e 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -20,6 +20,8 @@ #include "internal.h" static unsigned __read_mostly afs_cell_gc_delay = 10; +static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; +static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; static void afs_manage_cell(struct work_struct *); @@ -119,7 +121,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, */ static struct afs_cell *afs_alloc_cell(struct afs_net *net, const char *name, unsigned int namelen, - const char *vllist) + const char *addresses) { struct afs_cell *cell; int i, ret; @@ -134,7 +136,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, if (namelen == 5 && memcmp(name, "@cell", 5) == 0) return ERR_PTR(-EINVAL); - _enter("%*.*s,%s", namelen, namelen, name, vllist); + _enter("%*.*s,%s", namelen, namelen, name, addresses); cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL); if (!cell) { @@ -153,23 +155,26 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, (1 << AFS_CELL_FL_NO_LOOKUP_YET)); INIT_LIST_HEAD(&cell->proc_volumes); rwlock_init(&cell->proc_lock); - rwlock_init(&cell->vl_addrs_lock); + rwlock_init(&cell->vl_servers_lock); /* Fill in the VL server list if we were given a list of addresses to * use. */ - if (vllist) { - struct afs_addr_list *alist; - - alist = afs_parse_text_addrs(vllist, strlen(vllist), ':', - VL_SERVICE, AFS_VL_PORT); - if (IS_ERR(alist)) { - ret = PTR_ERR(alist); + if (addresses) { + struct afs_vlserver_list *vllist; + + vllist = afs_parse_text_addrs(net, + addresses, strlen(addresses), ':', + VL_SERVICE, AFS_VL_PORT); + if (IS_ERR(vllist)) { + ret = PTR_ERR(vllist); goto parse_failed; } - rcu_assign_pointer(cell->vl_addrs, alist); + rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_expiry = TIME64_MAX; + } else { + cell->dns_expiry = ktime_get_real_seconds(); } _leave(" = %p", cell); @@ -356,26 +361,40 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) */ static void afs_update_cell(struct afs_cell *cell) { - struct afs_addr_list *alist, *old; - time64_t now, expiry; + struct afs_vlserver_list *vllist, *old; + unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl); + unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl); + time64_t now, expiry = 0; _enter("%s", cell->name); - alist = afs_dns_query(cell, &expiry); - if (IS_ERR(alist)) { - switch (PTR_ERR(alist)) { + vllist = afs_dns_query(cell, &expiry); + + now = ktime_get_real_seconds(); + if (min_ttl > max_ttl) + max_ttl = min_ttl; + if (expiry < now + min_ttl) + expiry = now + min_ttl; + else if (expiry > now + max_ttl) + expiry = now + max_ttl; + + if (IS_ERR(vllist)) { + switch (PTR_ERR(vllist)) { case -ENODATA: - /* The DNS said that the cell does not exist */ + case -EDESTADDRREQ: + /* The DNS said that the cell does not exist or there + * weren't any addresses to be had. + */ set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags); clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags); - cell->dns_expiry = ktime_get_real_seconds() + 61; + cell->dns_expiry = expiry; break; case -EAGAIN: case -ECONNREFUSED: default: set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags); - cell->dns_expiry = ktime_get_real_seconds() + 10; + cell->dns_expiry = now + 10; break; } @@ -387,12 +406,12 @@ static void afs_update_cell(struct afs_cell *cell) /* Exclusion on changing vl_addrs is achieved by a * non-reentrant work item. */ - old = rcu_dereference_protected(cell->vl_addrs, true); - rcu_assign_pointer(cell->vl_addrs, alist); + old = rcu_dereference_protected(cell->vl_servers, true); + rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_expiry = expiry; if (old) - afs_put_addrlist(old); + afs_put_vlserverlist(cell->net, old); } if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags)) @@ -414,7 +433,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) ASSERTCMP(atomic_read(&cell->usage), ==, 0); - afs_put_addrlist(rcu_access_pointer(cell->vl_addrs)); + afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers)); key_put(cell->anonymous_key); kfree(cell); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 9e51d6fe7e8f..8ee5972893ed 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -16,6 +16,7 @@ #include <linux/ip.h> #include "internal.h" #include "afs_cm.h" +#include "protocol_yfs.h" static int afs_deliver_cb_init_call_back_state(struct afs_call *); static int afs_deliver_cb_init_call_back_state3(struct afs_call *); @@ -30,6 +31,8 @@ static void SRXAFSCB_Probe(struct work_struct *); static void SRXAFSCB_ProbeUuid(struct work_struct *); static void SRXAFSCB_TellMeAboutYourself(struct work_struct *); +static int afs_deliver_yfs_cb_callback(struct afs_call *); + #define CM_NAME(name) \ const char afs_SRXCB##name##_name[] __tracepoint_string = \ "CB." #name @@ -101,12 +104,25 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { }; /* + * YFS CB.CallBack operation type + */ +static CM_NAME(YFS_CallBack); +static const struct afs_call_type afs_SRXYFSCB_CallBack = { + .name = afs_SRXCBYFS_CallBack_name, + .deliver = afs_deliver_yfs_cb_callback, + .destructor = afs_cm_destructor, + .work = SRXAFSCB_CallBack, +}; + +/* * route an incoming cache manager call * - return T if supported, F if not */ bool afs_cm_incoming_call(struct afs_call *call) { - _enter("{CB.OP %u}", call->operation_ID); + _enter("{%u, CB.OP %u}", call->service_id, call->operation_ID); + + call->epoch = rxrpc_kernel_get_epoch(call->net->socket, call->rxcall); switch (call->operation_ID) { case CBCallBack: @@ -127,12 +143,102 @@ bool afs_cm_incoming_call(struct afs_call *call) case CBTellMeAboutYourself: call->type = &afs_SRXCBTellMeAboutYourself; return true; + case YFSCBCallBack: + if (call->service_id != YFS_CM_SERVICE) + return false; + call->type = &afs_SRXYFSCB_CallBack; + return true; default: return false; } } /* + * Record a probe to the cache manager from a server. + */ +static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server) +{ + _enter(""); + + if (test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags) && + !test_bit(AFS_SERVER_FL_PROBING, &server->flags)) { + if (server->cm_epoch == call->epoch) + return 0; + + if (!server->probe.said_rebooted) { + pr_notice("kAFS: FS rebooted %pU\n", &server->uuid); + server->probe.said_rebooted = true; + } + } + + spin_lock(&server->probe_lock); + + if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { + server->cm_epoch = call->epoch; + server->probe.cm_epoch = call->epoch; + goto out; + } + + if (server->probe.cm_probed && + call->epoch != server->probe.cm_epoch && + !server->probe.said_inconsistent) { + pr_notice("kAFS: FS endpoints inconsistent %pU\n", + &server->uuid); + server->probe.said_inconsistent = true; + } + + if (!server->probe.cm_probed || call->epoch == server->cm_epoch) + server->probe.cm_epoch = server->cm_epoch; + +out: + server->probe.cm_probed = true; + spin_unlock(&server->probe_lock); + return 0; +} + +/* + * Find the server record by peer address and record a probe to the cache + * manager from a server. + */ +static int afs_find_cm_server_by_peer(struct afs_call *call) +{ + struct sockaddr_rxrpc srx; + struct afs_server *server; + + rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); + + server = afs_find_server(call->net, &srx); + if (!server) { + trace_afs_cm_no_server(call, &srx); + return 0; + } + + call->cm_server = server; + return afs_record_cm_probe(call, server); +} + +/* + * Find the server record by server UUID and record a probe to the cache + * manager from a server. + */ +static int afs_find_cm_server_by_uuid(struct afs_call *call, + struct afs_uuid *uuid) +{ + struct afs_server *server; + + rcu_read_lock(); + server = afs_find_server_by_uuid(call->net, call->request); + rcu_read_unlock(); + if (!server) { + trace_afs_cm_no_server_u(call, call->request); + return 0; + } + + call->cm_server = server; + return afs_record_cm_probe(call, server); +} + +/* * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) @@ -168,7 +274,6 @@ static void SRXAFSCB_CallBack(struct work_struct *work) static int afs_deliver_cb_callback(struct afs_call *call) { struct afs_callback_break *cb; - struct sockaddr_rxrpc srx; __be32 *bp; int ret, loop; @@ -176,32 +281,32 @@ static int afs_deliver_cb_callback(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* extract the FID array and its count in two steps */ case 1: _debug("extract FID count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > AFSCBMAX) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_cb_fid_count); call->buffer = kmalloc(array3_size(call->count, 3, 4), GFP_KERNEL); if (!call->buffer) return -ENOMEM; - call->offset = 0; + afs_extract_to_buf(call, call->count * 3 * 4); call->unmarshall++; case 2: _debug("extract FID array"); - ret = afs_extract_data(call, call->buffer, - call->count * 3 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -218,59 +323,46 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb->fid.vid = ntohl(*bp++); cb->fid.vnode = ntohl(*bp++); cb->fid.unique = ntohl(*bp++); - cb->cb.type = AFSCM_CB_UNTYPED; } - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* extract the callback array and its count in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count2 = ntohl(call->tmp); _debug("CB count: %u", call->count2); if (call->count2 != call->count && call->count2 != 0) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_cb_count); + call->_iter = &call->iter; + iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4); call->unmarshall++; case 4: - _debug("extract CB array"); - ret = afs_extract_data(call, call->buffer, - call->count2 * 3 * 4, false); + _debug("extract discard %zu/%u", + iov_iter_count(&call->iter), call->count2 * 3 * 4); + + ret = afs_extract_data(call, false); if (ret < 0) return ret; - _debug("unmarshall CB array"); - cb = call->request; - bp = call->buffer; - for (loop = call->count2; loop > 0; loop--, cb++) { - cb->cb.version = ntohl(*bp++); - cb->cb.expiry = ntohl(*bp++); - cb->cb.type = ntohl(*bp++); - } - - call->offset = 0; call->unmarshall++; case 5: break; } if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; + return afs_io_error(call, afs_io_error_cm_reply); /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - call->cm_server = afs_find_server(call->net, &srx); - if (!call->cm_server) - trace_afs_cm_no_server(call, &srx); - - return afs_queue_call_work(call); + return afs_find_cm_server_by_peer(call); } /* @@ -294,24 +386,18 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) */ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { - struct sockaddr_rxrpc srx; int ret; _enter(""); - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - - ret = afs_extract_data(call, NULL, 0, false); + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); if (ret < 0) return ret; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - call->cm_server = afs_find_server(call->net, &srx); - if (!call->cm_server) - trace_afs_cm_no_server(call, &srx); - - return afs_queue_call_work(call); + return afs_find_cm_server_by_peer(call); } /* @@ -330,16 +416,15 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); if (!call->buffer) return -ENOMEM; + afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; case 1: _debug("extract UUID"); - ret = afs_extract_data(call, call->buffer, - 11 * sizeof(__be32), false); + ret = afs_extract_data(call, false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -362,7 +447,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) for (loop = 0; loop < 6; loop++) r->node[loop] = ntohl(b[loop + 5]); - call->offset = 0; call->unmarshall++; case 2: @@ -370,17 +454,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) } if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; + return afs_io_error(call, afs_io_error_cm_reply); /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - rcu_read_lock(); - call->cm_server = afs_find_server_by_uuid(call->net, call->request); - rcu_read_unlock(); - if (!call->cm_server) - trace_afs_cm_no_server_u(call, call->request); - - return afs_queue_call_work(call); + return afs_find_cm_server_by_uuid(call, call->request); } /* @@ -405,14 +483,14 @@ static int afs_deliver_cb_probe(struct afs_call *call) _enter(""); - ret = afs_extract_data(call, NULL, 0, false); + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); if (ret < 0) return ret; if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; - - return afs_queue_call_work(call); + return afs_io_error(call, afs_io_error_cm_reply); + return afs_find_cm_server_by_peer(call); } /* @@ -453,16 +531,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); if (!call->buffer) return -ENOMEM; + afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; case 1: _debug("extract UUID"); - ret = afs_extract_data(call, call->buffer, - 11 * sizeof(__be32), false); + ret = afs_extract_data(call, false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -485,7 +562,6 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) for (loop = 0; loop < 6; loop++) r->node[loop] = ntohl(b[loop + 5]); - call->offset = 0; call->unmarshall++; case 2: @@ -493,9 +569,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) } if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; - - return afs_queue_call_work(call); + return afs_io_error(call, afs_io_error_cm_reply); + return afs_find_cm_server_by_uuid(call, call->request); } /* @@ -570,12 +645,88 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) _enter(""); - ret = afs_extract_data(call, NULL, 0, false); + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); if (ret < 0) return ret; if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; + return afs_io_error(call, afs_io_error_cm_reply); + return afs_find_cm_server_by_peer(call); +} + +/* + * deliver request data to a YFS CB.CallBack call + */ +static int afs_deliver_yfs_cb_callback(struct afs_call *call) +{ + struct afs_callback_break *cb; + struct yfs_xdr_YFSFid *bp; + size_t size; + int ret, loop; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + + /* extract the FID array and its count in two steps */ + case 1: + _debug("extract FID count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("FID count: %u", call->count); + if (call->count > YFSCBMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_cb_fid_count); + + size = array_size(call->count, sizeof(struct yfs_xdr_YFSFid)); + call->buffer = kmalloc(size, GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + afs_extract_to_buf(call, size); + call->unmarshall++; + + case 2: + _debug("extract FID array"); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + _debug("unmarshall FID array"); + call->request = kcalloc(call->count, + sizeof(struct afs_callback_break), + GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + cb = call->request; + bp = call->buffer; + for (loop = call->count; loop > 0; loop--, cb++) { + cb->fid.vid = xdr_to_u64(bp->volume); + cb->fid.vnode = xdr_to_u64(bp->vnode.lo); + cb->fid.vnode_hi = ntohl(bp->vnode.hi); + cb->fid.unique = ntohl(bp->vnode.unique); + bp++; + } + + afs_extract_to_tmp(call); + call->unmarshall++; + + case 3: + break; + } + + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); - return afs_queue_call_work(call); + /* We'll need the file server record as that tells us which set of + * vnodes to operate upon. + */ + return afs_find_cm_server_by_peer(call); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 855bf2b79fed..43dea3b00c29 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -138,6 +138,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, ntohs(dbuf->blocks[tmp].hdr.magic)); trace_afs_dir_check_failed(dvnode, off, i_size); kunmap(page); + trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); goto error; } @@ -190,9 +191,11 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) retry: i_size = i_size_read(&dvnode->vfs_inode); if (i_size < 2048) - return ERR_PTR(-EIO); - if (i_size > 2048 * 1024) + return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small)); + if (i_size > 2048 * 1024) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); return ERR_PTR(-EFBIG); + } _enter("%llu", i_size); @@ -315,7 +318,8 @@ content_has_grown: /* * deal with one block in an AFS directory */ -static int afs_dir_iterate_block(struct dir_context *ctx, +static int afs_dir_iterate_block(struct afs_vnode *dvnode, + struct dir_context *ctx, union afs_xdr_dir_block *block, unsigned blkoff) { @@ -365,7 +369,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx, " (len %u/%zu)", blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); - return -EIO; + return afs_bad(dvnode, afs_file_error_dir_over_end); } if (!(block->hdr.bitmap[next / 8] & (1 << (next % 8)))) { @@ -373,7 +377,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx, " %u unmarked extension (len %u/%zu)", blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); - return -EIO; + return afs_bad(dvnode, afs_file_error_dir_unmarked_ext); } _debug("ENT[%zu.%u]: ext %u/%zu", @@ -442,7 +446,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, */ page = req->pages[blkoff / PAGE_SIZE]; if (!page) { - ret = -EIO; + ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } mark_page_accessed(page); @@ -455,7 +459,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, do { dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / sizeof(union afs_xdr_dir_block)]; - ret = afs_dir_iterate_block(ctx, dblock, blkoff); + ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff); if (ret != 1) { kunmap(page); goto out; @@ -548,7 +552,7 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, } *fid = cookie.fid; - _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique); + _leave(" = 0 { vn=%llu u=%u }", fid->vnode, fid->unique); return 0; } @@ -826,7 +830,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, struct key *key; int ret; - _enter("{%x:%u},%p{%pd},", + _enter("{%llx:%llu},%p{%pd},", dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry); ASSERTCMP(d_inode(dentry), ==, NULL); @@ -896,7 +900,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); - _enter("{v={%x:%u} n=%pd fl=%lx},", + _enter("{v={%llx:%llu} n=%pd fl=%lx},", vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); } else { @@ -965,7 +969,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) /* if the vnode ID has changed, then the dirent points to a * different file */ if (fid.vnode != vnode->fid.vnode) { - _debug("%pd: dirent changed [%u != %u]", + _debug("%pd: dirent changed [%llu != %llu]", dentry, fid.vnode, vnode->fid.vnode); goto not_found; @@ -1085,6 +1089,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + afs_vnode_commit_status(fc, vnode, 0); d_add(new_dentry, inode); } @@ -1104,7 +1109,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mode |= S_IFDIR; - _enter("{%x:%u},{%pd},%ho", + _enter("{%llx:%llu},{%pd},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); key = afs_request_key(dvnode->volume->cell); @@ -1169,12 +1174,12 @@ static void afs_dir_remove_subdir(struct dentry *dentry) static int afs_rmdir(struct inode *dir, struct dentry *dentry) { struct afs_fs_cursor fc; - struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; struct key *key; u64 data_version = dvnode->status.data_version; int ret; - _enter("{%x:%u},{%pd}", + _enter("{%llx:%llu},{%pd}", dvnode->fid.vid, dvnode->fid.vnode, dentry); key = afs_request_key(dvnode->volume->cell); @@ -1183,11 +1188,19 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) goto error; } + /* Try to make sure we have a callback promise on the victim. */ + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); + ret = afs_validate(vnode, key); + if (ret < 0) + goto error_key; + } + ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_fs_remove(&fc, dentry->d_name.name, true, + afs_fs_remove(&fc, vnode, dentry->d_name.name, true, data_version); } @@ -1201,6 +1214,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) } } +error_key: key_put(key); error: return ret; @@ -1231,7 +1245,9 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key, if (d_really_is_positive(dentry)) { struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - if (dir_valid) { + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + /* Already done */ + } else if (dir_valid) { drop_nlink(&vnode->vfs_inode); if (vnode->vfs_inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); @@ -1260,13 +1276,13 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key, static int afs_unlink(struct inode *dir, struct dentry *dentry) { struct afs_fs_cursor fc; - struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; + struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; struct key *key; unsigned long d_version = (unsigned long)dentry->d_fsdata; u64 data_version = dvnode->status.data_version; int ret; - _enter("{%x:%u},{%pd}", + _enter("{%llx:%llu},{%pd}", dvnode->fid.vid, dvnode->fid.vnode, dentry); if (dentry->d_name.len >= AFSNAMEMAX) @@ -1290,7 +1306,18 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_fs_remove(&fc, dentry->d_name.name, false, + + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) && + !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) { + yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name, + data_version); + if (fc.ac.error != -ECONNABORTED || + fc.ac.abort_code != RXGEN_OPCODE) + continue; + set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags); + } + + afs_fs_remove(&fc, vnode, dentry->d_name.name, false, data_version); } @@ -1330,7 +1357,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, mode |= S_IFREG; - _enter("{%x:%u},{%pd},%ho,", + _enter("{%llx:%llu},{%pd},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); ret = -ENAMETOOLONG; @@ -1393,7 +1420,7 @@ static int afs_link(struct dentry *from, struct inode *dir, dvnode = AFS_FS_I(dir); data_version = dvnode->status.data_version; - _enter("{%x:%u},{%x:%u},{%pd}", + _enter("{%llx:%llu},{%llx:%llu},{%pd}", vnode->fid.vid, vnode->fid.vnode, dvnode->fid.vid, dvnode->fid.vnode, dentry); @@ -1464,7 +1491,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, u64 data_version = dvnode->status.data_version; int ret; - _enter("{%x:%u},{%pd},%s", + _enter("{%llx:%llu},{%pd},%s", dvnode->fid.vid, dvnode->fid.vnode, dentry, content); @@ -1540,7 +1567,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, orig_data_version = orig_dvnode->status.data_version; new_data_version = new_dvnode->status.data_version; - _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}", + _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, vnode->fid.vid, vnode->fid.vnode, new_dvnode->fid.vid, new_dvnode->fid.vnode, @@ -1607,7 +1634,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); - _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); set_page_private(page, 0); ClearPagePrivate(page); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index f29c6dade7f6..a9ba81ddf154 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -46,7 +46,7 @@ static int afs_probe_cell_name(struct dentry *dentry) return 0; } - ret = dns_query("afsdb", name, len, "", NULL, NULL); + ret = dns_query("afsdb", name, len, "srv=1", NULL, NULL); if (ret == -ENODATA) ret = -EDESTADDRREQ; return ret; @@ -62,7 +62,7 @@ struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) struct inode *inode; int ret = -ENOENT; - _enter("%p{%pd}, {%x:%u}", + _enter("%p{%pd}, {%llx:%llu}", dentry, dentry, vnode->fid.vid, vnode->fid.vnode); if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) diff --git a/fs/afs/file.c b/fs/afs/file.c index 7d4f26198573..d6bc3f5d784b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -121,7 +121,7 @@ int afs_open(struct inode *inode, struct file *file) struct key *key; int ret; - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) { @@ -170,7 +170,7 @@ int afs_release(struct inode *inode, struct file *file) struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = file->private_data; - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); if ((file->f_mode & FMODE_WRITE)) return vfs_fsync(file, 0); @@ -228,7 +228,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x,,,", + _enter("%s{%llx:%llu.%u},%x,,,", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -634,7 +634,7 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); unsigned long priv; - _enter("{{%x:%u}[%lu],%lx},%x", + _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, gfp_flags); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index dc62d15a964b..0568fd986821 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -29,7 +29,7 @@ static const struct file_lock_operations afs_lock_ops = { */ void afs_lock_may_be_available(struct afs_vnode *vnode) { - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0); } @@ -76,7 +76,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x,%u", + _enter("%s{%llx:%llu.%u},%x,%u", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -107,7 +107,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x", + _enter("%s{%llx:%llu.%u},%x", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -138,7 +138,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key) struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x", + _enter("%s{%llx:%llu.%u},%x", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -175,7 +175,7 @@ void afs_lock_work(struct work_struct *work) struct key *key; int ret; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); spin_lock(&vnode->lock); @@ -192,7 +192,7 @@ again: ret = afs_release_lock(vnode, vnode->lock_key); if (ret < 0) printk(KERN_WARNING "AFS:" - " Failed to release lock on {%x:%x} error %d\n", + " Failed to release lock on {%llx:%llx} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); spin_lock(&vnode->lock); @@ -229,7 +229,7 @@ again: key_put(key); if (ret < 0) - pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n", + pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); spin_lock(&vnode->lock); @@ -430,7 +430,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) struct key *key = afs_file_key(file); int ret; - _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); /* only whole-file locks are supported */ if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) @@ -582,7 +582,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); int ret; - _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); /* Flush all pending writes before doing anything with locks. */ vfs_fsync(file, 0); @@ -639,7 +639,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); - _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", + _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, fl->fl_type, fl->fl_flags, (long long) fl->fl_start, (long long) fl->fl_end); @@ -662,7 +662,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); - _enter("{%x:%u},%d,{t=%x,fl=%x}", + _enter("{%llx:%llu},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, fl->fl_type, fl->fl_flags); diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c new file mode 100644 index 000000000000..d049cb459742 --- /dev/null +++ b/fs/afs/fs_probe.c @@ -0,0 +1,270 @@ +/* AFS fileserver probing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include "afs_fs.h" +#include "internal.h" +#include "protocol_yfs.h" + +static bool afs_fs_probe_done(struct afs_server *server) +{ + if (!atomic_dec_and_test(&server->probe_outstanding)) + return false; + + wake_up_var(&server->probe_outstanding); + clear_bit_unlock(AFS_SERVER_FL_PROBING, &server->flags); + wake_up_bit(&server->flags, AFS_SERVER_FL_PROBING); + return true; +} + +/* + * Process the result of probing a fileserver. This is called after successful + * or failed delivery of an FS.GetCapabilities operation. + */ +void afs_fileserver_probe_result(struct afs_call *call) +{ + struct afs_addr_list *alist = call->alist; + struct afs_server *server = call->reply[0]; + unsigned int server_index = (long)call->reply[1]; + unsigned int index = call->addr_ix; + unsigned int rtt = UINT_MAX; + bool have_result = false; + u64 _rtt; + int ret = call->error; + + _enter("%pU,%u", &server->uuid, index); + + spin_lock(&server->probe_lock); + + switch (ret) { + case 0: + server->probe.error = 0; + goto responded; + case -ECONNABORTED: + if (!server->probe.responded) { + server->probe.abort_code = call->abort_code; + server->probe.error = ret; + } + goto responded; + case -ENOMEM: + case -ENONET: + server->probe.local_failure = true; + afs_io_error(call, afs_io_error_fs_probe_fail); + goto out; + case -ECONNRESET: /* Responded, but call expired. */ + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + default: + clear_bit(index, &alist->responded); + set_bit(index, &alist->failed); + if (!server->probe.responded && + (server->probe.error == 0 || + server->probe.error == -ETIMEDOUT || + server->probe.error == -ETIME)) + server->probe.error = ret; + afs_io_error(call, afs_io_error_fs_probe_fail); + goto out; + } + +responded: + set_bit(index, &alist->responded); + clear_bit(index, &alist->failed); + + if (call->service_id == YFS_FS_SERVICE) { + server->probe.is_yfs = true; + set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } else { + server->probe.not_yfs = true; + if (!server->probe.is_yfs) { + clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } + } + + /* Get the RTT and scale it to fit into a 32-bit value that represents + * over a minute of time so that we can access it with one instruction + * on a 32-bit system. + */ + _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); + _rtt /= 64; + rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; + if (rtt < server->probe.rtt) { + server->probe.rtt = rtt; + alist->preferred = index; + have_result = true; + } + + smp_wmb(); /* Set rtt before responded. */ + server->probe.responded = true; + set_bit(AFS_SERVER_FL_PROBED, &server->flags); +out: + spin_unlock(&server->probe_lock); + + _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", + server_index, index, &alist->addrs[index].transport, + (unsigned int)rtt, ret); + + have_result |= afs_fs_probe_done(server); + if (have_result) { + server->probe.have_result = true; + wake_up_var(&server->probe.have_result); + wake_up_all(&server->probe_wq); + } +} + +/* + * Probe all of a fileserver's addresses to find out the best route and to + * query its capabilities. + */ +static int afs_do_probe_fileserver(struct afs_net *net, + struct afs_server *server, + struct key *key, + unsigned int server_index) +{ + struct afs_addr_cursor ac = { + .index = 0, + }; + int ret; + + _enter("%pU", &server->uuid); + + read_lock(&server->fs_lock); + ac.alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->fs_lock)); + read_unlock(&server->fs_lock); + + atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + memset(&server->probe, 0, sizeof(server->probe)); + server->probe.rtt = UINT_MAX; + + for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { + ret = afs_fs_get_capabilities(net, server, &ac, key, server_index, + true); + if (ret != -EINPROGRESS) { + afs_fs_probe_done(server); + return ret; + } + } + + return 0; +} + +/* + * Send off probes to all unprobed servers. + */ +int afs_probe_fileservers(struct afs_net *net, struct key *key, + struct afs_server_list *list) +{ + struct afs_server *server; + int i, ret; + + for (i = 0; i < list->nr_servers; i++) { + server = list->servers[i].server; + if (test_bit(AFS_SERVER_FL_PROBED, &server->flags)) + continue; + + if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags)) { + ret = afs_do_probe_fileserver(net, server, key, i); + if (ret) + return ret; + } + } + + return 0; +} + +/* + * Wait for the first as-yet untried fileserver to respond. + */ +int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) +{ + struct wait_queue_entry *waits; + struct afs_server *server; + unsigned int rtt = UINT_MAX; + bool have_responders = false; + int pref = -1, i; + + _enter("%u,%lx", slist->nr_servers, untried); + + /* Only wait for servers that have a probe outstanding. */ + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + if (!test_bit(AFS_SERVER_FL_PROBING, &server->flags)) + __clear_bit(i, &untried); + if (server->probe.responded) + have_responders = true; + } + } + if (have_responders || !untried) + return 0; + + waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL); + if (!waits) + return -ENOMEM; + + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + init_waitqueue_entry(&waits[i], current); + add_wait_queue(&server->probe_wq, &waits[i]); + } + } + + for (;;) { + bool still_probing = false; + + set_current_state(TASK_INTERRUPTIBLE); + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + if (server->probe.responded) + goto stop; + if (test_bit(AFS_SERVER_FL_PROBING, &server->flags)) + still_probing = true; + } + } + + if (!still_probing || unlikely(signal_pending(current))) + goto stop; + schedule(); + } + +stop: + set_current_state(TASK_RUNNING); + + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + if (server->probe.responded && + server->probe.rtt < rtt) { + pref = i; + rtt = server->probe.rtt; + } + + remove_wait_queue(&server->probe_wq, &waits[i]); + } + } + + kfree(waits); + + if (pref == -1 && signal_pending(current)) + return -ERESTARTSYS; + + if (pref >= 0) + slist->preferred = pref; + return 0; +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 50929cb91732..ca08c83168f5 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -17,15 +17,10 @@ #include "internal.h" #include "afs_fs.h" #include "xdr_fs.h" +#include "protocol_yfs.h" static const struct afs_fid afs_zero_fid; -/* - * We need somewhere to discard into in case the server helpfully returns more - * than we asked for in FS.FetchData{,64}. - */ -static u8 afs_discard_buffer[64]; - static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) { call->cbi = afs_get_cb_interest(cbi); @@ -75,8 +70,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode, struct timespec64 t; umode_t mode; - t.tv_sec = status->mtime_client; - t.tv_nsec = 0; + t = status->mtime_client; vnode->vfs_inode.i_ctime = t; vnode->vfs_inode.i_mtime = t; vnode->vfs_inode.i_atime = t; @@ -96,7 +90,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode, if (!(flags & AFS_VNODE_NOT_YET_SET)) { if (expected_version && *expected_version != status->data_version) { - _debug("vnode modified %llx on {%x:%u} [exp %llx]", + _debug("vnode modified %llx on {%llx:%llu} [exp %llx]", (unsigned long long) status->data_version, vnode->fid.vid, vnode->fid.vnode, (unsigned long long) *expected_version); @@ -170,7 +164,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, if (type != status->type && vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - pr_warning("Vnode %x:%x:%x changed type %u to %u\n", + pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n", vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, @@ -200,8 +194,10 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, EXTRACT_M(mode); EXTRACT_M(group); - status->mtime_client = ntohl(xdr->mtime_client); - status->mtime_server = ntohl(xdr->mtime_server); + status->mtime_client.tv_sec = ntohl(xdr->mtime_client); + status->mtime_client.tv_nsec = 0; + status->mtime_server.tv_sec = ntohl(xdr->mtime_server); + status->mtime_server.tv_nsec = 0; status->lock_count = ntohl(xdr->lock_count); size = (u64)ntohl(xdr->size_lo); @@ -233,7 +229,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, bad: xdr_dump_bad(*_bp); - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); } /* @@ -273,7 +269,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call, write_seqlock(&vnode->cb_lock); - if (call->cb_break == afs_cb_break_sum(vnode, cbi)) { + if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) { vnode->cb_version = ntohl(*bp++); cb_expiry = ntohl(*bp++); vnode->cb_type = ntohl(*bp++); @@ -293,13 +289,19 @@ static void xdr_decode_AFSCallBack(struct afs_call *call, *_bp = bp; } -static void xdr_decode_AFSCallBack_raw(const __be32 **_bp, +static ktime_t xdr_decode_expiry(struct afs_call *call, u32 expiry) +{ + return ktime_add_ns(call->reply_time, expiry * NSEC_PER_SEC); +} + +static void xdr_decode_AFSCallBack_raw(struct afs_call *call, + const __be32 **_bp, struct afs_callback *cb) { const __be32 *bp = *_bp; cb->version = ntohl(*bp++); - cb->expiry = ntohl(*bp++); + cb->expires_at = xdr_decode_expiry(call, ntohl(*bp++)); cb->type = ntohl(*bp++); *_bp = bp; } @@ -311,14 +313,18 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp, struct afs_volsync *volsync) { const __be32 *bp = *_bp; + u32 creation; - volsync->creation = ntohl(*bp++); + creation = ntohl(*bp++); bp++; /* spare2 */ bp++; /* spare3 */ bp++; /* spare4 */ bp++; /* spare5 */ bp++; /* spare6 */ *_bp = bp; + + if (volsync) + volsync->creation = creation; } /* @@ -379,6 +385,8 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, vs->blocks_in_use = ntohl(*bp++); vs->part_blocks_avail = ntohl(*bp++); vs->part_max_blocks = ntohl(*bp++); + vs->vol_copy_date = 0; + vs->vol_backup_date = 0; *_bp = bp; } @@ -395,16 +403,16 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) if (ret < 0) return ret; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; xdr_decode_AFSCallBack(call, vnode, &bp); - if (call->reply[1]) - xdr_decode_AFSVolSync(&bp, call->reply[1]); + xdr_decode_AFSVolSync(&bp, call->reply[1]); _leave(" = 0 [done]"); return 0; @@ -431,7 +439,10 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_fetch_file_status(fc, volsync, new_inode); + + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode, @@ -445,6 +456,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy call->reply[0] = vnode; call->reply[1] = volsync; call->expected_version = new_inode ? 1 : vnode->status.data_version; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -468,139 +480,117 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) struct afs_read *req = call->reply[2]; const __be32 *bp; unsigned int size; - void *buffer; int ret; - _enter("{%u,%zu/%u;%llu/%llu}", - call->unmarshall, call->offset, call->count, - req->remain, req->actual_len); + _enter("{%u,%zu/%llu}", + call->unmarshall, iov_iter_count(&call->iter), req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - call->offset = 0; + req->index = 0; + req->offset = req->pos & (PAGE_SIZE - 1); call->unmarshall++; - if (call->operation_ID != FSFETCHDATA64) { - call->unmarshall++; - goto no_msw; + if (call->operation_ID == FSFETCHDATA64) { + afs_extract_to_tmp64(call); + } else { + call->tmp_u = htonl(0); + afs_extract_to_tmp(call); } - /* extract the upper part of the returned data length of an - * FSFETCHDATA64 op (which should always be 0 using this - * client) */ - case 1: - _debug("extract data length (MSW)"); - ret = afs_extract_data(call, &call->tmp, 4, true); - if (ret < 0) - return ret; - - req->actual_len = ntohl(call->tmp); - req->actual_len <<= 32; - call->offset = 0; - call->unmarshall++; - - no_msw: /* extract the returned data length */ - case 2: + case 1: _debug("extract data length"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->actual_len |= ntohl(call->tmp); + req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - - req->remain = req->actual_len; - call->offset = req->pos & (PAGE_SIZE - 1); - req->index = 0; - if (req->actual_len == 0) + req->remain = min(req->len, req->actual_len); + if (req->remain == 0) goto no_more_data; + call->unmarshall++; begin_page: ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - call->offset) - size = PAGE_SIZE - call->offset; + if (req->remain > PAGE_SIZE - req->offset) + size = PAGE_SIZE - req->offset; else size = req->remain; - call->count = call->offset + size; - ASSERTCMP(call->count, <=, PAGE_SIZE); - req->remain -= size; + call->bvec[0].bv_len = size; + call->bvec[0].bv_offset = req->offset; + call->bvec[0].bv_page = req->pages[req->index]; + iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + ASSERTCMP(size, <=, PAGE_SIZE); /* extract the returned data */ - case 3: - _debug("extract data %llu/%llu %zu/%u", - req->remain, req->actual_len, call->offset, call->count); + case 2: + _debug("extract data %zu/%llu", + iov_iter_count(&call->iter), req->remain); - buffer = kmap(req->pages[req->index]); - ret = afs_extract_data(call, buffer, call->count, true); - kunmap(req->pages[req->index]); + ret = afs_extract_data(call, true); if (ret < 0) return ret; - if (call->offset == PAGE_SIZE) { + req->remain -= call->bvec[0].bv_len; + req->offset += call->bvec[0].bv_len; + ASSERTCMP(req->offset, <=, PAGE_SIZE); + if (req->offset == PAGE_SIZE) { + req->offset = 0; if (req->page_done) req->page_done(call, req); req->index++; - if (req->remain > 0) { - call->offset = 0; - if (req->index >= req->nr_pages) { - call->unmarshall = 4; - goto begin_discard; - } + if (req->remain > 0) goto begin_page; - } } - goto no_more_data; + + ASSERTCMP(req->remain, ==, 0); + if (req->actual_len <= req->len) + goto no_more_data; /* Discard any excess data the server gave us */ - begin_discard: - case 4: - size = min_t(loff_t, sizeof(afs_discard_buffer), req->remain); - call->count = size; - _debug("extract discard %llu/%llu %zu/%u", - req->remain, req->actual_len, call->offset, call->count); - - call->offset = 0; - ret = afs_extract_data(call, afs_discard_buffer, call->count, true); - req->remain -= call->offset; + iov_iter_discard(&call->iter, READ, req->actual_len - req->len); + call->unmarshall = 3; + case 3: + _debug("extract discard %zu/%llu", + iov_iter_count(&call->iter), req->actual_len - req->len); + + ret = afs_extract_data(call, true); if (ret < 0) return ret; - if (req->remain > 0) - goto begin_discard; no_more_data: - call->offset = 0; - call->unmarshall = 5; + call->unmarshall = 4; + afs_extract_to_buf(call, (21 + 3 + 6) * 4); /* extract the metadata */ - case 5: - ret = afs_extract_data(call, call->buffer, - (21 + 3 + 6) * 4, false); + case 4: + ret = afs_extract_data(call, false); if (ret < 0) return ret; bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &vnode->status.data_version, req) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req); + if (ret < 0) + return ret; xdr_decode_AFSCallBack(call, vnode, &bp); - if (call->reply[1]) - xdr_decode_AFSVolSync(&bp, call->reply[1]); + xdr_decode_AFSVolSync(&bp, call->reply[1]); - call->offset = 0; call->unmarshall++; - case 6: + case 5: break; } for (; req->index < req->nr_pages; req->index++) { - if (call->count < PAGE_SIZE) + if (req->offset < PAGE_SIZE) zero_user_segment(req->pages[req->index], - call->count, PAGE_SIZE); + req->offset, PAGE_SIZE); if (req->page_done) req->page_done(call, req); - call->count = 0; + req->offset = 0; } _leave(" = 0 [done]"); @@ -653,6 +643,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[1] = NULL; /* volsync */ call->reply[2] = req; call->expected_version = vnode->status.data_version; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -682,6 +673,9 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_fetch_data(fc, req); + if (upper_32_bits(req->pos) || upper_32_bits(req->len) || upper_32_bits(req->pos + req->len)) @@ -698,6 +692,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[1] = NULL; /* volsync */ call->reply[2] = req; call->expected_version = vnode->status.data_version; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -733,11 +728,14 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 || - afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); - xdr_decode_AFSCallBack_raw(&bp, call->reply[3]); + ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_AFSCallBack_raw(call, &bp, call->reply[3]); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -778,6 +776,15 @@ int afs_fs_create(struct afs_fs_cursor *fc, size_t namesz, reqsz, padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)){ + if (S_ISDIR(mode)) + return yfs_fs_make_dir(fc, name, mode, current_data_version, + newfid, newstatus, newcb); + else + return yfs_fs_create_file(fc, name, mode, current_data_version, + newfid, newstatus, newcb); + } + _enter(""); namesz = strlen(name); @@ -796,6 +803,7 @@ int afs_fs_create(struct afs_fs_cursor *fc, call->reply[2] = newstatus; call->reply[3] = newcb; call->expected_version = current_data_version + 1; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -839,9 +847,10 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -868,15 +877,18 @@ static const struct afs_call_type afs_RXFSRemoveDir = { /* * remove a file or directory */ -int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, - u64 current_data_version) +int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, bool isdir, u64 current_data_version) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode *dvnode = fc->vnode; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct afs_net *net = afs_v2net(dvnode); size_t namesz, reqsz, padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_remove(fc, vnode, name, isdir, current_data_version); + _enter(""); namesz = strlen(name); @@ -890,15 +902,16 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, return -ENOMEM; call->key = fc->key; - call->reply[0] = vnode; + call->reply[0] = dvnode; + call->reply[1] = vnode; call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(dvnode->fid.vid); + *bp++ = htonl(dvnode->fid.vnode); + *bp++ = htonl(dvnode->fid.unique); *bp++ = htonl(namesz); memcpy(bp, name, namesz); bp = (void *) bp + namesz; @@ -908,7 +921,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, } afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); + trace_afs_make_fs_call(call, &dvnode->fid); return afs_make_call(&fc->ac, call, GFP_NOFS, false); } @@ -929,10 +942,13 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 || - afs_decode_status(call, &bp, &dvnode->status, dvnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL); + if (ret < 0) + return ret; + ret = afs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -961,6 +977,9 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, size_t namesz, reqsz, padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_link(fc, vnode, name, current_data_version); + _enter(""); namesz = strlen(name); @@ -1016,10 +1035,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) || - afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1052,6 +1074,10 @@ int afs_fs_symlink(struct afs_fs_cursor *fc, size_t namesz, reqsz, padsz, c_namesz, c_padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_symlink(fc, name, contents, current_data_version, + newfid, newstatus); + _enter(""); namesz = strlen(name); @@ -1122,13 +1148,16 @@ static int afs_deliver_fs_rename(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); - if (new_dvnode != orig_dvnode && - afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode, - &call->expected_version_2, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + if (new_dvnode != orig_dvnode) { + ret = afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL); + if (ret < 0) + return ret; + } /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1161,6 +1190,12 @@ int afs_fs_rename(struct afs_fs_cursor *fc, size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_rename(fc, orig_name, + new_dvnode, new_name, + current_orig_data_version, + current_new_data_version); + _enter(""); o_namesz = strlen(orig_name); @@ -1231,9 +1266,10 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ afs_pages_written_back(vnode, call); @@ -1273,7 +1309,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc, struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); call = afs_alloc_flat_call(net, &afs_RXFSStoreData64, @@ -1330,7 +1366,10 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, loff_t size, pos, i_size; __be32 *bp; - _enter(",%x,{%x:%u},,", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_store_data(fc, mapping, first, last, offset, to); + + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); size = (loff_t)to - (loff_t)offset; @@ -1407,9 +1446,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1451,7 +1491,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); @@ -1498,7 +1538,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); @@ -1544,10 +1584,13 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_setattr(fc, attr); + if (attr->ia_valid & ATTR_SIZE) return afs_fs_setattr_size(fc, attr); - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus, @@ -1581,164 +1624,114 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) { const __be32 *bp; char *p; + u32 size; int ret; _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: - call->offset = 0; call->unmarshall++; + afs_extract_to_buf(call, 12 * 4); /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, call->buffer, - 12 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; bp = call->buffer; xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]); - call->offset = 0; call->unmarshall++; + afs_extract_to_tmp(call); /* extract the volume name length */ case 2: - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_volname_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); call->unmarshall++; /* extract the volume name */ case 3: _debug("extract volname"); - if (call->count > 0) { - ret = afs_extract_data(call, call->reply[2], - call->count, true); - if (ret < 0) - return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; p = call->reply[2]; p[call->count] = 0; _debug("volname '%s'", p); - - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; - /* extract the volume name padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_volname_padding; - } - call->count = 4 - (call->count & 3); - - case 4: - ret = afs_extract_data(call, call->buffer, - call->count, true); - if (ret < 0) - return ret; - - call->offset = 0; - call->unmarshall++; - no_volname_padding: - /* extract the offline message length */ - case 5: - ret = afs_extract_data(call, &call->tmp, 4, true); + case 4: + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_offline_msg_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); call->unmarshall++; /* extract the offline message */ - case 6: + case 5: _debug("extract offline"); - if (call->count > 0) { - ret = afs_extract_data(call, call->reply[2], - call->count, true); - if (ret < 0) - return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; p = call->reply[2]; p[call->count] = 0; _debug("offline '%s'", p); - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; - /* extract the offline message padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_offline_padding; - } - call->count = 4 - (call->count & 3); - - case 7: - ret = afs_extract_data(call, call->buffer, - call->count, true); - if (ret < 0) - return ret; - - call->offset = 0; - call->unmarshall++; - no_offline_padding: - /* extract the message of the day length */ - case 8: - ret = afs_extract_data(call, &call->tmp, 4, true); + case 6: + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_motd_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); call->unmarshall++; /* extract the message of the day */ - case 9: + case 7: _debug("extract motd"); - if (call->count > 0) { - ret = afs_extract_data(call, call->reply[2], - call->count, true); - if (ret < 0) - return ret; - } + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; p = call->reply[2]; p[call->count] = 0; _debug("motd '%s'", p); - call->offset = 0; call->unmarshall++; - /* extract the message of the day padding */ - call->count = (4 - (call->count & 3)) & 3; - - case 10: - ret = afs_extract_data(call, call->buffer, - call->count, false); - if (ret < 0) - return ret; - - call->offset = 0; - call->unmarshall++; - case 11: + case 8: break; } @@ -1778,6 +1771,9 @@ int afs_fs_get_volume_status(struct afs_fs_cursor *fc, __be32 *bp; void *tmpbuf; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_get_volume_status(fc, vs); + _enter(""); tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL); @@ -1867,6 +1863,9 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_set_lock(fc, type); + _enter(""); call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4); @@ -1899,6 +1898,9 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_extend_lock(fc); + _enter(""); call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4); @@ -1930,6 +1932,9 @@ int afs_fs_release_lock(struct afs_fs_cursor *fc) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_release_lock(fc); + _enter(""); call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4); @@ -2004,19 +2009,16 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) u32 count; int ret; - _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter)); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* Extract the capabilities word count */ case 1: - ret = afs_extract_data(call, &call->tmp, - 1 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -2024,24 +2026,17 @@ again: call->count = count; call->count2 = count; - call->offset = 0; + iov_iter_discard(&call->iter, READ, count * sizeof(__be32)); call->unmarshall++; /* Extract capabilities words */ case 2: - count = min(call->count, 16U); - ret = afs_extract_data(call, call->buffer, - count * sizeof(__be32), - call->count > 16); + ret = afs_extract_data(call, false); if (ret < 0) return ret; /* TODO: Examine capabilities */ - call->count -= count; - if (call->count > 0) - goto again; - call->offset = 0; call->unmarshall++; break; } @@ -2050,6 +2045,14 @@ again: return 0; } +static void afs_destroy_fs_get_capabilities(struct afs_call *call) +{ + struct afs_server *server = call->reply[0]; + + afs_put_server(call->net, server); + afs_flat_call_destructor(call); +} + /* * FS.GetCapabilities operation type */ @@ -2057,7 +2060,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { .name = "FS.GetCapabilities", .op = afs_FS_GetCapabilities, .deliver = afs_deliver_fs_get_capabilities, - .destructor = afs_flat_call_destructor, + .done = afs_fileserver_probe_result, + .destructor = afs_destroy_fs_get_capabilities, }; /* @@ -2067,7 +2071,9 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { int afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, struct afs_addr_cursor *ac, - struct key *key) + struct key *key, + unsigned int server_index, + bool async) { struct afs_call *call; __be32 *bp; @@ -2079,6 +2085,10 @@ int afs_fs_get_capabilities(struct afs_net *net, return -ENOMEM; call->key = key; + call->reply[0] = afs_get_server(server); + call->reply[1] = (void *)(long)server_index; + call->upgrade = true; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -2086,7 +2096,7 @@ int afs_fs_get_capabilities(struct afs_net *net, /* Can't take a ref on server */ trace_afs_make_fs_call(call, NULL); - return afs_make_call(ac, call, GFP_NOFS, false); + return afs_make_call(ac, call, GFP_NOFS, async); } /* @@ -2097,7 +2107,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) struct afs_file_status *status = call->reply[1]; struct afs_callback *callback = call->reply[2]; struct afs_volsync *volsync = call->reply[3]; - struct afs_vnode *vnode = call->reply[0]; + struct afs_fid *fid = call->reply[0]; const __be32 *bp; int ret; @@ -2105,21 +2115,16 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) if (ret < 0) return ret; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", fid->vid, fid->vnode); /* unmarshall the reply once we've received all of it */ bp = call->buffer; - afs_decode_status(call, &bp, status, vnode, - &call->expected_version, NULL); - callback[call->count].version = ntohl(bp[0]); - callback[call->count].expiry = ntohl(bp[1]); - callback[call->count].type = ntohl(bp[2]); - if (vnode) - xdr_decode_AFSCallBack(call, vnode, &bp); - else - bp += 3; - if (volsync) - xdr_decode_AFSVolSync(&bp, volsync); + ret = afs_decode_status(call, &bp, status, NULL, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_AFSCallBack_raw(call, &bp, callback); + xdr_decode_AFSVolSync(&bp, volsync); _leave(" = 0 [done]"); return 0; @@ -2148,7 +2153,10 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc, struct afs_call *call; __be32 *bp; - _enter(",%x,{%x:%u},,", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_fetch_status(fc, net, fid, status, callback, volsync); + + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), fid->vid, fid->vnode); call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); @@ -2158,11 +2166,12 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc, } call->key = fc->key; - call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[0] = fid; call->reply[1] = status; call->reply[2] = callback; call->reply[3] = volsync; call->expected_version = 1; /* vnode->status.data_version */ + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -2193,38 +2202,40 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* Extract the file status count and array in two steps */ case 1: _debug("extract status count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; tmp = ntohl(call->tmp); _debug("status count: %u/%u", tmp, call->count2); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_count); call->count = 0; call->unmarshall++; more_counts: - call->offset = 0; + afs_extract_to_buf(call, 21 * sizeof(__be32)); case 2: _debug("extract status array %u", call->count); - ret = afs_extract_data(call, call->buffer, 21 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; bp = call->buffer; statuses = call->reply[1]; - if (afs_decode_status(call, &bp, &statuses[call->count], - call->count == 0 ? vnode : NULL, - NULL, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL); + if (ret < 0) + return ret; call->count++; if (call->count < call->count2) @@ -2232,27 +2243,28 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->count = 0; call->unmarshall++; - call->offset = 0; + afs_extract_to_tmp(call); /* Extract the callback count and array in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_cb_count); call->count = 0; call->unmarshall++; more_cbs: - call->offset = 0; + afs_extract_to_buf(call, 3 * sizeof(__be32)); case 4: _debug("extract CB array"); - ret = afs_extract_data(call, call->buffer, 3 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -2260,7 +2272,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) bp = call->buffer; callbacks = call->reply[2]; callbacks[call->count].version = ntohl(bp[0]); - callbacks[call->count].expiry = ntohl(bp[1]); + callbacks[call->count].expires_at = xdr_decode_expiry(call, ntohl(bp[1])); callbacks[call->count].type = ntohl(bp[2]); statuses = call->reply[1]; if (call->count == 0 && vnode && statuses[0].abort_code == 0) @@ -2269,19 +2281,17 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) if (call->count < call->count2) goto more_cbs; - call->offset = 0; + afs_extract_to_buf(call, 6 * sizeof(__be32)); call->unmarshall++; case 5: - ret = afs_extract_data(call, call->buffer, 6 * 4, false); + ret = afs_extract_data(call, false); if (ret < 0) return ret; bp = call->buffer; - if (call->reply[3]) - xdr_decode_AFSVolSync(&bp, call->reply[3]); + xdr_decode_AFSVolSync(&bp, call->reply[3]); - call->offset = 0; call->unmarshall++; case 6: @@ -2317,7 +2327,11 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, __be32 *bp; int i; - _enter(",%x,{%x:%u},%u", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_inline_bulk_status(fc, net, fids, statuses, callbacks, + nr_fids, volsync); + + _enter(",%x,{%llx:%llu},%u", key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus, @@ -2334,6 +2348,7 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, call->reply[2] = callbacks; call->reply[3] = volsync; call->count2 = nr_fids; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 479b7fdda124..4c6d8e1112c2 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -82,7 +82,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key) default: printk("kAFS: AFS vnode with undefined type\n"); read_sequnlock_excl(&vnode->cb_lock); - return afs_protocol_error(NULL, -EBADMSG); + return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type); } inode->i_blocks = 0; @@ -100,7 +100,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) struct afs_fs_cursor fc; int ret; - _enter("%s,{%x:%u.%u,S=%lx}", + _enter("%s,{%llx:%llu.%u,S=%lx}", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, vnode->flags); @@ -127,9 +127,9 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) int afs_iget5_test(struct inode *inode, void *opaque) { struct afs_iget_data *data = opaque; + struct afs_vnode *vnode = AFS_FS_I(inode); - return inode->i_ino == data->fid.vnode && - inode->i_generation == data->fid.unique; + return memcmp(&vnode->fid, &data->fid, sizeof(data->fid)) == 0; } /* @@ -150,11 +150,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque) struct afs_iget_data *data = opaque; struct afs_vnode *vnode = AFS_FS_I(inode); - inode->i_ino = data->fid.vnode; - inode->i_generation = data->fid.unique; vnode->fid = data->fid; vnode->volume = data->volume; + /* YFS supports 96-bit vnode IDs, but Linux only supports + * 64-bit inode numbers. + */ + inode->i_ino = data->fid.vnode; + inode->i_generation = data->fid.unique; return 0; } @@ -193,7 +196,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }", + _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", inode, inode->i_ino, data.fid.vid, data.fid.vnode, data.fid.unique); @@ -252,8 +255,8 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) key.vnode_id = vnode->fid.vnode; key.unique = vnode->fid.unique; - key.vnode_id_ext[0] = 0; - key.vnode_id_ext[1] = 0; + key.vnode_id_ext[0] = vnode->fid.vnode >> 32; + key.vnode_id_ext[1] = vnode->fid.vnode_hi; aux.data_version = vnode->status.data_version; vnode->cache = fscache_acquire_cookie(vnode->volume->cache, @@ -277,7 +280,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, struct inode *inode; int ret; - _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique); + _enter(",{%llx:%llu.%u},,", fid->vid, fid->vnode, fid->unique); as = sb->s_fs_info; data.volume = as->volume; @@ -289,7 +292,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { vl=%x vn=%x, u=%x }", + _debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }", inode, fid->vid, fid->vnode, fid->unique); vnode = AFS_FS_I(inode); @@ -314,11 +317,11 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, * didn't give us a callback) */ vnode->cb_version = 0; vnode->cb_type = 0; - vnode->cb_expires_at = 0; + vnode->cb_expires_at = ktime_get(); } else { vnode->cb_version = cb->version; vnode->cb_type = cb->type; - vnode->cb_expires_at = cb->expiry; + vnode->cb_expires_at = cb->expires_at; vnode->cb_interest = afs_get_cb_interest(cbi); set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } @@ -352,7 +355,7 @@ bad_inode: */ void afs_zap_data(struct afs_vnode *vnode) { - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); #ifdef CONFIG_AFS_FSCACHE fscache_invalidate(vnode->cache); @@ -382,7 +385,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) bool valid = false; int ret; - _enter("{v={%x:%u} fl=%lx},%x", + _enter("{v={%llx:%llu} fl=%lx},%x", vnode->fid.vid, vnode->fid.vnode, vnode->flags, key_serial(key)); @@ -501,7 +504,7 @@ void afs_evict_inode(struct inode *inode) vnode = AFS_FS_I(inode); - _enter("{%x:%u.%d}", + _enter("{%llx:%llu.%d}", vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); @@ -550,7 +553,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) struct key *key; int ret; - _enter("{%x:%u},{n=%pd},%x", + _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 72de1f157d20..5da3b09b7518 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -22,6 +22,7 @@ #include <linux/backing-dev.h> #include <linux/uuid.h> #include <linux/mm_types.h> +#include <linux/dns_resolver.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> @@ -75,10 +76,13 @@ struct afs_addr_list { u32 version; /* Version */ unsigned char max_addrs; unsigned char nr_addrs; - unsigned char index; /* Address currently in use */ + unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; unsigned long probed; /* Mask of servers that have been probed */ - unsigned long yfs; /* Mask of servers that are YFS */ + unsigned long failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long responded; /* Mask of addrs that responded */ struct sockaddr_rxrpc addrs[]; #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; @@ -88,6 +92,7 @@ struct afs_addr_list { */ struct afs_call { const struct afs_call_type *type; /* type of call */ + struct afs_addr_list *alist; /* Address is alist[addr_ix] */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ @@ -98,16 +103,22 @@ struct afs_call { struct afs_cb_interest *cbi; /* Callback interest for server used */ void *request; /* request data (first part) */ struct address_space *mapping; /* Pages being written from */ + struct iov_iter iter; /* Buffer iterator */ + struct iov_iter *_iter; /* Iterator currently in use */ + union { /* Convenience for ->iter */ + struct kvec kvec[1]; + struct bio_vec bvec[1]; + }; void *buffer; /* reply receive buffer */ void *reply[4]; /* Where to put the reply */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ - size_t offset; /* offset into received data store */ atomic_t usage; enum afs_call_state state; spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ + u32 epoch; unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned first_offset; /* offset into mapping[first] */ @@ -117,19 +128,28 @@ struct afs_call { unsigned count2; /* count used in unmarshalling */ }; unsigned char unmarshall; /* unmarshalling phase */ + unsigned char addr_ix; /* Address in ->alist */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool ret_reply0; /* T if should return reply[0] on success */ bool upgrade; /* T to request service upgrade */ + bool want_reply_time; /* T if want reply_time */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ - __be32 tmp; /* place to extract temporary data */ + union { /* place to extract temporary data */ + struct { + __be32 tmp_u; + __be32 tmp; + } __attribute__((packed)); + __be64 tmp64; + }; afs_dataversion_t expected_version; /* Updated version expected from store */ afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */ + ktime_t reply_time; /* Time of first reply packet */ }; struct afs_call_type { @@ -146,6 +166,9 @@ struct afs_call_type { /* Work function */ void (*work)(struct work_struct *work); + + /* Call done function (gets called immediately on success or failure) */ + void (*done)(struct afs_call *call); }; /* @@ -185,6 +208,7 @@ struct afs_read { refcount_t usage; unsigned int index; /* Which page we're reading into */ unsigned int nr_pages; + unsigned int offset; /* offset into current page */ void (*page_done)(struct afs_call *, struct afs_read *); struct page **pages; struct page *array[]; @@ -343,13 +367,70 @@ struct afs_cell { rwlock_t proc_lock; /* VL server list. */ - rwlock_t vl_addrs_lock; /* Lock on vl_addrs */ - struct afs_addr_list __rcu *vl_addrs; /* List of VL servers */ + rwlock_t vl_servers_lock; /* Lock on vl_servers */ + struct afs_vlserver_list __rcu *vl_servers; + u8 name_len; /* Length of name */ char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */ }; /* + * Volume Location server record. + */ +struct afs_vlserver { + struct rcu_head rcu; + struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */ + unsigned long flags; +#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ +#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ +#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ + rwlock_t lock; /* Lock on addresses */ + atomic_t usage; + + /* Probe state */ + wait_queue_head_t probe_wq; + atomic_t probe_outstanding; + spinlock_t probe_lock; + struct { + unsigned int rtt; /* RTT as ktime/64 */ + u32 abort_code; + short error; + bool have_result; + bool responded:1; + bool is_yfs:1; + bool not_yfs:1; + bool local_failure:1; + } probe; + + u16 port; + u16 name_len; /* Length of name */ + char name[]; /* Server name, case-flattened */ +}; + +/* + * Weighted list of Volume Location servers. + */ +struct afs_vlserver_entry { + u16 priority; /* Preference (as SRV) */ + u16 weight; /* Weight (as SRV) */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; + struct afs_vlserver *server; +}; + +struct afs_vlserver_list { + struct rcu_head rcu; + atomic_t usage; + u8 nr_servers; + u8 index; /* Server currently in use */ + u8 preferred; /* Preferred server */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; + rwlock_t lock; + struct afs_vlserver_entry servers[]; +}; + +/* * Cached VLDB entry. * * This is pointed to by cell->vldb_entries, indexed by name. @@ -403,8 +484,12 @@ struct afs_server { #define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ #define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ +#define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */ +#define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */ +#define AFS_SERVER_FL_HAVE_EPOCH 11 /* ->epoch is valid */ atomic_t usage; u32 addr_version; /* Address list version */ + u32 cm_epoch; /* Server RxRPC epoch */ /* file service access */ rwlock_t fs_lock; /* access lock */ @@ -413,6 +498,26 @@ struct afs_server { struct hlist_head cb_volumes; /* List of volume interests on this server */ unsigned cb_s_break; /* Break-everything counter. */ rwlock_t cb_break_lock; /* Volume finding lock */ + + /* Probe state */ + wait_queue_head_t probe_wq; + atomic_t probe_outstanding; + spinlock_t probe_lock; + struct { + unsigned int rtt; /* RTT as ktime/64 */ + u32 abort_code; + u32 cm_epoch; + short error; + bool have_result; + bool responded:1; + bool is_yfs:1; + bool not_yfs:1; + bool local_failure:1; + bool no_epoch:1; + bool cm_probed:1; + bool said_rebooted:1; + bool said_inconsistent:1; + } probe; }; /* @@ -447,8 +552,8 @@ struct afs_server_entry { struct afs_server_list { refcount_t usage; - unsigned short nr_servers; - unsigned short index; /* Server currently in use */ + unsigned char nr_servers; + unsigned char preferred; /* Preferred server */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; @@ -550,6 +655,15 @@ struct afs_vnode { afs_callback_type_t cb_type; /* type of callback */ }; +static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) +{ +#ifdef CONFIG_AFS_FSCACHE + return vnode->cache; +#else + return NULL; +#endif +} + /* * cached security record for one user's attempt to access a vnode */ @@ -586,13 +700,31 @@ struct afs_interface { */ struct afs_addr_cursor { struct afs_addr_list *alist; /* Current address list (pins ref) */ - struct sockaddr_rxrpc *addr; + unsigned long tried; /* Tried addresses */ + signed char index; /* Current address */ + bool responded; /* T if the current address responded */ + unsigned short nr_iterations; /* Number of address iterations */ + short error; u32 abort_code; - unsigned short start; /* Starting point in alist->addrs[] */ - unsigned short index; /* Wrapping offset from start to current addr */ +}; + +/* + * Cursor for iterating over a set of volume location servers. + */ +struct afs_vl_cursor { + struct afs_addr_cursor ac; + struct afs_cell *cell; /* The cell we're querying */ + struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ + struct afs_vlserver *server; /* Server on which this resides */ + struct key *key; /* Key for the server */ + unsigned long untried; /* Bitmask of untried servers */ + short index; /* Current server */ short error; - bool begun; /* T if we've begun iteration */ - bool responded; /* T if the current address responded */ + unsigned short flags; +#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ +#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ +#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ + unsigned short nr_iterations; /* Number of server iterations */ }; /* @@ -604,10 +736,11 @@ struct afs_fs_cursor { struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_cb_interest *cbi; /* Server on which this resides (pins ref) */ struct key *key; /* Key for the server */ + unsigned long untried; /* Bitmask of untried servers */ unsigned int cb_break; /* cb_break + cb_s_break before the call */ unsigned int cb_break_2; /* cb_break + cb_s_break (2nd vnode) */ - unsigned char start; /* Initial index in server list */ - unsigned char index; /* Number of servers tried beyond start */ + short index; /* Current server */ + short error; unsigned short flags; #define AFS_FS_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_FS_CURSOR_VBUSY 0x0002 /* Set if seen VBUSY */ @@ -615,6 +748,7 @@ struct afs_fs_cursor { #define AFS_FS_CURSOR_VNOVOL 0x0008 /* Set if seen VNOVOL */ #define AFS_FS_CURSOR_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ #define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ + unsigned short nr_iterations; /* Number of server iterations */ }; /* @@ -640,12 +774,12 @@ extern struct afs_addr_list *afs_alloc_addrlist(unsigned int, unsigned short, unsigned short); extern void afs_put_addrlist(struct afs_addr_list *); -extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char, - unsigned short, unsigned short); -extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *); +extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, + const char *, size_t, char, + unsigned short, unsigned short); +extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); extern bool afs_iterate_addresses(struct afs_addr_cursor *); extern int afs_end_cursor(struct afs_addr_cursor *); -extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *); extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16); extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); @@ -668,6 +802,7 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def; * callback.c */ extern void afs_init_callback_state(struct afs_server *); +extern void __afs_break_callback(struct afs_vnode *); extern void afs_break_callback(struct afs_vnode *); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*); @@ -688,10 +823,13 @@ static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; } -static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode, - struct afs_cb_interest *cbi) +static inline bool afs_cb_is_broken(unsigned int cb_break, + const struct afs_vnode *vnode, + const struct afs_cb_interest *cbi) { - return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break; + return !cbi || cb_break != (vnode->cb_break + + cbi->server->cb_s_break + + vnode->volume->cb_v_break); } /* @@ -781,7 +919,7 @@ extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *); extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64, struct afs_fid *, struct afs_file_status *, struct afs_callback *); -extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64); +extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64); extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64, struct afs_fid *, struct afs_file_status *); @@ -797,7 +935,7 @@ extern int afs_fs_release_lock(struct afs_fs_cursor *); extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); + struct afs_addr_cursor *, struct key *, unsigned int, bool); extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, struct afs_fid *, struct afs_file_status *, struct afs_callback *, unsigned int, @@ -807,6 +945,13 @@ extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, struct afs_callback *, struct afs_volsync *); /* + * fs_probe.c + */ +extern void afs_fileserver_probe_result(struct afs_call *); +extern int afs_probe_fileservers(struct afs_net *, struct key *, struct afs_server_list *); +extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); + +/* * inode.c */ extern int afs_fetch_status(struct afs_vnode *, struct key *, bool); @@ -922,7 +1067,6 @@ extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); -extern int afs_queue_call_work(struct afs_call *); extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, @@ -930,12 +1074,39 @@ extern struct afs_call *afs_alloc_flat_call(struct afs_net *, extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, void *, size_t, bool); -extern int afs_protocol_error(struct afs_call *, int); +extern int afs_extract_data(struct afs_call *, bool); +extern int afs_protocol_error(struct afs_call *, int, enum afs_eproto_cause); + +static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) +{ + call->kvec[0].iov_base = buf; + call->kvec[0].iov_len = size; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, size); +} + +static inline void afs_extract_to_tmp(struct afs_call *call) +{ + afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); +} + +static inline void afs_extract_to_tmp64(struct afs_call *call) +{ + afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); +} + +static inline void afs_extract_discard(struct afs_call *call, size_t size) +{ + iov_iter_discard(&call->iter, READ, size); +} + +static inline void afs_extract_to_buf(struct afs_call *call, size_t size) +{ + afs_extract_begin(call, call->buffer, size); +} static inline int afs_transfer_reply(struct afs_call *call) { - return afs_extract_data(call, call->buffer, call->reply_max, false); + return afs_extract_data(call, false); } static inline bool afs_check_call_state(struct afs_call *call, @@ -1012,7 +1183,6 @@ extern void afs_put_server(struct afs_net *, struct afs_server *); extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); -extern bool afs_probe_fileserver(struct afs_fs_cursor *); extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *); /* @@ -1039,14 +1209,51 @@ extern void afs_fs_exit(void); /* * vlclient.c */ -extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *, - struct afs_addr_cursor *, - struct key *, const char *, int); -extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *, - struct key *, const uuid_t *); -extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *); -extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *, - struct key *, const uuid_t *); +extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, + const char *, int); +extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); +extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *, + struct afs_vlserver *, unsigned int, bool); +extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); + +/* + * vl_probe.c + */ +extern void afs_vlserver_probe_result(struct afs_call *); +extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *); +extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long); + +/* + * vl_rotate.c + */ +extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *, + struct afs_cell *, struct key *); +extern bool afs_select_vlserver(struct afs_vl_cursor *); +extern bool afs_select_current_vlserver(struct afs_vl_cursor *); +extern int afs_end_vlserver_operation(struct afs_vl_cursor *); + +/* + * vlserver_list.c + */ +static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver) +{ + atomic_inc(&vlserver->usage); + return vlserver; +} + +static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist) +{ + if (vllist) + atomic_inc(&vllist->usage); + return vllist; +} + +extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short); +extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *); +extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int); +extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *); +extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, + const void *, size_t); /* * volume.c @@ -1089,6 +1296,36 @@ extern int afs_launder_page(struct page *); extern const struct xattr_handler *afs_xattr_handlers[]; extern ssize_t afs_listxattr(struct dentry *, char *, size_t); +/* + * yfsclient.c + */ +extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool); +extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); +extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, u64, + struct afs_fid *, struct afs_file_status *, struct afs_callback *); +extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, u64, + struct afs_fid *, struct afs_file_status *, struct afs_callback *); +extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); +extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64); +extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); +extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64, + struct afs_fid *, struct afs_file_status *); +extern int yfs_fs_rename(struct afs_fs_cursor *, const char *, + struct afs_vnode *, const char *, u64, u64); +extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *, + pgoff_t, pgoff_t, unsigned, unsigned); +extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *); +extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *); +extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t); +extern int yfs_fs_extend_lock(struct afs_fs_cursor *); +extern int yfs_fs_release_lock(struct afs_fs_cursor *); +extern int yfs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, struct afs_volsync *); +extern int yfs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, unsigned int, + struct afs_volsync *); /* * Miscellaneous inline functions. @@ -1120,6 +1357,17 @@ static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc, } } +static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) +{ + trace_afs_io_error(call->debug_id, -EIO, where); + return -EIO; +} + +static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) +{ + trace_afs_file_error(vnode, -EIO, where); + return -EIO; +} /*****************************************************************************/ /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 99fd13500a97..2e51c6994148 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -130,9 +130,10 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) goto error_no_page; } - ret = -EIO; - if (PageError(page)) + if (PageError(page)) { + ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); goto error; + } buf = kmap_atomic(page); memcpy(devname, buf, size); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 9101f62707af..be2ee3bbd0a9 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -17,6 +17,11 @@ #include <linux/uaccess.h> #include "internal.h" +struct afs_vl_seq_net_private { + struct seq_net_private seq; /* Must be first */ + struct afs_vlserver_list *vllist; +}; + static inline struct afs_net *afs_seq2net(struct seq_file *m) { return afs_net(seq_file_net(m)); @@ -32,16 +37,24 @@ static inline struct afs_net *afs_seq2net_single(struct seq_file *m) */ static int afs_proc_cells_show(struct seq_file *m, void *v) { - struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); + struct afs_vlserver_list *vllist; + struct afs_cell *cell; if (v == SEQ_START_TOKEN) { /* display header on line 1 */ - seq_puts(m, "USE NAME\n"); + seq_puts(m, "USE TTL SV NAME\n"); return 0; } + cell = list_entry(v, struct afs_cell, proc_link); + vllist = rcu_dereference(cell->vl_servers); + /* display one cell per line on subsequent lines */ - seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name); + seq_printf(m, "%3u %6lld %2u %s\n", + atomic_read(&cell->usage), + cell->dns_expiry - ktime_get_real_seconds(), + vllist ? vllist->nr_servers : 0, + cell->name); return 0; } @@ -208,7 +221,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) return 0; } - seq_printf(m, "%3d %08x %s\n", + seq_printf(m, "%3d %08llx %s\n", atomic_read(&vol->usage), vol->vid, afs_vol_types[vol->type]); @@ -247,61 +260,102 @@ static const struct seq_operations afs_proc_cell_volumes_ops = { .show = afs_proc_cell_volumes_show, }; +static const char *const dns_record_sources[NR__dns_record_source + 1] = { + [DNS_RECORD_UNAVAILABLE] = "unav", + [DNS_RECORD_FROM_CONFIG] = "cfg", + [DNS_RECORD_FROM_DNS_A] = "A", + [DNS_RECORD_FROM_DNS_AFSDB] = "AFSDB", + [DNS_RECORD_FROM_DNS_SRV] = "SRV", + [DNS_RECORD_FROM_NSS] = "nss", + [NR__dns_record_source] = "[weird]" +}; + +static const char *const dns_lookup_statuses[NR__dns_lookup_status + 1] = { + [DNS_LOOKUP_NOT_DONE] = "no-lookup", + [DNS_LOOKUP_GOOD] = "good", + [DNS_LOOKUP_GOOD_WITH_BAD] = "good/bad", + [DNS_LOOKUP_BAD] = "bad", + [DNS_LOOKUP_GOT_NOT_FOUND] = "not-found", + [DNS_LOOKUP_GOT_LOCAL_FAILURE] = "local-failure", + [DNS_LOOKUP_GOT_TEMP_FAILURE] = "temp-failure", + [DNS_LOOKUP_GOT_NS_FAILURE] = "ns-failure", + [NR__dns_lookup_status] = "[weird]" +}; + /* * Display the list of Volume Location servers we're using for a cell. */ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) { - struct sockaddr_rxrpc *addr = v; + const struct afs_vl_seq_net_private *priv = m->private; + const struct afs_vlserver_list *vllist = priv->vllist; + const struct afs_vlserver_entry *entry; + const struct afs_vlserver *vlserver; + const struct afs_addr_list *alist; + int i; - /* display header on line 1 */ - if (v == (void *)1) { - seq_puts(m, "ADDRESS\n"); + if (v == SEQ_START_TOKEN) { + seq_printf(m, "# source %s, status %s\n", + dns_record_sources[vllist->source], + dns_lookup_statuses[vllist->status]); return 0; } - /* display one cell per line on subsequent lines */ - seq_printf(m, "%pISp\n", &addr->transport); + entry = v; + vlserver = entry->server; + alist = rcu_dereference(vlserver->addresses); + + seq_printf(m, "%s [p=%hu w=%hu s=%s,%s]:\n", + vlserver->name, entry->priority, entry->weight, + dns_record_sources[alist ? alist->source : entry->source], + dns_lookup_statuses[alist ? alist->status : entry->status]); + if (alist) { + for (i = 0; i < alist->nr_addrs; i++) + seq_printf(m, " %c %pISpc\n", + alist->preferred == i ? '>' : '-', + &alist->addrs[i].transport); + } return 0; } static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { - struct afs_addr_list *alist; + struct afs_vl_seq_net_private *priv = m->private; + struct afs_vlserver_list *vllist; struct afs_cell *cell = PDE_DATA(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); - alist = rcu_dereference(cell->vl_addrs); + vllist = rcu_dereference(cell->vl_servers); + priv->vllist = vllist; - /* allow for the header line */ - if (!pos) - return (void *) 1; - pos--; + if (pos < 0) + *_pos = pos = 0; + if (pos == 0) + return SEQ_START_TOKEN; - if (!alist || pos >= alist->nr_addrs) + if (!vllist || pos - 1 >= vllist->nr_servers) return NULL; - return alist->addrs + pos; + return &vllist->servers[pos - 1]; } static void *afs_proc_cell_vlservers_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_addr_list *alist; - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_vl_seq_net_private *priv = m->private; + struct afs_vlserver_list *vllist = priv->vllist; loff_t pos; - alist = rcu_dereference(cell->vl_addrs); - pos = *_pos; - (*_pos)++; - if (!alist || pos >= alist->nr_addrs) + pos++; + *_pos = pos; + if (!vllist || pos - 1 >= vllist->nr_servers) return NULL; - return alist->addrs + pos; + return &vllist->servers[pos - 1]; } static void afs_proc_cell_vlservers_stop(struct seq_file *m, void *v) @@ -337,11 +391,11 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) &server->uuid, atomic_read(&server->usage), &alist->addrs[0].transport, - alist->index == 0 ? "*" : ""); + alist->preferred == 0 ? "*" : ""); for (i = 1; i < alist->nr_addrs; i++) seq_printf(m, " %pISpc%s\n", &alist->addrs[i].transport, - alist->index == i ? "*" : ""); + alist->preferred == i ? "*" : ""); return 0; } @@ -562,7 +616,7 @@ int afs_proc_cell_setup(struct afs_cell *cell) if (!proc_create_net_data("vlservers", 0444, dir, &afs_proc_cell_vlservers_ops, - sizeof(struct seq_net_private), + sizeof(struct afs_vl_seq_net_private), cell) || !proc_create_net_data("volumes", 0444, dir, &afs_proc_cell_volumes_ops, diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h new file mode 100644 index 000000000000..07bc10f076aa --- /dev/null +++ b/fs/afs/protocol_yfs.h @@ -0,0 +1,163 @@ +/* YFS protocol bits + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define YFS_FS_SERVICE 2500 +#define YFS_CM_SERVICE 2501 + +#define YFSCBMAX 1024 + +enum YFS_CM_Operations { + YFSCBProbe = 206, /* probe client */ + YFSCBGetLock = 207, /* get contents of CM lock table */ + YFSCBXStatsVersion = 209, /* get version of extended statistics */ + YFSCBGetXStats = 210, /* get contents of extended statistics data */ + YFSCBInitCallBackState3 = 213, /* initialise callback state, version 3 */ + YFSCBProbeUuid = 214, /* check the client hasn't rebooted */ + YFSCBGetServerPrefs = 215, + YFSCBGetCellServDV = 216, + YFSCBGetLocalCell = 217, + YFSCBGetCacheConfig = 218, + YFSCBGetCellByNum = 65537, + YFSCBTellMeAboutYourself = 65538, /* get client capabilities */ + YFSCBCallBack = 64204, +}; + +enum YFS_FS_Operations { + YFSFETCHACL = 64131, /* YFS Fetch file ACL */ + YFSFETCHSTATUS = 64132, /* YFS Fetch file status */ + YFSSTOREACL = 64134, /* YFS Store file ACL */ + YFSSTORESTATUS = 64135, /* YFS Store file status */ + YFSREMOVEFILE = 64136, /* YFS Remove a file */ + YFSCREATEFILE = 64137, /* YFS Create a file */ + YFSRENAME = 64138, /* YFS Rename or move a file or directory */ + YFSSYMLINK = 64139, /* YFS Create a symbolic link */ + YFSLINK = 64140, /* YFS Create a hard link */ + YFSMAKEDIR = 64141, /* YFS Create a directory */ + YFSREMOVEDIR = 64142, /* YFS Remove a directory */ + YFSGETVOLUMESTATUS = 64149, /* YFS Get volume status information */ + YFSSETVOLUMESTATUS = 64150, /* YFS Set volume status information */ + YFSSETLOCK = 64156, /* YFS Request a file lock */ + YFSEXTENDLOCK = 64157, /* YFS Extend a file lock */ + YFSRELEASELOCK = 64158, /* YFS Release a file lock */ + YFSLOOKUP = 64161, /* YFS lookup file in directory */ + YFSFLUSHCPS = 64165, + YFSFETCHOPAQUEACL = 64168, + YFSWHOAMI = 64170, + YFSREMOVEACL = 64171, + YFSREMOVEFILE2 = 64173, + YFSSTOREOPAQUEACL2 = 64174, + YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ + YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ + YFSSTOREDATA64 = 64538, /* YFS Store file data */ + YFSUPDATESYMLINK = 64540, +}; + +struct yfs_xdr_u64 { + __be32 msw; + __be32 lsw; +} __packed; + +static inline u64 xdr_to_u64(const struct yfs_xdr_u64 x) +{ + return ((u64)ntohl(x.msw) << 32) | ntohl(x.lsw); +} + +static inline struct yfs_xdr_u64 u64_to_xdr(const u64 x) +{ + return (struct yfs_xdr_u64){ .msw = htonl(x >> 32), .lsw = htonl(x) }; +} + +struct yfs_xdr_vnode { + struct yfs_xdr_u64 lo; + __be32 hi; + __be32 unique; +} __packed; + +struct yfs_xdr_YFSFid { + struct yfs_xdr_u64 volume; + struct yfs_xdr_vnode vnode; +} __packed; + + +struct yfs_xdr_YFSFetchStatus { + __be32 type; + __be32 nlink; + struct yfs_xdr_u64 size; + struct yfs_xdr_u64 data_version; + struct yfs_xdr_u64 author; + struct yfs_xdr_u64 owner; + struct yfs_xdr_u64 group; + __be32 mode; + __be32 caller_access; + __be32 anon_access; + struct yfs_xdr_vnode parent; + __be32 data_access_protocol; + struct yfs_xdr_u64 mtime_client; + struct yfs_xdr_u64 mtime_server; + __be32 lock_count; + __be32 abort_code; +} __packed; + +struct yfs_xdr_YFSCallBack { + __be32 version; + struct yfs_xdr_u64 expiration_time; + __be32 type; +} __packed; + +struct yfs_xdr_YFSStoreStatus { + __be32 mask; + __be32 mode; + struct yfs_xdr_u64 mtime_client; + struct yfs_xdr_u64 owner; + struct yfs_xdr_u64 group; +} __packed; + +struct yfs_xdr_RPCFlags { + __be32 rpc_flags; +} __packed; + +struct yfs_xdr_YFSVolSync { + struct yfs_xdr_u64 vol_creation_date; + struct yfs_xdr_u64 vol_update_date; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 blocks_in_use; + struct yfs_xdr_u64 blocks_avail; +} __packed; + +enum yfs_volume_type { + yfs_volume_type_ro = 0, + yfs_volume_type_rw = 1, +}; + +#define yfs_FVSOnline 0x1 +#define yfs_FVSInservice 0x2 +#define yfs_FVSBlessed 0x4 +#define yfs_FVSNeedsSalvage 0x8 + +struct yfs_xdr_YFSFetchVolumeStatus { + struct yfs_xdr_u64 vid; + struct yfs_xdr_u64 parent_id; + __be32 flags; + __be32 type; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 blocks_in_use; + struct yfs_xdr_u64 part_blocks_avail; + struct yfs_xdr_u64 part_max_blocks; + struct yfs_xdr_u64 vol_copy_date; + struct yfs_xdr_u64 vol_backup_date; +} __packed; + +struct yfs_xdr_YFSStoreVolumeStatus { + __be32 mask; + struct yfs_xdr_u64 min_quota; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 file_quota; +} __packed; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 1faef56b12bd..00504254c1c2 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -19,14 +19,6 @@ #include "afs_fs.h" /* - * Initialise a filesystem server cursor for iterating over FS servers. - */ -static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) -{ - memset(fc, 0, sizeof(*fc)); -} - -/* * Begin an operation on the fileserver. * * Fileserver operations are serialised on the server by vnode, so we serialise @@ -35,13 +27,14 @@ static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, struct key *key) { - afs_init_fs_cursor(fc, vnode); + memset(fc, 0, sizeof(*fc)); fc->vnode = vnode; fc->key = key; fc->ac.error = SHRT_MAX; + fc->error = -EDESTADDRREQ; if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - fc->ac.error = -EINTR; + fc->error = -EINTR; fc->flags |= AFS_FS_CURSOR_STOP; return false; } @@ -65,12 +58,15 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, fc->server_list = afs_get_serverlist(vnode->volume->servers); read_unlock(&vnode->volume->servers_lock); + fc->untried = (1UL << fc->server_list->nr_servers) - 1; + fc->index = READ_ONCE(fc->server_list->preferred); + cbi = vnode->cb_interest; if (cbi) { /* See if the vnode's preferred record is still available */ for (i = 0; i < fc->server_list->nr_servers; i++) { if (fc->server_list->servers[i].cb_interest == cbi) { - fc->start = i; + fc->index = i; goto found_interest; } } @@ -80,7 +76,7 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, * and have to return an error. */ if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; return false; } @@ -94,12 +90,9 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, afs_put_cb_interest(afs_v2net(vnode), cbi); cbi = NULL; - } else { - fc->start = READ_ONCE(fc->server_list->index); } found_interest: - fc->index = fc->start; return true; } @@ -117,7 +110,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) default: m = "busy"; break; } - pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m); + pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m); } /* @@ -127,7 +120,7 @@ static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) { msleep_interruptible(1000); if (signal_pending(current)) { - fc->ac.error = -ERESTARTSYS; + fc->error = -ERESTARTSYS; return false; } @@ -143,27 +136,32 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) struct afs_addr_list *alist; struct afs_server *server; struct afs_vnode *vnode = fc->vnode; + u32 rtt, abort_code; + int error = fc->ac.error, i; - _enter("%u/%u,%u/%u,%d,%d", - fc->index, fc->start, - fc->ac.index, fc->ac.start, - fc->ac.error, fc->ac.abort_code); + _enter("%lx[%d],%lx[%d],%d,%d", + fc->untried, fc->index, + fc->ac.tried, fc->ac.index, + error, fc->ac.abort_code); if (fc->flags & AFS_FS_CURSOR_STOP) { _leave(" = f [stopped]"); return false; } + fc->nr_iterations++; + /* Evaluate the result of the previous operation, if there was one. */ - switch (fc->ac.error) { + switch (error) { case SHRT_MAX: goto start; case 0: default: /* Success or local failure. Stop. */ + fc->error = error; fc->flags |= AFS_FS_CURSOR_STOP; - _leave(" = f [okay/local %d]", fc->ac.error); + _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: @@ -178,7 +176,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * - May indicate that the fileserver couldn't attach to the vol. */ if (fc->flags & AFS_FS_CURSOR_VNOVOL) { - fc->ac.error = -EREMOTEIO; + fc->error = -EREMOTEIO; goto next_server; } @@ -187,12 +185,12 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) write_unlock(&vnode->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); - if (fc->ac.error < 0) - goto failed; + error = afs_check_volume_status(vnode->volume, fc->key); + if (error < 0) + goto failed_set_error; if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { - fc->ac.error = -ENOMEDIUM; + fc->error = -ENOMEDIUM; goto failed; } @@ -200,7 +198,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * it's the fileserver having trouble. */ if (vnode->volume->servers == fc->server_list) { - fc->ac.error = -EREMOTEIO; + fc->error = -EREMOTEIO; goto next_server; } @@ -215,7 +213,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case VONLINE: case VDISKFULL: case VOVERQUOTA: - fc->ac.error = afs_abort_to_error(fc->ac.abort_code); + fc->error = afs_abort_to_error(fc->ac.abort_code); goto next_server; case VOFFLINE: @@ -224,11 +222,11 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); } if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { - fc->ac.error = -EADV; + fc->error = -EADV; goto failed; } if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; goto failed; } goto busy; @@ -240,7 +238,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * have a file lock we need to maintain. */ if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { - fc->ac.error = -EBUSY; + fc->error = -EBUSY; goto failed; } if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { @@ -269,16 +267,16 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * honour, just in case someone sets up a loop. */ if (fc->flags & AFS_FS_CURSOR_VMOVED) { - fc->ac.error = -EREMOTEIO; + fc->error = -EREMOTEIO; goto failed; } fc->flags |= AFS_FS_CURSOR_VMOVED; set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); - if (fc->ac.error < 0) - goto failed; + error = afs_check_volume_status(vnode->volume, fc->key); + if (error < 0) + goto failed_set_error; /* If the server list didn't change, then the VLDB is * out of sync with the fileservers. This is hopefully @@ -290,7 +288,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * TODO: Retry a few times with sleeps. */ if (vnode->volume->servers == fc->server_list) { - fc->ac.error = -ENOMEDIUM; + fc->error = -ENOMEDIUM; goto failed; } @@ -299,20 +297,25 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) default: clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); - fc->ac.error = afs_abort_to_error(fc->ac.abort_code); + fc->error = afs_abort_to_error(fc->ac.abort_code); goto failed; } + case -ETIMEDOUT: + case -ETIME: + if (fc->error != -EDESTADDRREQ) + goto iterate_address; + /* Fall through */ case -ENETUNREACH: case -EHOSTUNREACH: case -ECONNREFUSED: - case -ETIMEDOUT: - case -ETIME: _debug("no conn"); + fc->error = error; goto iterate_address; case -ECONNRESET: _debug("call reset"); + fc->error = error; goto failed; } @@ -328,15 +331,57 @@ start: /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ - fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); - if (fc->ac.error < 0) - goto failed; + error = afs_check_volume_status(vnode->volume, fc->key); + if (error < 0) + goto failed_set_error; if (!afs_start_fs_iteration(fc, vnode)) goto failed; -use_server: - _debug("use"); + _debug("__ VOL %llx __", vnode->volume->vid); + error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list); + if (error < 0) + goto failed_set_error; + +pick_server: + _debug("pick [%lx]", fc->untried); + + error = afs_wait_for_fs_probes(fc->server_list, fc->untried); + if (error < 0) + goto failed_set_error; + + /* Pick the untried server with the lowest RTT. If we have outstanding + * callbacks, we stick with the server we're already using if we can. + */ + if (fc->cbi) { + _debug("cbi %u", fc->index); + if (test_bit(fc->index, &fc->untried)) + goto selected_server; + afs_put_cb_interest(afs_v2net(vnode), fc->cbi); + fc->cbi = NULL; + _debug("nocbi"); + } + + fc->index = -1; + rtt = U32_MAX; + for (i = 0; i < fc->server_list->nr_servers; i++) { + struct afs_server *s = fc->server_list->servers[i].server; + + if (!test_bit(i, &fc->untried) || !s->probe.responded) + continue; + if (s->probe.rtt < rtt) { + fc->index = i; + rtt = s->probe.rtt; + } + } + + if (fc->index == -1) + goto no_more_servers; + +selected_server: + _debug("use %d", fc->index); + __clear_bit(fc->index, &fc->untried); + /* We're starting on a different fileserver from the list. We need to * check it, create a callback intercept, find its address list and * probe its capabilities before we use it. @@ -354,10 +399,10 @@ use_server: * break request before we've finished decoding the reply and * installing the vnode. */ - fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list, - fc->index); - if (fc->ac.error < 0) - goto failed; + error = afs_register_server_cb_interest(vnode, fc->server_list, + fc->index); + if (error < 0) + goto failed_set_error; fc->cbi = afs_get_cb_interest(vnode->cb_interest); @@ -369,66 +414,88 @@ use_server: memset(&fc->ac, 0, sizeof(fc->ac)); - /* Probe the current fileserver if we haven't done so yet. */ - if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { - fc->ac.alist = afs_get_addrlist(alist); - - if (!afs_probe_fileserver(fc)) { - switch (fc->ac.error) { - case -ENOMEM: - case -ERESTARTSYS: - case -EINTR: - goto failed; - default: - goto next_server; - } - } - } - if (!fc->ac.alist) fc->ac.alist = alist; else afs_put_addrlist(alist); - fc->ac.start = READ_ONCE(alist->index); - fc->ac.index = fc->ac.start; + fc->ac.index = -1; iterate_address: ASSERT(fc->ac.alist); - _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ if (!afs_iterate_addresses(&fc->ac)) goto next_server; + _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs); + _leave(" = t"); return true; next_server: _debug("next"); afs_end_cursor(&fc->ac); - afs_put_cb_interest(afs_v2net(vnode), fc->cbi); - fc->cbi = NULL; - fc->index++; - if (fc->index >= fc->server_list->nr_servers) - fc->index = 0; - if (fc->index != fc->start) - goto use_server; + goto pick_server; +no_more_servers: /* That's all the servers poked to no good effect. Try again if some * of them were busy. */ if (fc->flags & AFS_FS_CURSOR_VBUSY) goto restart_from_beginning; - fc->ac.error = -EDESTADDRREQ; - goto failed; + abort_code = 0; + error = -EDESTADDRREQ; + for (i = 0; i < fc->server_list->nr_servers; i++) { + struct afs_server *s = fc->server_list->servers[i].server; + int probe_error = READ_ONCE(s->probe.error); + + switch (probe_error) { + case 0: + continue; + default: + if (error == -ETIMEDOUT || + error == -ETIME) + continue; + case -ETIMEDOUT: + case -ETIME: + if (error == -ENOMEM || + error == -ENONET) + continue; + case -ENOMEM: + case -ENONET: + if (error == -ENETUNREACH) + continue; + case -ENETUNREACH: + if (error == -EHOSTUNREACH) + continue; + case -EHOSTUNREACH: + if (error == -ECONNREFUSED) + continue; + case -ECONNREFUSED: + if (error == -ECONNRESET) + continue; + case -ECONNRESET: /* Responded, but call expired. */ + if (error == -ECONNABORTED) + continue; + case -ECONNABORTED: + abort_code = s->probe.abort_code; + error = probe_error; + continue; + } + } + + if (error == -ECONNABORTED) + error = afs_abort_to_error(abort_code); +failed_set_error: + fc->error = error; failed: fc->flags |= AFS_FS_CURSOR_STOP; afs_end_cursor(&fc->ac); - _leave(" = f [failed %d]", fc->ac.error); + _leave(" = f [failed %d]", fc->error); return false; } @@ -442,13 +509,14 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) struct afs_vnode *vnode = fc->vnode; struct afs_cb_interest *cbi = vnode->cb_interest; struct afs_addr_list *alist; + int error = fc->ac.error; _enter(""); - switch (fc->ac.error) { + switch (error) { case SHRT_MAX: if (!cbi) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; fc->flags |= AFS_FS_CURSOR_STOP; return false; } @@ -461,25 +529,26 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) afs_get_addrlist(alist); read_unlock(&cbi->server->fs_lock); if (!alist) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; fc->flags |= AFS_FS_CURSOR_STOP; return false; } memset(&fc->ac, 0, sizeof(fc->ac)); fc->ac.alist = alist; - fc->ac.start = READ_ONCE(alist->index); - fc->ac.index = fc->ac.start; + fc->ac.index = -1; goto iterate_address; case 0: default: /* Success or local failure. Stop. */ + fc->error = error; fc->flags |= AFS_FS_CURSOR_STOP; - _leave(" = f [okay/local %d]", fc->ac.error); + _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: + fc->error = afs_abort_to_error(fc->ac.abort_code); fc->flags |= AFS_FS_CURSOR_STOP; _leave(" = f [abort]"); return false; @@ -490,6 +559,7 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) case -ETIMEDOUT: case -ETIME: _debug("no conn"); + fc->error = error; goto iterate_address; } @@ -507,12 +577,65 @@ iterate_address: } /* + * Dump cursor state in the case of the error being EDESTADDRREQ. + */ +static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) +{ + static int count; + int i; + + if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) + return; + count++; + + rcu_read_lock(); + + pr_notice("EDESTADDR occurred\n"); + pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", + fc->cb_break, fc->cb_break_2, fc->flags, fc->error); + pr_notice("FC: ut=%lx ix=%d ni=%u\n", + fc->untried, fc->index, fc->nr_iterations); + + if (fc->server_list) { + const struct afs_server_list *sl = fc->server_list; + pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", + sl->nr_servers, sl->preferred, sl->vnovol_mask); + for (i = 0; i < sl->nr_servers; i++) { + const struct afs_server *s = sl->servers[i].server; + pr_notice("FC: server fl=%lx av=%u %pU\n", + s->flags, s->addr_version, &s->uuid); + if (s->addresses) { + const struct afs_addr_list *a = + rcu_dereference(s->addresses); + pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", + a->version, + a->nr_ipv4, a->nr_addrs, a->max_addrs, + a->preferred); + pr_notice("FC: - pr=%lx R=%lx F=%lx\n", + a->probed, a->responded, a->failed); + if (a == fc->ac.alist) + pr_notice("FC: - current\n"); + } + } + } + + pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", + fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error, + fc->ac.responded, fc->ac.nr_iterations); + rcu_read_unlock(); +} + +/* * Tidy up a filesystem cursor and unlock the vnode. */ int afs_end_vnode_operation(struct afs_fs_cursor *fc) { struct afs_net *net = afs_v2net(fc->vnode); - int ret; + + if (fc->error == -EDESTADDRREQ || + fc->error == -ENETUNREACH || + fc->error == -EHOSTUNREACH) + afs_dump_edestaddrreq(fc); mutex_unlock(&fc->vnode->io_lock); @@ -520,9 +643,8 @@ int afs_end_vnode_operation(struct afs_fs_cursor *fc) afs_put_cb_interest(net, fc->cbi); afs_put_serverlist(net, fc->server_list); - ret = fc->ac.error; - if (ret == -ECONNABORTED) - afs_abort_to_error(fc->ac.abort_code); + if (fc->error == -ECONNABORTED) + fc->error = afs_abort_to_error(fc->ac.abort_code); - return fc->ac.error; + return fc->error; } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 77a83790a31f..59970886690f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -16,6 +16,7 @@ #include <net/af_rxrpc.h> #include "internal.h" #include "afs_cm.h" +#include "protocol_yfs.h" struct workqueue_struct *afs_async_calls; @@ -75,6 +76,18 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + srx.srx_service = YFS_CM_SERVICE; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + if (ret < 0) + goto error_2; + + /* Ideally, we'd turn on service upgrade here, but we can't because + * OpenAFS is buggy and leaks the userStatus field from packet to + * packet and between FS packets and CB packets - so if we try to do an + * upgrade on an FS packet, OpenAFS will leak that into the CB packet + * it sends back to us. + */ + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, afs_rx_discard_new_call); @@ -143,6 +156,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); + call->_iter = &call->iter; o = atomic_inc_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_alloc, 1, o, @@ -176,6 +190,7 @@ void afs_put_call(struct afs_call *call) afs_put_server(call->net, call->cm_server); afs_put_cb_interest(call->net, call->cbi); + afs_put_addrlist(call->alist); kfree(call->request); trace_afs_call(call, afs_call_trace_free, 0, o, @@ -189,21 +204,22 @@ void afs_put_call(struct afs_call *call) } /* - * Queue the call for actual work. Returns 0 unconditionally for convenience. + * Queue the call for actual work. */ -int afs_queue_call_work(struct afs_call *call) +static void afs_queue_call_work(struct afs_call *call) { - int u = atomic_inc_return(&call->usage); + if (call->type->work) { + int u = atomic_inc_return(&call->usage); - trace_afs_call(call, afs_call_trace_work, u, - atomic_read(&call->net->nr_outstanding_calls), - __builtin_return_address(0)); + trace_afs_call(call, afs_call_trace_work, u, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); - INIT_WORK(&call->work, call->type->work); + INIT_WORK(&call->work, call->type->work); - if (!queue_work(afs_wq, &call->work)) - afs_put_call(call); - return 0; + if (!queue_work(afs_wq, &call->work)) + afs_put_call(call); + } } /* @@ -233,6 +249,7 @@ struct afs_call *afs_alloc_flat_call(struct afs_net *net, goto nomem_free; } + afs_extract_to_buf(call, call->reply_max); call->operation_ID = type->op; init_waitqueue_head(&call->waitq); return call; @@ -286,7 +303,7 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, offset = 0; } - iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes); + iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes); } /* @@ -342,7 +359,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp, bool async) { - struct sockaddr_rxrpc *srx = ac->addr; + struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index]; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; @@ -359,6 +376,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, atomic_read(&call->net->nr_outstanding_calls)); call->async = async; + call->addr_ix = ac->index; + call->alist = afs_get_addrlist(ac->alist); /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data @@ -390,6 +409,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); + call->error = ret; goto error_kill_call; } @@ -401,8 +421,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, - call->request_size); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0); @@ -432,7 +451,7 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, false, &call->abort_code, &call->service_id); @@ -442,6 +461,8 @@ error_do_abort: call->error = ret; trace_afs_call_done(call); error_kill_call: + if (call->type->done) + call->type->done(call); afs_put_call(call); ac->error = ret; _leave(" = %d", ret); @@ -466,14 +487,12 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { - struct iov_iter iter; - - iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&call->iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, - call->rxcall, &iter, false, - &remote_abort, + call->rxcall, &call->iter, + false, &remote_abort, &call->service_id); - trace_afs_recv_data(call, 0, 0, false, ret); + trace_afs_receive_data(call, &call->iter, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; @@ -485,10 +504,17 @@ static void afs_deliver_to_call(struct afs_call *call) return; } + if (call->want_reply_time && + rxrpc_kernel_get_reply_time(call->net->socket, + call->rxcall, + &call->reply_time)) + call->want_reply_time = false; + ret = call->type->deliver(call); state = READ_ONCE(call->state); switch (ret) { case 0: + afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { if (call->cbi) set_bit(AFS_SERVER_FL_MAY_HAVE_CB, @@ -500,7 +526,6 @@ static void afs_deliver_to_call(struct afs_call *call) case -EINPROGRESS: case -EAGAIN: goto out; - case -EIO: case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); goto done; @@ -509,6 +534,10 @@ static void afs_deliver_to_call(struct afs_call *call) rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KIV"); goto local_abort; + case -EIO: + pr_err("kAFS: Call %u in bad state %u\n", + call->debug_id, state); + /* Fall through */ case -ENODATA: case -EBADMSG: case -EMSGSIZE: @@ -517,12 +546,14 @@ static void afs_deliver_to_call(struct afs_call *call) if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, -EBADMSG, "KUM"); + abort_code, ret, "KUM"); goto local_abort; } } done: + if (call->type->done) + call->type->done(call); if (state == AFS_CALL_COMPLETE && call->incoming) afs_put_call(call); out: @@ -728,6 +759,7 @@ void afs_charge_preallocation(struct work_struct *work) call->async = true; call->state = AFS_CALL_SV_AWAIT_OP_ID; init_waitqueue_head(&call->waitq); + afs_extract_to_tmp(call); } if (rxrpc_kernel_charge_accept(net->socket, @@ -773,18 +805,15 @@ static int afs_deliver_cm_op_id(struct afs_call *call) { int ret; - _enter("{%zu}", call->offset); - - ASSERTCMP(call->offset, <, 4); + _enter("{%zu}", iov_iter_count(call->_iter)); /* the operation ID forms the first four bytes of the request data */ - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->operation_ID = ntohl(call->tmp); afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST); - call->offset = 0; /* ask the cache manager to route the call (it'll change the call type * if successful) */ @@ -825,7 +854,7 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -864,7 +893,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -888,30 +917,19 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) /* * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, void *buf, size_t count, - bool want_more) +int afs_extract_data(struct afs_call *call, bool want_more) { struct afs_net *net = call->net; - struct iov_iter iter; - struct kvec iov; + struct iov_iter *iter = call->_iter; enum afs_call_state state; u32 remote_abort = 0; int ret; - _enter("{%s,%zu},,%zu,%d", - call->type->name, call->offset, count, want_more); - - ASSERTCMP(call->offset, <=, count); - - iov.iov_base = buf + call->offset; - iov.iov_len = count - call->offset; - iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset); + _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more); - ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter, + ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, want_more, &remote_abort, &call->service_id); - call->offset += (count - call->offset) - iov_iter_count(&iter); - trace_afs_recv_data(call, count, call->offset, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; @@ -926,7 +944,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, break; case AFS_CALL_COMPLETE: kdebug("prem complete %d", call->error); - return -EIO; + return afs_io_error(call, afs_io_error_extract); default: break; } @@ -940,8 +958,9 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, /* * Log protocol error production. */ -noinline int afs_protocol_error(struct afs_call *call, int error) +noinline int afs_protocol_error(struct afs_call *call, int error, + enum afs_eproto_cause cause) { - trace_afs_protocol_error(call, error, __builtin_return_address(0)); + trace_afs_protocol_error(call, error, cause); return error; } diff --git a/fs/afs/security.c b/fs/afs/security.c index 81dfedb7879f..5f58a9a17e69 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -126,7 +126,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, bool changed = false; int i, j; - _enter("{%x:%u},%x,%x", + _enter("{%llx:%llu},%x,%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access); rcu_read_lock(); @@ -147,7 +147,8 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, break; } - if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) { + if (afs_cb_is_broken(cb_break, vnode, + vnode->cb_interest)) { changed = true; break; } @@ -177,7 +178,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) + if (afs_cb_is_broken(cb_break, vnode, vnode->cb_interest)) goto someone_else_changed_it; /* We need a ref on any permits list we want to copy as we'll have to @@ -256,7 +257,7 @@ found: spin_lock(&vnode->lock); zap = rcu_access_pointer(vnode->permit_cache); - if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) && + if (!afs_cb_is_broken(cb_break, vnode, vnode->cb_interest) && zap == permits) rcu_assign_pointer(vnode->permit_cache, replacement); else @@ -289,7 +290,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, bool valid = false; int i, ret; - _enter("{%x:%u},%x", + _enter("{%llx:%llu},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); /* check the permits to see if we've got one yet */ @@ -349,7 +350,7 @@ int afs_permission(struct inode *inode, int mask) if (mask & MAY_NOT_BLOCK) return -ECHILD; - _enter("{{%x:%u},%lx},%x,", + _enter("{{%llx:%llu},%lx},%x,", vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); key = afs_request_key(vnode->volume->cell); diff --git a/fs/afs/server.c b/fs/afs/server.c index 1d329e6981d5..642afa2e9783 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include "afs_fs.h" #include "internal.h" +#include "protocol_yfs.h" static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ static unsigned afs_server_update_delay = 30; /* Time till VLDB recheck in secs */ @@ -230,6 +231,8 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, rwlock_init(&server->fs_lock); INIT_HLIST_HEAD(&server->cb_volumes); rwlock_init(&server->cb_break_lock); + init_waitqueue_head(&server->probe_wq); + spin_lock_init(&server->probe_lock); afs_inc_servers_outstanding(net); _leave(" = %p", server); @@ -246,41 +249,23 @@ enomem: static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, struct key *key, const uuid_t *uuid) { - struct afs_addr_cursor ac; - struct afs_addr_list *alist; + struct afs_vl_cursor vc; + struct afs_addr_list *alist = NULL; int ret; - ret = afs_set_vl_cursor(&ac, cell); - if (ret < 0) - return ERR_PTR(ret); - - while (afs_iterate_addresses(&ac)) { - if (test_bit(ac.index, &ac.alist->yfs)) - alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid); - else - alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid); - switch (ac.error) { - case 0: - afs_end_cursor(&ac); - return alist; - case -ECONNABORTED: - ac.error = afs_abort_to_error(ac.abort_code); - goto error; - case -ENOMEM: - case -ENONET: - goto error; - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - break; - default: - ac.error = -EIO; - goto error; + ret = -ERESTARTSYS; + if (afs_begin_vlserver_operation(&vc, cell, key)) { + while (afs_select_vlserver(&vc)) { + if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) + alist = afs_yfsvl_get_endpoints(&vc, uuid); + else + alist = afs_vl_get_addrs_u(&vc, uuid); } + + ret = afs_end_vlserver_operation(&vc); } -error: - return ERR_PTR(afs_end_cursor(&ac)); + return ret < 0 ? ERR_PTR(ret) : alist; } /* @@ -382,9 +367,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { .alist = alist, - .start = alist->index, - .index = 0, - .addr = &alist->addrs[alist->index], + .index = alist->preferred, .error = 0, }; _enter("%p", server); @@ -392,6 +375,9 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + wait_var_event(&server->probe_outstanding, + atomic_read(&server->probe_outstanding) == 0); + call_rcu(&server->rcu, afs_server_rcu); afs_dec_servers_outstanding(net); } @@ -525,99 +511,6 @@ void afs_purge_servers(struct afs_net *net) } /* - * Probe a fileserver to find its capabilities. - * - * TODO: Try service upgrade. - */ -static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc) -{ - _enter(""); - - fc->ac.addr = NULL; - fc->ac.start = READ_ONCE(fc->ac.alist->index); - fc->ac.index = fc->ac.start; - fc->ac.error = 0; - fc->ac.begun = false; - - while (afs_iterate_addresses(&fc->ac)) { - afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server, - &fc->ac, fc->key); - switch (fc->ac.error) { - case 0: - afs_end_cursor(&fc->ac); - set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags); - return true; - case -ECONNABORTED: - fc->ac.error = afs_abort_to_error(fc->ac.abort_code); - goto error; - case -ENOMEM: - case -ENONET: - goto error; - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - case -ETIMEDOUT: - case -ETIME: - break; - default: - fc->ac.error = -EIO; - goto error; - } - } - -error: - afs_end_cursor(&fc->ac); - return false; -} - -/* - * If we haven't already, try probing the fileserver to get its capabilities. - * We try not to instigate parallel probes, but it's possible that the parallel - * probes will fail due to authentication failure when ours would succeed. - * - * TODO: Try sending an anonymous probe if an authenticated probe fails. - */ -bool afs_probe_fileserver(struct afs_fs_cursor *fc) -{ - bool success; - int ret, retries = 0; - - _enter(""); - -retry: - if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) { - _leave(" = t"); - return true; - } - - if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) { - success = afs_do_probe_fileserver(fc); - clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags); - wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING); - _leave(" = t"); - return success; - } - - _debug("wait"); - ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING, - TASK_INTERRUPTIBLE); - if (ret == -ERESTARTSYS) { - fc->ac.error = ret; - _leave(" = f [%d]", ret); - return false; - } - - retries++; - if (retries == 4) { - fc->ac.error = -ESTALE; - _leave(" = f [stale]"); - return false; - } - _debug("retry"); - goto retry; -} - -/* * Get an update for a server's address list. */ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server) diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 8a5760aa5832..95d0761cdb34 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -118,11 +118,11 @@ bool afs_annotate_server_list(struct afs_server_list *new, return false; changed: - /* Maintain the same current server as before if possible. */ - cur = old->servers[old->index].server; + /* Maintain the same preferred server as before if possible. */ + cur = old->servers[old->preferred].server; for (j = 0; j < new->nr_servers; j++) { if (new->servers[j].server == cur) { - new->index = j; + new->preferred = j; break; } } diff --git a/fs/afs/super.c b/fs/afs/super.c index 4d3e274207fb..dcd07fe99871 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -406,10 +406,11 @@ static int afs_fill_super(struct super_block *sb, inode = afs_iget_pseudo_dir(sb, true); sb->s_flags |= SB_RDONLY; } else { - sprintf(sb->s_id, "%u", as->volume->vid); + sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); fid.vid = as->volume->vid; fid.vnode = 1; + fid.vnode_hi = 0; fid.unique = 1; inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL); } @@ -663,7 +664,7 @@ static void afs_destroy_inode(struct inode *inode) { struct afs_vnode *vnode = AFS_FS_I(inode); - _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode); + _enter("%p{%llx:%llu}", inode, vnode->fid.vid, vnode->fid.vnode); _debug("DESTROY INODE %p", inode); diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c new file mode 100644 index 000000000000..b4f1a84519b9 --- /dev/null +++ b/fs/afs/vl_list.c @@ -0,0 +1,340 @@ +/* AFS vlserver list management. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include "internal.h" + +struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, + unsigned short port) +{ + struct afs_vlserver *vlserver; + + vlserver = kzalloc(struct_size(vlserver, name, name_len + 1), + GFP_KERNEL); + if (vlserver) { + atomic_set(&vlserver->usage, 1); + rwlock_init(&vlserver->lock); + init_waitqueue_head(&vlserver->probe_wq); + spin_lock_init(&vlserver->probe_lock); + vlserver->name_len = name_len; + vlserver->port = port; + memcpy(vlserver->name, name, name_len); + } + return vlserver; +} + +static void afs_vlserver_rcu(struct rcu_head *rcu) +{ + struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu); + + afs_put_addrlist(rcu_access_pointer(vlserver->addresses)); + kfree_rcu(vlserver, rcu); +} + +void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver) +{ + if (vlserver) { + unsigned int u = atomic_dec_return(&vlserver->usage); + //_debug("VL PUT %p{%u}", vlserver, u); + + if (u == 0) + call_rcu(&vlserver->rcu, afs_vlserver_rcu); + } +} + +struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers) +{ + struct afs_vlserver_list *vllist; + + vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL); + if (vllist) { + atomic_set(&vllist->usage, 1); + rwlock_init(&vllist->lock); + } + + return vllist; +} + +void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist) +{ + if (vllist) { + unsigned int u = atomic_dec_return(&vllist->usage); + + //_debug("VLLS PUT %p{%u}", vllist, u); + if (u == 0) { + int i; + + for (i = 0; i < vllist->nr_servers; i++) { + afs_put_vlserver(net, vllist->servers[i].server); + } + kfree_rcu(vllist, rcu); + } + } +} + +static u16 afs_extract_le16(const u8 **_b) +{ + u16 val; + + val = (u16)*(*_b)++ << 0; + val |= (u16)*(*_b)++ << 8; + return val; +} + +/* + * Build a VL server address list from a DNS queried server list. + */ +static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, + u8 nr_addrs, u16 port) +{ + struct afs_addr_list *alist; + const u8 *b = *_b; + int ret = -EINVAL; + + alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port); + if (!alist) + return ERR_PTR(-ENOMEM); + if (nr_addrs == 0) + return alist; + + for (; nr_addrs > 0 && end - b >= nr_addrs; nr_addrs--) { + struct dns_server_list_v1_address hdr; + __be32 x[4]; + + hdr.address_type = *b++; + + switch (hdr.address_type) { + case DNS_ADDRESS_IS_IPV4: + if (end - b < 4) { + _leave(" = -EINVAL [short inet]"); + goto error; + } + memcpy(x, b, 4); + afs_merge_fs_addr4(alist, x[0], port); + b += 4; + break; + + case DNS_ADDRESS_IS_IPV6: + if (end - b < 16) { + _leave(" = -EINVAL [short inet6]"); + goto error; + } + memcpy(x, b, 16); + afs_merge_fs_addr6(alist, x, port); + b += 16; + break; + + default: + _leave(" = -EADDRNOTAVAIL [unknown af %u]", + hdr.address_type); + ret = -EADDRNOTAVAIL; + goto error; + } + } + + /* Start with IPv6 if available. */ + if (alist->nr_ipv4 < alist->nr_addrs) + alist->preferred = alist->nr_ipv4; + + *_b = b; + return alist; + +error: + *_b = b; + afs_put_addrlist(alist); + return ERR_PTR(ret); +} + +/* + * Build a VL server list from a DNS queried server list. + */ +struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, + const void *buffer, + size_t buffer_size) +{ + const struct dns_server_list_v1_header *hdr = buffer; + struct dns_server_list_v1_server bs; + struct afs_vlserver_list *vllist, *previous; + struct afs_addr_list *addrs; + struct afs_vlserver *server; + const u8 *b = buffer, *end = buffer + buffer_size; + int ret = -ENOMEM, nr_servers, i, j; + + _enter(""); + + /* Check that it's a server list, v1 */ + if (end - b < sizeof(*hdr) || + hdr->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST || + hdr->hdr.version != 1) { + pr_notice("kAFS: Got DNS record [%u,%u] len %zu\n", + hdr->hdr.content, hdr->hdr.version, end - b); + ret = -EDESTADDRREQ; + goto dump; + } + + nr_servers = hdr->nr_servers; + + vllist = afs_alloc_vlserver_list(nr_servers); + if (!vllist) + return ERR_PTR(-ENOMEM); + + vllist->source = (hdr->source < NR__dns_record_source) ? + hdr->source : NR__dns_record_source; + vllist->status = (hdr->status < NR__dns_lookup_status) ? + hdr->status : NR__dns_lookup_status; + + read_lock(&cell->vl_servers_lock); + previous = afs_get_vlserverlist( + rcu_dereference_protected(cell->vl_servers, + lockdep_is_held(&cell->vl_servers_lock))); + read_unlock(&cell->vl_servers_lock); + + b += sizeof(*hdr); + while (end - b >= sizeof(bs)) { + bs.name_len = afs_extract_le16(&b); + bs.priority = afs_extract_le16(&b); + bs.weight = afs_extract_le16(&b); + bs.port = afs_extract_le16(&b); + bs.source = *b++; + bs.status = *b++; + bs.protocol = *b++; + bs.nr_addrs = *b++; + + _debug("extract %u %u %u %u %u %u %*.*s", + bs.name_len, bs.priority, bs.weight, + bs.port, bs.protocol, bs.nr_addrs, + bs.name_len, bs.name_len, b); + + if (end - b < bs.name_len) + break; + + ret = -EPROTONOSUPPORT; + if (bs.protocol == DNS_SERVER_PROTOCOL_UNSPECIFIED) { + bs.protocol = DNS_SERVER_PROTOCOL_UDP; + } else if (bs.protocol != DNS_SERVER_PROTOCOL_UDP) { + _leave(" = [proto %u]", bs.protocol); + goto error; + } + + if (bs.port == 0) + bs.port = AFS_VL_PORT; + if (bs.source > NR__dns_record_source) + bs.source = NR__dns_record_source; + if (bs.status > NR__dns_lookup_status) + bs.status = NR__dns_lookup_status; + + server = NULL; + if (previous) { + /* See if we can update an old server record */ + for (i = 0; i < previous->nr_servers; i++) { + struct afs_vlserver *p = previous->servers[i].server; + + if (p->name_len == bs.name_len && + p->port == bs.port && + strncasecmp(b, p->name, bs.name_len) == 0) { + server = afs_get_vlserver(p); + break; + } + } + } + + if (!server) { + ret = -ENOMEM; + server = afs_alloc_vlserver(b, bs.name_len, bs.port); + if (!server) + goto error; + } + + b += bs.name_len; + + /* Extract the addresses - note that we can't skip this as we + * have to advance the payload pointer. + */ + addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port); + if (IS_ERR(addrs)) { + ret = PTR_ERR(addrs); + goto error_2; + } + + if (vllist->nr_servers >= nr_servers) { + _debug("skip %u >= %u", vllist->nr_servers, nr_servers); + afs_put_addrlist(addrs); + afs_put_vlserver(cell->net, server); + continue; + } + + addrs->source = bs.source; + addrs->status = bs.status; + + if (addrs->nr_addrs == 0) { + afs_put_addrlist(addrs); + if (!rcu_access_pointer(server->addresses)) { + afs_put_vlserver(cell->net, server); + continue; + } + } else { + struct afs_addr_list *old = addrs; + + write_lock(&server->lock); + rcu_swap_protected(server->addresses, old, + lockdep_is_held(&server->lock)); + write_unlock(&server->lock); + afs_put_addrlist(old); + } + + + /* TODO: Might want to check for duplicates */ + + /* Insertion-sort by priority and weight */ + for (j = 0; j < vllist->nr_servers; j++) { + if (bs.priority < vllist->servers[j].priority) + break; /* Lower preferable */ + if (bs.priority == vllist->servers[j].priority && + bs.weight > vllist->servers[j].weight) + break; /* Higher preferable */ + } + + if (j < vllist->nr_servers) { + memmove(vllist->servers + j + 1, + vllist->servers + j, + (vllist->nr_servers - j) * sizeof(struct afs_vlserver_entry)); + } + + clear_bit(AFS_VLSERVER_FL_PROBED, &server->flags); + + vllist->servers[j].priority = bs.priority; + vllist->servers[j].weight = bs.weight; + vllist->servers[j].server = server; + vllist->nr_servers++; + } + + if (b != end) { + _debug("parse error %zd", b - end); + goto error; + } + + afs_put_vlserverlist(cell->net, previous); + _leave(" = ok [%u]", vllist->nr_servers); + return vllist; + +error_2: + afs_put_vlserver(cell->net, server); +error: + afs_put_vlserverlist(cell->net, vllist); + afs_put_vlserverlist(cell->net, previous); +dump: + if (ret != -ENOMEM) { + printk(KERN_DEBUG "DNS: at %zu\n", (const void *)b - buffer); + print_hex_dump_bytes("DNS: ", DUMP_PREFIX_NONE, buffer, buffer_size); + } + return ERR_PTR(ret); +} diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c new file mode 100644 index 000000000000..c0f616bd70cb --- /dev/null +++ b/fs/afs/vl_probe.c @@ -0,0 +1,273 @@ +/* AFS vlserver probing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include "afs_fs.h" +#include "internal.h" +#include "protocol_yfs.h" + +static bool afs_vl_probe_done(struct afs_vlserver *server) +{ + if (!atomic_dec_and_test(&server->probe_outstanding)) + return false; + + wake_up_var(&server->probe_outstanding); + clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags); + wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING); + return true; +} + +/* + * Process the result of probing a vlserver. This is called after successful + * or failed delivery of an VL.GetCapabilities operation. + */ +void afs_vlserver_probe_result(struct afs_call *call) +{ + struct afs_addr_list *alist = call->alist; + struct afs_vlserver *server = call->reply[0]; + unsigned int server_index = (long)call->reply[1]; + unsigned int index = call->addr_ix; + unsigned int rtt = UINT_MAX; + bool have_result = false; + u64 _rtt; + int ret = call->error; + + _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code); + + spin_lock(&server->probe_lock); + + switch (ret) { + case 0: + server->probe.error = 0; + goto responded; + case -ECONNABORTED: + if (!server->probe.responded) { + server->probe.abort_code = call->abort_code; + server->probe.error = ret; + } + goto responded; + case -ENOMEM: + case -ENONET: + server->probe.local_failure = true; + afs_io_error(call, afs_io_error_vl_probe_fail); + goto out; + case -ECONNRESET: /* Responded, but call expired. */ + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + default: + clear_bit(index, &alist->responded); + set_bit(index, &alist->failed); + if (!server->probe.responded && + (server->probe.error == 0 || + server->probe.error == -ETIMEDOUT || + server->probe.error == -ETIME)) + server->probe.error = ret; + afs_io_error(call, afs_io_error_vl_probe_fail); + goto out; + } + +responded: + set_bit(index, &alist->responded); + clear_bit(index, &alist->failed); + + if (call->service_id == YFS_VL_SERVICE) { + server->probe.is_yfs = true; + set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } else { + server->probe.not_yfs = true; + if (!server->probe.is_yfs) { + clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } + } + + /* Get the RTT and scale it to fit into a 32-bit value that represents + * over a minute of time so that we can access it with one instruction + * on a 32-bit system. + */ + _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); + _rtt /= 64; + rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; + if (rtt < server->probe.rtt) { + server->probe.rtt = rtt; + alist->preferred = index; + have_result = true; + } + + smp_wmb(); /* Set rtt before responded. */ + server->probe.responded = true; + set_bit(AFS_VLSERVER_FL_PROBED, &server->flags); +out: + spin_unlock(&server->probe_lock); + + _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", + server_index, index, &alist->addrs[index].transport, + (unsigned int)rtt, ret); + + have_result |= afs_vl_probe_done(server); + if (have_result) { + server->probe.have_result = true; + wake_up_var(&server->probe.have_result); + wake_up_all(&server->probe_wq); + } +} + +/* + * Probe all of a vlserver's addresses to find out the best route and to + * query its capabilities. + */ +static int afs_do_probe_vlserver(struct afs_net *net, + struct afs_vlserver *server, + struct key *key, + unsigned int server_index) +{ + struct afs_addr_cursor ac = { + .index = 0, + }; + int ret; + + _enter("%s", server->name); + + read_lock(&server->lock); + ac.alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->lock)); + read_unlock(&server->lock); + + atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + memset(&server->probe, 0, sizeof(server->probe)); + server->probe.rtt = UINT_MAX; + + for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { + ret = afs_vl_get_capabilities(net, &ac, key, server, + server_index, true); + if (ret != -EINPROGRESS) { + afs_vl_probe_done(server); + return ret; + } + } + + return 0; +} + +/* + * Send off probes to all unprobed servers. + */ +int afs_send_vl_probes(struct afs_net *net, struct key *key, + struct afs_vlserver_list *vllist) +{ + struct afs_vlserver *server; + int i, ret; + + for (i = 0; i < vllist->nr_servers; i++) { + server = vllist->servers[i].server; + if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) + continue; + + if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags)) { + ret = afs_do_probe_vlserver(net, server, key, i); + if (ret) + return ret; + } + } + + return 0; +} + +/* + * Wait for the first as-yet untried server to respond. + */ +int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, + unsigned long untried) +{ + struct wait_queue_entry *waits; + struct afs_vlserver *server; + unsigned int rtt = UINT_MAX; + bool have_responders = false; + int pref = -1, i; + + _enter("%u,%lx", vllist->nr_servers, untried); + + /* Only wait for servers that have a probe outstanding. */ + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) + __clear_bit(i, &untried); + if (server->probe.responded) + have_responders = true; + } + } + if (have_responders || !untried) + return 0; + + waits = kmalloc(array_size(vllist->nr_servers, sizeof(*waits)), GFP_KERNEL); + if (!waits) + return -ENOMEM; + + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + init_waitqueue_entry(&waits[i], current); + add_wait_queue(&server->probe_wq, &waits[i]); + } + } + + for (;;) { + bool still_probing = false; + + set_current_state(TASK_INTERRUPTIBLE); + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (server->probe.responded) + goto stop; + if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) + still_probing = true; + } + } + + if (!still_probing || unlikely(signal_pending(current))) + goto stop; + schedule(); + } + +stop: + set_current_state(TASK_RUNNING); + + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (server->probe.responded && + server->probe.rtt < rtt) { + pref = i; + rtt = server->probe.rtt; + } + + remove_wait_queue(&server->probe_wq, &waits[i]); + } + } + + kfree(waits); + + if (pref == -1 && signal_pending(current)) + return -ERESTARTSYS; + + if (pref >= 0) + vllist->preferred = pref; + + _leave(" = 0 [%u]", pref); + return 0; +} diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c new file mode 100644 index 000000000000..b64a284b99d2 --- /dev/null +++ b/fs/afs/vl_rotate.c @@ -0,0 +1,355 @@ +/* Handle vlserver selection and rotation. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/signal.h> +#include "internal.h" +#include "afs_vl.h" + +/* + * Begin an operation on a volume location server. + */ +bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell, + struct key *key) +{ + memset(vc, 0, sizeof(*vc)); + vc->cell = cell; + vc->key = key; + vc->error = -EDESTADDRREQ; + vc->ac.error = SHRT_MAX; + + if (signal_pending(current)) { + vc->error = -EINTR; + vc->flags |= AFS_VL_CURSOR_STOP; + return false; + } + + return true; +} + +/* + * Begin iteration through a server list, starting with the last used server if + * possible, or the last recorded good server if not. + */ +static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) +{ + struct afs_cell *cell = vc->cell; + + if (wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET, + TASK_INTERRUPTIBLE)) { + vc->error = -ERESTARTSYS; + return false; + } + + read_lock(&cell->vl_servers_lock); + vc->server_list = afs_get_vlserverlist( + rcu_dereference_protected(cell->vl_servers, + lockdep_is_held(&cell->vl_servers_lock))); + read_unlock(&cell->vl_servers_lock); + if (!vc->server_list || !vc->server_list->nr_servers) + return false; + + vc->untried = (1UL << vc->server_list->nr_servers) - 1; + vc->index = -1; + return true; +} + +/* + * Select the vlserver to use. May be called multiple times to rotate + * through the vlservers. + */ +bool afs_select_vlserver(struct afs_vl_cursor *vc) +{ + struct afs_addr_list *alist; + struct afs_vlserver *vlserver; + u32 rtt; + int error = vc->ac.error, abort_code, i; + + _enter("%lx[%d],%lx[%d],%d,%d", + vc->untried, vc->index, + vc->ac.tried, vc->ac.index, + error, vc->ac.abort_code); + + if (vc->flags & AFS_VL_CURSOR_STOP) { + _leave(" = f [stopped]"); + return false; + } + + vc->nr_iterations++; + + /* Evaluate the result of the previous operation, if there was one. */ + switch (error) { + case SHRT_MAX: + goto start; + + default: + case 0: + /* Success or local failure. Stop. */ + vc->error = error; + vc->flags |= AFS_VL_CURSOR_STOP; + _leave(" = f [okay/local %d]", vc->ac.error); + return false; + + case -ECONNABORTED: + /* The far side rejected the operation on some grounds. This + * might involve the server being busy or the volume having been moved. + */ + switch (vc->ac.abort_code) { + case AFSVL_IO: + case AFSVL_BADVOLOPER: + case AFSVL_NOMEM: + /* The server went weird. */ + vc->error = -EREMOTEIO; + //write_lock(&vc->cell->vl_servers_lock); + //vc->server_list->weird_mask |= 1 << vc->index; + //write_unlock(&vc->cell->vl_servers_lock); + goto next_server; + + default: + vc->error = afs_abort_to_error(vc->ac.abort_code); + goto failed; + } + + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + _debug("no conn %d", error); + vc->error = error; + goto iterate_address; + + case -ECONNRESET: + _debug("call reset"); + vc->error = error; + vc->flags |= AFS_VL_CURSOR_RETRY; + goto next_server; + } + +restart_from_beginning: + _debug("restart"); + afs_end_cursor(&vc->ac); + afs_put_vlserverlist(vc->cell->net, vc->server_list); + vc->server_list = NULL; + if (vc->flags & AFS_VL_CURSOR_RETRIED) + goto failed; + vc->flags |= AFS_VL_CURSOR_RETRIED; +start: + _debug("start"); + + if (!afs_start_vl_iteration(vc)) + goto failed; + + error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list); + if (error < 0) + goto failed_set_error; + +pick_server: + _debug("pick [%lx]", vc->untried); + + error = afs_wait_for_vl_probes(vc->server_list, vc->untried); + if (error < 0) + goto failed_set_error; + + /* Pick the untried server with the lowest RTT. */ + vc->index = vc->server_list->preferred; + if (test_bit(vc->index, &vc->untried)) + goto selected_server; + + vc->index = -1; + rtt = U32_MAX; + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + + if (!test_bit(i, &vc->untried) || !s->probe.responded) + continue; + if (s->probe.rtt < rtt) { + vc->index = i; + rtt = s->probe.rtt; + } + } + + if (vc->index == -1) + goto no_more_servers; + +selected_server: + _debug("use %d", vc->index); + __clear_bit(vc->index, &vc->untried); + + /* We're starting on a different vlserver from the list. We need to + * check it, find its address list and probe its capabilities before we + * use it. + */ + ASSERTCMP(vc->ac.alist, ==, NULL); + vlserver = vc->server_list->servers[vc->index].server; + vc->server = vlserver; + + _debug("USING VLSERVER: %s", vlserver->name); + + read_lock(&vlserver->lock); + alist = rcu_dereference_protected(vlserver->addresses, + lockdep_is_held(&vlserver->lock)); + afs_get_addrlist(alist); + read_unlock(&vlserver->lock); + + memset(&vc->ac, 0, sizeof(vc->ac)); + + if (!vc->ac.alist) + vc->ac.alist = alist; + else + afs_put_addrlist(alist); + + vc->ac.index = -1; + +iterate_address: + ASSERT(vc->ac.alist); + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + if (!afs_iterate_addresses(&vc->ac)) + goto next_server; + + _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + + _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport); + return true; + +next_server: + _debug("next"); + afs_end_cursor(&vc->ac); + goto pick_server; + +no_more_servers: + /* That's all the servers poked to no good effect. Try again if some + * of them were busy. + */ + if (vc->flags & AFS_VL_CURSOR_RETRY) + goto restart_from_beginning; + + abort_code = 0; + error = -EDESTADDRREQ; + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + int probe_error = READ_ONCE(s->probe.error); + + switch (probe_error) { + case 0: + continue; + default: + if (error == -ETIMEDOUT || + error == -ETIME) + continue; + case -ETIMEDOUT: + case -ETIME: + if (error == -ENOMEM || + error == -ENONET) + continue; + case -ENOMEM: + case -ENONET: + if (error == -ENETUNREACH) + continue; + case -ENETUNREACH: + if (error == -EHOSTUNREACH) + continue; + case -EHOSTUNREACH: + if (error == -ECONNREFUSED) + continue; + case -ECONNREFUSED: + if (error == -ECONNRESET) + continue; + case -ECONNRESET: /* Responded, but call expired. */ + if (error == -ECONNABORTED) + continue; + case -ECONNABORTED: + abort_code = s->probe.abort_code; + error = probe_error; + continue; + } + } + + if (error == -ECONNABORTED) + error = afs_abort_to_error(abort_code); + +failed_set_error: + vc->error = error; +failed: + vc->flags |= AFS_VL_CURSOR_STOP; + afs_end_cursor(&vc->ac); + _leave(" = f [failed %d]", vc->error); + return false; +} + +/* + * Dump cursor state in the case of the error being EDESTADDRREQ. + */ +static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) +{ + static int count; + int i; + + if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) + return; + count++; + + rcu_read_lock(); + pr_notice("EDESTADDR occurred\n"); + pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", + vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); + + if (vc->server_list) { + const struct afs_vlserver_list *sl = vc->server_list; + pr_notice("VC: SL nr=%u ix=%u\n", + sl->nr_servers, sl->index); + for (i = 0; i < sl->nr_servers; i++) { + const struct afs_vlserver *s = sl->servers[i].server; + pr_notice("VC: server %s+%hu fl=%lx E=%hd\n", + s->name, s->port, s->flags, s->probe.error); + if (s->addresses) { + const struct afs_addr_list *a = + rcu_dereference(s->addresses); + pr_notice("VC: - nr=%u/%u/%u pf=%u\n", + a->nr_ipv4, a->nr_addrs, a->max_addrs, + a->preferred); + pr_notice("VC: - pr=%lx R=%lx F=%lx\n", + a->probed, a->responded, a->failed); + if (a == vc->ac.alist) + pr_notice("VC: - current\n"); + } + } + } + + pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", + vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error, + vc->ac.responded, vc->ac.nr_iterations); + rcu_read_unlock(); +} + +/* + * Tidy up a volume location server cursor and unlock the vnode. + */ +int afs_end_vlserver_operation(struct afs_vl_cursor *vc) +{ + struct afs_net *net = vc->cell->net; + + if (vc->error == -EDESTADDRREQ || + vc->error == -ENETUNREACH || + vc->error == -EHOSTUNREACH) + afs_vl_dump_edestaddrreq(vc); + + afs_end_cursor(&vc->ac); + afs_put_vlserverlist(net, vc->server_list); + + if (vc->error == -ECONNABORTED) + vc->error = afs_abort_to_error(vc->ac.abort_code); + + return vc->error; +} diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index c3b740813fc7..c3d9e5a5f67e 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -128,14 +128,13 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = { * Dispatch a get volume entry by name or ID operation (uuid variant). If the * volname is a decimal number then it's a volume ID not a volume name. */ -struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net, - struct afs_addr_cursor *ac, - struct key *key, +struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, const char *volname, int volnamesz) { struct afs_vldb_entry *entry; struct afs_call *call; + struct afs_net *net = vc->cell->net; size_t reqsz, padsz; __be32 *bp; @@ -155,7 +154,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net, return ERR_PTR(-ENOMEM); } - call->key = key; + call->key = vc->key; call->reply[0] = entry; call->ret_reply0 = true; @@ -168,7 +167,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net, memset((void *)bp + volnamesz, 0, padsz); trace_afs_make_vl_call(call); - return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false); + return (struct afs_vldb_entry *)afs_make_call(&vc->ac, call, GFP_KERNEL, false); } /* @@ -187,19 +186,18 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) u32 uniquifier, nentries, count; int i, ret; - _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->_iter), call->count); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_buf(call, + sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32)); call->unmarshall++; /* Extract the returned uuid, uniquifier, nentries and blkaddrs size */ case 1: - ret = afs_extract_data(call, call->buffer, - sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -216,28 +214,28 @@ again: call->reply[0] = alist; call->count = count; call->count2 = nentries; - call->offset = 0; call->unmarshall++; + more_entries: + count = min(call->count, 4U); + afs_extract_to_buf(call, count * sizeof(__be32)); + /* Extract entries */ case 2: - count = min(call->count, 4U); - ret = afs_extract_data(call, call->buffer, - count * sizeof(__be32), - call->count > 4); + ret = afs_extract_data(call, call->count > 4); if (ret < 0) return ret; alist = call->reply[0]; bp = call->buffer; + count = min(call->count, 4U); for (i = 0; i < count; i++) if (alist->nr_addrs < call->count2) afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT); call->count -= count; if (call->count > 0) - goto again; - call->offset = 0; + goto more_entries; call->unmarshall++; break; } @@ -267,14 +265,13 @@ static const struct afs_call_type afs_RXVLGetAddrsU = { * Dispatch an operation to get the addresses for a server, where the server is * nominated by UUID. */ -struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, - struct afs_addr_cursor *ac, - struct key *key, +struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, const uuid_t *uuid) { struct afs_ListAddrByAttributes__xdr *r; const struct afs_uuid *u = (const struct afs_uuid *)uuid; struct afs_call *call; + struct afs_net *net = vc->cell->net; __be32 *bp; int i; @@ -286,7 +283,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, if (!call) return ERR_PTR(-ENOMEM); - call->key = key; + call->key = vc->key; call->reply[0] = NULL; call->ret_reply0 = true; @@ -307,7 +304,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); - return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); + return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false); } /* @@ -318,54 +315,51 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) u32 count; int ret; - _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->_iter), call->count); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* Extract the capabilities word count */ case 1: - ret = afs_extract_data(call, &call->tmp, - 1 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; count = ntohl(call->tmp); - call->count = count; call->count2 = count; - call->offset = 0; + call->unmarshall++; + afs_extract_discard(call, count * sizeof(__be32)); /* Extract capabilities words */ case 2: - count = min(call->count, 16U); - ret = afs_extract_data(call, call->buffer, - count * sizeof(__be32), - call->count > 16); + ret = afs_extract_data(call, false); if (ret < 0) return ret; /* TODO: Examine capabilities */ - call->count -= count; - if (call->count > 0) - goto again; - call->offset = 0; call->unmarshall++; break; } - call->reply[0] = (void *)(unsigned long)call->service_id; - _leave(" = 0 [done]"); return 0; } +static void afs_destroy_vl_get_capabilities(struct afs_call *call) +{ + struct afs_vlserver *server = call->reply[0]; + + afs_put_vlserver(call->net, server); + afs_flat_call_destructor(call); +} + /* * VL.GetCapabilities operation type */ @@ -373,11 +367,12 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { .name = "VL.GetCapabilities", .op = afs_VL_GetCapabilities, .deliver = afs_deliver_vl_get_capabilities, - .destructor = afs_flat_call_destructor, + .done = afs_vlserver_probe_result, + .destructor = afs_destroy_vl_get_capabilities, }; /* - * Probe a fileserver for the capabilities that it supports. This can + * Probe a volume server for the capabilities that it supports. This can * return up to 196 words. * * We use this to probe for service upgrade to determine what the server at the @@ -385,7 +380,10 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { */ int afs_vl_get_capabilities(struct afs_net *net, struct afs_addr_cursor *ac, - struct key *key) + struct key *key, + struct afs_vlserver *server, + unsigned int server_index, + bool async) { struct afs_call *call; __be32 *bp; @@ -397,9 +395,10 @@ int afs_vl_get_capabilities(struct afs_net *net, return -ENOMEM; call->key = key; - call->upgrade = true; /* Let's see if this is a YFS server */ - call->reply[0] = (void *)VLGETCAPABILITIES; - call->ret_reply0 = true; + call->reply[0] = afs_get_vlserver(server); + call->reply[1] = (void *)(long)server_index; + call->upgrade = true; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -407,7 +406,7 @@ int afs_vl_get_capabilities(struct afs_net *net, /* Can't take a ref on server */ trace_afs_make_vl_call(call); - return afs_make_call(ac, call, GFP_KERNEL, false); + return afs_make_call(ac, call, GFP_KERNEL, async); } /* @@ -426,22 +425,19 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) u32 uniquifier, size; int ret; - _enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2); + _enter("{%u,%zu,%u}", + call->unmarshall, iov_iter_count(call->_iter), call->count2); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_buf(call, sizeof(uuid_t) + 3 * sizeof(__be32)); call->unmarshall = 1; /* Extract the returned uuid, uniquifier, fsEndpoints count and * either the first fsEndpoint type or the volEndpoints * count if there are no fsEndpoints. */ case 1: - ret = afs_extract_data(call, call->buffer, - sizeof(uuid_t) + - 3 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -451,22 +447,19 @@ again: call->count2 = ntohl(*bp); /* Type or next count */ if (call->count > YFS_MAXENDPOINTS) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt_num); alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); if (!alist) return -ENOMEM; alist->version = uniquifier; call->reply[0] = alist; - call->offset = 0; if (call->count == 0) goto extract_volendpoints; - call->unmarshall = 2; - - /* Extract fsEndpoints[] entries */ - case 2: + next_fsendpoint: switch (call->count2) { case YFS_ENDPOINT_IPV4: size = sizeof(__be32) * (1 + 1 + 1); @@ -475,11 +468,17 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt_type); } size += sizeof(__be32); - ret = afs_extract_data(call, call->buffer, size, true); + afs_extract_to_buf(call, size); + call->unmarshall = 2; + + /* Extract fsEndpoints[] entries */ + case 2: + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -488,18 +487,21 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt4_len); afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt6_len); afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); bp += 6; break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt_type); } /* Got either the type of the next entry or the count of @@ -507,10 +509,9 @@ again: */ call->count2 = ntohl(*bp++); - call->offset = 0; call->count--; if (call->count > 0) - goto again; + goto next_fsendpoint; extract_volendpoints: /* Extract the list of volEndpoints. */ @@ -518,8 +519,10 @@ again: if (!call->count) goto end; if (call->count > YFS_MAXENDPOINTS) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt_type); + afs_extract_to_buf(call, 1 * sizeof(__be32)); call->unmarshall = 3; /* Extract the type of volEndpoints[0]. Normally we would @@ -527,17 +530,14 @@ again: * data of the current one, but this is the first... */ case 3: - ret = afs_extract_data(call, call->buffer, sizeof(__be32), true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; bp = call->buffer; - call->count2 = ntohl(*bp++); - call->offset = 0; - call->unmarshall = 4; - /* Extract volEndpoints[] entries */ - case 4: + next_volendpoint: + call->count2 = ntohl(*bp++); switch (call->count2) { case YFS_ENDPOINT_IPV4: size = sizeof(__be32) * (1 + 1 + 1); @@ -546,12 +546,18 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt_type); } if (call->count > 1) - size += sizeof(__be32); - ret = afs_extract_data(call, call->buffer, size, true); + size += sizeof(__be32); /* Get next type too */ + afs_extract_to_buf(call, size); + call->unmarshall = 4; + + /* Extract volEndpoints[] entries */ + case 4: + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -559,34 +565,35 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt4_len); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt6_len); bp += 6; break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt_type); } /* Got either the type of the next entry or the count of * volEndpoints if no more fsEndpoints. */ - call->offset = 0; call->count--; - if (call->count > 0) { - call->count2 = ntohl(*bp++); - goto again; - } + if (call->count > 0) + goto next_volendpoint; end: + afs_extract_discard(call, 0); call->unmarshall = 5; /* Done */ case 5: - ret = afs_extract_data(call, call->buffer, 0, false); + ret = afs_extract_data(call, false); if (ret < 0) return ret; call->unmarshall = 6; @@ -596,11 +603,6 @@ again: } alist = call->reply[0]; - - /* Start with IPv6 if available. */ - if (alist->nr_ipv4 < alist->nr_addrs) - alist->index = alist->nr_ipv4; - _leave(" = 0 [done]"); return 0; } @@ -619,12 +621,11 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { * Dispatch an operation to get the addresses for a server, where the server is * nominated by UUID. */ -struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net, - struct afs_addr_cursor *ac, - struct key *key, +struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, const uuid_t *uuid) { struct afs_call *call; + struct afs_net *net = vc->cell->net; __be32 *bp; _enter(""); @@ -635,7 +636,7 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net, if (!call) return ERR_PTR(-ENOMEM); - call->key = key; + call->key = vc->key; call->reply[0] = NULL; call->ret_reply0 = true; @@ -646,5 +647,5 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net, memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */ trace_afs_make_vl_call(call); - return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); + return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 3037bd01f617..00975ed3640f 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -74,55 +74,19 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, const char *volname, size_t volnamesz) { - struct afs_addr_cursor ac; - struct afs_vldb_entry *vldb; + struct afs_vldb_entry *vldb = ERR_PTR(-EDESTADDRREQ); + struct afs_vl_cursor vc; int ret; - ret = afs_set_vl_cursor(&ac, cell); - if (ret < 0) - return ERR_PTR(ret); - - while (afs_iterate_addresses(&ac)) { - if (!test_bit(ac.index, &ac.alist->probed)) { - ret = afs_vl_get_capabilities(cell->net, &ac, key); - switch (ret) { - case VL_SERVICE: - clear_bit(ac.index, &ac.alist->yfs); - set_bit(ac.index, &ac.alist->probed); - ac.addr->srx_service = ret; - break; - case YFS_VL_SERVICE: - set_bit(ac.index, &ac.alist->yfs); - set_bit(ac.index, &ac.alist->probed); - ac.addr->srx_service = ret; - break; - } - } - - vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key, - volname, volnamesz); - switch (ac.error) { - case 0: - afs_end_cursor(&ac); - return vldb; - case -ECONNABORTED: - ac.error = afs_abort_to_error(ac.abort_code); - goto error; - case -ENOMEM: - case -ENONET: - goto error; - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - break; - default: - ac.error = -EIO; - goto error; - } + if (!afs_begin_vlserver_operation(&vc, cell, key)) + return ERR_PTR(-ERESTARTSYS); + + while (afs_select_vlserver(&vc)) { + vldb = afs_vl_get_entry_by_name_u(&vc, volname, volnamesz); } -error: - return ERR_PTR(afs_end_cursor(&ac)); + ret = afs_end_vlserver_operation(&vc); + return ret < 0 ? ERR_PTR(ret) : vldb; } /* @@ -270,7 +234,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) /* We look up an ID by passing it as a decimal string in the * operation's name parameter. */ - idsz = sprintf(idbuf, "%u", volume->vid); + idsz = sprintf(idbuf, "%llu", volume->vid); vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); if (IS_ERR(vldb)) { diff --git a/fs/afs/write.c b/fs/afs/write.c index 19c04caf3c01..72efcfcf9f95 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -33,10 +33,21 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, loff_t pos, unsigned int len, struct page *page) { struct afs_read *req; + size_t p; + void *data; int ret; _enter(",,%llu", (unsigned long long)pos); + if (pos >= vnode->vfs_inode.i_size) { + p = pos & ~PAGE_MASK; + ASSERTCMP(p + len, <=, PAGE_SIZE); + data = kmap(page); + memset(data + p, 0, len); + kunmap(page); + return 0; + } + req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *), GFP_KERNEL); if (!req) @@ -81,7 +92,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; int ret; - _enter("{%x:%u},{%lx},%u,%u", + _enter("{%llx:%llu},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, index, from, to); /* We want to store information about how much of a page is altered in @@ -181,7 +192,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, loff_t i_size, maybe_i_size; int ret; - _enter("{%x:%u},{%lx}", + _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); maybe_i_size = pos + copied; @@ -230,7 +241,7 @@ static void afs_kill_pages(struct address_space *mapping, struct pagevec pv; unsigned count, loop; - _enter("{%x:%u},%lx-%lx", + _enter("{%llx:%llu},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); @@ -272,7 +283,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, struct pagevec pv; unsigned count, loop; - _enter("{%x:%u},%lx-%lx", + _enter("{%llx:%llu},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); @@ -314,7 +325,7 @@ static int afs_store_data(struct address_space *mapping, struct list_head *p; int ret = -ENOKEY, ret2; - _enter("%s{%x:%u.%u},%lx,%lx,%x,%x", + _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -533,6 +544,7 @@ no_more: case -ENOENT: case -ENOMEDIUM: case -ENXIO: + trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); afs_kill_pages(mapping, first, last); mapping_set_error(mapping, ret); break; @@ -675,7 +687,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) unsigned count, loop; pgoff_t first = call->first, last = call->last; - _enter("{%x:%u},{%lx-%lx}", + _enter("{%llx:%llu},{%lx-%lx}", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); @@ -714,7 +726,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) ssize_t result; size_t count = iov_iter_count(from); - _enter("{%x.%u},{%zu},", + _enter("{%llx:%llu},{%zu},", vnode->fid.vid, vnode->fid.vnode, count); if (IS_SWAPFILE(&vnode->vfs_inode)) { @@ -742,7 +754,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); - _enter("{%x:%u},{n=%pD},%d", + _enter("{%llx:%llu},{n=%pD},%d", vnode->fid.vid, vnode->fid.vnode, file, datasync); @@ -760,7 +772,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) struct afs_vnode *vnode = AFS_FS_I(inode); unsigned long priv; - _enter("{{%x:%u}},{%lx}", + _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, vmf->page->index); sb_start_pagefault(inode->i_sb); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index cfcc674e64a5..a2cdf25573e2 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -72,7 +72,7 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler, char text[8 + 1 + 8 + 1 + 8 + 1]; size_t len; - len = sprintf(text, "%x:%x:%x", + len = sprintf(text, "%llx:%llx:%x", vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); if (size == 0) return len; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c new file mode 100644 index 000000000000..12658c1363ae --- /dev/null +++ b/fs/afs/yfsclient.c @@ -0,0 +1,2184 @@ +/* YFS File Server client stubs + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/circ_buf.h> +#include <linux/iversion.h> +#include "internal.h" +#include "afs_fs.h" +#include "xdr_fs.h" +#include "protocol_yfs.h" + +static const struct afs_fid afs_zero_fid; + +static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) +{ + call->cbi = afs_get_cb_interest(cbi); +} + +#define xdr_size(x) (sizeof(*x) / sizeof(__be32)) + +static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid) +{ + const struct yfs_xdr_YFSFid *x = (const void *)*_bp; + + fid->vid = xdr_to_u64(x->volume); + fid->vnode = xdr_to_u64(x->vnode.lo); + fid->vnode_hi = ntohl(x->vnode.hi); + fid->unique = ntohl(x->vnode.unique); + *_bp += xdr_size(x); +} + +static __be32 *xdr_encode_u32(__be32 *bp, u32 n) +{ + *bp++ = htonl(n); + return bp; +} + +static __be32 *xdr_encode_u64(__be32 *bp, u64 n) +{ + struct yfs_xdr_u64 *x = (void *)bp; + + *x = u64_to_xdr(n); + return bp + xdr_size(x); +} + +static __be32 *xdr_encode_YFSFid(__be32 *bp, struct afs_fid *fid) +{ + struct yfs_xdr_YFSFid *x = (void *)bp; + + x->volume = u64_to_xdr(fid->vid); + x->vnode.lo = u64_to_xdr(fid->vnode); + x->vnode.hi = htonl(fid->vnode_hi); + x->vnode.unique = htonl(fid->unique); + return bp + xdr_size(x); +} + +static size_t xdr_strlen(unsigned int len) +{ + return sizeof(__be32) + round_up(len, sizeof(__be32)); +} + +static __be32 *xdr_encode_string(__be32 *bp, const char *p, unsigned int len) +{ + bp = xdr_encode_u32(bp, len); + bp = memcpy(bp, p, len); + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + memset((u8 *)bp + len, 0, pad); + len += pad; + } + + return bp + len / sizeof(__be32); +} + +static s64 linux_to_yfs_time(const struct timespec64 *t) +{ + /* Convert to 100ns intervals. */ + return (u64)t->tv_sec * 10000000 + t->tv_nsec/100; +} + +static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + + x->mask = htonl(AFS_SET_MODE); + x->mode = htonl(mode & S_IALLUGO); + x->mtime_client = u64_to_xdr(0); + x->owner = u64_to_xdr(0); + x->group = u64_to_xdr(0); + return bp + xdr_size(x); +} + +static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + s64 mtime = linux_to_yfs_time(t); + + x->mask = htonl(AFS_SET_MTIME); + x->mode = htonl(0); + x->mtime_client = u64_to_xdr(mtime); + x->owner = u64_to_xdr(0); + x->group = u64_to_xdr(0); + return bp + xdr_size(x); +} + +/* + * Convert a signed 100ns-resolution 64-bit time into a timespec. + */ +static struct timespec64 yfs_time_to_linux(s64 t) +{ + struct timespec64 ts; + u64 abs_t; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + abs_t = -t; + ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100); + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -abs_t; + } else { + abs_t = t; + ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100; + ts.tv_sec = abs_t; + } + + return ts; +} + +static struct timespec64 xdr_to_time(const struct yfs_xdr_u64 xdr) +{ + s64 t = xdr_to_u64(xdr); + + return yfs_time_to_linux(t); +} + +static void yfs_check_req(struct afs_call *call, __be32 *bp) +{ + size_t len = (void *)bp - call->request; + + if (len > call->request_size) + pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n", + call->type->name, len, call->request_size); + else if (len < call->request_size) + pr_warning("kAFS: %s: Request buffer underflow (%zu<%u)\n", + call->type->name, len, call->request_size); +} + +/* + * Dump a bad file status record. + */ +static void xdr_dump_bad(const __be32 *bp) +{ + __be32 x[4]; + int i; + + pr_notice("YFS XDR: Bad status record\n"); + for (i = 0; i < 5 * 4 * 4; i += 16) { + memcpy(x, bp, 16); + bp += 4; + pr_notice("%03x: %08x %08x %08x %08x\n", + i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); + } + + memcpy(x, bp, 4); + pr_notice("0x50: %08x\n", ntohl(x[0])); +} + +/* + * Decode a YFSFetchStatus block + */ +static int xdr_decode_YFSFetchStatus(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp; + u32 type; + u8 flags = 0; + + status->abort_code = ntohl(xdr->abort_code); + if (status->abort_code != 0) { + if (vnode && status->abort_code == VNOVNODE) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + status->nlink = 0; + __afs_break_callback(vnode); + } + return 0; + } + + type = ntohl(xdr->type); + switch (type) { + case AFS_FTYPE_FILE: + case AFS_FTYPE_DIR: + case AFS_FTYPE_SYMLINK: + if (type != status->type && + vnode && + !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + status->type, type); + goto bad; + } + status->type = type; + break; + default: + goto bad; + } + +#define EXTRACT_M4(FIELD) \ + do { \ + u32 x = ntohl(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_META_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) + +#define EXTRACT_M8(FIELD) \ + do { \ + u64 x = xdr_to_u64(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_META_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) + +#define EXTRACT_D8(FIELD) \ + do { \ + u64 x = xdr_to_u64(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_DATA_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) + + EXTRACT_M4(nlink); + EXTRACT_D8(size); + EXTRACT_D8(data_version); + EXTRACT_M8(author); + EXTRACT_M8(owner); + EXTRACT_M8(group); + EXTRACT_M4(mode); + EXTRACT_M4(caller_access); /* call ticket dependent */ + EXTRACT_M4(anon_access); + + status->mtime_client = xdr_to_time(xdr->mtime_client); + status->mtime_server = xdr_to_time(xdr->mtime_server); + status->lock_count = ntohl(xdr->lock_count); + + if (read_req) { + read_req->data_version = status->data_version; + read_req->file_size = status->size; + } + + *_bp += xdr_size(xdr); + + if (vnode) { + if (test_bit(AFS_VNODE_UNSET, &vnode->flags)) + flags |= AFS_VNODE_NOT_YET_SET; + afs_update_inode_from_status(vnode, status, expected_version, + flags); + } + + return 0; + +bad: + xdr_dump_bad(*_bp); + return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); +} + +/* + * Decode the file status. We need to lock the target vnode if we're going to + * update its status so that stat() sees the attributes update atomically. + */ +static int yfs_decode_status(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + int ret; + + if (!vnode) + return xdr_decode_YFSFetchStatus(call, _bp, status, vnode, + expected_version, read_req); + + write_seqlock(&vnode->cb_lock); + ret = xdr_decode_YFSFetchStatus(call, _bp, status, vnode, + expected_version, read_req); + write_sequnlock(&vnode->cb_lock); + return ret; +} + +/* + * Decode a YFSCallBack block + */ +static void xdr_decode_YFSCallBack(struct afs_call *call, + struct afs_vnode *vnode, + const __be32 **_bp) +{ + struct yfs_xdr_YFSCallBack *xdr = (void *)*_bp; + struct afs_cb_interest *old, *cbi = call->cbi; + u64 cb_expiry; + + write_seqlock(&vnode->cb_lock); + + if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) { + cb_expiry = xdr_to_u64(xdr->expiration_time); + do_div(cb_expiry, 10 * 1000 * 1000); + vnode->cb_version = ntohl(xdr->version); + vnode->cb_type = ntohl(xdr->type); + vnode->cb_expires_at = cb_expiry + ktime_get_real_seconds(); + old = vnode->cb_interest; + if (old != call->cbi) { + vnode->cb_interest = cbi; + cbi = old; + } + set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } + + write_sequnlock(&vnode->cb_lock); + call->cbi = cbi; + *_bp += xdr_size(xdr); +} + +static void xdr_decode_YFSCallBack_raw(const __be32 **_bp, + struct afs_callback *cb) +{ + struct yfs_xdr_YFSCallBack *x = (void *)*_bp; + u64 cb_expiry; + + cb_expiry = xdr_to_u64(x->expiration_time); + do_div(cb_expiry, 10 * 1000 * 1000); + cb->version = ntohl(x->version); + cb->type = ntohl(x->type); + cb->expires_at = cb_expiry + ktime_get_real_seconds(); + + *_bp += xdr_size(x); +} + +/* + * Decode a YFSVolSync block + */ +static void xdr_decode_YFSVolSync(const __be32 **_bp, + struct afs_volsync *volsync) +{ + struct yfs_xdr_YFSVolSync *x = (void *)*_bp; + u64 creation; + + if (volsync) { + creation = xdr_to_u64(x->vol_creation_date); + do_div(creation, 10 * 1000 * 1000); + volsync->creation = creation; + } + + *_bp += xdr_size(x); +} + +/* + * Encode the requested attributes into a YFSStoreStatus block + */ +static __be32 *xdr_encode_YFS_StoreStatus(__be32 *bp, struct iattr *attr) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + s64 mtime = 0, owner = 0, group = 0; + u32 mask = 0, mode = 0; + + mask = 0; + if (attr->ia_valid & ATTR_MTIME) { + mask |= AFS_SET_MTIME; + mtime = linux_to_yfs_time(&attr->ia_mtime); + } + + if (attr->ia_valid & ATTR_UID) { + mask |= AFS_SET_OWNER; + owner = from_kuid(&init_user_ns, attr->ia_uid); + } + + if (attr->ia_valid & ATTR_GID) { + mask |= AFS_SET_GROUP; + group = from_kgid(&init_user_ns, attr->ia_gid); + } + + if (attr->ia_valid & ATTR_MODE) { + mask |= AFS_SET_MODE; + mode = attr->ia_mode & S_IALLUGO; + } + + x->mask = htonl(mask); + x->mode = htonl(mode); + x->mtime_client = u64_to_xdr(mtime); + x->owner = u64_to_xdr(owner); + x->group = u64_to_xdr(group); + return bp + xdr_size(x); +} + +/* + * Decode a YFSFetchVolumeStatus block. + */ +static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp, + struct afs_volume_status *vs) +{ + const struct yfs_xdr_YFSFetchVolumeStatus *x = (const void *)*_bp; + u32 flags; + + vs->vid = xdr_to_u64(x->vid); + vs->parent_id = xdr_to_u64(x->parent_id); + flags = ntohl(x->flags); + vs->online = flags & yfs_FVSOnline; + vs->in_service = flags & yfs_FVSInservice; + vs->blessed = flags & yfs_FVSBlessed; + vs->needs_salvage = flags & yfs_FVSNeedsSalvage; + vs->type = ntohl(x->type); + vs->min_quota = 0; + vs->max_quota = xdr_to_u64(x->max_quota); + vs->blocks_in_use = xdr_to_u64(x->blocks_in_use); + vs->part_blocks_avail = xdr_to_u64(x->part_blocks_avail); + vs->part_max_blocks = xdr_to_u64(x->part_max_blocks); + vs->vol_copy_date = xdr_to_u64(x->vol_copy_date); + vs->vol_backup_date = xdr_to_u64(x->vol_backup_date); + *_bp += sizeof(*x) / sizeof(__be32); +} + +/* + * deliver reply data to an FS.FetchStatus + */ +static int yfs_deliver_fs_fetch_status_vnode(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack(call, vnode, &bp); + xdr_decode_YFSVolSync(&bp, call->reply[1]); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.FetchStatus operation type + */ +static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = { + .name = "YFS.FetchStatus(vnode)", + .op = yfs_FS_FetchStatus, + .deliver = yfs_deliver_fs_fetch_status_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a file. + */ +int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync, + bool new_inode) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus_vnode, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = volsync; + call->expected_version = new_inode ? 1 : vnode->status.data_version; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an YFS.FetchData64. + */ +static int yfs_deliver_fs_fetch_data64(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + struct afs_read *req = call->reply[2]; + const __be32 *bp; + unsigned int size; + int ret; + + _enter("{%u,%zu/%llu}", + call->unmarshall, iov_iter_count(&call->iter), req->actual_len); + + switch (call->unmarshall) { + case 0: + req->actual_len = 0; + req->index = 0; + req->offset = req->pos & (PAGE_SIZE - 1); + afs_extract_to_tmp64(call); + call->unmarshall++; + + /* extract the returned data length */ + case 1: + _debug("extract data length"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + req->actual_len = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", req->actual_len); + req->remain = min(req->len, req->actual_len); + if (req->remain == 0) + goto no_more_data; + + call->unmarshall++; + + begin_page: + ASSERTCMP(req->index, <, req->nr_pages); + if (req->remain > PAGE_SIZE - req->offset) + size = PAGE_SIZE - req->offset; + else + size = req->remain; + call->bvec[0].bv_len = size; + call->bvec[0].bv_offset = req->offset; + call->bvec[0].bv_page = req->pages[req->index]; + iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + ASSERTCMP(size, <=, PAGE_SIZE); + + /* extract the returned data */ + case 2: + _debug("extract data %zu/%llu", + iov_iter_count(&call->iter), req->remain); + + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + req->remain -= call->bvec[0].bv_len; + req->offset += call->bvec[0].bv_len; + ASSERTCMP(req->offset, <=, PAGE_SIZE); + if (req->offset == PAGE_SIZE) { + req->offset = 0; + if (req->page_done) + req->page_done(call, req); + req->index++; + if (req->remain > 0) + goto begin_page; + } + + ASSERTCMP(req->remain, ==, 0); + if (req->actual_len <= req->len) + goto no_more_data; + + /* Discard any excess data the server gave us */ + iov_iter_discard(&call->iter, READ, req->actual_len - req->len); + call->unmarshall = 3; + case 3: + _debug("extract discard %zu/%llu", + iov_iter_count(&call->iter), req->actual_len - req->len); + + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + no_more_data: + call->unmarshall = 4; + afs_extract_to_buf(call, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + /* extract the metadata */ + case 4: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack(call, vnode, &bp); + xdr_decode_YFSVolSync(&bp, call->reply[1]); + + call->unmarshall++; + + case 5: + break; + } + + for (; req->index < req->nr_pages; req->index++) { + if (req->offset < PAGE_SIZE) + zero_user_segment(req->pages[req->index], + req->offset, PAGE_SIZE); + if (req->page_done) + req->page_done(call, req); + req->offset = 0; + } + + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_fetch_data_destructor(struct afs_call *call) +{ + struct afs_read *req = call->reply[2]; + + afs_put_read(req); + afs_flat_call_destructor(call); +} + +/* + * YFS.FetchData64 operation type + */ +static const struct afs_call_type yfs_RXYFSFetchData64 = { + .name = "YFS.FetchData64", + .op = yfs_FS_FetchData64, + .deliver = yfs_deliver_fs_fetch_data64, + .destructor = yfs_fetch_data_destructor, +}; + +/* + * Fetch data from a file. + */ +int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(",%x,{%llx:%llu},%llx,%llx", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode, + req->pos, req->len); + + call = afs_alloc_flat_call(net, &yfs_RXYFSFetchData64, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_u64) * 2, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = NULL; /* volsync */ + call->reply[2] = req; + call->expected_version = vnode->status.data_version; + call->want_reply_time = true; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_u64(bp, req->pos); + bp = xdr_encode_u64(bp, req->len); + yfs_check_req(call, bp); + + refcount_inc(&req->usage); + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data for YFS.CreateFile or YFS.MakeDir. + */ +static int yfs_deliver_fs_create_vnode(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFid(&bp, call->reply[1]); + ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack_raw(&bp, call->reply[3]); + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.CreateFile and FS.MakeDir operation type + */ +static const struct afs_call_type afs_RXFSCreateFile = { + .name = "YFS.CreateFile", + .op = yfs_FS_CreateFile, + .deliver = yfs_deliver_fs_create_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Create a file. + */ +int yfs_fs_create_file(struct afs_fs_cursor *fc, + const char *name, + umode_t mode, + u64 current_data_version, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + size_t namesz, reqsz, rplsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + reqsz = (sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(__be32)); + rplsz = (sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + call = afs_alloc_flat_call(net, &afs_RXFSCreateFile, reqsz, rplsz); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; + call->reply[3] = newcb; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSCREATEFILE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSStoreStatus_mode(bp, mode); + bp = xdr_encode_u32(bp, 0); /* ViceLockType */ + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +static const struct afs_call_type yfs_RXFSMakeDir = { + .name = "YFS.MakeDir", + .op = yfs_FS_MakeDir, + .deliver = yfs_deliver_fs_create_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Make a directory. + */ +int yfs_fs_make_dir(struct afs_fs_cursor *fc, + const char *name, + umode_t mode, + u64 current_data_version, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + size_t namesz, reqsz, rplsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + reqsz = (sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + sizeof(struct yfs_xdr_YFSStoreStatus)); + rplsz = (sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + call = afs_alloc_flat_call(net, &yfs_RXFSMakeDir, reqsz, rplsz); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; + call->reply[3] = newcb; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSMAKEDIR); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSStoreStatus_mode(bp, mode); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.RemoveFile2 operation. + */ +static int yfs_deliver_fs_remove_file2(struct afs_call *call) +{ + struct afs_vnode *dvnode = call->reply[0]; + struct afs_vnode *vnode = call->reply[1]; + struct afs_fid fid; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + + xdr_decode_YFSFid(&bp, &fid); + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL); + if (ret < 0) + return ret; + /* Was deleted if vnode->status.abort_code == VNOVNODE. */ + + xdr_decode_YFSVolSync(&bp, NULL); + return 0; +} + +/* + * YFS.RemoveFile2 operation type. + */ +static const struct afs_call_type yfs_RXYFSRemoveFile2 = { + .name = "YFS.RemoveFile2", + .op = yfs_FS_RemoveFile2, + .deliver = yfs_deliver_fs_remove_file2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Remove a file and retrieve new file status. + */ +int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, u64 current_data_version) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(dvnode); + size_t namesz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + + call = afs_alloc_flat_call(net, &yfs_RXYFSRemoveFile2, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = vnode; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSREMOVEFILE2); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.RemoveFile or YFS.RemoveDir operation. + */ +static int yfs_deliver_fs_remove(struct afs_call *call) +{ + struct afs_vnode *dvnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + + xdr_decode_YFSVolSync(&bp, NULL); + return 0; +} + +/* + * FS.RemoveDir and FS.RemoveFile operation types. + */ +static const struct afs_call_type yfs_RXYFSRemoveFile = { + .name = "YFS.RemoveFile", + .op = yfs_FS_RemoveFile, + .deliver = yfs_deliver_fs_remove, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type yfs_RXYFSRemoveDir = { + .name = "YFS.RemoveDir", + .op = yfs_FS_RemoveDir, + .deliver = yfs_deliver_fs_remove, + .destructor = afs_flat_call_destructor, +}; + +/* + * remove a file or directory + */ +int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, bool isdir, u64 current_data_version) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(dvnode); + size_t namesz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + call = afs_alloc_flat_call( + net, isdir ? &yfs_RXYFSRemoveDir : &yfs_RXYFSRemoveFile, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = vnode; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, isdir ? YFSREMOVEDIR : YFSREMOVEFILE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.Link operation. + */ +static int yfs_deliver_fs_link(struct afs_call *call) +{ + struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL); + if (ret < 0) + return ret; + ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Link operation type. + */ +static const struct afs_call_type yfs_RXYFSLink = { + .name = "YFS.Link", + .op = yfs_FS_Link, + .deliver = yfs_deliver_fs_link, + .destructor = afs_flat_call_destructor, +}; + +/* + * Make a hard link. + */ +int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, u64 current_data_version) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + size_t namesz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + call = afs_alloc_flat_call(net, &yfs_RXYFSLink, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = vnode; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSLINK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.Symlink operation. + */ +static int yfs_deliver_fs_symlink(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFid(&bp, call->reply[1]); + ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Symlink operation type + */ +static const struct afs_call_type yfs_RXYFSSymlink = { + .name = "YFS.Symlink", + .op = yfs_FS_Symlink, + .deliver = yfs_deliver_fs_symlink, + .destructor = afs_flat_call_destructor, +}; + +/* + * Create a symbolic link. + */ +int yfs_fs_symlink(struct afs_fs_cursor *fc, + const char *name, + const char *contents, + u64 current_data_version, + struct afs_fid *newfid, + struct afs_file_status *newstatus) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(dvnode); + size_t namesz, contents_sz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + contents_sz = strlen(contents); + call = afs_alloc_flat_call(net, &yfs_RXYFSSymlink, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + xdr_strlen(contents_sz) + + sizeof(struct yfs_xdr_YFSStoreStatus), + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSYMLINK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_string(bp, contents, contents_sz); + bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.Rename operation. + */ +static int yfs_deliver_fs_rename(struct afs_call *call) +{ + struct afs_vnode *orig_dvnode = call->reply[0]; + struct afs_vnode *new_dvnode = call->reply[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + if (new_dvnode != orig_dvnode) { + ret = yfs_decode_status(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL); + if (ret < 0) + return ret; + } + + xdr_decode_YFSVolSync(&bp, NULL); + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Rename operation type + */ +static const struct afs_call_type yfs_RXYFSRename = { + .name = "FS.Rename", + .op = yfs_FS_Rename, + .deliver = yfs_deliver_fs_rename, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory. + */ +int yfs_fs_rename(struct afs_fs_cursor *fc, + const char *orig_name, + struct afs_vnode *new_dvnode, + const char *new_name, + u64 current_orig_data_version, + u64 current_new_data_version) +{ + struct afs_vnode *orig_dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(orig_dvnode); + size_t o_namesz, n_namesz; + __be32 *bp; + + _enter(""); + + o_namesz = strlen(orig_name); + n_namesz = strlen(new_name); + call = afs_alloc_flat_call(net, &yfs_RXYFSRename, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(o_namesz) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(n_namesz), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = orig_dvnode; + call->reply[1] = new_dvnode; + call->expected_version = current_orig_data_version + 1; + call->expected_version_2 = current_new_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvnode->fid); + bp = xdr_encode_string(bp, orig_name, o_namesz); + bp = xdr_encode_YFSFid(bp, &new_dvnode->fid); + bp = xdr_encode_string(bp, new_name, n_namesz); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &orig_dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.StoreData64 operation. + */ +static int yfs_deliver_fs_store_data(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter(""); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + afs_pages_written_back(vnode, call); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.StoreData64 operation type. + */ +static const struct afs_call_type yfs_RXYFSStoreData64 = { + .name = "YFS.StoreData64", + .op = yfs_FS_StoreData64, + .deliver = yfs_deliver_fs_store_data, + .destructor = afs_flat_call_destructor, +}; + +/* + * Store a set of pages to a large file. + */ +int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, + pgoff_t first, pgoff_t last, + unsigned offset, unsigned to) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + loff_t size, pos, i_size; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + size = (loff_t)to - (loff_t)offset; + if (first != last) + size += (loff_t)(last - first) << PAGE_SHIFT; + pos = (loff_t)first << PAGE_SHIFT; + pos += offset; + + i_size = i_size_read(&vnode->vfs_inode); + if (pos + size > i_size) + i_size = size + pos; + + _debug("size %llx, at %llx, i_size %llx", + (unsigned long long)size, (unsigned long long)pos, + (unsigned long long)i_size); + + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64, + sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(struct yfs_xdr_u64) * 3, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->mapping = mapping; + call->reply[0] = vnode; + call->first = first; + call->last = last; + call->first_offset = offset; + call->last_to = to; + call->send_pages = true; + call->expected_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTOREDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSStoreStatus_mtime(bp, &vnode->vfs_inode.i_mtime); + bp = xdr_encode_u64(bp, pos); + bp = xdr_encode_u64(bp, size); + bp = xdr_encode_u64(bp, i_size); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * deliver reply data to an FS.StoreStatus + */ +static int yfs_deliver_fs_store_status(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter(""); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.StoreStatus operation type + */ +static const struct afs_call_type yfs_RXYFSStoreStatus = { + .name = "YFS.StoreStatus", + .op = yfs_FS_StoreStatus, + .deliver = yfs_deliver_fs_store_status, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = { + .name = "YFS.StoreData64", + .op = yfs_FS_StoreData64, + .deliver = yfs_deliver_fs_store_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Set the attributes on a file, using YFS.StoreData64 rather than + * YFS.StoreStatus so as to alter the file size also. + */ +static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64_as_Status, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(struct yfs_xdr_u64) * 3, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->expected_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTOREDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFS_StoreStatus(bp, attr); + bp = xdr_encode_u64(bp, 0); /* position of start of write */ + bp = xdr_encode_u64(bp, 0); /* size of write */ + bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Set the attributes on a file, using YFS.StoreData64 if there's a change in + * file size, and YFS.StoreStatus otherwise. + */ +int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + if (attr->ia_valid & ATTR_SIZE) + return yfs_fs_setattr_size(fc, attr); + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->expected_version = vnode->status.data_version; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTORESTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFS_StoreStatus(bp, attr); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.GetVolumeStatus operation. + */ +static int yfs_deliver_fs_get_volume_status(struct afs_call *call) +{ + const __be32 *bp; + char *p; + u32 size; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + call->unmarshall++; + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus)); + + /* extract the returned status record */ + case 1: + _debug("extract status"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchVolumeStatus(&bp, call->reply[1]); + call->unmarshall++; + afs_extract_to_tmp(call); + + /* extract the volume name length */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("volname length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_volname_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); + call->unmarshall++; + + /* extract the volume name */ + case 3: + _debug("extract volname"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + p = call->reply[2]; + p[call->count] = 0; + _debug("volname '%s'", p); + afs_extract_to_tmp(call); + call->unmarshall++; + + /* extract the offline message length */ + case 4: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("offline msg length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_offline_msg_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); + call->unmarshall++; + + /* extract the offline message */ + case 5: + _debug("extract offline"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + p = call->reply[2]; + p[call->count] = 0; + _debug("offline '%s'", p); + + afs_extract_to_tmp(call); + call->unmarshall++; + + /* extract the message of the day length */ + case 6: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("motd length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_motd_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); + call->unmarshall++; + + /* extract the message of the day */ + case 7: + _debug("extract motd"); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + p = call->reply[2]; + p[call->count] = 0; + _debug("motd '%s'", p); + + call->unmarshall++; + + case 8: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * Destroy a YFS.GetVolumeStatus call. + */ +static void yfs_get_volume_status_call_destructor(struct afs_call *call) +{ + kfree(call->reply[2]); + call->reply[2] = NULL; + afs_flat_call_destructor(call); +} + +/* + * YFS.GetVolumeStatus operation type + */ +static const struct afs_call_type yfs_RXYFSGetVolumeStatus = { + .name = "YFS.GetVolumeStatus", + .op = yfs_FS_GetVolumeStatus, + .deliver = yfs_deliver_fs_get_volume_status, + .destructor = yfs_get_volume_status_call_destructor, +}; + +/* + * fetch the status of a volume + */ +int yfs_fs_get_volume_status(struct afs_fs_cursor *fc, + struct afs_volume_status *vs) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + void *tmpbuf; + + _enter(""); + + tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + + call = afs_alloc_flat_call(net, &yfs_RXYFSGetVolumeStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_u64), + sizeof(struct yfs_xdr_YFSFetchVolumeStatus) + + sizeof(__be32)); + if (!call) { + kfree(tmpbuf); + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = vs; + call->reply[2] = tmpbuf; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSGETVOLUMESTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_u64(bp, vnode->fid.vid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an YFS.SetLock, YFS.ExtendLock or YFS.ReleaseLock + */ +static int yfs_deliver_fs_xxxx_lock(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.SetLock operation type + */ +static const struct afs_call_type yfs_RXYFSSetLock = { + .name = "YFS.SetLock", + .op = yfs_FS_SetLock, + .deliver = yfs_deliver_fs_xxxx_lock, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.ExtendLock operation type + */ +static const struct afs_call_type yfs_RXYFSExtendLock = { + .name = "YFS.ExtendLock", + .op = yfs_FS_ExtendLock, + .deliver = yfs_deliver_fs_xxxx_lock, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.ReleaseLock operation type + */ +static const struct afs_call_type yfs_RXYFSReleaseLock = { + .name = "YFS.ReleaseLock", + .op = yfs_FS_ReleaseLock, + .deliver = yfs_deliver_fs_xxxx_lock, + .destructor = afs_flat_call_destructor, +}; + +/* + * Set a lock on a file + */ +int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &yfs_RXYFSSetLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(__be32), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSETLOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_u32(bp, type); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * extend a lock on a file + */ +int yfs_fs_extend_lock(struct afs_fs_cursor *fc) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &yfs_RXYFSExtendLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSEXTENDLOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * release a lock on a file + */ +int yfs_fs_release_lock(struct afs_fs_cursor *fc) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &yfs_RXYFSReleaseLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRELEASELOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an FS.FetchStatus with no vnode. + */ +static int yfs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_file_status *status = call->reply[1]; + struct afs_callback *callback = call->reply[2]; + struct afs_volsync *volsync = call->reply[3]; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack_raw(&bp, callback); + xdr_decode_YFSVolSync(&bp, volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.FetchStatus operation type + */ +static const struct afs_call_type yfs_RXYFSFetchStatus = { + .name = "YFS.FetchStatus", + .op = yfs_FS_FetchStatus, + .deliver = yfs_deliver_fs_fetch_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a fid without needing a vnode handle. + */ +int yfs_fs_fetch_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fid, + struct afs_file_status *status, + struct afs_callback *callback, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), fid->vid, fid->vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = status; + call->reply[2] = callback; + call->reply[3] = volsync; + call->expected_version = 1; /* vnode->status.data_version */ + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, fid); + yfs_check_req(call, bp); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an YFS.InlineBulkStatus call + */ +static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) +{ + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + u32 tmp; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + + /* Extract the file status count and array in two steps */ + case 1: + _debug("extract status count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("status count: %u/%u", tmp, call->count2); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_count); + + call->count = 0; + call->unmarshall++; + more_counts: + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus)); + + case 2: + _debug("extract status array %u", call->count); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer; + statuses = call->reply[1]; + ret = yfs_decode_status(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL); + if (ret < 0) + return ret; + + call->count++; + if (call->count < call->count2) + goto more_counts; + + call->count = 0; + call->unmarshall++; + afs_extract_to_tmp(call); + + /* Extract the callback count and array in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_cb_count); + call->count = 0; + call->unmarshall++; + more_cbs: + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack)); + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + _debug("unmarshall CB array"); + bp = call->buffer; + callbacks = call->reply[2]; + xdr_decode_YFSCallBack_raw(&bp, &callbacks[call->count]); + statuses = call->reply[1]; + if (call->count == 0 && vnode && statuses[0].abort_code == 0) { + bp = call->buffer; + xdr_decode_YFSCallBack(call, vnode, &bp); + } + call->count++; + if (call->count < call->count2) + goto more_cbs; + + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync)); + call->unmarshall++; + + case 5: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSVolSync(&bp, call->reply[3]); + + call->unmarshall++; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.InlineBulkStatus operation type + */ +static const struct afs_call_type yfs_RXYFSInlineBulkStatus = { + .name = "YFS.InlineBulkStatus", + .op = yfs_FS_InlineBulkStatus, + .deliver = yfs_deliver_fs_inline_bulk_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for up to 1024 files + */ +int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fids, + struct afs_file_status *statuses, + struct afs_callback *callbacks, + unsigned int nr_fids, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + int i; + + _enter(",%x,{%llx:%llu},%u", + key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); + + call = afs_alloc_flat_call(net, &yfs_RXYFSInlineBulkStatus, + sizeof(__be32) + + sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) * nr_fids, + sizeof(struct yfs_xdr_YFSFetchStatus)); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = statuses; + call->reply[2] = callbacks; + call->reply[3] = volsync; + call->count2 = nr_fids; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSINLINEBULKSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPCFlags */ + bp = xdr_encode_u32(bp, nr_fids); + for (i = 0; i < nr_fids; i++) + bp = xdr_encode_YFSFid(bp, &fids[i]); + yfs_check_req(call, bp); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &fids[0]); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 9a69392f1fb3..d81c148682e7 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -350,7 +350,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_magic = BFS_MAGIC; - if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || + le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { printf("Superblock is corrupted\n"); goto out1; } @@ -359,9 +360,11 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; imap_len = (info->si_lasti / 8) + 1; - info->si_imap = kzalloc(imap_len, GFP_KERNEL); - if (!info->si_imap) + info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); + if (!info->si_imap) { + printf("Cannot allocate %u bytes\n", imap_len); goto out1; + } for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); diff --git a/fs/block_dev.c b/fs/block_dev.c index 38b8ce05cbc7..a80b4f0ee7c4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->size = 0; dio->multi_bio = false; - dio->should_dirty = is_read && (iter->type == ITER_IOVEC); + dio->should_dirty = is_read && iter_is_iovec(iter); blk_start_plug(&plug); for (;;) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68ca41dbbef3..80953528572d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3201,9 +3201,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3233,8 +3230,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97c7a086f7bd..a3c22e16509b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3298,8 +3298,7 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif - .clone_file_range = btrfs_clone_file_range, - .dedupe_file_range = btrfs_dedupe_file_range, + .remap_file_range = btrfs_remap_file_range, }; void __cold btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a990a9045139..3ca6943827ef 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3629,26 +3629,6 @@ out_unlock: return ret; } -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen) -{ - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* - * Btrfs does not support blocksize < page_size. As a - * result, btrfs_cmp_data() won't correctly handle - * this situation without an update. - */ - return -EINVAL; - } - - return btrfs_extent_same(src, src_loff, olen, dst, dst_loff); -} - static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, @@ -4350,10 +4330,34 @@ out_unlock: return ret; } -int btrfs_clone_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len) +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) { - return btrfs_clone_files(dst_file, src_file, off, len, destoff); + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) { + struct inode *src = file_inode(src_file); + struct inode *dst = file_inode(dst_file); + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + + if (WARN_ON_ONCE(bs < PAGE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + return -EINVAL; + } + + ret = btrfs_extent_same(src, off, len, dst, destoff); + } else { + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + } + return ret < 0 ? ret : len; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) diff --git a/fs/buffer.c b/fs/buffer.c index d60d61e8ed7d..1286c2b95498 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,6 +3060,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3079,11 +3084,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - submit_bio(bio); return 0; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f788496fafcc..27cad84dab23 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -615,7 +615,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, more = len < iov_iter_count(to); - if (unlikely(to->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(to))) { ret = iov_iter_get_pages_alloc(to, &pages, len, &page_off); if (ret <= 0) { @@ -662,7 +662,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ret += zlen; } - if (unlikely(to->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(to))) { if (ret > 0) { iov_iter_advance(to, ret); off += ret; @@ -815,7 +815,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, + iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, osd_data->num_bvecs, osd_data->bvec_pos.iter.bi_size); iov_iter_advance(&i, rc); @@ -1038,8 +1038,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int zlen = min_t(size_t, len - ret, size - pos - ret); - iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, - len); + iov_iter_bvec(&i, READ, bvecs, num_pages, len); iov_iter_advance(&i, ret); iov_iter_zero(zlen, &i); ret += zlen; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 3e812428ac8d..ba178b09de0b 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -145,6 +145,58 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); } +static int cifs_debug_files_proc_show(struct seq_file *m, void *v) +{ + struct list_head *stmp, *tmp, *tmp1, *tmp2; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cifsFileInfo *cfile; + + seq_puts(m, "# Version:1\n"); + seq_puts(m, "# Format:\n"); + seq_puts(m, "# <tree id> <persistent fid> <flags> <count> <pid> <uid>"); +#ifdef CONFIG_CIFS_DEBUG2 + seq_printf(m, " <filename> <mid>\n"); +#else + seq_printf(m, " <filename>\n"); +#endif /* CIFS_DEBUG2 */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(stmp, &cifs_tcp_ses_list) { + server = list_entry(stmp, struct TCP_Server_Info, + tcp_ses_list); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + spin_lock(&tcon->open_file_lock); + list_for_each(tmp2, &tcon->openFileList) { + cfile = list_entry(tmp2, struct cifsFileInfo, + tlist); + seq_printf(m, + "0x%x 0x%llx 0x%x %d %d %d %s", + tcon->tid, + cfile->fid.persistent_fid, + cfile->f_flags, + cfile->count, + cfile->pid, + from_kuid(&init_user_ns, cfile->uid), + cfile->dentry->d_name.name); +#ifdef CONFIG_CIFS_DEBUG2 + seq_printf(m, " 0x%llx\n", cfile->fid.mid); +#else + seq_printf(m, "\n"); +#endif /* CIFS_DEBUG2 */ + } + spin_unlock(&tcon->open_file_lock); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + seq_putc(m, '\n'); + return 0; +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct list_head *tmp1, *tmp2, *tmp3; @@ -565,6 +617,9 @@ cifs_proc_init(void) proc_create_single("DebugData", 0, proc_fs_cifs, cifs_debug_data_proc_show); + proc_create_single("open_files", 0400, proc_fs_cifs, + cifs_debug_files_proc_show); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops); proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops); proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops); @@ -601,6 +656,7 @@ cifs_proc_clean(void) return; remove_proc_entry("DebugData", proc_fs_cifs); + remove_proc_entry("open_files", proc_fs_cifs); remove_proc_entry("cifsFYI", proc_fs_cifs); remove_proc_entry("traceSMB", proc_fs_cifs); remove_proc_entry("Stats", proc_fs_cifs); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index b611fc2e8984..7f01c6e60791 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -147,8 +147,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) sprintf(dp, ";sec=krb5"); else if (server->sec_mskerberos) sprintf(dp, ";sec=mskrb5"); - else - goto out; + else { + cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n"); + sprintf(dp, ";sec=krb5"); + } dp = description + strlen(description); sprintf(dp, ";uid=0x%x", diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 7de9603c54f1..865706edb307 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -992,17 +992,21 @@ const struct inode_operations cifs_symlink_inode_ops = { .listxattr = cifs_listxattr, }; -static int cifs_clone_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len) +static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) { struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); struct cifsFileInfo *smb_file_src = src_file->private_data; - struct cifsFileInfo *smb_file_target = dst_file->private_data; - struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink); + struct cifsFileInfo *smb_file_target; + struct cifs_tcon *target_tcon; unsigned int xid; int rc; + if (remap_flags & ~REMAP_FILE_ADVISORY) + return -EINVAL; + cifs_dbg(FYI, "clone range\n"); xid = get_xid(); @@ -1013,6 +1017,9 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, goto out; } + smb_file_target = dst_file->private_data; + target_tcon = tlink_tcon(smb_file_target->tlink); + /* * Note: cifs case is easier than btrfs since server responsible for * checks for proper open modes and file type and if it wants @@ -1042,7 +1049,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, unlock_two_nondirectories(src_inode, target_inode); out: free_xid(xid); - return rc; + return rc < 0 ? rc : len; } ssize_t cifs_file_copychunk_range(unsigned int xid, @@ -1151,7 +1158,7 @@ const struct file_operations cifs_file_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1170,15 +1177,14 @@ const struct file_operations cifs_file_strict_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_direct_ops = { - /* BB reevaluate whether they can be done with directio, no cache */ - .read_iter = cifs_user_readv, - .write_iter = cifs_user_writev, + .read_iter = cifs_direct_readv, + .write_iter = cifs_direct_writev, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -1189,7 +1195,7 @@ const struct file_operations cifs_file_direct_ops = { .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1208,7 +1214,7 @@ const struct file_operations cifs_file_nobrl_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1226,15 +1232,14 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_direct_nobrl_ops = { - /* BB reevaluate whether they can be done with directio, no cache */ - .read_iter = cifs_user_readv, - .write_iter = cifs_user_writev, + .read_iter = cifs_direct_readv, + .write_iter = cifs_direct_writev, .open = cifs_open, .release = cifs_close, .fsync = cifs_fsync, @@ -1244,7 +1249,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1256,7 +1261,7 @@ const struct file_operations cifs_dir_ops = { .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = generic_file_llseek, .fsync = cifs_dir_fsync, }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 24e265a51874..4c3b5cfccc49 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -101,8 +101,10 @@ extern int cifs_open(struct inode *inode, struct file *file); extern int cifs_close(struct inode *inode, struct file *file); extern int cifs_closedir(struct inode *inode, struct file *file); extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ed1e0fcb69e3..38ab0fca49e1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1125,6 +1125,9 @@ struct cifs_fid { __u8 create_guid[16]; struct cifs_pending_open *pending_open; unsigned int epoch; +#ifdef CONFIG_CIFS_DEBUG2 + __u64 mid; +#endif /* CIFS_DEBUG2 */ bool purge_cache; }; @@ -1183,6 +1186,11 @@ struct cifs_aio_ctx { unsigned int len; unsigned int total_len; bool should_dirty; + /* + * Indicates if this aio_ctx is for direct_io, + * If yes, iter is a copy of the user passed iov_iter + */ + bool direct_io; }; struct cifs_readdata; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 1ce733f3582f..79d842e7240c 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1539,6 +1539,9 @@ struct reparse_symlink_data { char PathBuffer[0]; } __attribute__((packed)); +/* Flag above */ +#define SYMLINK_FLAG_RELATIVE 0x00000001 + /* For IO_REPARSE_TAG_NFS */ #define NFS_SPECFILE_LNK 0x00000000014B4E4C #define NFS_SPECFILE_CHR 0x0000000000524843 diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d82f0cc71755..6f24f129a751 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -589,7 +589,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; - iov_iter_kvec(&smb_msg.msg_iter, READ | ITER_KVEC, &iov, 1, to_read); + iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -601,7 +601,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, struct msghdr smb_msg; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; - iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read); + iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c620d4b5d5d4..74c33d5fafc8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1005,7 +1005,7 @@ cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) * Set the byte-range lock (mandatory style). Returns: * 1) 0, if we set the lock and don't need to request to the server; * 2) 1, if no locks prevent us but we need to request to the server; - * 3) -EACCESS, if there is a lock that prevents us and wait is false. + * 3) -EACCES, if there is a lock that prevents us and wait is false. */ static int cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, @@ -2538,6 +2538,61 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, } static int +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, + struct cifs_aio_ctx *ctx) +{ + int wait_retry = 0; + unsigned int wsize, credits; + int rc; + struct TCP_Server_Info *server = + tlink_tcon(wdata->cfile->tlink)->ses->server; + + /* + * Try to resend this wdata, waiting for credits up to 3 seconds. + * Note: we are attempting to resend the whole wdata not in segments + */ + do { + rc = server->ops->wait_mtu_credits( + server, wdata->bytes, &wsize, &credits); + + if (rc) + break; + + if (wsize < wdata->bytes) { + add_credits_and_wake_if(server, credits, 0); + msleep(1000); + wait_retry++; + } + } while (wsize < wdata->bytes && wait_retry < 3); + + if (wsize < wdata->bytes) { + rc = -EBUSY; + goto out; + } + + rc = -EAGAIN; + while (rc == -EAGAIN) { + rc = 0; + if (wdata->cfile->invalidHandle) + rc = cifs_reopen_file(wdata->cfile, false); + if (!rc) + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); + } + + if (!rc) { + list_add_tail(&wdata->list, wdata_list); + return 0; + } + + add_credits_and_wake_if(server, wdata->credits, 0); +out: + kref_put(&wdata->refcount, cifs_uncached_writedata_release); + + return rc; +} + +static int cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, @@ -2551,6 +2606,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, loff_t saved_offset = offset; pid_t pid; struct TCP_Server_Info *server; + struct page **pagevec; + size_t start; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -2567,38 +2624,79 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) break; - nr_pages = get_numpages(wsize, len, &cur_len); - wdata = cifs_writedata_alloc(nr_pages, + if (ctx->direct_io) { + ssize_t result; + + result = iov_iter_get_pages_alloc( + from, &pagevec, wsize, &start); + if (result < 0) { + cifs_dbg(VFS, + "direct_writev couldn't get user pages " + "(rc=%zd) iter type %d iov_offset %zd " + "count %zd\n", + result, from->type, + from->iov_offset, from->count); + dump_stack(); + break; + } + cur_len = (size_t)result; + iov_iter_advance(from, cur_len); + + nr_pages = + (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE; + + wdata = cifs_writedata_direct_alloc(pagevec, cifs_uncached_writev_complete); - if (!wdata) { - rc = -ENOMEM; - add_credits_and_wake_if(server, credits, 0); - break; - } + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; + } - rc = cifs_write_allocate_pages(wdata->pages, nr_pages); - if (rc) { - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } - num_pages = nr_pages; - rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages); - if (rc) { - for (i = 0; i < nr_pages; i++) - put_page(wdata->pages[i]); - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } + wdata->page_offset = start; + wdata->tailsz = + nr_pages > 1 ? + cur_len - (PAGE_SIZE - start) - + (nr_pages - 2) * PAGE_SIZE : + cur_len; + } else { + nr_pages = get_numpages(wsize, len, &cur_len); + wdata = cifs_writedata_alloc(nr_pages, + cifs_uncached_writev_complete); + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; + } - /* - * Bring nr_pages down to the number of pages we actually used, - * and free any pages that we didn't use. - */ - for ( ; nr_pages > num_pages; nr_pages--) - put_page(wdata->pages[nr_pages - 1]); + rc = cifs_write_allocate_pages(wdata->pages, nr_pages); + if (rc) { + kfree(wdata); + add_credits_and_wake_if(server, credits, 0); + break; + } + + num_pages = nr_pages; + rc = wdata_fill_from_iovec( + wdata, from, &cur_len, &num_pages); + if (rc) { + for (i = 0; i < nr_pages; i++) + put_page(wdata->pages[i]); + kfree(wdata); + add_credits_and_wake_if(server, credits, 0); + break; + } + + /* + * Bring nr_pages down to the number of pages we + * actually used, and free any pages that we didn't use. + */ + for ( ; nr_pages > num_pages; nr_pages--) + put_page(wdata->pages[nr_pages - 1]); + + wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); + } wdata->sync_mode = WB_SYNC_ALL; wdata->nr_pages = nr_pages; @@ -2607,7 +2705,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->pid = pid; wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; - wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); wdata->credits = credits; wdata->ctx = ctx; kref_get(&ctx->refcount); @@ -2682,13 +2779,18 @@ restart_loop: INIT_LIST_HEAD(&tmp_list); list_del_init(&wdata->list); - iov_iter_advance(&tmp_from, + if (ctx->direct_io) + rc = cifs_resend_wdata( + wdata, &tmp_list, ctx); + else { + iov_iter_advance(&tmp_from, wdata->offset - ctx->pos); - rc = cifs_write_from_iter(wdata->offset, + rc = cifs_write_from_iter(wdata->offset, wdata->bytes, &tmp_from, ctx->cfile, cifs_sb, &tmp_list, ctx); + } list_splice(&tmp_list, &ctx->list); @@ -2701,8 +2803,9 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } - for (i = 0; i < ctx->npages; i++) - put_page(ctx->bv[i].bv_page); + if (!ctx->direct_io) + for (i = 0; i < ctx->npages; i++) + put_page(ctx->bv[i].bv_page); cifs_stats_bytes_written(tcon, ctx->total_len); set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); @@ -2717,7 +2820,8 @@ restart_loop: complete(&ctx->done); } -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +static ssize_t __cifs_writev( + struct kiocb *iocb, struct iov_iter *from, bool direct) { struct file *file = iocb->ki_filp; ssize_t total_written = 0; @@ -2726,13 +2830,18 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) struct cifs_sb_info *cifs_sb; struct cifs_aio_ctx *ctx; struct iov_iter saved_from = *from; + size_t len = iov_iter_count(from); int rc; /* - * BB - optimize the way when signing is disabled. We can drop this - * extra memory-to-memory copying and use iovec buffers for constructing - * write request. + * iov_iter_get_pages_alloc doesn't work with ITER_KVEC. + * In this case, fall back to non-direct write function. + * this could be improved by getting pages directly in ITER_KVEC */ + if (direct && from->type & ITER_KVEC) { + cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n"); + direct = false; + } rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2756,10 +2865,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) ctx->pos = iocb->ki_pos; - rc = setup_aio_ctx_iter(ctx, from, WRITE); - if (rc) { - kref_put(&ctx->refcount, cifs_aio_ctx_release); - return rc; + if (direct) { + ctx->direct_io = true; + ctx->iter = *from; + ctx->len = len; + } else { + rc = setup_aio_ctx_iter(ctx, from, WRITE); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } } /* grab a lock here due to read response handlers can access ctx */ @@ -2809,6 +2924,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) return total_written; } +ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from) +{ + return __cifs_writev(iocb, from, true); +} + +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +{ + return __cifs_writev(iocb, from, false); +} + static ssize_t cifs_writev(struct kiocb *iocb, struct iov_iter *from) { @@ -2979,7 +3104,6 @@ cifs_uncached_readdata_release(struct kref *refcount) kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < rdata->nr_pages; i++) { put_page(rdata->pages[i]); - rdata->pages[i] = NULL; } cifs_readdata_release(refcount); } @@ -3004,7 +3128,7 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) size_t copy = min_t(size_t, remaining, PAGE_SIZE); size_t written; - if (unlikely(iter->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(iter))) { void *addr = kmap_atomic(page); written = copy_to_iter(addr, copy, iter); @@ -3106,6 +3230,67 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, return uncached_fill_pages(server, rdata, iter, iter->count); } +static int cifs_resend_rdata(struct cifs_readdata *rdata, + struct list_head *rdata_list, + struct cifs_aio_ctx *ctx) +{ + int wait_retry = 0; + unsigned int rsize, credits; + int rc; + struct TCP_Server_Info *server = + tlink_tcon(rdata->cfile->tlink)->ses->server; + + /* + * Try to resend this rdata, waiting for credits up to 3 seconds. + * Note: we are attempting to resend the whole rdata not in segments + */ + do { + rc = server->ops->wait_mtu_credits(server, rdata->bytes, + &rsize, &credits); + + if (rc) + break; + + if (rsize < rdata->bytes) { + add_credits_and_wake_if(server, credits, 0); + msleep(1000); + wait_retry++; + } + } while (rsize < rdata->bytes && wait_retry < 3); + + /* + * If we can't find enough credits to send this rdata + * release the rdata and return failure, this will pass + * whatever I/O amount we have finished to VFS. + */ + if (rsize < rdata->bytes) { + rc = -EBUSY; + goto out; + } + + rc = -EAGAIN; + while (rc == -EAGAIN) { + rc = 0; + if (rdata->cfile->invalidHandle) + rc = cifs_reopen_file(rdata->cfile, true); + if (!rc) + rc = server->ops->async_readv(rdata); + } + + if (!rc) { + /* Add to aio pending list */ + list_add_tail(&rdata->list, rdata_list); + return 0; + } + + add_credits_and_wake_if(server, rdata->credits, 0); +out: + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + + return rc; +} + static int cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *rdata_list, @@ -3117,6 +3302,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, int rc; pid_t pid; struct TCP_Server_Info *server; + struct page **pagevec; + size_t start; + struct iov_iter direct_iov = ctx->iter; server = tlink_tcon(open_file->tlink)->ses->server; @@ -3125,6 +3313,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, else pid = current->tgid; + if (ctx->direct_io) + iov_iter_advance(&direct_iov, offset - ctx->pos); + do { rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, &rsize, &credits); @@ -3132,20 +3323,59 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, break; cur_len = min_t(const size_t, len, rsize); - npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); - /* allocate a readdata struct */ - rdata = cifs_readdata_alloc(npages, + if (ctx->direct_io) { + ssize_t result; + + result = iov_iter_get_pages_alloc( + &direct_iov, &pagevec, + cur_len, &start); + if (result < 0) { + cifs_dbg(VFS, + "couldn't get user pages (cur_len=%zd)" + " iter type %d" + " iov_offset %zd count %zd\n", + result, direct_iov.type, + direct_iov.iov_offset, + direct_iov.count); + dump_stack(); + break; + } + cur_len = (size_t)result; + iov_iter_advance(&direct_iov, cur_len); + + rdata = cifs_readdata_direct_alloc( + pagevec, cifs_uncached_readv_complete); + if (!rdata) { + add_credits_and_wake_if(server, credits, 0); + rc = -ENOMEM; + break; + } + + npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE; + rdata->page_offset = start; + rdata->tailsz = npages > 1 ? + cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE : + cur_len; + + } else { + + npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); + /* allocate a readdata struct */ + rdata = cifs_readdata_alloc(npages, cifs_uncached_readv_complete); - if (!rdata) { - add_credits_and_wake_if(server, credits, 0); - rc = -ENOMEM; - break; - } + if (!rdata) { + add_credits_and_wake_if(server, credits, 0); + rc = -ENOMEM; + break; + } - rc = cifs_read_allocate_pages(rdata, npages); - if (rc) - goto error; + rc = cifs_read_allocate_pages(rdata, npages); + if (rc) + goto error; + + rdata->tailsz = PAGE_SIZE; + } rdata->cfile = cifsFileInfo_get(open_file); rdata->nr_pages = npages; @@ -3153,7 +3383,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->bytes = cur_len; rdata->pid = pid; rdata->pagesz = PAGE_SIZE; - rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; rdata->credits = credits; @@ -3167,9 +3396,11 @@ error: if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, - cifs_uncached_readdata_release); - if (rc == -EAGAIN) + cifs_uncached_readdata_release); + if (rc == -EAGAIN) { + iov_iter_revert(&direct_iov, cur_len); continue; + } break; } @@ -3225,45 +3456,62 @@ again: * reading. */ if (got_bytes && got_bytes < rdata->bytes) { - rc = cifs_readdata_to_iov(rdata, to); + rc = 0; + if (!ctx->direct_io) + rc = cifs_readdata_to_iov(rdata, to); if (rc) { kref_put(&rdata->refcount, - cifs_uncached_readdata_release); + cifs_uncached_readdata_release); continue; } } - rc = cifs_send_async_read( + if (ctx->direct_io) { + /* + * Re-use rdata as this is a + * direct I/O + */ + rc = cifs_resend_rdata( + rdata, + &tmp_list, ctx); + } else { + rc = cifs_send_async_read( rdata->offset + got_bytes, rdata->bytes - got_bytes, rdata->cfile, cifs_sb, &tmp_list, ctx); + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + } + list_splice(&tmp_list, &ctx->list); - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); goto again; } else if (rdata->result) rc = rdata->result; - else + else if (!ctx->direct_io) rc = cifs_readdata_to_iov(rdata, to); /* if there was a short read -- discard anything left */ if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) rc = -ENODATA; + + ctx->total_len += rdata->got_bytes; } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); - } + if (!ctx->direct_io) { + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } - ctx->total_len = ctx->len - iov_iter_count(to); + ctx->total_len = ctx->len - iov_iter_count(to); + } cifs_stats_bytes_read(tcon, ctx->total_len); @@ -3281,18 +3529,28 @@ again: complete(&ctx->done); } -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static ssize_t __cifs_readv( + struct kiocb *iocb, struct iov_iter *to, bool direct) { - struct file *file = iocb->ki_filp; - ssize_t rc; size_t len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; + struct file *file = iocb->ki_filp; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; struct cifsFileInfo *cfile; + struct cifs_tcon *tcon; + ssize_t rc, total_read = 0; + loff_t offset = iocb->ki_pos; struct cifs_aio_ctx *ctx; + /* + * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC, + * fall back to data copy read path + * this could be improved by getting pages directly in ITER_KVEC + */ + if (direct && to->type & ITER_KVEC) { + cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n"); + direct = false; + } + len = iov_iter_count(to); if (!len) return 0; @@ -3316,17 +3574,23 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (to->type == ITER_IOVEC) + if (iter_is_iovec(to)) ctx->should_dirty = true; - rc = setup_aio_ctx_iter(ctx, to, READ); - if (rc) { - kref_put(&ctx->refcount, cifs_aio_ctx_release); - return rc; + if (direct) { + ctx->pos = offset; + ctx->direct_io = true; + ctx->iter = *to; + ctx->len = len; + } else { + rc = setup_aio_ctx_iter(ctx, to, READ); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + len = ctx->len; } - len = ctx->len; - /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); @@ -3368,6 +3632,16 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) return rc; } +ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to) +{ + return __cifs_readv(iocb, to, true); +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + return __cifs_readv(iocb, to, false); +} + ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1023d78673fb..a81a9df997c1 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1320,8 +1320,8 @@ cifs_drop_nlink(struct inode *inode) /* * If d_inode(dentry) is null (usually meaning the cached dentry * is a negative dentry) then we would attempt a standard SMB delete, but - * if that fails we can not attempt the fall back mechanisms on EACCESS - * but will return the EACCESS to the caller. Note that the VFS does not call + * if that fails we can not attempt the fall back mechanisms on EACCES + * but will return the EACCES to the caller. Note that the VFS does not call * unlink on negative dentries currently. */ int cifs_unlink(struct inode *dir, struct dentry *dentry) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index fc43d5d25d1d..8a41f4eba726 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -788,7 +788,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) struct page **pages = NULL; struct bio_vec *bv = NULL; - if (iter->type & ITER_KVEC) { + if (iov_iter_is_kvec(iter)) { memcpy(&ctx->iter, iter, sizeof(struct iov_iter)); ctx->len = count; iov_iter_advance(iter, count); @@ -859,7 +859,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) ctx->bv = bv; ctx->len = saved_len - count; ctx->npages = npages; - iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len); + iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len); return 0; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f85fc5aa2710..225fec1cfa67 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -747,6 +747,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size, int rc = 0; unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; char *name, *value; + size_t buf_size = dst_size; size_t name_len, value_len, user_name_len; while (src_size > 0) { @@ -782,9 +783,10 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size, /* 'user.' plus a terminating null */ user_name_len = 5 + 1 + name_len; - rc += user_name_len; - - if (dst_size >= user_name_len) { + if (buf_size == 0) { + /* skip copy - calc size only */ + rc += user_name_len; + } else if (dst_size >= user_name_len) { dst_size -= user_name_len; memcpy(dst, "user.", 5); dst += 5; @@ -792,8 +794,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size, dst += name_len; *dst = 0; ++dst; - } else if (dst_size == 0) { - /* skip copy - calc size only */ + rc += user_name_len; } else { /* stop before overrun buffer */ rc = -ERANGE; @@ -1078,6 +1079,9 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) cfile->fid.persistent_fid = fid->persistent_fid; cfile->fid.volatile_fid = fid->volatile_fid; +#ifdef CONFIG_CIFS_DEBUG2 + cfile->fid.mid = fid->mid; +#endif /* CIFS_DEBUG2 */ server->ops->set_oplock_level(cinode, oplock, fid->epoch, &fid->purge_cache); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); @@ -3152,13 +3156,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - iov_iter_bvec(&iter, WRITE | ITER_BVEC, bvec, npages, data_len); + iov_iter_bvec(&iter, WRITE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); iov.iov_base = buf + data_offset; iov.iov_len = data_len; - iov_iter_kvec(&iter, WRITE | ITER_KVEC, &iov, 1, data_len); + iov_iter_kvec(&iter, WRITE, &iov, 1, data_len); } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7d7b016fe8bb..27f86537a5d1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1512,7 +1512,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; - + trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc); if (rc != 0) { if (tcon) { cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE); @@ -1559,6 +1559,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if (tcon->ses->server->ops->validate_negotiate) rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); tcon_exit: + free_rsp_buf(resp_buftype, rsp); kfree(unc_path); return rc; @@ -2308,6 +2309,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, atomic_inc(&tcon->num_remote_opens); oparms->fid->persistent_fid = rsp->PersistentFileId; oparms->fid->volatile_fid = rsp->VolatileFileId; +#ifdef CONFIG_CIFS_DEBUG2 + oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId); +#endif /* CIFS_DEBUG2 */ if (buf) { memcpy(buf, &rsp->CreationTime, 32); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f753f424d7f1..5671d5ee7f58 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -842,6 +842,41 @@ struct fsctl_get_integrity_information_rsp { /* Integrity flags for above */ #define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 +/* Reparse structures - see MS-FSCC 2.1.2 */ + +/* struct fsctl_reparse_info_req is empty, only response structs (see below) */ + +struct reparse_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __u8 DataBuffer[0]; /* Variable Length */ +} __packed; + +struct reparse_guid_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __u8 ReparseGuid[16]; + __u8 DataBuffer[0]; /* Variable Length */ +} __packed; + +struct reparse_mount_point_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __u8 PathBuffer[0]; /* Variable Length */ +} __packed; + +/* See MS-FSCC 2.1.2.4 and cifspdu.h for struct reparse_symlink_data */ + +/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ + + /* See MS-DFSC 2.2.2 */ struct fsctl_get_dfs_referral_req { __le16 MaxReferralLevel; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5e282368cc4a..e94a8d1d08a3 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -2054,14 +2054,22 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) info->smbd_recv_pending++; - switch (msg->msg_iter.type) { - case READ | ITER_KVEC: + if (iov_iter_rw(&msg->msg_iter) == WRITE) { + /* It's a bug in upper layer to get there */ + cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n", + iov_iter_rw(&msg->msg_iter)); + rc = -EINVAL; + goto out; + } + + switch (iov_iter_type(&msg->msg_iter)) { + case ITER_KVEC: buf = msg->msg_iter.kvec->iov_base; to_read = msg->msg_iter.kvec->iov_len; rc = smbd_recv_buf(info, buf, to_read); break; - case READ | ITER_BVEC: + case ITER_BVEC: page = msg->msg_iter.bvec->bv_page; page_offset = msg->msg_iter.bvec->bv_offset; to_read = msg->msg_iter.bvec->bv_len; @@ -2071,10 +2079,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) default: /* It's a bug in upper layer to get there */ cifs_dbg(VFS, "CIFS: invalid msg type %d\n", - msg->msg_iter.type); + iov_iter_type(&msg->msg_iter)); rc = -EINVAL; } +out: info->smbd_recv_pending--; wake_up(&info->wait_smbd_recv_pending); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index cce8414fe7ec..fb049809555f 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -374,6 +374,48 @@ DEFINE_SMB3_ENTER_EXIT_EVENT(enter); DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done); /* + * For SMB2/SMB3 tree connect + */ + +DECLARE_EVENT_CLASS(smb3_tcon_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + const char *unc_name, + int rc), + TP_ARGS(xid, tid, sesid, unc_name, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(const char *, unc_name) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->unc_name = unc_name; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d", + __entry->xid, __entry->sesid, __entry->tid, + __entry->unc_name, __entry->rc) +) + +#define DEFINE_SMB3_TCON_EVENT(name) \ +DEFINE_EVENT(smb3_tcon_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + const char *unc_name, \ + int rc), \ + TP_ARGS(xid, tid, sesid, unc_name, rc)) + +DEFINE_SMB3_TCON_EVENT(tcon); + + +/* * For smb2/smb3 open call */ DECLARE_EVENT_CLASS(smb3_open_err_class, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index f8112433f0c8..83ff0c25710d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -316,8 +316,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, .iov_base = &rfc1002_marker, .iov_len = 4 }; - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov, - 1, 4); + iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) goto uncork; @@ -338,8 +337,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size += iov[i].iov_len; } - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, - iov, n_vec, size); + iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -355,7 +353,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rqst_page_get_length(&rqst[j], i, &bvec.bv_len, &bvec.bv_offset); - iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, + iov_iter_bvec(&smb_msg.msg_iter, WRITE, &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) diff --git a/fs/direct-io.c b/fs/direct-io.c index 093fb54cd316..722d17c88edb 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1313,7 +1313,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, spin_lock_init(&dio->bio_lock); dio->refcount = 1; - dio->should_dirty = (iter->type == ITER_IOVEC); + dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ; sdio.iter = iter; sdio.final_block_in_request = end >> blkbits; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a5e4a221435c..76976d6e50f9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -674,7 +674,7 @@ static int receive_from_sock(struct connection *con) nvec = 2; } len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nvec, len); + iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); if (ret <= 0) diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 41cf2fbee50d..906839a4da8f 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -101,6 +101,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts) token = match_token(p, tokens, args); switch (token) { case Opt_name: + kfree(opts->dev_name); opts->dev_name = match_strdup(&args[0]); if (unlikely(!opts->dev_name)) { EXOFS_ERR("Error allocating dev_name"); @@ -117,7 +118,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts) EXOFS_MIN_PID); return -EINVAL; } - s_pid = 1; + s_pid = true; break; case Opt_to: if (match_int(&args[0], &option)) @@ -866,8 +867,10 @@ static struct dentry *exofs_mount(struct file_system_type *type, int ret; ret = parse_options(data, &opts); - if (ret) + if (ret) { + kfree(opts.dev_name); return ERR_PTR(ret); + } if (!opts.dev_name) opts.dev_name = dev_name; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 12f90d48ba61..3f89d0ab08fc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -45,15 +45,6 @@ #include <linux/compiler.h> -/* Until this gets included into linux/compiler-gcc.h */ -#ifndef __nonstring -#if defined(GCC_VERSION) && (GCC_VERSION >= 80000) -#define __nonstring __attribute__((nonstring)) -#else -#define __nonstring -#endif -#endif - /* * The fourth extended filesystem constants/structures */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2addcb8730e1..014f6a698cb7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1216,7 +1216,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) - return (struct inode *) bitmap_bh; + return ERR_CAST(bitmap_bh); /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 67a38532032a..17adcb16a9c8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1556,7 +1556,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) - return (struct dentry *) bh; + return ERR_CAST(bh); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1600,7 +1600,7 @@ struct dentry *ext4_get_parent(struct dentry *child) bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); if (IS_ERR(bh)) - return (struct dentry *) bh; + return ERR_CAST(bh); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2aa62d58d8dd..db7590178dfc 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; + wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; - wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 58dbc39fea63..cc2121b37bf5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1275,7 +1275,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, ssize_t ret = 0; /* Special case for kernel I/O: can copy directly into the buffer */ - if (ii->type & ITER_KVEC) { + if (iov_iter_is_kvec(ii)) { unsigned long user_addr = fuse_get_user_addr(ii); size_t frag_size = fuse_get_frag_size(ii, *nbytesp); diff --git a/fs/ioctl.c b/fs/ioctl.c index 2005529af560..d64f622cac8b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -223,6 +223,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { struct fd src_file = fdget(srcfd); + loff_t cloned; int ret; if (!src_file.file) @@ -230,7 +231,14 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; - ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); + cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, + olen, 0); + if (cloned < 0) + ret = cloned; + else if (olen && cloned != olen) + ret = -EINVAL; + else + ret = 0; fdput: fdput(src_file); return ret; @@ -669,6 +677,9 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, return ioctl_fiemap(filp, arg); case FIGETBSZ: + /* anon_bdev filesystems may not have a block size */ + if (!inode->i_sb->s_blocksize) + return -EINVAL; return put_user(inode->i_sb->s_blocksize, argp); case FICLONE: diff --git a/fs/iomap.c b/fs/iomap.c index 90c2febc93ac..64ce240217a1 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -30,7 +30,6 @@ #include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include <linux/sched/signal.h> -#include <linux/swap.h> #include "internal.h" @@ -1795,7 +1794,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; - if (iter->type == ITER_IOVEC) + if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) dio->flags |= IOMAP_DIO_DIRTY; } else { flags |= IOMAP_WRITE; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4288a6ecaf75..46d691ba04bc 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -180,8 +180,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_allocate(filep, offset, len); } -static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, - struct file *dst_file, loff_t dst_off, u64 count) +static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t count, + unsigned int remap_flags) { struct inode *dst_inode = file_inode(dst_file); struct nfs_server *server = NFS_SERVER(dst_inode); @@ -190,6 +191,9 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, bool same_inode = false; int ret; + if (remap_flags & ~REMAP_FILE_ADVISORY) + return -EINVAL; + /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { @@ -240,7 +244,7 @@ out_unlock: inode_unlock(src_inode); } out: - return ret; + return ret < 0 ? ret : count; } #endif /* CONFIG_NFS_V4_2 */ @@ -262,7 +266,7 @@ const struct file_operations nfs4_file_operations = { .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, - .clone_file_range = nfs42_clone_file_range, + .remap_file_range = nfs42_remap_file_range, #else .llseek = nfs_file_llseek, #endif diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index db84b4adbc49..867457d6dfbe 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3788,7 +3788,7 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } /* - * -EACCESS could mean that the user doesn't have correct permissions + * -EACCES could mean that the user doesn't have correct permissions * to access the mount. It could also mean that we tried to mount * with a gss auth flavor, but rpc.gssd isn't running. Either way, * existing mount programs don't handle -EACCES very well so it should diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2751976704e9..eb67098117b4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -541,8 +541,12 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { - return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, - count)); + loff_t cloned; + + cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); + if (count && cloned != count) + cloned = -EINVAL; + return nfserrno(cloned < 0 ? cloned : 0); } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, @@ -923,7 +927,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, int host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); + iov_iter_kvec(&iter, READ, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &offset, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); } @@ -999,7 +1003,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (stable && !use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt); + iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 4690cd75d8d7..3986c7a1f6a8 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -312,7 +312,7 @@ static struct dentry *ntfs_get_parent(struct dentry *child_dent) /* Get the mft record of the inode belonging to the child dentry. */ mrec = map_mft_record(ni); if (IS_ERR(mrec)) - return (struct dentry *)mrec; + return ERR_CAST(mrec); /* Find the first file name attribute in the mft record. */ ctx = ntfs_attr_get_search_ctx(ni, mrec); if (unlikely(!ctx)) { diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1d098c3c00e0..4ebbd57cbf84 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -99,25 +99,34 @@ out: return ret; } +/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it + * will be easier to handle read failure. + */ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]) { int status = 0; unsigned int i; struct buffer_head *bh; + int new_bh = 0; trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); if (!nr) goto bail; + /* Don't put buffer head and re-assign it to NULL if it is allocated + * outside since the caller can't be aware of this alternation! + */ + new_bh = (bhs[0] == NULL); + for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(osb->sb, block++); if (bhs[i] == NULL) { status = -ENOMEM; mlog_errno(status); - goto bail; + break; } } bh = bhs[i]; @@ -158,9 +167,26 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, submit_bh(REQ_OP_READ, 0, bh); } +read_failure: for (i = nr; i > 0; i--) { bh = bhs[i - 1]; + if (unlikely(status)) { + if (new_bh && bh) { + /* If middle bh fails, let previous bh + * finish its read and then put it to + * aovoid bh leak + */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + put_bh(bh); + bhs[i - 1] = NULL; + } else if (bh && buffer_uptodate(bh)) { + clear_buffer_uptodate(bh); + } + continue; + } + /* No need to wait on the buffer if it's managed by JBD. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -170,8 +196,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, * so we can safely record this and loop back * to cleanup the other buffers. */ status = -EIO; - put_bh(bh); - bhs[i - 1] = NULL; + goto read_failure; } } @@ -179,6 +204,9 @@ bail: return status; } +/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it + * will be easier to handle read failure. + */ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, @@ -188,6 +216,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, int i, ignore_cache = 0; struct buffer_head *bh; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + int new_bh = 0; trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); @@ -213,6 +242,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, goto bail; } + /* Don't put buffer head and re-assign it to NULL if it is allocated + * outside since the caller can't be aware of this alternation! + */ + new_bh = (bhs[0] == NULL); + ocfs2_metadata_cache_io_lock(ci); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { @@ -221,7 +255,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, ocfs2_metadata_cache_io_unlock(ci); status = -ENOMEM; mlog_errno(status); - goto bail; + /* Don't forget to put previous bh! */ + break; } } bh = bhs[i]; @@ -316,16 +351,27 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, } } - status = 0; - +read_failure: for (i = (nr - 1); i >= 0; i--) { bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { - if (status) { - /* Clear the rest of the buffers on error */ - put_bh(bh); - bhs[i] = NULL; + if (unlikely(status)) { + /* Clear the buffers on error including those + * ever succeeded in reading + */ + if (new_bh && bh) { + /* If middle bh fails, let previous bh + * finish its read and then put it to + * aovoid bh leak + */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + put_bh(bh); + bhs[i] = NULL; + } else if (bh && buffer_uptodate(bh)) { + clear_buffer_uptodate(bh); + } continue; } /* We know this can't have changed as we hold the @@ -343,9 +389,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, * uptodate. */ status = -EIO; clear_buffer_needs_validate(bh); - put_bh(bh); - bhs[i] = NULL; - continue; + goto read_failure; } if (buffer_needs_validate(bh)) { @@ -355,11 +399,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, BUG_ON(buffer_jbd(bh)); clear_buffer_needs_validate(bh); status = validate(sb, bh); - if (status) { - put_bh(bh); - bhs[i] = NULL; - continue; - } + if (status) + goto read_failure; } } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 7d9eea7d4a87..e9f236af1927 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -916,7 +916,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { struct kvec vec = { .iov_len = len, .iov_base = data, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, len); + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len); return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b048d4fa3959..c121abbdfc7d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1897,8 +1897,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, /* On error, skip the f_pos to the next block. */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse(bh); - continue; + break; } if (le64_to_cpu(de->inode)) { unsigned char d_type = DT_UNKNOWN; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 933aac5da193..7c835824247e 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2123,10 +2123,10 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, /* LVB only has room for 64 bits of time here so we pack it for * now. */ -static u64 ocfs2_pack_timespec(struct timespec *spec) +static u64 ocfs2_pack_timespec(struct timespec64 *spec) { u64 res; - u64 sec = spec->tv_sec; + u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull); u32 nsec = spec->tv_nsec; res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); @@ -2142,7 +2142,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; - struct timespec ts; lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2163,15 +2162,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); - ts = timespec64_to_timespec(inode->i_atime); lvb->lvb_iatime_packed = - cpu_to_be64(ocfs2_pack_timespec(&ts)); - ts = timespec64_to_timespec(inode->i_ctime); + cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&ts)); - ts = timespec64_to_timespec(inode->i_mtime); + cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); lvb->lvb_imtime_packed = - cpu_to_be64(ocfs2_pack_timespec(&ts)); + cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); @@ -2180,7 +2176,7 @@ out: mlog_meta_lvb(0, lockres); } -static void ocfs2_unpack_timespec(struct timespec *spec, +static void ocfs2_unpack_timespec(struct timespec64 *spec, u64 packed_time) { spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; @@ -2189,7 +2185,6 @@ static void ocfs2_unpack_timespec(struct timespec *spec, static void ocfs2_refresh_inode_from_lvb(struct inode *inode) { - struct timespec ts; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; @@ -2217,15 +2212,12 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); - ocfs2_unpack_timespec(&ts, + ocfs2_unpack_timespec(&inode->i_atime, be64_to_cpu(lvb->lvb_iatime_packed)); - inode->i_atime = timespec_to_timespec64(ts); - ocfs2_unpack_timespec(&ts, + ocfs2_unpack_timespec(&inode->i_mtime, be64_to_cpu(lvb->lvb_imtime_packed)); - inode->i_mtime = timespec_to_timespec64(ts); - ocfs2_unpack_timespec(&ts, + ocfs2_unpack_timespec(&inode->i_ctime, be64_to_cpu(lvb->lvb_ictime_packed)); - inode->i_ctime = timespec_to_timespec64(ts); spin_unlock(&oi->ip_lock); } @@ -3603,7 +3595,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * we can recover correctly from node failure. Otherwise, we may get * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. */ - if (!ocfs2_is_o2cb_active() && + if (ocfs2_userspace_stack(osb) && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) lvb = 1; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9fa35cb6f6e0..d640c5f8a85d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2343,7 +2343,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, written = __generic_file_write_iter(iocb, from); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + BUG_ON(written == -EIOCBQUEUED && !direct_io); /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io @@ -2463,7 +2463,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, trace_generic_file_read_iter_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + BUG_ON(ret == -EIOCBQUEUED && !direct_io); /* see ocfs2_file_write_iter */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { @@ -2527,24 +2527,79 @@ out: return offset; } -static int ocfs2_file_clone_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) +static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + bool same_inode = (inode_in == inode_out); + loff_t remapped = 0; + ssize_t ret; -static int ocfs2_file_dedupe_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + /* Check file eligibility and prepare for block sharing. */ + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + /* Lock out changes to the allocation maps and remap. */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + len, PAGE_SIZE) - 1); + + remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, + inode_out, out_bh, pos_out, len); + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (remapped < 0) { + ret = remapped; + mlog_errno(ret); + goto out_unlock; + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return remapped > 0 ? remapped : ret; } const struct inode_operations ocfs2_file_iops = { @@ -2586,8 +2641,7 @@ const struct file_operations ocfs2_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, - .clone_file_range = ocfs2_file_clone_range, - .dedupe_file_range = ocfs2_file_dedupe_range, + .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops = { @@ -2633,8 +2687,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, - .clone_file_range = ocfs2_file_clone_range, - .dedupe_file_range = ocfs2_file_dedupe_range, + .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index bd3475694e83..b63c97f4318e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1378,15 +1378,23 @@ static int __ocfs2_recovery_thread(void *arg) int rm_quota_used = 0, i; struct ocfs2_quota_recovery *qrec; + /* Whether the quota supported. */ + int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA); + status = ocfs2_wait_on_mount(osb); if (status < 0) { goto bail; } - rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); - if (!rm_quota) { - status = -ENOMEM; - goto bail; + if (quota_enabled) { + rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } } restart: status = ocfs2_super_lock(osb, 1); @@ -1422,9 +1430,14 @@ restart: * then quota usage would be out of sync until some node takes * the slot. So we remember which nodes need quota recovery * and when everything else is done, we recover quotas. */ - for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); - if (i == rm_quota_used) - rm_quota[rm_quota_used++] = slot_num; + if (quota_enabled) { + for (i = 0; i < rm_quota_used + && rm_quota[i] != slot_num; i++) + ; + + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + } status = ocfs2_recover_node(osb, node_num, slot_num); skip_recovery: @@ -1452,16 +1465,19 @@ skip_recovery: /* Now it is right time to recover quotas... We have to do this under * superblock lock so that no one can start using the slot (and crash) * before we recover it */ - for (i = 0; i < rm_quota_used; i++) { - qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); - if (IS_ERR(qrec)) { - status = PTR_ERR(qrec); - mlog_errno(status); - continue; + if (quota_enabled) { + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, + rm_quota[i], + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } - ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec, - ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); @@ -1483,7 +1499,8 @@ bail: mutex_unlock(&osb->recovery_lock); - kfree(rm_quota); + if (quota_enabled) + kfree(rm_quota); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 7eb3b0a6347e..3f1685d7d43b 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -25,6 +25,7 @@ #include "ocfs2_ioctl.h" #include "alloc.h" +#include "localalloc.h" #include "aops.h" #include "dlmglue.h" #include "extent_map.h" @@ -233,6 +234,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, struct ocfs2_refcount_tree *ref_tree = NULL; u32 new_phys_cpos, new_len; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + int need_free = 0; if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { BUG_ON(!ocfs2_is_refcount_inode(inode)); @@ -308,6 +310,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, if (!partial) { context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; ret = -ENOSPC; + need_free = 1; goto out_commit; } } @@ -332,6 +335,20 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, mlog_errno(ret); out_commit: + if (need_free && context->data_ac) { + struct ocfs2_alloc_context *data_ac = context->data_ac; + + if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + new_phys_cpos, new_len); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), + new_len); + } + ocfs2_commit_trans(osb, handle); out_unlock_mutex: diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1114ef02e780..a35259eebc56 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4466,9 +4466,9 @@ out: } /* Update destination inode size, if necessary. */ -static int ocfs2_reflink_update_dest(struct inode *dest, - struct buffer_head *d_bh, - loff_t newlen) +int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) { handle_t *handle; int ret; @@ -4505,14 +4505,14 @@ out_commit: } /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ -static int ocfs2_reflink_remap_extent(struct inode *s_inode, - struct buffer_head *s_bh, - loff_t pos_in, - struct inode *t_inode, - struct buffer_head *t_bh, - loff_t pos_out, - loff_t len, - struct ocfs2_cached_dealloc_ctxt *dealloc) +static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len, + struct ocfs2_cached_dealloc_ctxt *dealloc) { struct ocfs2_extent_tree s_et; struct ocfs2_extent_tree t_et; @@ -4520,8 +4520,9 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_super *osb; + loff_t remapped_bytes = 0; loff_t pstart, plen; - u32 p_cluster, num_clusters, slast, spos, tpos; + u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0; unsigned int ext_flags; int ret = 0; @@ -4603,30 +4604,34 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, next_loop: spos += num_clusters; tpos += num_clusters; + remapped_clus += num_clusters; } -out: - return ret; + goto out; out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); - return ret; +out: + remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus); + remapped_bytes = min_t(loff_t, len, remapped_bytes); + + return remapped_bytes > 0 ? remapped_bytes : ret; } /* Set up refcount tree and remap s_inode to t_inode. */ -static int ocfs2_reflink_remap_blocks(struct inode *s_inode, - struct buffer_head *s_bh, - loff_t pos_in, - struct inode *t_inode, - struct buffer_head *t_bh, - loff_t pos_out, - loff_t len) +loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb; struct ocfs2_dinode *dis; struct ocfs2_dinode *dit; - int ret; + loff_t ret; osb = OCFS2_SB(s_inode->i_sb); dis = (struct ocfs2_dinode *)s_bh->b_data; @@ -4698,7 +4703,7 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode, /* Actually remap extents now. */ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, pos_out, len, &dealloc); - if (ret) { + if (ret < 0) { mlog_errno(ret); goto out; } @@ -4713,10 +4718,10 @@ out: } /* Lock an inode and grab a bh pointing to the inode. */ -static int ocfs2_reflink_inodes_lock(struct inode *s_inode, - struct buffer_head **bh1, - struct inode *t_inode, - struct buffer_head **bh2) +int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2) { struct inode *inode1; struct inode *inode2; @@ -4801,10 +4806,10 @@ out_i1: } /* Unlock both inodes and release buffers. */ -static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, - struct buffer_head *s_bh, - struct inode *t_inode, - struct buffer_head *t_bh) +void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) { ocfs2_inode_unlock(s_inode, 1); ocfs2_rw_unlock(s_inode, 1); @@ -4816,82 +4821,3 @@ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, } unlock_two_nondirectories(s_inode, t_inode); } - -/* Link a range of blocks from one file to another. */ -int ocfs2_reflink_remap_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); - struct buffer_head *in_bh = NULL, *out_bh = NULL; - bool same_inode = (inode_in == inode_out); - ssize_t ret; - - if (!ocfs2_refcount_tree(osb)) - return -EOPNOTSUPP; - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) - return -EROFS; - - /* Lock both files against IO */ - ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); - if (ret) - return ret; - - /* Check file eligibility and prepare for block sharing. */ - ret = -EINVAL; - if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || - (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) - goto out_unlock; - - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - &len, is_dedupe); - if (ret <= 0) - goto out_unlock; - - /* Lock out changes to the allocation maps and remap. */ - down_write(&OCFS2_I(inode_in)->ip_alloc_sem); - if (!same_inode) - down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, - SINGLE_DEPTH_NESTING); - - ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, - out_bh, pos_out, len); - - /* Zap any page cache for the destination file's range. */ - if (!ret) - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - - up_write(&OCFS2_I(inode_in)->ip_alloc_sem); - if (!same_inode) - up_write(&OCFS2_I(inode_out)->ip_alloc_sem); - if (ret) { - mlog_errno(ret); - goto out_unlock; - } - - /* - * Empty the extent map so that we may get the right extent - * record from the disk. - */ - ocfs2_extent_map_trunc(inode_in, 0); - ocfs2_extent_map_trunc(inode_out, 0); - - ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); - if (ret) { - mlog_errno(ret); - goto out_unlock; - } - - ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); - return 0; - -out_unlock: - ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); - return ret; -} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 4af55bf4b35b..e9e862be4a1e 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -115,11 +115,23 @@ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve); -int ocfs2_reflink_remap_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe); +loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len); +int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2); +void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh); +int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index d6c350ba25b9..c4b029c43464 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; -inline int ocfs2_is_o2cb_active(void) -{ - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); -} -EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); - static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e3036e1790e8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); -/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ -int ocfs2_is_o2cb_active(void); - extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 5e65d818937b..fe53381b26b1 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -25,7 +25,7 @@ static int read_one_page(struct page *page) struct iov_iter to; struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE}; - iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE); + iov_iter_bvec(&to, READ, &bv, 1, PAGE_SIZE); gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpage called with page %p\n", diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 1cc797a08a5b..9e62dcf06fc4 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -125,6 +125,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) struct file *new_file; loff_t old_pos = 0; loff_t new_pos = 0; + loff_t cloned; int error = 0; if (len == 0) @@ -141,11 +142,10 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - error = do_clone_file_range(old_file, 0, new_file, 0, len); - if (!error) + cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); + if (cloned == len) goto out; /* Couldn't clone, so now we try to copy the data */ - error = 0; /* FIXME: copy up sparse files efficiently */ while (len) { @@ -395,7 +395,6 @@ struct ovl_copy_up_ctx { struct dentry *destdir; struct qstr destname; struct dentry *workdir; - bool tmpfile; bool origin; bool indexed; bool metacopy; @@ -440,63 +439,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; } -static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp, - struct dentry **newdentry) -{ - int err; - struct dentry *upper; - struct inode *udir = d_inode(c->destdir); - - upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); - if (IS_ERR(upper)) - return PTR_ERR(upper); - - if (c->tmpfile) - err = ovl_do_link(temp, udir, upper); - else - err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0); - - if (!err) - *newdentry = dget(c->tmpfile ? upper : temp); - dput(upper); - - return err; -} - -static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c) -{ - int err; - struct dentry *temp; - const struct cred *old_creds = NULL; - struct cred *new_creds = NULL; - struct ovl_cattr cattr = { - /* Can't properly set mode on creation because of the umask */ - .mode = c->stat.mode & S_IFMT, - .rdev = c->stat.rdev, - .link = c->link - }; - - err = security_inode_copy_up(c->dentry, &new_creds); - temp = ERR_PTR(err); - if (err < 0) - goto out; - - if (new_creds) - old_creds = override_creds(new_creds); - - if (c->tmpfile) - temp = ovl_do_tmpfile(c->workdir, c->stat.mode); - else - temp = ovl_create_temp(c->workdir, &cattr); -out: - if (new_creds) { - revert_creds(old_creds); - put_cred(new_creds); - } - - return temp; -} - static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; @@ -548,51 +490,148 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } -static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) +struct ovl_cu_creds { + const struct cred *old; + struct cred *new; +}; + +static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc) +{ + int err; + + cc->old = cc->new = NULL; + err = security_inode_copy_up(dentry, &cc->new); + if (err < 0) + return err; + + if (cc->new) + cc->old = override_creds(cc->new); + + return 0; +} + +static void ovl_revert_cu_creds(struct ovl_cu_creds *cc) +{ + if (cc->new) { + revert_creds(cc->old); + put_cred(cc->new); + } +} + +/* + * Copyup using workdir to prepare temp file. Used when copying up directories, + * special files or when upper fs doesn't support O_TMPFILE. + */ +static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) { - struct inode *udir = c->destdir->d_inode; struct inode *inode; - struct dentry *newdentry = NULL; - struct dentry *temp; + struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct dentry *temp, *upper; + struct ovl_cu_creds cc; int err; + struct ovl_cattr cattr = { + /* Can't properly set mode on creation because of the umask */ + .mode = c->stat.mode & S_IFMT, + .rdev = c->stat.rdev, + .link = c->link + }; + + err = ovl_lock_rename_workdir(c->workdir, c->destdir); + if (err) + return err; + + err = ovl_prep_cu_creds(c->dentry, &cc); + if (err) + goto unlock; - temp = ovl_get_tmpfile(c); + temp = ovl_create_temp(c->workdir, &cattr); + ovl_revert_cu_creds(&cc); + + err = PTR_ERR(temp); if (IS_ERR(temp)) - return PTR_ERR(temp); + goto unlock; err = ovl_copy_up_inode(c, temp); if (err) - goto out; + goto cleanup; if (S_ISDIR(c->stat.mode) && c->indexed) { err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); if (err) - goto out; + goto cleanup; } - if (c->tmpfile) { - inode_lock_nested(udir, I_MUTEX_PARENT); - err = ovl_install_temp(c, temp, &newdentry); - inode_unlock(udir); - } else { - err = ovl_install_temp(c, temp, &newdentry); - } + upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto cleanup; + + err = ovl_do_rename(wdir, temp, udir, upper, 0); + dput(upper); if (err) - goto out; + goto cleanup; if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); inode = d_inode(c->dentry); - ovl_inode_update(inode, newdentry); + ovl_inode_update(inode, temp); if (S_ISDIR(inode->i_mode)) ovl_set_flag(OVL_WHITEOUTS, inode); +unlock: + unlock_rename(c->workdir, c->destdir); -out: - if (err && !c->tmpfile) - ovl_cleanup(d_inode(c->workdir), temp); - dput(temp); return err; +cleanup: + ovl_cleanup(wdir, temp); + dput(temp); + goto unlock; +} + +/* Copyup using O_TMPFILE which does not require cross dir locking */ +static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) +{ + struct inode *udir = d_inode(c->destdir); + struct dentry *temp, *upper; + struct ovl_cu_creds cc; + int err; + + err = ovl_prep_cu_creds(c->dentry, &cc); + if (err) + return err; + + temp = ovl_do_tmpfile(c->workdir, c->stat.mode); + ovl_revert_cu_creds(&cc); + + if (IS_ERR(temp)) + return PTR_ERR(temp); + + err = ovl_copy_up_inode(c, temp); + if (err) + goto out_dput; + + inode_lock_nested(udir, I_MUTEX_PARENT); + + upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + err = PTR_ERR(upper); + if (!IS_ERR(upper)) { + err = ovl_do_link(temp, udir, upper); + dput(upper); + } + inode_unlock(udir); + + if (err) + goto out_dput; + + if (!c->metacopy) + ovl_set_upperdata(d_inode(c->dentry)); + ovl_inode_update(d_inode(c->dentry), temp); + + return 0; + +out_dput: + dput(temp); + return err; } /* @@ -646,18 +685,10 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) } /* Should we copyup with O_TMPFILE or with workdir? */ - if (S_ISREG(c->stat.mode) && ofs->tmpfile) { - c->tmpfile = true; - err = ovl_copy_up_locked(c); - } else { - err = ovl_lock_rename_workdir(c->workdir, c->destdir); - if (!err) { - err = ovl_copy_up_locked(c); - unlock_rename(c->workdir, c->destdir); - } - } - - + if (S_ISREG(c->stat.mode) && ofs->tmpfile) + err = ovl_copy_up_tmpfile(c); + else + err = ovl_copy_up_workdir(c); if (err) goto out; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 276914ae3c60..c6289147c787 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -414,13 +414,12 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) return 0; - size = posix_acl_to_xattr(NULL, acl, NULL, 0); + size = posix_acl_xattr_size(acl->a_count); buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return -ENOMEM; - size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - err = size; + err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); if (err < 0) goto out_free; @@ -463,6 +462,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (IS_ERR(upper)) goto out_unlock; + err = -ESTALE; + if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) + goto out_dput; + newdentry = ovl_create_temp(workdir, cattr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) @@ -652,7 +655,6 @@ static int ovl_link(struct dentry *old, struct inode *newdir, struct dentry *new) { int err; - bool locked = false; struct inode *inode; err = ovl_want_write(old); @@ -663,13 +665,17 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) goto out_drop_write; + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (ovl_is_metacopy_dentry(old)) { err = ovl_set_redirect(old, false); if (err) goto out_drop_write; } - err = ovl_nlink_start(old, &locked); + err = ovl_nlink_start(old); if (err) goto out_drop_write; @@ -682,7 +688,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) iput(inode); - ovl_nlink_end(old, locked); + ovl_nlink_end(old); out_drop_write: ovl_drop_write(old); out: @@ -807,7 +813,6 @@ static bool ovl_pure_upper(struct dentry *dentry) static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; - bool locked = false; const struct cred *old_cred; struct dentry *upperdentry; bool lower_positive = ovl_lower_positive(dentry); @@ -828,7 +833,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (err) goto out_drop_write; - err = ovl_nlink_start(dentry, &locked); + err = ovl_nlink_start(dentry); if (err) goto out_drop_write; @@ -844,7 +849,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) else drop_nlink(dentry->d_inode); } - ovl_nlink_end(dentry, locked); + ovl_nlink_end(dentry); /* * Copy ctime @@ -1008,7 +1013,6 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, unsigned int flags) { int err; - bool locked = false; struct dentry *old_upperdir; struct dentry *new_upperdir; struct dentry *olddentry; @@ -1017,6 +1021,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, bool old_opaque; bool new_opaque; bool cleanup_whiteout = false; + bool update_nlink = false; bool overwrite = !(flags & RENAME_EXCHANGE); bool is_dir = d_is_dir(old); bool new_is_dir = d_is_dir(new); @@ -1074,10 +1079,12 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, err = ovl_copy_up(new); if (err) goto out_drop_write; - } else { - err = ovl_nlink_start(new, &locked); + } else if (d_inode(new)) { + err = ovl_nlink_start(new); if (err) goto out_drop_write; + + update_nlink = true; } old_cred = ovl_override_creds(old->d_sb); @@ -1206,7 +1213,8 @@ out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: revert_creds(old_cred); - ovl_nlink_end(new, locked); + if (update_nlink) + ovl_nlink_end(new); out_drop_write: ovl_drop_write(old); out: diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 986313da0c88..84dd957efa24 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -434,14 +434,14 @@ enum ovl_copyop { OVL_DEDUPE, }; -static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, +static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, unsigned int flags, enum ovl_copyop op) + loff_t len, unsigned int flags, enum ovl_copyop op) { struct inode *inode_out = file_inode(file_out); struct fd real_in, real_out; const struct cred *old_cred; - ssize_t ret; + loff_t ret; ret = ovl_real_fdget(file_out, &real_out); if (ret) @@ -462,12 +462,13 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, case OVL_CLONE: ret = vfs_clone_file_range(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, flags); break; case OVL_DEDUPE: ret = vfs_dedupe_file_range_one(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, + flags); break; } revert_creds(old_cred); @@ -489,26 +490,31 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, OVL_COPY); } -static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_CLONE); -} + enum ovl_copyop op; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + op = OVL_DEDUPE; + else + op = OVL_CLONE; -static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) -{ /* * Don't copy up because of a dedupe request, this wouldn't make sense * most of the time (data would be duplicated instead of deduplicated). */ - if (!ovl_inode_upper(file_inode(file_in)) || - !ovl_inode_upper(file_inode(file_out))) + if (op == OVL_DEDUPE && + (!ovl_inode_upper(file_inode(file_in)) || + !ovl_inode_upper(file_inode(file_out)))) return -EPERM; - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_DEDUPE); + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, + remap_flags, op); } const struct file_operations ovl_file_operations = { @@ -525,6 +531,5 @@ const struct file_operations ovl_file_operations = { .compat_ioctl = ovl_compat_ioctl, .copy_file_range = ovl_copy_file_range, - .clone_file_range = ovl_clone_file_range, - .dedupe_file_range = ovl_dedupe_file_range, + .remap_file_range = ovl_remap_file_range, }; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 3b7ed5d2279c..6bcc9dedc342 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -286,13 +286,22 @@ int ovl_permission(struct inode *inode, int mask) if (err) return err; - old_cred = ovl_override_creds(inode->i_sb); - if (!upperinode && - !special_file(realinode->i_mode) && mask & MAY_WRITE) { + /* No need to do any access on underlying for special files */ + if (special_file(realinode->i_mode)) + return 0; + + /* No need to access underlying for execute */ + mask &= ~MAY_EXEC; + if ((mask & (MAY_READ | MAY_WRITE)) == 0) + return 0; + + /* Lower files get copied up, so turn write access into read */ + if (!upperinode && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); - /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } + + old_cred = ovl_override_creds(inode->i_sb); err = inode_permission(realinode, mask); revert_creds(old_cred); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 9c0ca6a7becf..efd372312ef1 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -422,8 +422,10 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, fh = ovl_encode_real_fh(real, is_upper); err = PTR_ERR(fh); - if (IS_ERR(fh)) + if (IS_ERR(fh)) { + fh = NULL; goto fail; + } err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index a3c0d9584312..5e45cb3630a0 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -271,8 +271,8 @@ bool ovl_test_flag(unsigned long flag, struct inode *inode); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); bool ovl_need_index(struct dentry *dentry); -int ovl_nlink_start(struct dentry *dentry, bool *locked); -void ovl_nlink_end(struct dentry *dentry, bool locked); +int ovl_nlink_start(struct dentry *dentry); +void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); @@ -290,6 +290,16 @@ static inline unsigned int ovl_xino_bits(struct super_block *sb) return ofs->xino_bits; } +static inline int ovl_inode_lock(struct inode *inode) +{ + return mutex_lock_interruptible(&OVL_I(inode)->lock); +} + +static inline void ovl_inode_unlock(struct inode *inode) +{ + mutex_unlock(&OVL_I(inode)->lock); +} + /* namei.c */ int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 30adc9d408a0..0116735cc321 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -472,6 +472,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; int err; + bool metacopy_opt = false, redirect_opt = false; config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); if (!config->redirect_mode) @@ -516,6 +517,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->redirect_mode = match_strdup(&args[0]); if (!config->redirect_mode) return -ENOMEM; + redirect_opt = true; break; case OPT_INDEX_ON: @@ -548,6 +550,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) case OPT_METACOPY_ON: config->metacopy = true; + metacopy_opt = true; break; case OPT_METACOPY_OFF: @@ -572,13 +575,32 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) if (err) return err; - /* metacopy feature with upper requires redirect_dir=on */ - if (config->upperdir && config->metacopy && !config->redirect_dir) { - pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=on\", falling back to metacopy=off.\n"); - config->metacopy = false; - } else if (config->metacopy && !config->redirect_follow) { - pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=follow\" on non-upper mount, falling back to metacopy=off.\n"); - config->metacopy = false; + /* + * This is to make the logic below simpler. It doesn't make any other + * difference, since config->redirect_dir is only used for upper. + */ + if (!config->upperdir && config->redirect_follow) + config->redirect_dir = true; + + /* Resolve metacopy -> redirect_dir dependency */ + if (config->metacopy && !config->redirect_dir) { + if (metacopy_opt && redirect_opt) { + pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n", + config->redirect_mode); + return -EINVAL; + } + if (redirect_opt) { + /* + * There was an explicit redirect_dir=... that resulted + * in this conflict. + */ + pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n", + config->redirect_mode); + config->metacopy = false; + } else { + /* Automatically enable redirect otherwise. */ + config->redirect_follow = config->redirect_dir = true; + } } return 0; @@ -1175,9 +1197,29 @@ out: return err; } +static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) +{ + unsigned int i; + + if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt)) + return true; + + for (i = 0; i < ofs->numlowerfs; i++) { + /* + * We use uuid to associate an overlay lower file handle with a + * lower layer, so we can accept lower fs with null uuid as long + * as all lower layers with null uuid are on the same fs. + */ + if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) + return false; + } + return true; +} + /* Get a unique fsid for the layer */ -static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) +static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) { + struct super_block *sb = path->mnt->mnt_sb; unsigned int i; dev_t dev; int err; @@ -1191,6 +1233,14 @@ static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) return i + 1; } + if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", + path->dentry); + } + err = get_anon_bdev(&dev); if (err) { pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); @@ -1225,7 +1275,7 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, struct vfsmount *mnt; int fsid; - err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb); + err = fsid = ovl_get_fsid(ofs, &stack[i]); if (err < 0) goto out; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index ace4fe4c39a9..7c01327b1852 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -65,8 +65,7 @@ struct super_block *ovl_same_sb(struct super_block *sb) */ int ovl_can_decode_fh(struct super_block *sb) { - if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry || - uuid_is_null(&sb->s_uuid)) + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry) return 0; return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; @@ -522,13 +521,13 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags) int ovl_copy_up_start(struct dentry *dentry, int flags) { - struct ovl_inode *oi = OVL_I(d_inode(dentry)); + struct inode *inode = d_inode(dentry); int err; - err = mutex_lock_interruptible(&oi->lock); + err = ovl_inode_lock(inode); if (!err && ovl_already_copied_up_locked(dentry, flags)) { err = 1; /* Already copied up */ - mutex_unlock(&oi->lock); + ovl_inode_unlock(inode); } return err; @@ -536,7 +535,7 @@ int ovl_copy_up_start(struct dentry *dentry, int flags) void ovl_copy_up_end(struct dentry *dentry) { - mutex_unlock(&OVL_I(d_inode(dentry))->lock); + ovl_inode_unlock(d_inode(dentry)); } bool ovl_check_origin_xattr(struct dentry *dentry) @@ -739,14 +738,14 @@ fail: * Operations that change overlay inode and upper inode nlink need to be * synchronized with copy up for persistent nlink accounting. */ -int ovl_nlink_start(struct dentry *dentry, bool *locked) +int ovl_nlink_start(struct dentry *dentry) { - struct ovl_inode *oi = OVL_I(d_inode(dentry)); + struct inode *inode = d_inode(dentry); const struct cred *old_cred; int err; - if (!d_inode(dentry)) - return 0; + if (WARN_ON(!inode)) + return -ENOENT; /* * With inodes index is enabled, we store the union overlay nlink @@ -768,11 +767,11 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked) return err; } - err = mutex_lock_interruptible(&oi->lock); + err = ovl_inode_lock(inode); if (err) return err; - if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) goto out; old_cred = ovl_override_creds(dentry->d_sb); @@ -787,27 +786,24 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked) out: if (err) - mutex_unlock(&oi->lock); - else - *locked = true; + ovl_inode_unlock(inode); return err; } -void ovl_nlink_end(struct dentry *dentry, bool locked) +void ovl_nlink_end(struct dentry *dentry) { - if (locked) { - if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) && - d_inode(dentry)->i_nlink == 0) { - const struct cred *old_cred; + struct inode *inode = d_inode(dentry); - old_cred = ovl_override_creds(dentry->d_sb); - ovl_cleanup_index(dentry); - revert_creds(old_cred); - } + if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { + const struct cred *old_cred; - mutex_unlock(&OVL_I(d_inode(dentry))->lock); + old_cred = ovl_override_creds(dentry->d_sb); + ovl_cleanup_index(dentry); + revert_creds(old_cred); } + + ovl_inode_unlock(inode); } int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) diff --git a/fs/proc/base.c b/fs/proc/base.c index 7e9f07bf260d..ce3465479447 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2905,6 +2905,21 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_STACKLEAK_METRICS +static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long prev_depth = THREAD_SIZE - + (task->prev_lowest_stack & (THREAD_SIZE - 1)); + unsigned long depth = THREAD_SIZE - + (task->lowest_stack & (THREAD_SIZE - 1)); + + seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", + prev_depth, depth); + return 0; +} +#endif /* CONFIG_STACKLEAK_METRICS */ + /* * Thread groups */ @@ -3006,6 +3021,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_STACKLEAK_METRICS + ONE("stack_depth", S_IRUGO, proc_stack_depth), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/read_write.c b/fs/read_write.c index 603794b207eb..bfcb4ced5664 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1407,7 +1407,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, goto fput_in; if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; - retval = -EINVAL; in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; @@ -1588,11 +1587,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * Try cloning first, this is supported by more file systems, and * more efficient if both clone and copy are supported (e.g. NFS). */ - if (file_in->f_op->clone_file_range) { - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); - if (ret == 0) { - ret = len; + if (file_in->f_op->remap_file_range) { + loff_t cloned; + + cloned = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, + min_t(loff_t, MAX_RW_COUNT, len), + REMAP_FILE_CAN_SHORTEN); + if (cloned > 0) { + ret = cloned; goto done; } } @@ -1686,11 +1689,12 @@ out2: return ret; } -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) { struct inode *inode = file_inode(file); - if (unlikely(pos < 0)) + if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) @@ -1708,22 +1712,150 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For deduplication we always scale down to the previous block because we + * can't meaningfully compare post-EOF contents. + * + * For clone we only link a partial EOF block above the destination file's EOF. + * + * Shorten the request if possible. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; + + if ((*len & blkmask) == 0) + return 0; + + if ((remap_flags & REMAP_FILE_DEDUP) || + pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; + + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. * - * Returns: 0 for "nothing to clone", 1 for "something to clone", or - * the usual negative error code. + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. */ -int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) { - loff_t bs = inode_out->i_sb->s_blocksize; - loff_t blen; - loff_t isize; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; @@ -1740,50 +1872,24 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) - return 0; - /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { - if (is_dedupe || pos_in == isize) + loff_t isize = i_size_read(inode_in); + + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; *len = isize - pos_in; + if (*len == 0) + return 0; } - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + *len < pos_in || pos_out + *len < pos_out || - pos_in + *len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + *len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + *len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = *len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - } + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + remap_flags); + if (ret) + return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); @@ -1803,7 +1909,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, /* * Check that the extents are the same. */ - if (is_dedupe) { + if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; ret = vfs_dedupe_file_range_compare(inode_in, pos_in, @@ -1814,16 +1920,43 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, return -EBADE; } - return 1; + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* If can't alter the file contents, we're done. */ + if (!(remap_flags & REMAP_FILE_DEDUP)) { + /* Update the timestamps, since we can alter file contents. */ + if (!(file_out->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file_out); + if (ret) + return ret; + } + + /* + * Clear the security bits if the process is not being run by + * root. This keeps people from modifying setuid and setgid + * binaries. + */ + ret = file_remove_privs(file_out); + if (ret) + return ret; + } + + return 0; } -EXPORT_SYMBOL(vfs_clone_file_prep_inodes); +EXPORT_SYMBOL(generic_remap_file_range_prep); -int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - int ret; + loff_t ret; + + WARN_ON_ONCE(remap_flags); if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; @@ -1843,155 +1976,76 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, (file_out->f_flags & O_APPEND)) return -EBADF; - if (!file_in->f_op->clone_file_range) + if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = clone_verify_area(file_in, pos_in, len, false); + ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; - ret = clone_verify_area(file_out, pos_out, len, true); + ret = remap_verify_area(file_out, pos_out, len, true); if (ret) return ret; - if (pos_in + len > i_size_read(inode_in)) - return -EINVAL; - - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); - if (!ret) { - fsnotify_access(file_in); - fsnotify_modify(file_out); - } + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(do_clone_file_range); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { - int ret; + loff_t ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); file_end_write(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +/* Check whether we are allowed to dedupe the destination file */ +static bool allow_file_dedupe(struct file *file) { - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; + if (capable(CAP_SYS_ADMIN)) + return true; + if (file->f_mode & FMODE_WRITE) + return true; + if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + return true; + if (!inode_permission(file_inode(file), MAY_WRITE)) + return true; + return false; } -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags) { - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); + loff_t ret; - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - return error; -} -EXPORT_SYMBOL(vfs_dedupe_file_range_compare); - -int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, u64 len) -{ - s64 ret; + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); ret = mnt_want_write_file(dst_file); if (ret) return ret; - ret = clone_verify_area(dst_file, dst_pos, len, true); + ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret < 0) goto out_drop_write; - ret = -EINVAL; - if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE))) + ret = -EPERM; + if (!allow_file_dedupe(dst_file)) goto out_drop_write; ret = -EXDEV; @@ -2003,11 +2057,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; ret = -EINVAL; - if (!dst_file->f_op->dedupe_file_range) + if (!dst_file->f_op->remap_file_range) goto out_drop_write; - ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, - dst_file, dst_pos, len); + if (len == 0) { + ret = 0; + goto out_drop_write; + } + + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); @@ -2024,7 +2083,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) int i; int ret; u16 count = same->dest_count; - int deduped; + loff_t deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2043,7 +2102,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) if (!S_ISREG(src->i_mode)) goto out; - ret = clone_verify_area(file, off, len, false); + ret = remap_verify_area(file, off, len, false); if (ret < 0) goto out; ret = 0; @@ -2075,7 +2134,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len); + info->dest_offset, len, + REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) diff --git a/fs/splice.c b/fs/splice.c index b3daa971f597..3553f1956508 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -301,7 +301,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; int idx, ret; - iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len); + iov_iter_pipe(&to, READ, pipe, len); idx = to.idx; init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; @@ -386,7 +386,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, */ offset = *ppos & ~PAGE_MASK; - iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); + iov_iter_pipe(&to, READ, pipe, len + offset); res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); if (res <= 0) @@ -745,8 +745,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, left -= this_len; } - iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, - sd.total_len - left); + iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index bbc78549be4c..529856fbccd0 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -7,6 +7,7 @@ config UBIFS_FS select CRYPTO if UBIFS_FS_ZLIB select CRYPTO_LZO if UBIFS_FS_LZO select CRYPTO_DEFLATE if UBIFS_FS_ZLIB + select CRYPTO_HASH_INFO depends on MTD_UBI help UBIFS is a file system for flash devices which works on top of UBI. @@ -85,3 +86,13 @@ config UBIFS_FS_SECURITY the extended attribute support in advance. If you are not using a security module, say N. + +config UBIFS_FS_AUTHENTICATION + bool "UBIFS authentication support" + select CRYPTO_HMAC + help + Enable authentication support for UBIFS. This feature offers protection + against offline changes for both data and metadata of the filesystem. + If you say yes here you should also select a hashing algorithm such as + sha256, these are not selected automatically since there are many + different options. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 6197d7e539e4..5f838319c8d5 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -8,3 +8,4 @@ ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o ubifs-y += misc.o ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o +ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c new file mode 100644 index 000000000000..124e965a28b3 --- /dev/null +++ b/fs/ubifs/auth.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file is part of UBIFS. + * + * Copyright (C) 2018 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de> + */ + +/* + * This file implements various helper functions for UBIFS authentication support + */ + +#include <linux/crypto.h> +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <crypto/algapi.h> +#include <keys/user-type.h> + +#include "ubifs.h" + +/** + * ubifs_node_calc_hash - calculate the hash of a UBIFS node + * @c: UBIFS file-system description object + * @node: the node to calculate a hash for + * @hash: the returned hash + * + * Returns 0 for success or a negative error code otherwise. + */ +int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, + u8 *hash) +{ + const struct ubifs_ch *ch = node; + SHASH_DESC_ON_STACK(shash, c->hash_tfm); + int err; + + shash->tfm = c->hash_tfm; + shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash); + if (err < 0) + return err; + return 0; +} + +/** + * ubifs_hash_calc_hmac - calculate a HMAC from a hash + * @c: UBIFS file-system description object + * @hash: the node to calculate a HMAC for + * @hmac: the returned HMAC + * + * Returns 0 for success or a negative error code otherwise. + */ +static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, + u8 *hmac) +{ + SHASH_DESC_ON_STACK(shash, c->hmac_tfm); + int err; + + shash->tfm = c->hmac_tfm; + shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_shash_digest(shash, hash, c->hash_len, hmac); + if (err < 0) + return err; + return 0; +} + +/** + * ubifs_prepare_auth_node - Prepare an authentication node + * @c: UBIFS file-system description object + * @node: the node to calculate a hash for + * @hash: input hash of previous nodes + * + * This function prepares an authentication node for writing onto flash. + * It creates a HMAC from the given input hash and writes it to the node. + * + * Returns 0 for success or a negative error code otherwise. + */ +int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, + struct shash_desc *inhash) +{ + SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); + struct ubifs_auth_node *auth = node; + u8 *hash; + int err; + + hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS); + if (!hash) + return -ENOMEM; + + hash_desc->tfm = c->hash_tfm; + hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ubifs_shash_copy_state(c, inhash, hash_desc); + + err = crypto_shash_final(hash_desc, hash); + if (err) + goto out; + + err = ubifs_hash_calc_hmac(c, hash, auth->hmac); + if (err) + goto out; + + auth->ch.node_type = UBIFS_AUTH_NODE; + ubifs_prepare_node(c, auth, ubifs_auth_node_sz(c), 0); + + err = 0; +out: + kfree(hash); + + return err; +} + +static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c, + struct crypto_shash *tfm) +{ + struct shash_desc *desc; + int err; + + if (!ubifs_authenticated(c)) + return NULL; + + desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(tfm), GFP_KERNEL); + if (!desc) + return ERR_PTR(-ENOMEM); + + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_shash_init(desc); + if (err) { + kfree(desc); + return ERR_PTR(err); + } + + return desc; +} + +/** + * __ubifs_hash_get_desc - get a descriptor suitable for hashing a node + * @c: UBIFS file-system description object + * + * This function returns a descriptor suitable for hashing a node. Free after use + * with kfree. + */ +struct shash_desc *__ubifs_hash_get_desc(const struct ubifs_info *c) +{ + return ubifs_get_desc(c, c->hash_tfm); +} + +/** + * __ubifs_shash_final - finalize shash + * @c: UBIFS file-system description object + * @desc: the descriptor + * @out: the output hash + * + * Simple wrapper around crypto_shash_final(), safe to be called with + * disabled authentication. + */ +int __ubifs_shash_final(const struct ubifs_info *c, struct shash_desc *desc, + u8 *out) +{ + if (ubifs_authenticated(c)) + return crypto_shash_final(desc, out); + + return 0; +} + +/** + * ubifs_bad_hash - Report hash mismatches + * @c: UBIFS file-system description object + * @node: the node + * @hash: the expected hash + * @lnum: the LEB @node was read from + * @offs: offset in LEB @node was read from + * + * This function reports a hash mismatch when a node has a different hash than + * expected. + */ +void ubifs_bad_hash(const struct ubifs_info *c, const void *node, const u8 *hash, + int lnum, int offs) +{ + int len = min(c->hash_len, 20); + int cropped = len != c->hash_len; + const char *cont = cropped ? "..." : ""; + + u8 calc[UBIFS_HASH_ARR_SZ]; + + __ubifs_node_calc_hash(c, node, calc); + + ubifs_err(c, "hash mismatch on node at LEB %d:%d", lnum, offs); + ubifs_err(c, "hash expected: %*ph%s", len, hash, cont); + ubifs_err(c, "hash calculated: %*ph%s", len, calc, cont); +} + +/** + * __ubifs_node_check_hash - check the hash of a node against given hash + * @c: UBIFS file-system description object + * @node: the node + * @expected: the expected hash + * + * This function calculates a hash over a node and compares it to the given hash. + * Returns 0 if both hashes are equal or authentication is disabled, otherwise a + * negative error code is returned. + */ +int __ubifs_node_check_hash(const struct ubifs_info *c, const void *node, + const u8 *expected) +{ + u8 calc[UBIFS_HASH_ARR_SZ]; + int err; + + err = __ubifs_node_calc_hash(c, node, calc); + if (err) + return err; + + if (ubifs_check_hash(c, expected, calc)) + return -EPERM; + + return 0; +} + +/** + * ubifs_init_authentication - initialize UBIFS authentication support + * @c: UBIFS file-system description object + * + * This function returns 0 for success or a negative error code otherwise. + */ +int ubifs_init_authentication(struct ubifs_info *c) +{ + struct key *keyring_key; + const struct user_key_payload *ukp; + int err; + char hmac_name[CRYPTO_MAX_ALG_NAME]; + + if (!c->auth_hash_name) { + ubifs_err(c, "authentication hash name needed with authentication"); + return -EINVAL; + } + + c->auth_hash_algo = match_string(hash_algo_name, HASH_ALGO__LAST, + c->auth_hash_name); + if ((int)c->auth_hash_algo < 0) { + ubifs_err(c, "Unknown hash algo %s specified", + c->auth_hash_name); + return -EINVAL; + } + + snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", + c->auth_hash_name); + + keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL); + + if (IS_ERR(keyring_key)) { + ubifs_err(c, "Failed to request key: %ld", + PTR_ERR(keyring_key)); + return PTR_ERR(keyring_key); + } + + down_read(&keyring_key->sem); + + if (keyring_key->type != &key_type_logon) { + ubifs_err(c, "key type must be logon"); + err = -ENOKEY; + goto out; + } + + ukp = user_key_payload_locked(keyring_key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + err = -EKEYREVOKED; + goto out; + } + + c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(c->hash_tfm)) { + err = PTR_ERR(c->hash_tfm); + ubifs_err(c, "Can not allocate %s: %d", + c->auth_hash_name, err); + goto out; + } + + c->hash_len = crypto_shash_digestsize(c->hash_tfm); + if (c->hash_len > UBIFS_HASH_ARR_SZ) { + ubifs_err(c, "hash %s is bigger than maximum allowed hash size (%d > %d)", + c->auth_hash_name, c->hash_len, UBIFS_HASH_ARR_SZ); + err = -EINVAL; + goto out_free_hash; + } + + c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(c->hmac_tfm)) { + err = PTR_ERR(c->hmac_tfm); + ubifs_err(c, "Can not allocate %s: %d", hmac_name, err); + goto out_free_hash; + } + + c->hmac_desc_len = crypto_shash_digestsize(c->hmac_tfm); + if (c->hmac_desc_len > UBIFS_HMAC_ARR_SZ) { + ubifs_err(c, "hmac %s is bigger than maximum allowed hmac size (%d > %d)", + hmac_name, c->hmac_desc_len, UBIFS_HMAC_ARR_SZ); + err = -EINVAL; + goto out_free_hash; + } + + err = crypto_shash_setkey(c->hmac_tfm, ukp->data, ukp->datalen); + if (err) + goto out_free_hmac; + + c->authenticated = true; + + c->log_hash = ubifs_hash_get_desc(c); + if (IS_ERR(c->log_hash)) + goto out_free_hmac; + + err = 0; + +out_free_hmac: + if (err) + crypto_free_shash(c->hmac_tfm); +out_free_hash: + if (err) + crypto_free_shash(c->hash_tfm); +out: + up_read(&keyring_key->sem); + key_put(keyring_key); + + return err; +} + +/** + * __ubifs_exit_authentication - release resource + * @c: UBIFS file-system description object + * + * This function releases the authentication related resources. + */ +void __ubifs_exit_authentication(struct ubifs_info *c) +{ + if (!ubifs_authenticated(c)) + return; + + crypto_free_shash(c->hmac_tfm); + crypto_free_shash(c->hash_tfm); + kfree(c->log_hash); +} + +/** + * ubifs_node_calc_hmac - calculate the HMAC of a UBIFS node + * @c: UBIFS file-system description object + * @node: the node to insert a HMAC into. + * @len: the length of the node + * @ofs_hmac: the offset in the node where the HMAC is inserted + * @hmac: returned HMAC + * + * This function calculates a HMAC of a UBIFS node. The HMAC is expected to be + * embedded into the node, so this area is not covered by the HMAC. Also not + * covered is the UBIFS_NODE_MAGIC and the CRC of the node. + */ +static int ubifs_node_calc_hmac(const struct ubifs_info *c, const void *node, + int len, int ofs_hmac, void *hmac) +{ + SHASH_DESC_ON_STACK(shash, c->hmac_tfm); + int hmac_len = c->hmac_desc_len; + int err; + + ubifs_assert(c, ofs_hmac > 8); + ubifs_assert(c, ofs_hmac + hmac_len < len); + + shash->tfm = c->hmac_tfm; + shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_shash_init(shash); + if (err) + return err; + + /* behind common node header CRC up to HMAC begin */ + err = crypto_shash_update(shash, node + 8, ofs_hmac - 8); + if (err < 0) + return err; + + /* behind HMAC, if any */ + if (len - ofs_hmac - hmac_len > 0) { + err = crypto_shash_update(shash, node + ofs_hmac + hmac_len, + len - ofs_hmac - hmac_len); + if (err < 0) + return err; + } + + return crypto_shash_final(shash, hmac); +} + +/** + * __ubifs_node_insert_hmac - insert a HMAC into a UBIFS node + * @c: UBIFS file-system description object + * @node: the node to insert a HMAC into. + * @len: the length of the node + * @ofs_hmac: the offset in the node where the HMAC is inserted + * + * This function inserts a HMAC at offset @ofs_hmac into the node given in + * @node. + * + * This function returns 0 for success or a negative error code otherwise. + */ +int __ubifs_node_insert_hmac(const struct ubifs_info *c, void *node, int len, + int ofs_hmac) +{ + return ubifs_node_calc_hmac(c, node, len, ofs_hmac, node + ofs_hmac); +} + +/** + * __ubifs_node_verify_hmac - verify the HMAC of UBIFS node + * @c: UBIFS file-system description object + * @node: the node to insert a HMAC into. + * @len: the length of the node + * @ofs_hmac: the offset in the node where the HMAC is inserted + * + * This function verifies the HMAC at offset @ofs_hmac of the node given in + * @node. Returns 0 if successful or a negative error code otherwise. + */ +int __ubifs_node_verify_hmac(const struct ubifs_info *c, const void *node, + int len, int ofs_hmac) +{ + int hmac_len = c->hmac_desc_len; + u8 *hmac; + int err; + + hmac = kmalloc(hmac_len, GFP_NOFS); + if (!hmac) + return -ENOMEM; + + err = ubifs_node_calc_hmac(c, node, len, ofs_hmac, hmac); + if (err) + return err; + + err = crypto_memneq(hmac, node + ofs_hmac, hmac_len); + + kfree(hmac); + + if (!err) + return 0; + + return -EPERM; +} + +int __ubifs_shash_copy_state(const struct ubifs_info *c, struct shash_desc *src, + struct shash_desc *target) +{ + u8 *state; + int err; + + state = kmalloc(crypto_shash_descsize(src->tfm), GFP_NOFS); + if (!state) + return -ENOMEM; + + err = crypto_shash_export(src, state); + if (err) + goto out; + + err = crypto_shash_import(target, state); + +out: + kfree(state); + + return err; +} + +/** + * ubifs_hmac_wkm - Create a HMAC of the well known message + * @c: UBIFS file-system description object + * @hmac: The HMAC of the well known message + * + * This function creates a HMAC of a well known message. This is used + * to check if the provided key is suitable to authenticate a UBIFS + * image. This is only a convenience to the user to provide a better + * error message when the wrong key is provided. + * + * This function returns 0 for success or a negative error code otherwise. + */ +int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac) +{ + SHASH_DESC_ON_STACK(shash, c->hmac_tfm); + int err; + const char well_known_message[] = "UBIFS"; + + if (!ubifs_authenticated(c)) + return 0; + + shash->tfm = c->hmac_tfm; + shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_shash_init(shash); + if (err) + return err; + + err = crypto_shash_update(shash, well_known_message, + sizeof(well_known_message) - 1); + if (err < 0) + return err; + + err = crypto_shash_final(shash, hmac); + if (err) + return err; + return 0; +} diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 564e330d05b1..c49ff50fdceb 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -165,6 +165,8 @@ const char *dbg_ntype(int type) return "commit start node"; case UBIFS_ORPH_NODE: return "orphan node"; + case UBIFS_AUTH_NODE: + return "auth node"; default: return "unknown node"; } @@ -542,6 +544,10 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) (unsigned long long)le64_to_cpu(orph->inos[i])); break; } + case UBIFS_AUTH_NODE: + { + break; + } default: pr_err("node type %d was not recognized\n", (int)ch->node_type); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index d2680e0b4a36..bf75fdc76fc3 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -254,7 +254,8 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, snod->type == UBIFS_DATA_NODE || snod->type == UBIFS_DENT_NODE || snod->type == UBIFS_XENT_NODE || - snod->type == UBIFS_TRUN_NODE); + snod->type == UBIFS_TRUN_NODE || + snod->type == UBIFS_AUTH_NODE); if (snod->type != UBIFS_INO_NODE && snod->type != UBIFS_DATA_NODE && @@ -364,12 +365,13 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) /* Write nodes to their new location. Use the first-fit strategy */ while (1) { - int avail; + int avail, moved = 0; struct ubifs_scan_node *snod, *tmp; /* Move data nodes */ list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - avail = c->leb_size - wbuf->offs - wbuf->used; + avail = c->leb_size - wbuf->offs - wbuf->used - + ubifs_auth_node_sz(c); if (snod->len > avail) /* * Do not skip data nodes in order to optimize @@ -377,14 +379,21 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) */ break; + err = ubifs_shash_update(c, c->jheads[GCHD].log_hash, + snod->node, snod->len); + if (err) + goto out; + err = move_node(c, sleb, snod, wbuf); if (err) goto out; + moved = 1; } /* Move non-data nodes */ list_for_each_entry_safe(snod, tmp, &nondata, list) { - avail = c->leb_size - wbuf->offs - wbuf->used; + avail = c->leb_size - wbuf->offs - wbuf->used - + ubifs_auth_node_sz(c); if (avail < min) break; @@ -402,9 +411,41 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) continue; } + err = ubifs_shash_update(c, c->jheads[GCHD].log_hash, + snod->node, snod->len); + if (err) + goto out; + err = move_node(c, sleb, snod, wbuf); if (err) goto out; + moved = 1; + } + + if (ubifs_authenticated(c) && moved) { + struct ubifs_auth_node *auth; + + auth = kmalloc(ubifs_auth_node_sz(c), GFP_NOFS); + if (!auth) { + err = -ENOMEM; + goto out; + } + + err = ubifs_prepare_auth_node(c, auth, + c->jheads[GCHD].log_hash); + if (err) { + kfree(auth); + goto out; + } + + err = ubifs_wbuf_write_nolock(wbuf, auth, + ubifs_auth_node_sz(c)); + if (err) { + kfree(auth); + goto out; + } + + ubifs_add_dirt(c, wbuf->lnum, ubifs_auth_node_sz(c)); } if (list_empty(&sleb->nodes) && list_empty(&nondata)) diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 099bec94b820..d124117efd42 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -365,20 +365,8 @@ static unsigned long long next_sqnum(struct ubifs_info *c) return sqnum; } -/** - * ubifs_prepare_node - prepare node to be written to flash. - * @c: UBIFS file-system description object - * @node: the node to pad - * @len: node length - * @pad: if the buffer has to be padded - * - * This function prepares node at @node to be written to the media - it - * calculates node CRC, fills the common header, and adds proper padding up to - * the next minimum I/O unit if @pad is not zero. - */ -void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) +void ubifs_init_node(struct ubifs_info *c, void *node, int len, int pad) { - uint32_t crc; struct ubifs_ch *ch = node; unsigned long long sqnum = next_sqnum(c); @@ -389,8 +377,6 @@ void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) ch->group_type = UBIFS_NO_NODE_GROUP; ch->sqnum = cpu_to_le64(sqnum); ch->padding[0] = ch->padding[1] = 0; - crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); - ch->crc = cpu_to_le32(crc); if (pad) { len = ALIGN(len, 8); @@ -399,6 +385,68 @@ void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) } } +void ubifs_crc_node(struct ubifs_info *c, void *node, int len) +{ + struct ubifs_ch *ch = node; + uint32_t crc; + + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); +} + +/** + * ubifs_prepare_node_hmac - prepare node to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @hmac_offs: offset of the HMAC in the node + * @pad: if the buffer has to be padded + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC, fills the common header, and adds proper padding up to + * the next minimum I/O unit if @pad is not zero. if @hmac_offs is positive then + * a HMAC is inserted into the node at the given offset. + * + * This function returns 0 for success or a negative error code otherwise. + */ +int ubifs_prepare_node_hmac(struct ubifs_info *c, void *node, int len, + int hmac_offs, int pad) +{ + int err; + + ubifs_init_node(c, node, len, pad); + + if (hmac_offs > 0) { + err = ubifs_node_insert_hmac(c, node, len, hmac_offs); + if (err) + return err; + } + + ubifs_crc_node(c, node, len); + + return 0; +} + +/** + * ubifs_prepare_node - prepare node to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @pad: if the buffer has to be padded + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC, fills the common header, and adds proper padding up to + * the next minimum I/O unit if @pad is not zero. + */ +void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) +{ + /* + * Deliberately ignore return value since this function can only fail + * when a hmac offset is given. + */ + ubifs_prepare_node_hmac(c, node, len, 0, pad); +} + /** * ubifs_prep_grp_node - prepare node of a group to be written to flash. * @c: UBIFS file-system description object @@ -849,12 +897,13 @@ out: } /** - * ubifs_write_node - write node to the media. + * ubifs_write_node_hmac - write node to the media. * @c: UBIFS file-system description object * @buf: the node to write * @len: node length * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock + * @hmac_offs: offset of the HMAC within the node * * This function automatically fills node magic number, assigns sequence * number, and calculates node CRC checksum. The length of the @buf buffer has @@ -862,8 +911,8 @@ out: * appends padding node and padding bytes if needed. Returns zero in case of * success and a negative error code in case of failure. */ -int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, - int offs) +int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int hmac_offs) { int err, buf_len = ALIGN(len, c->min_io_size); @@ -878,7 +927,10 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, if (c->ro_error) return -EROFS; - ubifs_prepare_node(c, buf, len, 1); + err = ubifs_prepare_node_hmac(c, buf, len, hmac_offs, 1); + if (err) + return err; + err = ubifs_leb_write(c, lnum, buf, offs, buf_len); if (err) ubifs_dump_node(c, buf); @@ -887,6 +939,26 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, } /** + * ubifs_write_node - write node to the media. + * @c: UBIFS file-system description object + * @buf: the node to write + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function automatically fills node magic number, assigns sequence + * number, and calculates node CRC checksum. The length of the @buf buffer has + * to be aligned to the minimal I/O unit size. This function automatically + * appends padding node and padding bytes if needed. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, + int offs) +{ + return ubifs_write_node_hmac(c, buf, len, lnum, offs, -1); +} + +/** * ubifs_read_node_wbuf - read node from the media or write-buffer. * @wbuf: wbuf to check for un-written data * @buf: buffer to read to diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 802565a17733..729dc76c83df 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -90,6 +90,12 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) memset(trun->padding, 0, 12); } +static void ubifs_add_auth_dirt(struct ubifs_info *c, int lnum) +{ + if (ubifs_authenticated(c)) + ubifs_add_dirt(c, lnum, ubifs_auth_node_sz(c)); +} + /** * reserve_space - reserve space in the journal. * @c: UBIFS file-system description object @@ -228,34 +234,33 @@ out_return: return err; } -/** - * write_node - write node to a journal head. - * @c: UBIFS file-system description object - * @jhead: journal head - * @node: node to write - * @len: node length - * @lnum: LEB number written is returned here - * @offs: offset written is returned here - * - * This function writes a node to reserved space of journal head @jhead. - * Returns zero in case of success and a negative error code in case of - * failure. - */ -static int write_node(struct ubifs_info *c, int jhead, void *node, int len, - int *lnum, int *offs) +static int ubifs_hash_nodes(struct ubifs_info *c, void *node, + int len, struct shash_desc *hash) { - struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + int auth_node_size = ubifs_auth_node_sz(c); + int err; - ubifs_assert(c, jhead != GCHD); + while (1) { + const struct ubifs_ch *ch = node; + int nodelen = le32_to_cpu(ch->len); - *lnum = c->jheads[jhead].wbuf.lnum; - *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + ubifs_assert(c, len >= auth_node_size); - dbg_jnl("jhead %s, LEB %d:%d, len %d", - dbg_jhead(jhead), *lnum, *offs, len); - ubifs_prepare_node(c, node, len, 0); + if (len == auth_node_size) + break; + + ubifs_assert(c, len > nodelen); + ubifs_assert(c, ch->magic == cpu_to_le32(UBIFS_NODE_MAGIC)); - return ubifs_wbuf_write_nolock(wbuf, node, len); + err = ubifs_shash_update(c, hash, (void *)node, nodelen); + if (err) + return err; + + node += ALIGN(nodelen, 8); + len -= ALIGN(nodelen, 8); + } + + return ubifs_prepare_auth_node(c, node, hash); } /** @@ -268,9 +273,9 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len, * @offs: offset written is returned here * @sync: non-zero if the write-buffer has to by synchronized * - * This function is the same as 'write_node()' but it does not assume the - * buffer it is writing is a node, so it does not prepare it (which means - * initializing common header and calculating CRC). + * This function writes data to the reserved space of journal head @jhead. + * Returns zero in case of success and a negative error code in case of + * failure. */ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, int *lnum, int *offs, int sync) @@ -285,6 +290,12 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, dbg_jnl("jhead %s, LEB %d:%d, len %d", dbg_jhead(jhead), *lnum, *offs, len); + if (ubifs_authenticated(c)) { + err = ubifs_hash_nodes(c, buf, len, c->jheads[jhead].log_hash); + if (err) + return err; + } + err = ubifs_wbuf_write_nolock(wbuf, buf, len); if (err) return err; @@ -548,6 +559,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, struct ubifs_dent_node *dent; struct ubifs_ino_node *ino; union ubifs_key dent_key, ino_key; + u8 hash_dent[UBIFS_HASH_ARR_SZ]; + u8 hash_ino[UBIFS_HASH_ARR_SZ]; + u8 hash_ino_host[UBIFS_HASH_ARR_SZ]; ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex)); @@ -570,7 +584,10 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; /* Make sure to also account for extended attributes */ - len += host_ui->data_len; + if (ubifs_authenticated(c)) + len += ALIGN(host_ui->data_len, 8) + ubifs_auth_node_sz(c); + else + len += host_ui->data_len; dent = kzalloc(len, GFP_NOFS); if (!dent) @@ -602,11 +619,21 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, zero_dent_node_unused(dent); ubifs_prep_grp_node(c, dent, dlen, 0); + err = ubifs_node_calc_hash(c, dent, hash_dent); + if (err) + goto out_release; ino = (void *)dent + aligned_dlen; pack_inode(c, ino, inode, 0); + err = ubifs_node_calc_hash(c, ino, hash_ino); + if (err) + goto out_release; + ino = (void *)ino + aligned_ilen; pack_inode(c, ino, dir, 1); + err = ubifs_node_calc_hash(c, ino, hash_ino_host); + if (err) + goto out_release; if (last_reference) { err = ubifs_add_orphan(c, inode->i_ino); @@ -628,6 +655,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, } release_head(c, BASEHD); kfree(dent); + ubifs_add_auth_dirt(c, lnum); if (deletion) { if (nm->hash) @@ -638,7 +666,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, goto out_ro; err = ubifs_add_dirt(c, lnum, dlen); } else - err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm); + err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, + hash_dent, nm); if (err) goto out_ro; @@ -650,14 +679,14 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, */ ino_key_init(c, &ino_key, inode->i_ino); ino_offs = dent_offs + aligned_dlen; - err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen); + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen, hash_ino); if (err) goto out_ro; ino_key_init(c, &ino_key, dir->i_ino); ino_offs += aligned_ilen; err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, - UBIFS_INO_NODE_SZ + host_ui->data_len); + UBIFS_INO_NODE_SZ + host_ui->data_len, hash_ino_host); if (err) goto out_ro; @@ -706,10 +735,12 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len) { struct ubifs_data_node *data; - int err, lnum, offs, compr_type, out_len, compr_len; + int err, lnum, offs, compr_type, out_len, compr_len, auth_len; int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; + int write_len; struct ubifs_inode *ui = ubifs_inode(inode); bool encrypted = ubifs_crypt_is_encrypted(inode); + u8 hash[UBIFS_HASH_ARR_SZ]; dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", (unsigned long)key_inum(c, key), key_block(c, key), len); @@ -718,7 +749,9 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, if (encrypted) dlen += UBIFS_CIPHER_BLOCK_SIZE; - data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); + auth_len = ubifs_auth_node_sz(c); + + data = kmalloc(dlen + auth_len, GFP_NOFS | __GFP_NOWARN); if (!data) { /* * Fall-back to the write reserve buffer. Note, we might be @@ -757,20 +790,33 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, } dlen = UBIFS_DATA_NODE_SZ + out_len; + if (ubifs_authenticated(c)) + write_len = ALIGN(dlen, 8) + auth_len; + else + write_len = dlen; + data->compr_type = cpu_to_le16(compr_type); /* Make reservation before allocating sequence numbers */ - err = make_reservation(c, DATAHD, dlen); + err = make_reservation(c, DATAHD, write_len); if (err) goto out_free; - err = write_node(c, DATAHD, data, dlen, &lnum, &offs); + ubifs_prepare_node(c, data, dlen, 0); + err = write_head(c, DATAHD, data, write_len, &lnum, &offs, 0); + if (err) + goto out_release; + + err = ubifs_node_calc_hash(c, data, hash); if (err) goto out_release; + ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key)); release_head(c, DATAHD); - err = ubifs_tnc_add(c, key, lnum, offs, dlen); + ubifs_add_auth_dirt(c, lnum); + + err = ubifs_tnc_add(c, key, lnum, offs, dlen, hash); if (err) goto out_ro; @@ -808,7 +854,9 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) int err, lnum, offs; struct ubifs_ino_node *ino; struct ubifs_inode *ui = ubifs_inode(inode); - int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; + int sync = 0, write_len, ilen = UBIFS_INO_NODE_SZ; + int last_reference = !inode->i_nlink; + u8 hash[UBIFS_HASH_ARR_SZ]; dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); @@ -817,20 +865,30 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) * need to synchronize the write-buffer either. */ if (!last_reference) { - len += ui->data_len; + ilen += ui->data_len; sync = IS_SYNC(inode); } - ino = kmalloc(len, GFP_NOFS); + + if (ubifs_authenticated(c)) + write_len = ALIGN(ilen, 8) + ubifs_auth_node_sz(c); + else + write_len = ilen; + + ino = kmalloc(write_len, GFP_NOFS); if (!ino) return -ENOMEM; /* Make reservation before allocating sequence numbers */ - err = make_reservation(c, BASEHD, len); + err = make_reservation(c, BASEHD, write_len); if (err) goto out_free; pack_inode(c, ino, inode, 1); - err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + err = ubifs_node_calc_hash(c, ino, hash); + if (err) + goto out_release; + + err = write_head(c, BASEHD, ino, write_len, &lnum, &offs, sync); if (err) goto out_release; if (!sync) @@ -838,17 +896,19 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) inode->i_ino); release_head(c, BASEHD); + ubifs_add_auth_dirt(c, lnum); + if (last_reference) { err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) goto out_ro; ubifs_delete_orphan(c, inode->i_ino); - err = ubifs_add_dirt(c, lnum, len); + err = ubifs_add_dirt(c, lnum, ilen); } else { union ubifs_key key; ino_key_init(c, &key, inode->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs, len); + err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash); } if (err) goto out_ro; @@ -958,6 +1018,10 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, int aligned_dlen1, aligned_dlen2; int twoparents = (fst_dir != snd_dir); void *p; + u8 hash_dent1[UBIFS_HASH_ARR_SZ]; + u8 hash_dent2[UBIFS_HASH_ARR_SZ]; + u8 hash_p1[UBIFS_HASH_ARR_SZ]; + u8 hash_p2[UBIFS_HASH_ARR_SZ]; ubifs_assert(c, ubifs_inode(fst_dir)->data_len == 0); ubifs_assert(c, ubifs_inode(snd_dir)->data_len == 0); @@ -973,6 +1037,8 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, if (twoparents) len += plen; + len += ubifs_auth_node_sz(c); + dent1 = kzalloc(len, GFP_NOFS); if (!dent1) return -ENOMEM; @@ -993,6 +1059,9 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, set_dent_cookie(c, dent1); zero_dent_node_unused(dent1); ubifs_prep_grp_node(c, dent1, dlen1, 0); + err = ubifs_node_calc_hash(c, dent1, hash_dent1); + if (err) + goto out_release; /* Make new dent for 2nd entry */ dent2 = (void *)dent1 + aligned_dlen1; @@ -1006,14 +1075,26 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, set_dent_cookie(c, dent2); zero_dent_node_unused(dent2); ubifs_prep_grp_node(c, dent2, dlen2, 0); + err = ubifs_node_calc_hash(c, dent2, hash_dent2); + if (err) + goto out_release; p = (void *)dent2 + aligned_dlen2; - if (!twoparents) + if (!twoparents) { pack_inode(c, p, fst_dir, 1); - else { + err = ubifs_node_calc_hash(c, p, hash_p1); + if (err) + goto out_release; + } else { pack_inode(c, p, fst_dir, 0); + err = ubifs_node_calc_hash(c, p, hash_p1); + if (err) + goto out_release; p += ALIGN(plen, 8); pack_inode(c, p, snd_dir, 1); + err = ubifs_node_calc_hash(c, p, hash_p2); + if (err) + goto out_release; } err = write_head(c, BASEHD, dent1, len, &lnum, &offs, sync); @@ -1027,28 +1108,30 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, } release_head(c, BASEHD); + ubifs_add_auth_dirt(c, lnum); + dent_key_init(c, &key, snd_dir->i_ino, snd_nm); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, hash_dent1, snd_nm); if (err) goto out_ro; offs += aligned_dlen1; dent_key_init(c, &key, fst_dir->i_ino, fst_nm); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, fst_nm); if (err) goto out_ro; offs += aligned_dlen2; ino_key_init(c, &key, fst_dir->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs, plen); + err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_p1); if (err) goto out_ro; if (twoparents) { offs += ALIGN(plen, 8); ino_key_init(c, &key, snd_dir->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs, plen); + err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_p2); if (err) goto out_ro; } @@ -1101,6 +1184,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); struct ubifs_inode *uninitialized_var(new_ui); + u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; + u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; + u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; + u8 hash_dent1[UBIFS_HASH_ARR_SZ]; + u8 hash_dent2[UBIFS_HASH_ARR_SZ]; ubifs_assert(c, ubifs_inode(old_dir)->data_len == 0); ubifs_assert(c, ubifs_inode(new_dir)->data_len == 0); @@ -1123,6 +1211,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); if (move) len += plen; + + len += ubifs_auth_node_sz(c); + dent = kzalloc(len, GFP_NOFS); if (!dent) return -ENOMEM; @@ -1143,6 +1234,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, set_dent_cookie(c, dent); zero_dent_node_unused(dent); ubifs_prep_grp_node(c, dent, dlen1, 0); + err = ubifs_node_calc_hash(c, dent, hash_dent1); + if (err) + goto out_release; dent2 = (void *)dent + aligned_dlen1; dent2->ch.node_type = UBIFS_DENT_NODE; @@ -1162,19 +1256,36 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, set_dent_cookie(c, dent2); zero_dent_node_unused(dent2); ubifs_prep_grp_node(c, dent2, dlen2, 0); + err = ubifs_node_calc_hash(c, dent2, hash_dent2); + if (err) + goto out_release; p = (void *)dent2 + aligned_dlen2; if (new_inode) { pack_inode(c, p, new_inode, 0); + err = ubifs_node_calc_hash(c, p, hash_new_inode); + if (err) + goto out_release; + p += ALIGN(ilen, 8); } - if (!move) + if (!move) { pack_inode(c, p, old_dir, 1); - else { + err = ubifs_node_calc_hash(c, p, hash_old_dir); + if (err) + goto out_release; + } else { pack_inode(c, p, old_dir, 0); + err = ubifs_node_calc_hash(c, p, hash_old_dir); + if (err) + goto out_release; + p += ALIGN(plen, 8); pack_inode(c, p, new_dir, 1); + err = ubifs_node_calc_hash(c, p, hash_new_dir); + if (err) + goto out_release; } if (last_reference) { @@ -1200,15 +1311,17 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } release_head(c, BASEHD); + ubifs_add_auth_dirt(c, lnum); + dent_key_init(c, &key, new_dir->i_ino, new_nm); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, hash_dent1, new_nm); if (err) goto out_ro; offs += aligned_dlen1; if (whiteout) { dent_key_init(c, &key, old_dir->i_ino, old_nm); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm); if (err) goto out_ro; @@ -1227,21 +1340,21 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, offs += aligned_dlen2; if (new_inode) { ino_key_init(c, &key, new_inode->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs, ilen); + err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash_new_inode); if (err) goto out_ro; offs += ALIGN(ilen, 8); } ino_key_init(c, &key, old_dir->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs, plen); + err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir); if (err) goto out_ro; if (move) { offs += ALIGN(plen, 8); ino_key_init(c, &key, new_dir->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs, plen); + err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_new_dir); if (err) goto out_ro; } @@ -1360,6 +1473,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; + u8 hash_ino[UBIFS_HASH_ARR_SZ]; + u8 hash_dn[UBIFS_HASH_ARR_SZ]; dbg_jnl("ino %lu, size %lld -> %lld", (unsigned long)inum, old_size, new_size); @@ -1369,6 +1484,9 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + + sz += ubifs_auth_node_sz(c); + ino = kmalloc(sz, GFP_NOFS); if (!ino) return -ENOMEM; @@ -1414,16 +1532,28 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, /* Must make reservation before allocating sequence numbers */ len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ; - if (dlen) + + if (ubifs_authenticated(c)) + len += ALIGN(dlen, 8) + ubifs_auth_node_sz(c); + else len += dlen; + err = make_reservation(c, BASEHD, len); if (err) goto out_free; pack_inode(c, ino, inode, 0); + err = ubifs_node_calc_hash(c, ino, hash_ino); + if (err) + goto out_release; + ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); - if (dlen) + if (dlen) { ubifs_prep_grp_node(c, dn, dlen, 1); + err = ubifs_node_calc_hash(c, dn, hash_dn); + if (err) + goto out_release; + } err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); if (err) @@ -1432,15 +1562,17 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum); release_head(c, BASEHD); + ubifs_add_auth_dirt(c, lnum); + if (dlen) { sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ; - err = ubifs_tnc_add(c, &key, lnum, sz, dlen); + err = ubifs_tnc_add(c, &key, lnum, sz, dlen, hash_dn); if (err) goto out_ro; } ino_key_init(c, &key, inum); - err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ); + err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ, hash_ino); if (err) goto out_ro; @@ -1495,12 +1627,13 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, const struct inode *inode, const struct fscrypt_name *nm) { - int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen; + int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen, write_len; struct ubifs_dent_node *xent; struct ubifs_ino_node *ino; union ubifs_key xent_key, key1, key2; int sync = IS_DIRSYNC(host); struct ubifs_inode *host_ui = ubifs_inode(host); + u8 hash[UBIFS_HASH_ARR_SZ]; ubifs_assert(c, inode->i_nlink == 0); ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex)); @@ -1514,12 +1647,14 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); - xent = kzalloc(len, GFP_NOFS); + write_len = len + ubifs_auth_node_sz(c); + + xent = kzalloc(write_len, GFP_NOFS); if (!xent) return -ENOMEM; /* Make reservation before allocating sequence numbers */ - err = make_reservation(c, BASEHD, len); + err = make_reservation(c, BASEHD, write_len); if (err) { kfree(xent); return err; @@ -1540,11 +1675,16 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, pack_inode(c, ino, inode, 0); ino = (void *)ino + UBIFS_INO_NODE_SZ; pack_inode(c, ino, host, 1); + err = ubifs_node_calc_hash(c, ino, hash); + if (err) + goto out_release; - err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); + err = write_head(c, BASEHD, xent, write_len, &lnum, &xent_offs, sync); if (!sync && !err) ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino); release_head(c, BASEHD); + + ubifs_add_auth_dirt(c, lnum); kfree(xent); if (err) goto out_ro; @@ -1572,7 +1712,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, /* And update TNC with the new host inode position */ ino_key_init(c, &key1, host->i_ino); - err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen); + err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen, hash); if (err) goto out_ro; @@ -1583,6 +1723,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, mark_inode_clean(c, host_ui); return 0; +out_release: + kfree(xent); + release_head(c, BASEHD); out_ro: ubifs_ro_mode(c, err); finish_reservation(c); @@ -1610,6 +1753,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, struct ubifs_ino_node *ino; union ubifs_key key; int sync = IS_DIRSYNC(host); + u8 hash_host[UBIFS_HASH_ARR_SZ]; + u8 hash[UBIFS_HASH_ARR_SZ]; dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); ubifs_assert(c, host->i_nlink > 0); @@ -1621,6 +1766,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, aligned_len1 = ALIGN(len1, 8); aligned_len = aligned_len1 + ALIGN(len2, 8); + aligned_len += ubifs_auth_node_sz(c); + ino = kzalloc(aligned_len, GFP_NOFS); if (!ino) return -ENOMEM; @@ -1631,7 +1778,13 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, goto out_free; pack_inode(c, ino, host, 0); + err = ubifs_node_calc_hash(c, ino, hash_host); + if (err) + goto out_release; pack_inode(c, (void *)ino + aligned_len1, inode, 1); + err = ubifs_node_calc_hash(c, (void *)ino + aligned_len1, hash); + if (err) + goto out_release; err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); if (!sync && !err) { @@ -1644,13 +1797,15 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, if (err) goto out_ro; + ubifs_add_auth_dirt(c, lnum); + ino_key_init(c, &key, host->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs, len1); + err = ubifs_tnc_add(c, &key, lnum, offs, len1, hash_host); if (err) goto out_ro; ino_key_init(c, &key, inode->i_ino); - err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2); + err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2, hash); if (err) goto out_ro; @@ -1662,6 +1817,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, kfree(ino); return 0; +out_release: + release_head(c, BASEHD); out_ro: ubifs_ro_mode(c, err); finish_reservation(c); diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 86b0828f5499..15fd854149bb 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -236,6 +236,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) bud->lnum = lnum; bud->start = offs; bud->jhead = jhead; + bud->log_hash = NULL; ref->ch.node_type = UBIFS_REF_NODE; ref->lnum = cpu_to_le32(bud->lnum); @@ -275,6 +276,14 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) if (err) goto out_unlock; + err = ubifs_shash_update(c, c->log_hash, ref, UBIFS_REF_NODE_SZ); + if (err) + goto out_unlock; + + err = ubifs_shash_copy_state(c, c->log_hash, c->jheads[jhead].log_hash); + if (err) + goto out_unlock; + c->lhead_offs += c->ref_node_alsz; ubifs_add_bud(c, bud); @@ -377,6 +386,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) cs->cmt_no = cpu_to_le64(c->cmt_no); ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); + err = ubifs_shash_init(c, c->log_hash); + if (err) + goto out; + + err = ubifs_shash_update(c, c->log_hash, cs, UBIFS_CS_NODE_SZ); + if (err < 0) + goto out; + /* * Note, we do not lock 'c->log_mutex' because this is the commit start * phase and we are exclusively using the log. And we do not lock @@ -402,6 +419,12 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0); len += UBIFS_REF_NODE_SZ; + + err = ubifs_shash_update(c, c->log_hash, ref, + UBIFS_REF_NODE_SZ); + if (err) + goto out; + ubifs_shash_copy_state(c, c->log_hash, c->jheads[i].log_hash); } ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len); @@ -516,6 +539,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) if (err) return err; list_del(&bud->list); + kfree(bud->log_hash); kfree(bud); } mutex_lock(&c->log_mutex); diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 31393370e334..d1d5e96350dd 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -604,11 +604,12 @@ static int calc_pnode_num_from_parent(const struct ubifs_info *c, * @lpt_first: LEB number of first LPT LEB * @lpt_lebs: number of LEBs for LPT is passed and returned here * @big_lpt: use big LPT model is passed and returned here + * @hash: hash of the LPT is returned here * * This function returns %0 on success and a negative error code on failure. */ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, - int *lpt_lebs, int *big_lpt) + int *lpt_lebs, int *big_lpt, u8 *hash) { int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row; int blnum, boffs, bsz, bcnt; @@ -617,6 +618,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, void *buf = NULL, *p; struct ubifs_lpt_lprops *ltab = NULL; int *lsave = NULL; + struct shash_desc *desc; err = calc_dflt_lpt_geom(c, main_lebs, big_lpt); if (err) @@ -630,6 +632,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Needed by 'ubifs_pack_lsave()' */ c->main_first = c->leb_cnt - *main_lebs; + desc = ubifs_hash_get_desc(c); + if (IS_ERR(desc)) + return PTR_ERR(desc); + lsave = kmalloc_array(c->lsave_cnt, sizeof(int), GFP_KERNEL); pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL); nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL); @@ -677,6 +683,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Add first pnode */ ubifs_pack_pnode(c, p, pnode); + err = ubifs_shash_update(c, desc, p, c->pnode_sz); + if (err) + goto out; + p += c->pnode_sz; len = c->pnode_sz; pnode->num += 1; @@ -711,6 +721,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, len = 0; } ubifs_pack_pnode(c, p, pnode); + err = ubifs_shash_update(c, desc, p, c->pnode_sz); + if (err) + goto out; + p += c->pnode_sz; len += c->pnode_sz; /* @@ -830,6 +844,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, if (err) goto out; + err = ubifs_shash_final(c, desc, hash); + if (err) + goto out; + c->nhead_lnum = lnum; c->nhead_offs = ALIGN(len, c->min_io_size); @@ -853,6 +871,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); out: c->ltab = NULL; + kfree(desc); kfree(lsave); vfree(ltab); vfree(buf); @@ -1439,26 +1458,25 @@ struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, } /** - * ubifs_lpt_lookup - lookup LEB properties in the LPT. + * ubifs_pnode_lookup - lookup a pnode in the LPT. * @c: UBIFS file-system description object - * @lnum: LEB number to lookup + * @i: pnode number (0 to (main_lebs - 1) / UBIFS_LPT_FANOUT) * - * This function returns a pointer to the LEB properties on success or a - * negative error code on failure. + * This function returns a pointer to the pnode on success or a negative + * error code on failure. */ -struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) +struct ubifs_pnode *ubifs_pnode_lookup(struct ubifs_info *c, int i) { - int err, i, h, iip, shft; + int err, h, iip, shft; struct ubifs_nnode *nnode; - struct ubifs_pnode *pnode; if (!c->nroot) { err = ubifs_read_nnode(c, NULL, 0); if (err) return ERR_PTR(err); } + i <<= UBIFS_LPT_FANOUT_SHIFT; nnode = c->nroot; - i = lnum - c->main_first; shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; for (h = 1; h < c->lpt_hght; h++) { iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); @@ -1468,7 +1486,24 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - pnode = ubifs_get_pnode(c, nnode, iip); + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * ubifs_lpt_lookup - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) +{ + int i, iip; + struct ubifs_pnode *pnode; + + i = lnum - c->main_first; + pnode = ubifs_pnode_lookup(c, i >> UBIFS_LPT_FANOUT_SHIFT); if (IS_ERR(pnode)) return ERR_CAST(pnode); iip = (i & (UBIFS_LPT_FANOUT - 1)); @@ -1620,6 +1655,131 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) } /** + * ubifs_lpt_calc_hash - Calculate hash of the LPT pnodes + * @c: UBIFS file-system description object + * @hash: the returned hash of the LPT pnodes + * + * This function iterates over the LPT pnodes and creates a hash over them. + * Returns 0 for success or a negative error code otherwise. + */ +int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash) +{ + struct ubifs_nnode *nnode, *nn; + struct ubifs_cnode *cnode; + struct shash_desc *desc; + int iip = 0, i; + int bufsiz = max_t(int, c->nnode_sz, c->pnode_sz); + void *buf; + int err; + + if (!ubifs_authenticated(c)) + return 0; + + desc = ubifs_hash_get_desc(c); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + buf = kmalloc(bufsiz, GFP_NOFS); + if (!buf) { + err = -ENOMEM; + goto out; + } + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return err; + } + + cnode = (struct ubifs_cnode *)c->nroot; + + while (cnode) { + nnode = cnode->parent; + nn = (struct ubifs_nnode *)cnode; + if (cnode->level > 1) { + while (iip < UBIFS_LPT_FANOUT) { + if (nn->nbranch[iip].lnum == 0) { + /* Go right */ + iip++; + continue; + } + + nnode = ubifs_get_nnode(c, nn, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + + /* Go down */ + iip = 0; + cnode = (struct ubifs_cnode *)nnode; + break; + } + if (iip < UBIFS_LPT_FANOUT) + continue; + } else { + struct ubifs_pnode *pnode; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nn->nbranch[i].lnum == 0) + continue; + pnode = ubifs_get_pnode(c, nn, i); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + + ubifs_pack_pnode(c, buf, pnode); + err = ubifs_shash_update(c, desc, buf, + c->pnode_sz); + if (err) + goto out; + } + } + /* Go up and to the right */ + iip = cnode->iip + 1; + cnode = (struct ubifs_cnode *)nnode; + } + + err = ubifs_shash_final(c, desc, hash); +out: + kfree(desc); + kfree(buf); + + return err; +} + +/** + * lpt_check_hash - check the hash of the LPT. + * @c: UBIFS file-system description object + * + * This function calculates a hash over all pnodes in the LPT and compares it with + * the hash stored in the master node. Returns %0 on success and a negative error + * code on failure. + */ +static int lpt_check_hash(struct ubifs_info *c) +{ + int err; + u8 hash[UBIFS_HASH_ARR_SZ]; + + if (!ubifs_authenticated(c)) + return 0; + + err = ubifs_lpt_calc_hash(c, hash); + if (err) + return err; + + if (ubifs_check_hash(c, c->mst_node->hash_lpt, hash)) { + err = -EPERM; + ubifs_err(c, "Failed to authenticate LPT"); + } else { + err = 0; + } + + return err; +} + +/** * lpt_init_rd - initialize the LPT for reading. * @c: UBIFS file-system description object * @@ -1660,6 +1820,10 @@ static int lpt_init_rd(struct ubifs_info *c) if (err) return err; + err = lpt_check_hash(c); + if (err) + return err; + dbg_lp("space_bits %d", c->space_bits); dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 7ce30994bbba..1f88caffdf2a 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -619,38 +619,6 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, } /** - * pnode_lookup - lookup a pnode in the LPT. - * @c: UBIFS file-system description object - * @i: pnode number (0 to (main_lebs - 1) / UBIFS_LPT_FANOUT)) - * - * This function returns a pointer to the pnode on success or a negative - * error code on failure. - */ -static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) -{ - int err, h, iip, shft; - struct ubifs_nnode *nnode; - - if (!c->nroot) { - err = ubifs_read_nnode(c, NULL, 0); - if (err) - return ERR_PTR(err); - } - i <<= UBIFS_LPT_FANOUT_SHIFT; - nnode = c->nroot; - shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; - for (h = 1; h < c->lpt_hght; h++) { - iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - shft -= UBIFS_LPT_FANOUT_SHIFT; - nnode = ubifs_get_nnode(c, nnode, iip); - if (IS_ERR(nnode)) - return ERR_CAST(nnode); - } - iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - return ubifs_get_pnode(c, nnode, iip); -} - -/** * add_pnode_dirt - add dirty space to LPT LEB properties. * @c: UBIFS file-system description object * @pnode: pnode for which to add dirt @@ -702,7 +670,7 @@ static int make_tree_dirty(struct ubifs_info *c) { struct ubifs_pnode *pnode; - pnode = pnode_lookup(c, 0); + pnode = ubifs_pnode_lookup(c, 0); if (IS_ERR(pnode)) return PTR_ERR(pnode); @@ -956,7 +924,7 @@ static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum, struct ubifs_pnode *pnode; struct ubifs_nbranch *branch; - pnode = pnode_lookup(c, node_num); + pnode = ubifs_pnode_lookup(c, node_num); if (IS_ERR(pnode)) return PTR_ERR(pnode); branch = &pnode->parent->nbranch[pnode->iip]; @@ -1279,6 +1247,10 @@ int ubifs_lpt_start_commit(struct ubifs_info *c) if (err) goto out; + err = ubifs_lpt_calc_hash(c, c->mst_node->hash_lpt); + if (err) + goto out; + /* Copy the LPT's own lprops for end commit to write */ memcpy(c->ltab_cmt, c->ltab, sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); @@ -1558,7 +1530,7 @@ static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs) struct ubifs_nbranch *branch; cond_resched(); - pnode = pnode_lookup(c, i); + pnode = ubifs_pnode_lookup(c, i); if (IS_ERR(pnode)) return PTR_ERR(pnode); branch = &pnode->parent->nbranch[pnode->iip]; @@ -1710,7 +1682,7 @@ int dbg_check_ltab(struct ubifs_info *c) for (i = 0; i < cnt; i++) { struct ubifs_pnode *pnode; - pnode = pnode_lookup(c, i); + pnode = ubifs_pnode_lookup(c, i); if (IS_ERR(pnode)) return PTR_ERR(pnode); cond_resched(); diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 9df4a41bba52..5ea51bbd14c7 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -25,6 +25,42 @@ #include "ubifs.h" /** + * ubifs_compare_master_node - compare two UBIFS master nodes + * @c: UBIFS file-system description object + * @m1: the first node + * @m2: the second node + * + * This function compares two UBIFS master nodes. Returns 0 if they are equal + * and nonzero if not. + */ +int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2) +{ + int ret; + int behind; + int hmac_offs = offsetof(struct ubifs_mst_node, hmac); + + /* + * Do not compare the common node header since the sequence number and + * hence the CRC are different. + */ + ret = memcmp(m1 + UBIFS_CH_SZ, m2 + UBIFS_CH_SZ, + hmac_offs - UBIFS_CH_SZ); + if (ret) + return ret; + + /* + * Do not compare the embedded HMAC aswell which also must be different + * due to the different common node header. + */ + behind = hmac_offs + UBIFS_MAX_HMAC_LEN; + + if (UBIFS_MST_NODE_SZ > behind) + return memcmp(m1 + behind, m2 + behind, UBIFS_MST_NODE_SZ - behind); + + return 0; +} + +/** * scan_for_master - search the valid master node. * @c: UBIFS file-system description object * @@ -37,7 +73,7 @@ static int scan_for_master(struct ubifs_info *c) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; - int lnum, offs = 0, nodes_cnt; + int lnum, offs = 0, nodes_cnt, err; lnum = UBIFS_MST_LNUM; @@ -69,12 +105,23 @@ static int scan_for_master(struct ubifs_info *c) goto out_dump; if (snod->offs != offs) goto out; - if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, - (void *)snod->node + UBIFS_CH_SZ, - UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + if (ubifs_compare_master_node(c, c->mst_node, snod->node)) goto out; + c->mst_offs = offs; ubifs_scan_destroy(sleb); + + if (!ubifs_authenticated(c)) + return 0; + + err = ubifs_node_verify_hmac(c, c->mst_node, + sizeof(struct ubifs_mst_node), + offsetof(struct ubifs_mst_node, hmac)); + if (err) { + ubifs_err(c, "Failed to verify master node HMAC"); + return -EPERM; + } + return 0; out: @@ -305,6 +352,8 @@ int ubifs_read_master(struct ubifs_info *c) c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); + ubifs_copy_hash(c, c->mst_node->hash_root_idx, c->zroot.hash); + c->calc_idx_sz = c->bi.old_idx_sz; if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) @@ -378,7 +427,9 @@ int ubifs_write_master(struct ubifs_info *c) c->mst_offs = offs; c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); - err = ubifs_write_node(c, c->mst_node, len, lnum, offs); + ubifs_copy_hash(c, c->zroot.hash, c->mst_node->hash_root_idx); + err = ubifs_write_node_hmac(c, c->mst_node, len, lnum, offs, + offsetof(struct ubifs_mst_node, hmac)); if (err) return err; @@ -389,7 +440,8 @@ int ubifs_write_master(struct ubifs_info *c) if (err) return err; } - err = ubifs_write_node(c, c->mst_node, len, lnum, offs); + err = ubifs_write_node_hmac(c, c->mst_node, len, lnum, offs, + offsetof(struct ubifs_mst_node, hmac)); return err; } diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 21d35d7dd975..6f87237fdbf4 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -197,7 +197,8 @@ static inline int ubifs_return_leb(struct ubifs_info *c, int lnum) */ static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt) { - return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt; + return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len + c->hash_len) + * child_cnt; } /** @@ -212,7 +213,7 @@ struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c, int bnum) { return (struct ubifs_branch *)((void *)idx->branches + - (UBIFS_BRANCH_SZ + c->key_len) * bnum); + (UBIFS_BRANCH_SZ + c->key_len + c->hash_len) * bnum); } /** diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 984e30e83c0b..8526b7ec4707 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -212,7 +212,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c, save_flags = mst->flags; mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); - ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); + err = ubifs_prepare_node_hmac(c, mst, UBIFS_MST_NODE_SZ, + offsetof(struct ubifs_mst_node, hmac), 1); + if (err) + goto out; err = ubifs_leb_change(c, lnum, mst, sz); if (err) goto out; @@ -264,9 +267,7 @@ int ubifs_recover_master_node(struct ubifs_info *c) offs2 = (void *)mst2 - buf2; if (offs1 == offs2) { /* Same offset, so must be the same */ - if (memcmp((void *)mst1 + UBIFS_CH_SZ, - (void *)mst2 + UBIFS_CH_SZ, - UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + if (ubifs_compare_master_node(c, mst1, mst2)) goto out_err; mst = mst1; } else if (offs2 + sz == offs1) { @@ -1462,15 +1463,81 @@ out: } /** + * inode_fix_size - fix inode size + * @c: UBIFS file-system description object + * @e: inode size information for recovery + */ +static int inode_fix_size(struct ubifs_info *c, struct size_entry *e) +{ + struct inode *inode; + struct ubifs_inode *ui; + int err; + + if (c->ro_mount) + ubifs_assert(c, !e->inode); + + if (e->inode) { + /* Remounting rw, pick up inode we stored earlier */ + inode = e->inode; + } else { + inode = ubifs_iget(c->vfs_sb, e->inum); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (inode->i_size >= e->d_size) { + /* + * The original inode in the index already has a size + * big enough, nothing to do + */ + iput(inode); + return 0; + } + + dbg_rcvry("ino %lu size %lld -> %lld", + (unsigned long)e->inum, + inode->i_size, e->d_size); + + ui = ubifs_inode(inode); + + inode->i_size = e->d_size; + ui->ui_size = e->d_size; + ui->synced_i_size = e->d_size; + + e->inode = inode; + } + + /* + * In readonly mode just keep the inode pinned in memory until we go + * readwrite. In readwrite mode write the inode to the journal with the + * fixed size. + */ + if (c->ro_mount) + return 0; + + err = ubifs_jnl_write_inode(c, inode); + + iput(inode); + + if (err) + return err; + + rb_erase(&e->rb, &c->size_tree); + kfree(e); + + return 0; +} + +/** * ubifs_recover_size - recover inode size. * @c: UBIFS file-system description object + * @in_place: If true, do a in-place size fixup * * This function attempts to fix inode size discrepancies identified by the * 'ubifs_recover_size_accum()' function. * * This functions returns %0 on success and a negative error code on failure. */ -int ubifs_recover_size(struct ubifs_info *c) +int ubifs_recover_size(struct ubifs_info *c, bool in_place) { struct rb_node *this = rb_first(&c->size_tree); @@ -1479,6 +1546,9 @@ int ubifs_recover_size(struct ubifs_info *c) int err; e = rb_entry(this, struct size_entry, rb); + + this = rb_next(this); + if (!e->exists) { union ubifs_key key; @@ -1502,40 +1572,26 @@ int ubifs_recover_size(struct ubifs_info *c) } if (e->exists && e->i_size < e->d_size) { - if (c->ro_mount) { - /* Fix the inode size and pin it in memory */ - struct inode *inode; - struct ubifs_inode *ui; - - ubifs_assert(c, !e->inode); - - inode = ubifs_iget(c->vfs_sb, e->inum); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - ui = ubifs_inode(inode); - if (inode->i_size < e->d_size) { - dbg_rcvry("ino %lu size %lld -> %lld", - (unsigned long)e->inum, - inode->i_size, e->d_size); - inode->i_size = e->d_size; - ui->ui_size = e->d_size; - ui->synced_i_size = e->d_size; - e->inode = inode; - this = rb_next(this); - continue; - } - iput(inode); - } else { - /* Fix the size in place */ + ubifs_assert(c, !(c->ro_mount && in_place)); + + /* + * We found data that is outside the found inode size, + * fixup the inode size + */ + + if (in_place) { err = fix_size_in_place(c, e); if (err) return err; iput(e->inode); + } else { + err = inode_fix_size(c, e); + if (err) + return err; + continue; } } - this = rb_next(this); rb_erase(&e->rb, &c->size_tree); kfree(e); } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 4844538eb926..75f961c4c044 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -34,6 +34,8 @@ #include "ubifs.h" #include <linux/list_sort.h> +#include <crypto/hash.h> +#include <crypto/algapi.h> /** * struct replay_entry - replay list entry. @@ -56,6 +58,7 @@ struct replay_entry { int lnum; int offs; int len; + u8 hash[UBIFS_HASH_ARR_SZ]; unsigned int deletion:1; unsigned long long sqnum; struct list_head list; @@ -228,7 +231,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); else err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, - r->len, &r->nm); + r->len, r->hash, &r->nm); } else { if (r->deletion) switch (key_type(c, &r->key)) { @@ -248,7 +251,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) } else err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs, - r->len); + r->len, r->hash); if (err) return err; @@ -352,9 +355,9 @@ static void destroy_replay_list(struct ubifs_info *c) * in case of success and a negative error code in case of failure. */ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, - union ubifs_key *key, unsigned long long sqnum, - int deletion, int *used, loff_t old_size, - loff_t new_size) + const u8 *hash, union ubifs_key *key, + unsigned long long sqnum, int deletion, int *used, + loff_t old_size, loff_t new_size) { struct replay_entry *r; @@ -372,6 +375,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + ubifs_copy_hash(c, hash, r->hash); r->deletion = !!deletion; r->sqnum = sqnum; key_copy(c, key, &r->key); @@ -400,8 +404,9 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, * negative error code in case of failure. */ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, - union ubifs_key *key, const char *name, int nlen, - unsigned long long sqnum, int deletion, int *used) + const u8 *hash, union ubifs_key *key, + const char *name, int nlen, unsigned long long sqnum, + int deletion, int *used) { struct replay_entry *r; char *nbuf; @@ -425,6 +430,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + ubifs_copy_hash(c, hash, r->hash); r->deletion = !!deletion; r->sqnum = sqnum; key_copy(c, key, &r->key); @@ -528,6 +534,105 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) } /** + * authenticate_sleb - authenticate one scan LEB + * @c: UBIFS file-system description object + * @sleb: the scan LEB to authenticate + * @log_hash: + * @is_last: if true, this is is the last LEB + * + * This function iterates over the buds of a single LEB authenticating all buds + * with the authentication nodes on this LEB. Authentication nodes are written + * after some buds and contain a HMAC covering the authentication node itself + * and the buds between the last authentication node and the current + * authentication node. It can happen that the last buds cannot be authenticated + * because a powercut happened when some nodes were written but not the + * corresponding authentication node. This function returns the number of nodes + * that could be authenticated or a negative error code. + */ +static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct shash_desc *log_hash, int is_last) +{ + int n_not_auth = 0; + struct ubifs_scan_node *snod; + int n_nodes = 0; + int err; + u8 *hash, *hmac; + + if (!ubifs_authenticated(c)) + return sleb->nodes_cnt; + + hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS); + hmac = kmalloc(c->hmac_desc_len, GFP_NOFS); + if (!hash || !hmac) { + err = -ENOMEM; + goto out; + } + + list_for_each_entry(snod, &sleb->nodes, list) { + + n_nodes++; + + if (snod->type == UBIFS_AUTH_NODE) { + struct ubifs_auth_node *auth = snod->node; + SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); + SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); + + hash_desc->tfm = c->hash_tfm; + hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ubifs_shash_copy_state(c, log_hash, hash_desc); + err = crypto_shash_final(hash_desc, hash); + if (err) + goto out; + + hmac_desc->tfm = c->hmac_tfm; + hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + err = crypto_shash_digest(hmac_desc, hash, c->hash_len, + hmac); + if (err) + goto out; + + err = ubifs_check_hmac(c, auth->hmac, hmac); + if (err) { + err = -EPERM; + goto out; + } + n_not_auth = 0; + } else { + err = crypto_shash_update(log_hash, snod->node, + snod->len); + if (err) + goto out; + n_not_auth++; + } + } + + /* + * A powercut can happen when some nodes were written, but not yet + * the corresponding authentication node. This may only happen on + * the last bud though. + */ + if (n_not_auth) { + if (is_last) { + dbg_mnt("%d unauthenticated nodes found on LEB %d, Ignoring them", + n_not_auth, sleb->lnum); + err = 0; + } else { + dbg_mnt("%d unauthenticated nodes found on non-last LEB %d", + n_not_auth, sleb->lnum); + err = -EPERM; + } + } else { + err = 0; + } +out: + kfree(hash); + kfree(hmac); + + return err ? err : n_nodes - n_not_auth; +} + +/** * replay_bud - replay a bud logical eraseblock. * @c: UBIFS file-system description object * @b: bud entry which describes the bud @@ -540,6 +645,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) { int is_last = is_last_bud(c, b->bud); int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start; + int n_nodes, n = 0; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; @@ -559,6 +665,15 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) if (IS_ERR(sleb)) return PTR_ERR(sleb); + n_nodes = authenticate_sleb(c, sleb, b->bud->log_hash, is_last); + if (n_nodes < 0) { + err = n_nodes; + goto out; + } + + ubifs_shash_copy_state(c, b->bud->log_hash, + c->jheads[b->bud->jhead].log_hash); + /* * The bud does not have to start from offset zero - the beginning of * the 'lnum' LEB may contain previously committed data. One of the @@ -582,6 +697,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) */ list_for_each_entry(snod, &sleb->nodes, list) { + u8 hash[UBIFS_HASH_ARR_SZ]; int deletion = 0; cond_resched(); @@ -591,6 +707,8 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) goto out_dump; } + ubifs_node_calc_hash(c, snod->node, hash); + if (snod->sqnum > c->max_sqnum) c->max_sqnum = snod->sqnum; @@ -602,7 +720,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) if (le32_to_cpu(ino->nlink) == 0) deletion = 1; - err = insert_node(c, lnum, snod->offs, snod->len, + err = insert_node(c, lnum, snod->offs, snod->len, hash, &snod->key, snod->sqnum, deletion, &used, 0, new_size); break; @@ -614,7 +732,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) key_block(c, &snod->key) * UBIFS_BLOCK_SIZE; - err = insert_node(c, lnum, snod->offs, snod->len, + err = insert_node(c, lnum, snod->offs, snod->len, hash, &snod->key, snod->sqnum, deletion, &used, 0, new_size); break; @@ -628,7 +746,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) if (err) goto out_dump; - err = insert_dent(c, lnum, snod->offs, snod->len, + err = insert_dent(c, lnum, snod->offs, snod->len, hash, &snod->key, dent->name, le16_to_cpu(dent->nlen), snod->sqnum, !le64_to_cpu(dent->inum), &used); @@ -654,11 +772,13 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) * functions which expect nodes to have keys. */ trun_key_init(c, &key, le32_to_cpu(trun->inum)); - err = insert_node(c, lnum, snod->offs, snod->len, + err = insert_node(c, lnum, snod->offs, snod->len, hash, &key, snod->sqnum, 1, &used, old_size, new_size); break; } + case UBIFS_AUTH_NODE: + break; default: ubifs_err(c, "unexpected node type %d in bud LEB %d:%d", snod->type, lnum, snod->offs); @@ -667,6 +787,10 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) } if (err) goto out; + + n++; + if (n == n_nodes) + break; } ubifs_assert(c, ubifs_search_bud(c, lnum)); @@ -745,6 +869,7 @@ static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, { struct ubifs_bud *bud; struct bud_entry *b; + int err; dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead); @@ -754,13 +879,21 @@ static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL); if (!b) { - kfree(bud); - return -ENOMEM; + err = -ENOMEM; + goto out; } bud->lnum = lnum; bud->start = offs; bud->jhead = jhead; + bud->log_hash = ubifs_hash_get_desc(c); + if (IS_ERR(bud->log_hash)) { + err = PTR_ERR(bud->log_hash); + goto out; + } + + ubifs_shash_copy_state(c, c->log_hash, bud->log_hash); + ubifs_add_bud(c, bud); b->bud = bud; @@ -768,6 +901,11 @@ static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, list_add_tail(&b->list, &c->replay_buds); return 0; +out: + kfree(bud); + kfree(b); + + return err; } /** @@ -873,6 +1011,14 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) c->cs_sqnum = le64_to_cpu(node->ch.sqnum); dbg_mnt("commit start sqnum %llu", c->cs_sqnum); + + err = ubifs_shash_init(c, c->log_hash); + if (err) + goto out; + + err = ubifs_shash_update(c, c->log_hash, node, UBIFS_CS_NODE_SZ); + if (err < 0) + goto out; } if (snod->sqnum < c->cs_sqnum) { @@ -920,6 +1066,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) if (err) goto out_dump; + err = ubifs_shash_update(c, c->log_hash, ref, + UBIFS_REF_NODE_SZ); + if (err) + goto out; + err = add_replay_bud(c, le32_to_cpu(ref->lnum), le32_to_cpu(ref->offs), le32_to_cpu(ref->jhead), diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index bf17f58908ff..75a69dd26d6e 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -82,10 +82,13 @@ static int create_default_filesystem(struct ubifs_info *c) int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; int min_leb_cnt = UBIFS_MIN_LEB_CNT; + int idx_node_size; long long tmp64, main_bytes; __le64 tmp_le64; __le32 tmp_le32; struct timespec64 ts; + u8 hash[UBIFS_HASH_ARR_SZ]; + u8 hash_lpt[UBIFS_HASH_ARR_SZ]; /* Some functions called from here depend on the @c->key_len filed */ c->key_len = UBIFS_SK_LEN; @@ -147,7 +150,7 @@ static int create_default_filesystem(struct ubifs_info *c) c->lsave_cnt = DEFAULT_LSAVE_CNT; c->max_leb_cnt = c->leb_cnt; err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs, - &big_lpt); + &big_lpt, hash_lpt); if (err) return err; @@ -156,17 +159,35 @@ static int create_default_filesystem(struct ubifs_info *c) main_first = c->leb_cnt - main_lebs; + sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL); + mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); + idx_node_size = ubifs_idx_node_sz(c, 1); + idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL); + cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL); + + if (!sup || !mst || !idx || !ino || !cs) { + err = -ENOMEM; + goto out; + } + /* Create default superblock */ - tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); - sup = kzalloc(tmp, GFP_KERNEL); - if (!sup) - return -ENOMEM; tmp64 = (long long)max_buds * c->leb_size; if (big_lpt) sup_flags |= UBIFS_FLG_BIGLPT; sup_flags |= UBIFS_FLG_DOUBLE_HASH; + if (ubifs_authenticated(c)) { + sup_flags |= UBIFS_FLG_AUTHENTICATION; + sup->hash_algo = cpu_to_le16(c->auth_hash_algo); + err = ubifs_hmac_wkm(c, sup->hmac_wkm); + if (err) + goto out; + } else { + sup->hash_algo = 0xffff; + } + sup->ch.node_type = UBIFS_SB_NODE; sup->key_hash = UBIFS_KEY_HASH_R5; sup->flags = cpu_to_le32(sup_flags); @@ -197,17 +218,9 @@ static int create_default_filesystem(struct ubifs_info *c) sup->rp_size = cpu_to_le64(tmp64); sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); - err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0); - kfree(sup); - if (err) - return err; - dbg_gen("default superblock created at LEB 0:0"); /* Create default master node */ - mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); - if (!mst) - return -ENOMEM; mst->ch.node_type = UBIFS_MST_NODE; mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM); @@ -233,6 +246,7 @@ static int create_default_filesystem(struct ubifs_info *c) mst->empty_lebs = cpu_to_le32(main_lebs - 2); mst->idx_lebs = cpu_to_le32(1); mst->leb_cnt = cpu_to_le32(c->leb_cnt); + ubifs_copy_hash(c, hash_lpt, mst->hash_lpt); /* Calculate lprops statistics */ tmp64 = main_bytes; @@ -253,24 +267,9 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0); - if (err) { - kfree(mst); - return err; - } - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, - 0); - kfree(mst); - if (err) - return err; - dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM); /* Create the root indexing node */ - tmp = ubifs_idx_node_sz(c, 1); - idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); - if (!idx) - return -ENOMEM; c->key_fmt = UBIFS_SIMPLE_KEY_FMT; c->key_hash = key_r5_hash; @@ -282,19 +281,11 @@ static int create_default_filesystem(struct ubifs_info *c) key_write_idx(c, &key, &br->key); br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0); - kfree(idx); - if (err) - return err; dbg_gen("default root indexing node created LEB %d:0", main_first + DEFAULT_IDX_LEB); /* Create default root inode */ - tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); - ino = kzalloc(tmp, GFP_KERNEL); - if (!ino) - return -ENOMEM; ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO); ino->ch.node_type = UBIFS_INO_NODE; @@ -317,12 +308,6 @@ static int create_default_filesystem(struct ubifs_info *c) /* Set compression enabled by default */ ino->flags = cpu_to_le32(UBIFS_COMPR_FL); - err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, - main_first + DEFAULT_DATA_LEB, 0); - kfree(ino); - if (err) - return err; - dbg_gen("root inode created at LEB %d:0", main_first + DEFAULT_DATA_LEB); @@ -331,19 +316,54 @@ static int create_default_filesystem(struct ubifs_info *c) * always the case during normal file-system operation. Write a fake * commit start node to the log. */ - tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size); - cs = kzalloc(tmp, GFP_KERNEL); - if (!cs) - return -ENOMEM; cs->ch.node_type = UBIFS_CS_NODE; + + err = ubifs_write_node_hmac(c, sup, UBIFS_SB_NODE_SZ, 0, 0, + offsetof(struct ubifs_sb_node, hmac)); + if (err) + goto out; + + err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, + main_first + DEFAULT_DATA_LEB, 0); + if (err) + goto out; + + ubifs_node_calc_hash(c, ino, hash); + ubifs_copy_hash(c, hash, ubifs_branch_hash(c, br)); + + err = ubifs_write_node(c, idx, idx_node_size, main_first + DEFAULT_IDX_LEB, 0); + if (err) + goto out; + + ubifs_node_calc_hash(c, idx, hash); + ubifs_copy_hash(c, hash, mst->hash_root_idx); + + err = ubifs_write_node_hmac(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0, + offsetof(struct ubifs_mst_node, hmac)); + if (err) + goto out; + + err = ubifs_write_node_hmac(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, + 0, offsetof(struct ubifs_mst_node, hmac)); + if (err) + goto out; + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); - kfree(cs); if (err) - return err; + goto out; ubifs_msg(c, "default file-system created"); - return 0; + + err = 0; +out: + kfree(sup); + kfree(mst); + kfree(idx); + kfree(ino); + kfree(cs); + + return err; } /** @@ -498,7 +518,7 @@ failed: * code. Note, the user of this function is responsible of kfree()'ing the * returned superblock buffer. */ -struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) +static struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) { struct ubifs_sb_node *sup; int err; @@ -517,6 +537,65 @@ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) return sup; } +static int authenticate_sb_node(struct ubifs_info *c, + const struct ubifs_sb_node *sup) +{ + unsigned int sup_flags = le32_to_cpu(sup->flags); + u8 hmac_wkm[UBIFS_HMAC_ARR_SZ]; + int authenticated = !!(sup_flags & UBIFS_FLG_AUTHENTICATION); + int hash_algo; + int err; + + if (c->authenticated && !authenticated) { + ubifs_err(c, "authenticated FS forced, but found FS without authentication"); + return -EINVAL; + } + + if (!c->authenticated && authenticated) { + ubifs_err(c, "authenticated FS found, but no key given"); + return -EINVAL; + } + + ubifs_msg(c, "Mounting in %sauthenticated mode", + c->authenticated ? "" : "un"); + + if (!c->authenticated) + return 0; + + if (!IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) + return -EOPNOTSUPP; + + hash_algo = le16_to_cpu(sup->hash_algo); + if (hash_algo >= HASH_ALGO__LAST) { + ubifs_err(c, "superblock uses unknown hash algo %d", + hash_algo); + return -EINVAL; + } + + if (strcmp(hash_algo_name[hash_algo], c->auth_hash_name)) { + ubifs_err(c, "This filesystem uses %s for hashing," + " but %s is specified", hash_algo_name[hash_algo], + c->auth_hash_name); + return -EINVAL; + } + + err = ubifs_hmac_wkm(c, hmac_wkm); + if (err) + return err; + + if (ubifs_check_hmac(c, hmac_wkm, sup->hmac_wkm)) { + ubifs_err(c, "provided key does not fit"); + return -ENOKEY; + } + + err = ubifs_node_verify_hmac(c, sup, sizeof(*sup), + offsetof(struct ubifs_sb_node, hmac)); + if (err) + ubifs_err(c, "Failed to authenticate superblock: %d", err); + + return err; +} + /** * ubifs_write_sb_node - write superblock node. * @c: UBIFS file-system description object @@ -527,8 +606,13 @@ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) { int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + int err; + + err = ubifs_prepare_node_hmac(c, sup, UBIFS_SB_NODE_SZ, + offsetof(struct ubifs_sb_node, hmac), 1); + if (err) + return err; - ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len); } @@ -555,6 +639,8 @@ int ubifs_read_superblock(struct ubifs_info *c) if (IS_ERR(sup)) return PTR_ERR(sup); + c->sup_node = sup; + c->fmt_version = le32_to_cpu(sup->fmt_version); c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); @@ -603,7 +689,7 @@ int ubifs_read_superblock(struct ubifs_info *c) c->key_hash = key_test_hash; c->key_hash_type = UBIFS_KEY_HASH_TEST; break; - }; + } c->key_fmt = sup->key_fmt; @@ -640,6 +726,10 @@ int ubifs_read_superblock(struct ubifs_info *c) c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH); c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION); + err = authenticate_sb_node(c, sup); + if (err) + goto out; + if ((sup_flags & ~UBIFS_FLG_MASK) != 0) { ubifs_err(c, "Unknown feature flags found: %#x", sup_flags & ~UBIFS_FLG_MASK); @@ -686,7 +776,6 @@ int ubifs_read_superblock(struct ubifs_info *c) err = validate_sb(c, sup); out: - kfree(sup); return err; } @@ -815,7 +904,7 @@ out: int ubifs_fixup_free_space(struct ubifs_info *c) { int err; - struct ubifs_sb_node *sup; + struct ubifs_sb_node *sup = c->sup_node; ubifs_assert(c, c->space_fixup); ubifs_assert(c, !c->ro_mount); @@ -826,16 +915,11 @@ int ubifs_fixup_free_space(struct ubifs_info *c) if (err) return err; - sup = ubifs_read_sb_node(c); - if (IS_ERR(sup)) - return PTR_ERR(sup); - /* Free-space fixup is no longer required */ c->space_fixup = 0; sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP); err = ubifs_write_sb_node(c, sup); - kfree(sup); if (err) return err; @@ -846,7 +930,7 @@ int ubifs_fixup_free_space(struct ubifs_info *c) int ubifs_enable_encryption(struct ubifs_info *c) { int err; - struct ubifs_sb_node *sup; + struct ubifs_sb_node *sup = c->sup_node; if (c->encrypted) return 0; @@ -859,16 +943,11 @@ int ubifs_enable_encryption(struct ubifs_info *c) return -EINVAL; } - sup = ubifs_read_sb_node(c); - if (IS_ERR(sup)) - return PTR_ERR(sup); - sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION); err = ubifs_write_sb_node(c, sup); if (!err) c->encrypted = 1; - kfree(sup); return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index fec62e9dfbe6..1fac1133dadd 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -579,6 +579,9 @@ static int init_constants_early(struct ubifs_info *c) c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; + c->ranges[UBIFS_AUTH_NODE].min_len = UBIFS_AUTH_NODE_SZ; + c->ranges[UBIFS_AUTH_NODE].max_len = UBIFS_AUTH_NODE_SZ + + UBIFS_MAX_HMAC_LEN; c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; @@ -816,6 +819,9 @@ static int alloc_wbufs(struct ubifs_info *c) c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; c->jheads[i].wbuf.jhead = i; c->jheads[i].grouped = 1; + c->jheads[i].log_hash = ubifs_hash_get_desc(c); + if (IS_ERR(c->jheads[i].log_hash)) + goto out; } /* @@ -826,6 +832,12 @@ static int alloc_wbufs(struct ubifs_info *c) c->jheads[GCHD].grouped = 0; return 0; + +out: + while (i--) + kfree(c->jheads[i].log_hash); + + return err; } /** @@ -840,6 +852,7 @@ static void free_wbufs(struct ubifs_info *c) for (i = 0; i < c->jhead_cnt; i++) { kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); + kfree(c->jheads[i].log_hash); } kfree(c->jheads); c->jheads = NULL; @@ -924,6 +937,8 @@ static int check_volume_empty(struct ubifs_info *c) * Opt_no_chk_data_crc: do not check CRCs when reading data nodes * Opt_override_compr: override default compressor * Opt_assert: set ubifs_assert() action + * Opt_auth_key: The key name used for authentication + * Opt_auth_hash_name: The hash type used for authentication * Opt_err: just end of array marker */ enum { @@ -935,6 +950,8 @@ enum { Opt_no_chk_data_crc, Opt_override_compr, Opt_assert, + Opt_auth_key, + Opt_auth_hash_name, Opt_ignore, Opt_err, }; @@ -947,6 +964,8 @@ static const match_table_t tokens = { {Opt_chk_data_crc, "chk_data_crc"}, {Opt_no_chk_data_crc, "no_chk_data_crc"}, {Opt_override_compr, "compr=%s"}, + {Opt_auth_key, "auth_key=%s"}, + {Opt_auth_hash_name, "auth_hash_name=%s"}, {Opt_ignore, "ubi=%s"}, {Opt_ignore, "vol=%s"}, {Opt_assert, "assert=%s"}, @@ -1070,6 +1089,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, kfree(act); break; } + case Opt_auth_key: + c->auth_key_name = kstrdup(args[0].from, GFP_KERNEL); + if (!c->auth_key_name) + return -ENOMEM; + break; + case Opt_auth_hash_name: + c->auth_hash_name = kstrdup(args[0].from, GFP_KERNEL); + if (!c->auth_hash_name) + return -ENOMEM; + break; case Opt_ignore: break; default: @@ -1249,6 +1278,19 @@ static int mount_ubifs(struct ubifs_info *c) c->mounting = 1; + if (c->auth_key_name) { + if (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) { + err = ubifs_init_authentication(c); + if (err) + goto out_free; + } else { + ubifs_err(c, "auth_key_name, but UBIFS is built without" + " authentication support"); + err = -EINVAL; + goto out_free; + } + } + err = ubifs_read_superblock(c); if (err) goto out_free; @@ -1367,12 +1409,21 @@ static int mount_ubifs(struct ubifs_info *c) } if (c->need_recovery) { - err = ubifs_recover_size(c); - if (err) - goto out_orphans; + if (!ubifs_authenticated(c)) { + err = ubifs_recover_size(c, true); + if (err) + goto out_orphans; + } + err = ubifs_rcvry_gc_commit(c); if (err) goto out_orphans; + + if (ubifs_authenticated(c)) { + err = ubifs_recover_size(c, false); + if (err) + goto out_orphans; + } } else { err = take_gc_lnum(c); if (err) @@ -1391,7 +1442,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_orphans; } else if (c->need_recovery) { - err = ubifs_recover_size(c); + err = ubifs_recover_size(c, false); if (err) goto out_orphans; } else { @@ -1557,7 +1608,10 @@ static void ubifs_umount(struct ubifs_info *c) free_wbufs(c); free_orphans(c); ubifs_lpt_free(c, 0); + ubifs_exit_authentication(c); + kfree(c->auth_key_name); + kfree(c->auth_hash_name); kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); @@ -1605,16 +1659,10 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; if (c->old_leb_cnt != c->leb_cnt) { - struct ubifs_sb_node *sup; + struct ubifs_sb_node *sup = c->sup_node; - sup = ubifs_read_sb_node(c); - if (IS_ERR(sup)) { - err = PTR_ERR(sup); - goto out; - } sup->leb_cnt = cpu_to_le32(c->leb_cnt); err = ubifs_write_sb_node(c, sup); - kfree(sup); if (err) goto out; } @@ -1624,9 +1672,11 @@ static int ubifs_remount_rw(struct ubifs_info *c) err = ubifs_write_rcvrd_mst_node(c); if (err) goto out; - err = ubifs_recover_size(c); - if (err) - goto out; + if (!ubifs_authenticated(c)) { + err = ubifs_recover_size(c, true); + if (err) + goto out; + } err = ubifs_clean_lebs(c, c->sbuf); if (err) goto out; @@ -1692,10 +1742,19 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } - if (c->need_recovery) + if (c->need_recovery) { err = ubifs_rcvry_gc_commit(c); - else + if (err) + goto out; + + if (ubifs_authenticated(c)) { + err = ubifs_recover_size(c, false); + if (err) + goto out; + } + } else { err = ubifs_leb_unmap(c, c->gc_lnum); + } if (err) goto out; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index bf416e512743..25572ffea163 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -35,7 +35,7 @@ #include "ubifs.h" static int try_read_node(const struct ubifs_info *c, void *buf, int type, - int len, int lnum, int offs); + struct ubifs_zbranch *zbr); static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_zbranch *zbr, void *node); @@ -433,9 +433,7 @@ static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, * @c: UBIFS file-system description object * @buf: buffer to read to * @type: node type - * @len: node length (not aligned) - * @lnum: LEB number of node to read - * @offs: offset of node to read + * @zbr: the zbranch describing the node to read * * This function tries to read a node of known type and length, checks it and * stores it in @buf. This function returns %1 if a node is present and %0 if @@ -453,8 +451,11 @@ static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, * journal nodes may potentially be corrupted, so checking is required. */ static int try_read_node(const struct ubifs_info *c, void *buf, int type, - int len, int lnum, int offs) + struct ubifs_zbranch *zbr) { + int len = zbr->len; + int lnum = zbr->lnum; + int offs = zbr->offs; int err, node_len; struct ubifs_ch *ch = buf; uint32_t crc, node_crc; @@ -487,6 +488,12 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (crc != node_crc) return 0; + err = ubifs_node_check_hash(c, buf, zbr->hash); + if (err) { + ubifs_bad_hash(c, buf, zbr->hash, lnum, offs); + return 0; + } + return 1; } @@ -507,8 +514,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs); - ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, - zbr->offs); + ret = try_read_node(c, node, key_type(c, key), zbr); if (ret == 1) { union ubifs_key node_key; struct ubifs_dent_node *dent = node; @@ -1713,6 +1719,12 @@ static int validate_data_node(struct ubifs_info *c, void *buf, goto out; } + err = ubifs_node_check_hash(c, buf, zbr->hash); + if (err) { + ubifs_bad_hash(c, buf, zbr->hash, zbr->lnum, zbr->offs); + return err; + } + len = le32_to_cpu(ch->len); if (len != zbr->len) { ubifs_err(c, "bad node length %d, expected %d", len, zbr->len); @@ -2260,13 +2272,14 @@ do_split: * @lnum: LEB number of node * @offs: node offset * @len: node length + * @hash: The hash over the node * * This function adds a node with key @key to TNC. The node may be new or it may * obsolete some existing one. Returns %0 on success or negative error code on * failure. */ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, - int offs, int len) + int offs, int len, const u8 *hash) { int found, n, err = 0; struct ubifs_znode *znode; @@ -2281,6 +2294,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, zbr.lnum = lnum; zbr.offs = offs; zbr.len = len; + ubifs_copy_hash(c, hash, zbr.hash); key_copy(c, key, &zbr.key); err = tnc_insert(c, znode, &zbr, n + 1); } else if (found == 1) { @@ -2291,6 +2305,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, zbr->lnum = lnum; zbr->offs = offs; zbr->len = len; + ubifs_copy_hash(c, hash, zbr->hash); } else err = found; if (!err) @@ -2392,13 +2407,14 @@ out_unlock: * @lnum: LEB number of node * @offs: node offset * @len: node length + * @hash: The hash over the node * @nm: node name * * This is the same as 'ubifs_tnc_add()' but it should be used with keys which * may have collisions, like directory entry keys. */ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, - int lnum, int offs, int len, + int lnum, int offs, int len, const u8 *hash, const struct fscrypt_name *nm) { int found, n, err = 0; @@ -2441,6 +2457,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, zbr->lnum = lnum; zbr->offs = offs; zbr->len = len; + ubifs_copy_hash(c, hash, zbr->hash); goto out_unlock; } } @@ -2452,6 +2469,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, zbr.lnum = lnum; zbr.offs = offs; zbr.len = len; + ubifs_copy_hash(c, hash, zbr.hash); key_copy(c, key, &zbr.key); err = tnc_insert(c, znode, &zbr, n + 1); if (err) diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index dba87d09b989..dbcd2c350b65 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -38,6 +38,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, struct ubifs_znode *znode, int lnum, int offs, int len) { struct ubifs_znode *zp; + u8 hash[UBIFS_HASH_ARR_SZ]; int i, err; /* Make index node */ @@ -52,6 +53,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, br->lnum = cpu_to_le32(zbr->lnum); br->offs = cpu_to_le32(zbr->offs); br->len = cpu_to_le32(zbr->len); + ubifs_copy_hash(c, zbr->hash, ubifs_branch_hash(c, br)); if (!zbr->lnum || !zbr->len) { ubifs_err(c, "bad ref in znode"); ubifs_dump_znode(c, znode); @@ -62,6 +64,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, } } ubifs_prepare_node(c, idx, len, 0); + ubifs_node_calc_hash(c, idx, hash); znode->lnum = lnum; znode->offs = offs; @@ -78,10 +81,12 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, zbr->lnum = lnum; zbr->offs = offs; zbr->len = len; + ubifs_copy_hash(c, hash, zbr->hash); } else { c->zroot.lnum = lnum; c->zroot.offs = offs; c->zroot.len = len; + ubifs_copy_hash(c, hash, c->zroot.hash); } c->calc_idx_sz += ALIGN(len, 8); @@ -647,6 +652,8 @@ static int get_znodes_to_commit(struct ubifs_info *c) znode->cnext = c->cnext; break; } + znode->cparent = znode->parent; + znode->ciip = znode->iip; znode->cnext = cnext; znode = cnext; cnt += 1; @@ -840,6 +847,8 @@ static int write_index(struct ubifs_info *c) } while (1) { + u8 hash[UBIFS_HASH_ARR_SZ]; + cond_resched(); znode = cnext; @@ -857,6 +866,7 @@ static int write_index(struct ubifs_info *c) br->lnum = cpu_to_le32(zbr->lnum); br->offs = cpu_to_le32(zbr->offs); br->len = cpu_to_le32(zbr->len); + ubifs_copy_hash(c, zbr->hash, ubifs_branch_hash(c, br)); if (!zbr->lnum || !zbr->len) { ubifs_err(c, "bad ref in znode"); ubifs_dump_znode(c, znode); @@ -868,6 +878,23 @@ static int write_index(struct ubifs_info *c) } len = ubifs_idx_node_sz(c, znode->child_cnt); ubifs_prepare_node(c, idx, len, 0); + ubifs_node_calc_hash(c, idx, hash); + + mutex_lock(&c->tnc_mutex); + + if (znode->cparent) + ubifs_copy_hash(c, hash, + znode->cparent->zbranch[znode->ciip].hash); + + if (znode->parent) { + if (!ubifs_zn_obsolete(znode)) + ubifs_copy_hash(c, hash, + znode->parent->zbranch[znode->iip].hash); + } else { + ubifs_copy_hash(c, hash, c->zroot.hash); + } + + mutex_unlock(&c->tnc_mutex); /* Determine the index node position */ if (lnum == -1) { diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index d90ee01076a9..d1815e959007 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -265,9 +265,7 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, /** * read_znode - read an indexing node from flash and fill znode. * @c: UBIFS file-system description object - * @lnum: LEB of the indexing node to read - * @offs: node offset - * @len: node length + * @zzbr: the zbranch describing the node to read * @znode: znode to read to * * This function reads an indexing node from the flash media and fills znode @@ -276,9 +274,12 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, * is wrong with it, this function prints complaint messages and returns * %-EINVAL. */ -static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, +static int read_znode(struct ubifs_info *c, struct ubifs_zbranch *zzbr, struct ubifs_znode *znode) { + int lnum = zzbr->lnum; + int offs = zzbr->offs; + int len = zzbr->len; int i, err, type, cmp; struct ubifs_idx_node *idx; @@ -292,6 +293,12 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, return err; } + err = ubifs_node_check_hash(c, idx, zzbr->hash); + if (err) { + ubifs_bad_hash(c, idx, zzbr->hash, lnum, offs); + return err; + } + znode->child_cnt = le16_to_cpu(idx->child_cnt); znode->level = le16_to_cpu(idx->level); @@ -308,13 +315,14 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, } for (i = 0; i < znode->child_cnt; i++) { - const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); struct ubifs_zbranch *zbr = &znode->zbranch[i]; key_read(c, &br->key, &zbr->key); zbr->lnum = le32_to_cpu(br->lnum); zbr->offs = le32_to_cpu(br->offs); zbr->len = le32_to_cpu(br->len); + ubifs_copy_hash(c, ubifs_branch_hash(c, br), zbr->hash); zbr->znode = NULL; /* Validate branch */ @@ -425,7 +433,7 @@ struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, if (!znode) return ERR_PTR(-ENOMEM); - err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode); + err = read_znode(c, zbr, znode); if (err) goto out; @@ -496,5 +504,11 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, return -EINVAL; } + err = ubifs_node_check_hash(c, node, zbr->hash); + if (err) { + ubifs_bad_hash(c, node, zbr->hash, zbr->lnum, zbr->offs); + return err; + } + return 0; } diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index e8c23c9d4f4a..8b7c1844014f 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -286,6 +286,7 @@ enum { #define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node) #define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node) #define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node) +#define UBIFS_AUTH_NODE_SZ sizeof(struct ubifs_auth_node) /* Extended attribute entry nodes are identical to directory entry nodes */ #define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ /* Only this does not have to be multiple of 8 bytes */ @@ -300,6 +301,12 @@ enum { /* The largest UBIFS node */ #define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ +/* The maxmimum size of a hash, enough for sha512 */ +#define UBIFS_MAX_HASH_LEN 64 + +/* The maxmimum size of a hmac, enough for hmac(sha512) */ +#define UBIFS_MAX_HMAC_LEN 64 + /* * xattr name of UBIFS encryption context, we don't use a prefix * nor a long name to not waste space on the flash. @@ -365,6 +372,7 @@ enum { * UBIFS_IDX_NODE: index node * UBIFS_CS_NODE: commit start node * UBIFS_ORPH_NODE: orphan node + * UBIFS_AUTH_NODE: authentication node * UBIFS_NODE_TYPES_CNT: count of supported node types * * Note, we index arrays by these numbers, so keep them low and contiguous. @@ -384,6 +392,7 @@ enum { UBIFS_IDX_NODE, UBIFS_CS_NODE, UBIFS_ORPH_NODE, + UBIFS_AUTH_NODE, UBIFS_NODE_TYPES_CNT, }; @@ -421,15 +430,19 @@ enum { * UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to * support 64bit cookies for lookups by hash * UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files + * UBIFS_FLG_AUTHENTICATION: this filesystem contains hashes for authentication */ enum { UBIFS_FLG_BIGLPT = 0x02, UBIFS_FLG_SPACE_FIXUP = 0x04, UBIFS_FLG_DOUBLE_HASH = 0x08, UBIFS_FLG_ENCRYPTION = 0x10, + UBIFS_FLG_AUTHENTICATION = 0x20, }; -#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION) +#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT | UBIFS_FLG_SPACE_FIXUP | \ + UBIFS_FLG_DOUBLE_HASH | UBIFS_FLG_ENCRYPTION | \ + UBIFS_FLG_AUTHENTICATION) /** * struct ubifs_ch - common header node. @@ -633,6 +646,10 @@ struct ubifs_pad_node { * @time_gran: time granularity in nanoseconds * @uuid: UUID generated when the file system image was created * @ro_compat_version: UBIFS R/O compatibility version + * @hmac: HMAC to authenticate the superblock node + * @hmac_wkm: HMAC of a well known message (the string "UBIFS") as a convenience + * to the user to check if the correct key is passed. + * @hash_algo: The hash algo used for this filesystem (one of enum hash_algo) */ struct ubifs_sb_node { struct ubifs_ch ch; @@ -660,7 +677,10 @@ struct ubifs_sb_node { __le32 time_gran; __u8 uuid[16]; __le32 ro_compat_version; - __u8 padding2[3968]; + __u8 hmac[UBIFS_MAX_HMAC_LEN]; + __u8 hmac_wkm[UBIFS_MAX_HMAC_LEN]; + __le16 hash_algo; + __u8 padding2[3838]; } __packed; /** @@ -695,6 +715,9 @@ struct ubifs_sb_node { * @empty_lebs: number of empty logical eraseblocks * @idx_lebs: number of indexing logical eraseblocks * @leb_cnt: count of LEBs used by file-system + * @hash_root_idx: the hash of the root index node + * @hash_lpt: the hash of the LPT + * @hmac: HMAC to authenticate the master node * @padding: reserved for future, zeroes */ struct ubifs_mst_node { @@ -727,7 +750,10 @@ struct ubifs_mst_node { __le32 empty_lebs; __le32 idx_lebs; __le32 leb_cnt; - __u8 padding[344]; + __u8 hash_root_idx[UBIFS_MAX_HASH_LEN]; + __u8 hash_lpt[UBIFS_MAX_HASH_LEN]; + __u8 hmac[UBIFS_MAX_HMAC_LEN]; + __u8 padding[152]; } __packed; /** @@ -747,11 +773,25 @@ struct ubifs_ref_node { } __packed; /** + * struct ubifs_auth_node - node for authenticating other nodes + * @ch: common header + * @hmac: The HMAC + */ +struct ubifs_auth_node { + struct ubifs_ch ch; + __u8 hmac[]; +} __packed; + +/** * struct ubifs_branch - key/reference/length branch * @lnum: LEB number of the target node * @offs: offset within @lnum * @len: target node length * @key: key + * + * In an authenticated UBIFS we have the hash of the referenced node after @key. + * This can't be added to the struct type definition because @key is a + * dynamically sized element already. */ struct ubifs_branch { __le32 lnum; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4368cde476b0..38401adaa00d 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -39,6 +39,9 @@ #include <linux/security.h> #include <linux/xattr.h> #include <linux/random.h> +#include <crypto/hash_info.h> +#include <crypto/hash.h> +#include <crypto/algapi.h> #define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION) #include <linux/fscrypt.h> @@ -157,6 +160,14 @@ /* Maximum number of data nodes to bulk-read */ #define UBIFS_MAX_BULK_READ 32 +#ifdef CONFIG_UBIFS_FS_AUTHENTICATION +#define UBIFS_HASH_ARR_SZ UBIFS_MAX_HASH_LEN +#define UBIFS_HMAC_ARR_SZ UBIFS_MAX_HMAC_LEN +#else +#define UBIFS_HASH_ARR_SZ 0 +#define UBIFS_HMAC_ARR_SZ 0 +#endif + /* * Lockdep classes for UBIFS inode @ui_mutex. */ @@ -706,6 +717,7 @@ struct ubifs_wbuf { * @jhead: journal head number this bud belongs to * @list: link in the list buds belonging to the same journal head * @rb: link in the tree of all buds + * @log_hash: the log hash from the commit start node up to this bud */ struct ubifs_bud { int lnum; @@ -713,6 +725,7 @@ struct ubifs_bud { int jhead; struct list_head list; struct rb_node rb; + struct shash_desc *log_hash; }; /** @@ -720,6 +733,7 @@ struct ubifs_bud { * @wbuf: head's write-buffer * @buds_list: list of bud LEBs belonging to this journal head * @grouped: non-zero if UBIFS groups nodes when writing to this journal head + * @log_hash: the log hash from the commit start node up to this journal head * * Note, the @buds list is protected by the @c->buds_lock. */ @@ -727,6 +741,7 @@ struct ubifs_jhead { struct ubifs_wbuf wbuf; struct list_head buds_list; unsigned int grouped:1; + struct shash_desc *log_hash; }; /** @@ -736,6 +751,7 @@ struct ubifs_jhead { * @lnum: LEB number of the target node (indexing node or data node) * @offs: target node offset within @lnum * @len: target node length + * @hash: the hash of the target node */ struct ubifs_zbranch { union ubifs_key key; @@ -746,12 +762,15 @@ struct ubifs_zbranch { int lnum; int offs; int len; + u8 hash[UBIFS_HASH_ARR_SZ]; }; /** * struct ubifs_znode - in-memory representation of an indexing node. * @parent: parent znode or NULL if it is the root * @cnext: next znode to commit + * @cparent: parent node for this commit + * @ciip: index in cparent's zbranch array * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE) * @time: last access time (seconds) * @level: level of the entry in the TNC tree @@ -769,6 +788,8 @@ struct ubifs_zbranch { struct ubifs_znode { struct ubifs_znode *parent; struct ubifs_znode *cnext; + struct ubifs_znode *cparent; + int ciip; unsigned long flags; time64_t time; int level; @@ -983,6 +1004,7 @@ struct ubifs_debug_info; * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). * @vfs_sb: VFS @struct super_block object + * @sup_node: The super block node as read from the device * * @highest_inum: highest used inode number * @max_sqnum: current global sequence number @@ -1028,6 +1050,7 @@ struct ubifs_debug_info; * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) * @rw_incompat: the media is not R/W compatible * @assert_action: action to take when a ubifs_assert() fails + * @authenticated: flag indigating the FS is mounted in authenticated mode * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -1075,6 +1098,7 @@ struct ubifs_debug_info; * @key_hash: direntry key hash function * @key_fmt: key format * @key_len: key length + * @hash_len: The length of the index node hashes * @fanout: fanout of the index tree (number of links per indexing node) * * @min_io_size: minimal input/output unit size @@ -1210,6 +1234,15 @@ struct ubifs_debug_info; * @rp_uid: reserved pool user ID * @rp_gid: reserved pool group ID * + * @hash_tfm: the hash transformation used for hashing nodes + * @hmac_tfm: the HMAC transformation for this filesystem + * @hmac_desc_len: length of the HMAC used for authentication + * @auth_key_name: the authentication key name + * @auth_hash_name: the name of the hash algorithm used for authentication + * @auth_hash_algo: the authentication hash used for this fs + * @log_hash: the log hash from the commit start node up to the latest reference + * node. + * * @empty: %1 if the UBI device is empty * @need_recovery: %1 if the file-system needs recovery * @replaying: %1 during journal replay @@ -1230,6 +1263,7 @@ struct ubifs_debug_info; */ struct ubifs_info { struct super_block *vfs_sb; + struct ubifs_sb_node *sup_node; ino_t highest_inum; unsigned long long max_sqnum; @@ -1270,6 +1304,7 @@ struct ubifs_info { unsigned int default_compr:2; unsigned int rw_incompat:1; unsigned int assert_action:2; + unsigned int authenticated:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1314,6 +1349,7 @@ struct ubifs_info { uint32_t (*key_hash)(const char *str, int len); int key_fmt; int key_len; + int hash_len; int fanout; int min_io_size; @@ -1441,6 +1477,15 @@ struct ubifs_info { kuid_t rp_uid; kgid_t rp_gid; + struct crypto_shash *hash_tfm; + struct crypto_shash *hmac_tfm; + int hmac_desc_len; + char *auth_key_name; + char *auth_hash_name; + enum hash_algo auth_hash_algo; + + struct shash_desc *log_hash; + /* The below fields are used only during mounting and re-mounting */ unsigned int empty:1; unsigned int need_recovery:1; @@ -1471,6 +1516,195 @@ extern const struct inode_operations ubifs_dir_inode_operations; extern const struct inode_operations ubifs_symlink_inode_operations; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; +/* auth.c */ +static inline int ubifs_authenticated(const struct ubifs_info *c) +{ + return (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) && c->authenticated; +} + +struct shash_desc *__ubifs_hash_get_desc(const struct ubifs_info *c); +static inline struct shash_desc *ubifs_hash_get_desc(const struct ubifs_info *c) +{ + return ubifs_authenticated(c) ? __ubifs_hash_get_desc(c) : NULL; +} + +static inline int ubifs_shash_init(const struct ubifs_info *c, + struct shash_desc *desc) +{ + if (ubifs_authenticated(c)) + return crypto_shash_init(desc); + else + return 0; +} + +static inline int ubifs_shash_update(const struct ubifs_info *c, + struct shash_desc *desc, const void *buf, + unsigned int len) +{ + int err = 0; + + if (ubifs_authenticated(c)) { + err = crypto_shash_update(desc, buf, len); + if (err < 0) + return err; + } + + return 0; +} + +static inline int ubifs_shash_final(const struct ubifs_info *c, + struct shash_desc *desc, u8 *out) +{ + return ubifs_authenticated(c) ? crypto_shash_final(desc, out) : 0; +} + +int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *buf, + u8 *hash); +static inline int ubifs_node_calc_hash(const struct ubifs_info *c, + const void *buf, u8 *hash) +{ + if (ubifs_authenticated(c)) + return __ubifs_node_calc_hash(c, buf, hash); + else + return 0; +} + +int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, + struct shash_desc *inhash); + +/** + * ubifs_check_hash - compare two hashes + * @c: UBIFS file-system description object + * @expected: first hash + * @got: second hash + * + * Compare two hashes @expected and @got. Returns 0 when they are equal, a + * negative error code otherwise. + */ +static inline int ubifs_check_hash(const struct ubifs_info *c, + const u8 *expected, const u8 *got) +{ + return crypto_memneq(expected, got, c->hash_len); +} + +/** + * ubifs_check_hmac - compare two HMACs + * @c: UBIFS file-system description object + * @expected: first HMAC + * @got: second HMAC + * + * Compare two hashes @expected and @got. Returns 0 when they are equal, a + * negative error code otherwise. + */ +static inline int ubifs_check_hmac(const struct ubifs_info *c, + const u8 *expected, const u8 *got) +{ + return crypto_memneq(expected, got, c->hmac_desc_len); +} + +void ubifs_bad_hash(const struct ubifs_info *c, const void *node, + const u8 *hash, int lnum, int offs); + +int __ubifs_node_check_hash(const struct ubifs_info *c, const void *buf, + const u8 *expected); +static inline int ubifs_node_check_hash(const struct ubifs_info *c, + const void *buf, const u8 *expected) +{ + if (ubifs_authenticated(c)) + return __ubifs_node_check_hash(c, buf, expected); + else + return 0; +} + +int ubifs_init_authentication(struct ubifs_info *c); +void __ubifs_exit_authentication(struct ubifs_info *c); +static inline void ubifs_exit_authentication(struct ubifs_info *c) +{ + if (ubifs_authenticated(c)) + __ubifs_exit_authentication(c); +} + +/** + * ubifs_branch_hash - returns a pointer to the hash of a branch + * @c: UBIFS file-system description object + * @br: branch to get the hash from + * + * This returns a pointer to the hash of a branch. Since the key already is a + * dynamically sized object we cannot use a struct member here. + */ +static inline u8 *ubifs_branch_hash(struct ubifs_info *c, + struct ubifs_branch *br) +{ + return (void *)br + sizeof(*br) + c->key_len; +} + +/** + * ubifs_copy_hash - copy a hash + * @c: UBIFS file-system description object + * @from: source hash + * @to: destination hash + * + * With authentication this copies a hash, otherwise does nothing. + */ +static inline void ubifs_copy_hash(const struct ubifs_info *c, const u8 *from, + u8 *to) +{ + if (ubifs_authenticated(c)) + memcpy(to, from, c->hash_len); +} + +int __ubifs_node_insert_hmac(const struct ubifs_info *c, void *buf, + int len, int ofs_hmac); +static inline int ubifs_node_insert_hmac(const struct ubifs_info *c, void *buf, + int len, int ofs_hmac) +{ + if (ubifs_authenticated(c)) + return __ubifs_node_insert_hmac(c, buf, len, ofs_hmac); + else + return 0; +} + +int __ubifs_node_verify_hmac(const struct ubifs_info *c, const void *buf, + int len, int ofs_hmac); +static inline int ubifs_node_verify_hmac(const struct ubifs_info *c, + const void *buf, int len, int ofs_hmac) +{ + if (ubifs_authenticated(c)) + return __ubifs_node_verify_hmac(c, buf, len, ofs_hmac); + else + return 0; +} + +/** + * ubifs_auth_node_sz - returns the size of an authentication node + * @c: UBIFS file-system description object + * + * This function returns the size of an authentication node which can + * be 0 for unauthenticated filesystems or the real size of an auth node + * authentication is enabled. + */ +static inline int ubifs_auth_node_sz(const struct ubifs_info *c) +{ + if (ubifs_authenticated(c)) + return sizeof(struct ubifs_auth_node) + c->hmac_desc_len; + else + return 0; +} + +int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac); + +int __ubifs_shash_copy_state(const struct ubifs_info *c, struct shash_desc *src, + struct shash_desc *target); +static inline int ubifs_shash_copy_state(const struct ubifs_info *c, + struct shash_desc *src, + struct shash_desc *target) +{ + if (ubifs_authenticated(c)) + return __ubifs_shash_copy_state(c, src, target); + else + return 0; +} + /* io.c */ void ubifs_ro_mode(struct ubifs_info *c, int err); int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, @@ -1490,9 +1724,15 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int lnum, int offs); int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, int offs); +int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int hmac_offs); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, int offs, int quiet, int must_chk_crc); +void ubifs_init_node(struct ubifs_info *c, void *buf, int len, int pad); +void ubifs_crc_node(struct ubifs_info *c, void *buf, int len); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); +int ubifs_prepare_node_hmac(struct ubifs_info *c, void *node, int len, + int hmac_offs, int pad); void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); int ubifs_io_init(struct ubifs_info *c); void ubifs_pad(const struct ubifs_info *c, void *buf, int pad); @@ -1592,11 +1832,12 @@ int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, void *node, int *lnum, int *offs); int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, - int offs, int len); + int offs, int len, const u8 *hash); int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, int old_lnum, int old_offs, int lnum, int offs, int len); int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, - int lnum, int offs, int len, const struct fscrypt_name *nm); + int lnum, int offs, int len, const u8 *hash, + const struct fscrypt_name *nm); int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key); int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, const struct fscrypt_name *nm); @@ -1659,12 +1900,12 @@ int ubifs_gc_should_commit(struct ubifs_info *c); void ubifs_wait_for_commit(struct ubifs_info *c); /* master.c */ +int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2); int ubifs_read_master(struct ubifs_info *c); int ubifs_write_master(struct ubifs_info *c); /* sb.c */ int ubifs_read_superblock(struct ubifs_info *c); -struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); int ubifs_fixup_free_space(struct ubifs_info *c); int ubifs_enable_encryption(struct ubifs_info *c); @@ -1693,7 +1934,7 @@ int ubifs_clear_orphans(struct ubifs_info *c); /* lpt.c */ int ubifs_calc_lpt_geom(struct ubifs_info *c); int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, - int *lpt_lebs, int *big_lpt); + int *lpt_lebs, int *big_lpt, u8 *hash); int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr); struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum); struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum); @@ -1712,6 +1953,7 @@ struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip); struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip); +struct ubifs_pnode *ubifs_pnode_lookup(struct ubifs_info *c, int i); int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip); void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty); void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); @@ -1720,6 +1962,7 @@ struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); /* Needed only in debugging code in lpt_commit.c */ int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, struct ubifs_nnode *nnode); +int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash); /* lpt_commit.c */ int ubifs_lpt_start_commit(struct ubifs_info *c); @@ -1807,7 +2050,7 @@ int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf); int ubifs_rcvry_gc_commit(struct ubifs_info *c); int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, int deletion, loff_t new_size); -int ubifs_recover_size(struct ubifs_info *c); +int ubifs_recover_size(struct ubifs_info *c, bool in_place); void ubifs_destroy_size_tree(struct ubifs_info *c); /* ioctl.c */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 61a5ad2600e8..53c9ab8fb777 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -919,28 +919,67 @@ out_unlock: return error; } -STATIC int -xfs_file_clone_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} -STATIC int -xfs_file_dedupe_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) +loff_t +xfs_file_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + loff_t len, + unsigned int remap_flags) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + loff_t remapped = 0; + xfs_extlen_t cowextsize; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + + ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, + &remapped); + if (ret) + goto out_unlock; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (pos_in == 0 && len == i_size_read(inode_in) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + pos_out == 0 && len >= i_size_read(inode_out) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + remap_flags); + +out_unlock: + xfs_reflink_remap_unlock(file_in, file_out); + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + return remapped > 0 ? remapped : ret; } STATIC int @@ -1175,8 +1214,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .clone_file_range = xfs_file_clone_range, - .dedupe_file_range = xfs_file_dedupe_range, + .remap_file_range = xfs_file_remap_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 8eaeec9d58ed..ecdb086bc23e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -913,18 +913,18 @@ out_error: /* * Update destination inode size & cowextsize hint, if necessary. */ -STATIC int +int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, - bool is_dedupe) + unsigned int remap_flags) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -945,10 +945,6 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } - if (!is_dedupe) { - xfs_trans_ichgtime(tp, dest, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1112,19 +1108,28 @@ out: /* * Iteratively remap one file's extents (and holes) to another's. */ -STATIC int +int xfs_reflink_remap_blocks( struct xfs_inode *src, - xfs_fileoff_t srcoff, + loff_t pos_in, struct xfs_inode *dest, - xfs_fileoff_t destoff, - xfs_filblks_t len, - xfs_off_t new_isize) + loff_t pos_out, + loff_t remap_len, + loff_t *remapped) { struct xfs_bmbt_irec imap; + xfs_fileoff_t srcoff; + xfs_fileoff_t destoff; + xfs_filblks_t len; + xfs_filblks_t range_len; + xfs_filblks_t remapped_len = 0; + xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - xfs_filblks_t range_len; + + destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); + srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); + len = XFS_B_TO_FSB(src->i_mount, remap_len); /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ while (len) { @@ -1139,7 +1144,7 @@ xfs_reflink_remap_blocks( error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); xfs_iunlock(src, lock_mode); if (error) - goto err; + break; ASSERT(nimaps == 1); trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, @@ -1153,23 +1158,24 @@ xfs_reflink_remap_blocks( error = xfs_reflink_remap_extent(dest, &imap, destoff, new_isize); if (error) - goto err; + break; if (fatal_signal_pending(current)) { error = -EINTR; - goto err; + break; } /* Advance drange/srange */ srcoff += range_len; destoff += range_len; len -= range_len; + remapped_len += range_len; } - return 0; - -err: - trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + *remapped = min_t(loff_t, remap_len, + XFS_FSB_TO_B(src->i_mount, remapped_len)); return error; } @@ -1218,7 +1224,7 @@ retry: } /* Unlock both inodes after they've been prepped for a range clone. */ -STATIC void +void xfs_reflink_remap_unlock( struct file *file_in, struct file *file_out) @@ -1286,21 +1292,20 @@ xfs_reflink_zero_posteof( * stale data in the destination file. Hence we reject these clone attempts with * -EINVAL in this case. */ -STATIC int +int xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, - bool is_dedupe) + loff_t *len, + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); bool same_inode = (inode_in == inode_out); - u64 blkmask = i_blocksize(inode_in) - 1; ssize_t ret; /* Lock both files against IO */ @@ -1323,29 +1328,11 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - len, is_dedupe); - if (ret <= 0) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) goto out_unlock; - /* - * If the dedupe data matches, chop off the partial EOF block - * from the source file so we don't try to dedupe the partial - * EOF block. - */ - if (is_dedupe) { - *len &= ~blkmask; - } else if (*len & blkmask) { - /* - * The user is attempting to share a partial EOF block, - * if it's inside the destination EOF then reject it. - */ - if (pos_out + *len < i_size_read(inode_out)) { - ret = -EINVAL; - goto out_unlock; - } - } - /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) @@ -1365,31 +1352,9 @@ xfs_reflink_remap_prep( goto out_unlock; /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + *len) - 1); - - /* If we're altering the file contents... */ - if (!is_dedupe) { - /* - * ...update the timestamps (which will grab the ilock again - * from xfs_fs_dirty_inode, so we have to call it before we - * take the ilock). - */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - goto out_unlock; - } - - /* - * ...clear the security bits if the process is not being run - * by root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - goto out_unlock; - } + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + *len, PAGE_SIZE) - 1); return 1; out_unlock: @@ -1398,72 +1363,6 @@ out_unlock: } /* - * Link a range of blocks from one file to another. - */ -int -xfs_reflink_remap_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in = file_inode(file_in); - struct xfs_inode *src = XFS_I(inode_in); - struct inode *inode_out = file_inode(file_out); - struct xfs_inode *dest = XFS_I(inode_out); - struct xfs_mount *mp = src->i_mount; - xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; - xfs_extlen_t cowextsize; - ssize_t ret; - - if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return -EOPNOTSUPP; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - /* Prepare and then clone file data. */ - ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, - &len, is_dedupe); - if (ret <= 0) - return ret; - - trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - - dfsbno = XFS_B_TO_FSBT(mp, pos_out); - sfsbno = XFS_B_TO_FSBT(mp, pos_in); - fsblen = XFS_B_TO_FSB(mp, len); - ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, - pos_out + len); - if (ret) - goto out_unlock; - - /* - * Carry the cowextsize hint from src to dest if we're sharing the - * entire source file to the entire destination file, the source file - * has a cowextsize hint, and the destination file does not. - */ - cowextsize = 0; - if (pos_in == 0 && len == i_size_read(inode_in) && - (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && - pos_out == 0 && len >= i_size_read(inode_out) && - !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) - cowextsize = src->i_d.di_cowextsize; - - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, - is_dedupe); - -out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); - if (ret) - trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); - return ret; -} - -/* * The user wants to preemptively CoW all shared blocks in this file, * which enables us to turn off the reflink flag. Iterate all * extents which are not prealloc/delalloc to see which ranges are diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 7f47202b5639..6d73daef1f13 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -27,13 +27,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); -extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); +extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len, + unsigned int remap_flags); extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *has_shared); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); +extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t *len, + unsigned int remap_flags); +extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, + struct xfs_inode *dest, loff_t pos_out, loff_t remap_len, + loff_t *remapped); +extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, + xfs_extlen_t cowextsize, unsigned int remap_flags); +extern void xfs_reflink_remap_unlock(struct file *file_in, + struct file *file_out); #endif /* __XFS_REFLINK_H */ diff --git a/include/crypto/asym_tpm_subtype.h b/include/crypto/asym_tpm_subtype.h new file mode 100644 index 000000000000..48198c36d6b9 --- /dev/null +++ b/include/crypto/asym_tpm_subtype.h @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _LINUX_ASYM_TPM_SUBTYPE_H +#define _LINUX_ASYM_TPM_SUBTYPE_H + +#include <linux/keyctl.h> + +struct tpm_key { + void *blob; + u32 blob_len; + uint16_t key_len; /* Size in bits of the key */ + const void *pub_key; /* pointer inside blob to the public key bytes */ + uint16_t pub_key_len; /* length of the public key */ +}; + +struct tpm_key *tpm_key_create(const void *blob, uint32_t blob_len); + +extern struct asymmetric_key_subtype asym_tpm_subtype; + +#endif /* _LINUX_ASYM_TPM_SUBTYPE_H */ diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index e0b681a717ba..be626eac9113 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -14,6 +14,8 @@ #ifndef _LINUX_PUBLIC_KEY_H #define _LINUX_PUBLIC_KEY_H +#include <linux/keyctl.h> + /* * Cryptographic data for the public-key subtype of the asymmetric key type. * @@ -23,6 +25,7 @@ struct public_key { void *key; u32 keylen; + bool key_is_private; const char *id_type; const char *pkey_algo; }; @@ -40,6 +43,7 @@ struct public_key_signature { u8 digest_size; /* Number of bytes in digest */ const char *pkey_algo; const char *hash_algo; + const char *encoding; }; extern void public_key_signature_free(struct public_key_signature *sig); @@ -65,8 +69,14 @@ extern int restrict_link_by_key_or_keyring_chain(struct key *trust_keyring, const union key_payload *payload, struct key *trusted); -extern int verify_signature(const struct key *key, - const struct public_key_signature *sig); +extern int query_asymmetric_key(const struct kernel_pkey_params *, + struct kernel_pkey_query *); + +extern int encrypt_blob(struct kernel_pkey_params *, const void *, void *); +extern int decrypt_blob(struct kernel_pkey_params *, const void *, void *); +extern int create_signature(struct kernel_pkey_params *, const void *, void *); +extern int verify_signature(const struct key *, + const struct public_key_signature *); int public_key_verify_signature(const struct public_key *pkey, const struct public_key_signature *sig); diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index 91a877fa00cb..9ccad6b062f2 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -82,6 +82,53 @@ enum drm_connector_status { connector_status_unknown = 3, }; +/** + * enum drm_connector_registration_status - userspace registration status for + * a &drm_connector + * + * This enum is used to track the status of initializing a connector and + * registering it with userspace, so that DRM can prevent bogus modesets on + * connectors that no longer exist. + */ +enum drm_connector_registration_state { + /** + * @DRM_CONNECTOR_INITIALIZING: The connector has just been created, + * but has yet to be exposed to userspace. There should be no + * additional restrictions to how the state of this connector may be + * modified. + */ + DRM_CONNECTOR_INITIALIZING = 0, + + /** + * @DRM_CONNECTOR_REGISTERED: The connector has been fully initialized + * and registered with sysfs, as such it has been exposed to + * userspace. There should be no additional restrictions to how the + * state of this connector may be modified. + */ + DRM_CONNECTOR_REGISTERED = 1, + + /** + * @DRM_CONNECTOR_UNREGISTERED: The connector has either been exposed + * to userspace and has since been unregistered and removed from + * userspace, or the connector was unregistered before it had a chance + * to be exposed to userspace (e.g. still in the + * @DRM_CONNECTOR_INITIALIZING state). When a connector is + * unregistered, there are additional restrictions to how its state + * may be modified: + * + * - An unregistered connector may only have its DPMS changed from + * On->Off. Once DPMS is changed to Off, it may not be switched back + * to On. + * - Modesets are not allowed on unregistered connectors, unless they + * would result in disabling its assigned CRTCs. This means + * disabling a CRTC on an unregistered connector is OK, but enabling + * one is not. + * - Removing a CRTC from an unregistered connector is OK, but new + * CRTCs may never be assigned to an unregistered connector. + */ + DRM_CONNECTOR_UNREGISTERED = 2, +}; + enum subpixel_order { SubPixelUnknown = 0, SubPixelHorizontalRGB, @@ -853,10 +900,12 @@ struct drm_connector { bool ycbcr_420_allowed; /** - * @registered: Is this connector exposed (registered) with userspace? + * @registration_state: Is this connector initializing, exposed + * (registered) with userspace, or unregistered? + * * Protected by @mutex. */ - bool registered; + enum drm_connector_registration_state registration_state; /** * @modes: @@ -1166,6 +1215,24 @@ static inline void drm_connector_unreference(struct drm_connector *connector) drm_connector_put(connector); } +/** + * drm_connector_is_unregistered - has the connector been unregistered from + * userspace? + * @connector: DRM connector + * + * Checks whether or not @connector has been unregistered from userspace. + * + * Returns: + * True if the connector was unregistered, false if the connector is + * registered or has not yet been registered with userspace. + */ +static inline bool +drm_connector_is_unregistered(struct drm_connector *connector) +{ + return READ_ONCE(connector->registration_state) == + DRM_CONNECTOR_UNREGISTERED; +} + const char *drm_get_connector_status_name(enum drm_connector_status status); const char *drm_get_subpixel_order_name(enum subpixel_order order); const char *drm_get_dpms_name(int val); diff --git a/include/keys/asymmetric-subtype.h b/include/keys/asymmetric-subtype.h index e0a9c2368872..9ce2f0fae57e 100644 --- a/include/keys/asymmetric-subtype.h +++ b/include/keys/asymmetric-subtype.h @@ -17,6 +17,8 @@ #include <linux/seq_file.h> #include <keys/asymmetric-type.h> +struct kernel_pkey_query; +struct kernel_pkey_params; struct public_key_signature; /* @@ -34,6 +36,13 @@ struct asymmetric_key_subtype { /* Destroy a key of this subtype */ void (*destroy)(void *payload_crypto, void *payload_auth); + int (*query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info); + + /* Encrypt/decrypt/sign data */ + int (*eds_op)(struct kernel_pkey_params *params, + const void *in, void *out); + /* Verify the signature on a key of this subtype (optional) */ int (*verify_signature)(const struct key *key, const struct public_key_signature *sig); diff --git a/security/keys/trusted.h b/include/keys/trusted.h index 8d5fe9eafb22..adbcb6817826 100644 --- a/security/keys/trusted.h +++ b/include/keys/trusted.h @@ -3,7 +3,7 @@ #define __TRUSTED_KEY_H /* implementation specific TPM constants */ -#define MAX_BUF_SIZE 512 +#define MAX_BUF_SIZE 1024 #define TPM_GETRANDOM_SIZE 14 #define TPM_OSAP_SIZE 36 #define TPM_OIAP_SIZE 10 @@ -36,6 +36,18 @@ enum { SRK_keytype = 4 }; +int TSS_authhmac(unsigned char *digest, const unsigned char *key, + unsigned int keylen, unsigned char *h1, + unsigned char *h2, unsigned char h3, ...); +int TSS_checkhmac1(unsigned char *buffer, + const uint32_t command, + const unsigned char *ononce, + const unsigned char *key, + unsigned int keylen, ...); + +int trusted_tpm_send(unsigned char *cmd, size_t buflen); +int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce); + #define TPM_DEBUG 0 #if TPM_DEBUG diff --git a/include/linux/adxl.h b/include/linux/adxl.h index 2a629acb4c3f..2d29f55923e3 100644 --- a/include/linux/adxl.h +++ b/include/linux/adxl.h @@ -7,7 +7,12 @@ #ifndef _LINUX_ADXL_H #define _LINUX_ADXL_H +#ifdef CONFIG_ACPI_ADXL const char * const *adxl_get_component_names(void); int adxl_decode(u64 addr, u64 component_values[]); +#else +static inline const char * const *adxl_get_component_names(void) { return NULL; } +static inline int adxl_decode(u64 addr, u64 component_values[]) { return -EOPNOTSUPP; } +#endif #endif /* _LINUX_ADXL_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index b47c7f716731..056fb627edb3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -503,31 +503,23 @@ do { \ disk_devt((bio)->bi_disk) #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -int bio_associate_blkg_from_page(struct bio *bio, struct page *page); +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); #else -static inline int bio_associate_blkg_from_page(struct bio *bio, - struct page *page) { return 0; } +static inline int bio_associate_blkcg_from_page(struct bio *bio, + struct page *page) { return 0; } #endif #ifdef CONFIG_BLK_CGROUP +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); -int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css); -int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); -int bio_reassociate_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); -void bio_clone_blkg_association(struct bio *dst, struct bio *src); +void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ return 0; } -static inline int bio_associate_create_blkg(struct request_queue *q, - struct bio *bio) { return 0; } -static inline int bio_reassociate_blkg(struct request_queue *q, struct bio *bio) -{ return 0; } +static inline int bio_associate_blkcg(struct bio *bio, + struct cgroup_subsys_state *blkcg_css) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_clone_blkcg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1e76ceebeb5d..6d766a19f2bb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -126,7 +126,7 @@ struct blkcg_gq { struct request_list rl; /* reference count */ - struct percpu_ref refcnt; + atomic_t refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; @@ -184,8 +184,6 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -232,59 +230,22 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -/** - * blkcg_css - find the current css - * - * Find the css associated with either the kthread or the current task. - * This may return a dying css, so it is up to the caller to use tryget logic - * to confirm it is alive and well. - */ -static inline struct cgroup_subsys_state *blkcg_css(void) -{ - struct cgroup_subsys_state *css; - - css = kthread_blkcg(); - if (css) - return css; - return task_css(current, io_cgrp_id); -} static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } -/** - * __bio_blkcg - internal version of bio_blkcg for bfq and cfq - * - * DO NOT USE. - * There is a flaw using this version of the function. In particular, this was - * used in a broken paradigm where association was called on the given css. It - * is possible though that the returned css from task_css() is in the process - * of dying due to migration of the current task. So it is improper to assume - * *_get() is going to succeed. Both BFQ and CFQ rely on this logic and will - * take additional work to handle more gracefully. - */ -static inline struct blkcg *__bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return css_to_blkcg(blkcg_css()); -} - -/** - * bio_blkcg - grab the blkcg associated with a bio - * @bio: target bio - * - * This returns the blkcg associated with a bio, NULL if not associated. - * Callers are expected to either handle NULL or know association has been - * done prior to calling this. - */ static inline struct blkcg *bio_blkcg(struct bio *bio) { - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return NULL; + struct cgroup_subsys_state *css; + + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + css = kthread_blkcg(); + if (css) + return css_to_blkcg(css); + return css_to_blkcg(task_css(current, io_cgrp_id)); } static inline bool blk_cgroup_congested(void) @@ -490,35 +451,26 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) */ static inline void blkg_get(struct blkcg_gq *blkg) { - percpu_ref_get(&blkg->refcnt); + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); } /** - * blkg_tryget - try and get a blkg reference + * blkg_try_get - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ -static inline bool blkg_tryget(struct blkcg_gq *blkg) +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) { - return percpu_ref_tryget(&blkg->refcnt); + if (atomic_inc_not_zero(&blkg->refcnt)) + return blkg; + return NULL; } -/** - * blkg_tryget_closest - try and get a blkg ref on the closet blkg - * @blkg: blkg to get - * - * This walks up the blkg tree to find the closest non-dying blkg and returns - * the blkg that it did association with as it may not be the passed in blkg. - */ -static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) -{ - while (!percpu_ref_tryget(&blkg->refcnt)) - blkg = blkg->parent; - return blkg; -} +void __blkg_release_rcu(struct rcu_head *rcu); /** * blkg_put - put a blkg reference @@ -526,7 +478,9 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) */ static inline void blkg_put(struct blkcg_gq *blkg) { - percpu_ref_put(&blkg->refcnt); + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); } /** @@ -579,36 +533,25 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, rcu_read_lock(); - if (bio && bio->bi_blkg) { - blkcg = bio->bi_blkg->blkcg; - if (blkcg == &blkcg_root) - goto rl_use_root; - - blkg_get(bio->bi_blkg); - rcu_read_unlock(); - return &bio->bi_blkg->rl; - } + blkcg = bio_blkcg(bio); - blkcg = css_to_blkcg(blkcg_css()); + /* bypass blkg lookup and use @q->root_rl directly for root */ if (blkcg == &blkcg_root) - goto rl_use_root; + goto root_rl; + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) - blkg = __blkg_lookup_create(blkcg, q); - - if (blkg->blkcg == &blkcg_root || !blkg_tryget(blkg)) - goto rl_use_root; + goto root_rl; + blkg_get(blkg); rcu_read_unlock(); return &blkg->rl; - - /* - * Each blkg has its own request_list, however, the root blkcg - * uses the request_queue's root_rl. This is to avoid most - * overhead for the root blkcg. - */ -rl_use_root: +root_rl: rcu_read_unlock(); return &q->root_rl; } @@ -854,26 +797,32 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif - -static inline void blkcg_bio_issue_init(struct bio *bio) -{ - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -} - static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { + struct blkcg *blkcg; struct blkcg_gq *blkg; bool throtl = false; rcu_read_lock(); + blkcg = bio_blkcg(bio); + + /* associate blkcg if bio hasn't attached one */ + bio_associate_blkcg(bio, &blkcg->css); - bio_associate_create_blkg(q, bio); - blkg = bio->bi_blkg; + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { + blkg = blkg ?: q->root_blkg; /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the @@ -885,8 +834,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } - blkcg_bio_issue_init(bio); - rcu_read_unlock(); return !throtl; } @@ -983,7 +930,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, @@ -999,7 +945,6 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } -static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 093a818c5b68..1dcf652ba0aa 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -178,6 +178,7 @@ struct bio { * release. Read comment on top of bio_associate_current(). */ struct io_context *bi_ioc; + struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9968332cceed..9d12757a65b0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -93,8 +93,6 @@ extern struct css_set init_css_set; bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); -struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, - struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, diff --git a/include/linux/compat.h b/include/linux/compat.h index 06e77473f175..88720b443cd6 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -1032,9 +1032,9 @@ int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, #else /* !CONFIG_COMPAT */ #define is_compat_task() (0) -#ifndef in_compat_syscall +/* Ensure no one redefines in_compat_syscall() under !CONFIG_COMPAT */ +#define in_compat_syscall in_compat_syscall static inline bool in_compat_syscall(void) { return false; } -#endif #endif /* CONFIG_COMPAT */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index b1ce500fe8b3..3e7dafb3ea80 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -21,8 +21,6 @@ #define __SANITIZE_ADDRESS__ #endif -#define __no_sanitize_address __attribute__((no_sanitize("address"))) - /* * Not all versions of clang implement the the type-generic versions * of the builtin overflow checkers. Fortunately, clang implements @@ -41,6 +39,3 @@ * compilers, like ICC. */ #define barrier() __asm__ __volatile__("" : : : "memory") -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) -#define __assume_aligned(a, ...) \ - __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 90ddfefb6c2b..2010493e1040 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -68,31 +68,20 @@ */ #define uninitialized_var(x) x = x -#ifdef __CHECKER__ -#define __must_be_array(a) 0 -#else -/* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) -#endif - #ifdef RETPOLINE -#define __noretpoline __attribute__((indirect_branch("keep"))) +#define __noretpoline __attribute__((__indirect_branch__("keep"))) #endif #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) -#define __optimize(level) __attribute__((__optimize__(level))) - #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#ifndef __CHECKER__ -#define __compiletime_warning(message) __attribute__((warning(message))) -#define __compiletime_error(message) __attribute__((error(message))) +#define __compiletime_warning(message) __attribute__((__warning__(message))) +#define __compiletime_error(message) __attribute__((__error__(message))) -#ifdef LATENT_ENTROPY_PLUGIN +#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif -#endif /* __CHECKER__ */ /* * calling noreturn functions, __builtin_unreachable() and __builtin_trap() @@ -107,10 +96,6 @@ * Mark a position in code as unreachable. This can be used to * suppress control flow warnings after asm blocks that transfer * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. */ #define unreachable() \ do { \ @@ -119,9 +104,6 @@ __builtin_unreachable(); \ } while (0) -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) - #if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__) #define __randomize_layout __attribute__((randomize_layout)) #define __no_randomize_layout __attribute__((no_randomize_layout)) @@ -131,32 +113,6 @@ #endif /* - * When used with Link Time Optimization, gcc can optimize away C functions or - * variables which are referenced only from assembly code. __visible tells the - * optimizer that something else uses this function or variable, thus preventing - * this. - */ -#define __visible __attribute__((externally_visible)) - -/* gcc version specific checks */ - -#if GCC_VERSION >= 40900 && !defined(__CHECKER__) -/* - * __assume_aligned(n, k): Tell the optimizer that the returned - * pointer can be assumed to be k modulo n. The second argument is - * optional (default 0), so we use a variadic macro to make the - * shorthand. - * - * Beware: Do not apply this to functions which may return - * ERR_PTRs. Also, it is probably unwise to apply it to functions - * returning extra information in the low bits (but in that case the - * compiler should see some alignment anyway, when the return value is - * massaged by 'flags = ptr & 3; ptr &= ~3;'). - */ -#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) -#endif - -/* * GCC 'asm goto' miscompiles certain code sequences: * * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 @@ -187,39 +143,10 @@ #define KASAN_ABI_VERSION 3 #endif -#if GCC_VERSION >= 40902 -/* - * Tell the compiler that address safety instrumentation (KASAN) - * should not be applied to that function. - * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 - */ -#define __no_sanitize_address __attribute__((no_sanitize_address)) -#ifdef CONFIG_KASAN -#define __no_sanitize_address_or_inline \ - __no_sanitize_address __maybe_unused notrace -#else -#define __no_sanitize_address_or_inline inline -#endif -#endif - #if GCC_VERSION >= 50100 -/* - * Mark structures as requiring designated initializers. - * https://gcc.gnu.org/onlinedocs/gcc/Designated-Inits.html - */ -#define __designated_init __attribute__((designated_init)) #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif -#if !defined(__noclone) -#define __noclone /* not needed */ -#endif - -#if !defined(__no_sanitize_address) -#define __no_sanitize_address -#define __no_sanitize_address_or_inline inline -#endif - /* * Turn individual warnings and errors on and off locally, depending * on version. diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index 4c7f9befa9f6..517bd14e1222 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -29,17 +29,8 @@ */ #define OPTIMIZER_HIDE_VAR(var) barrier() -/* Intel ECC compiler doesn't support __builtin_types_compatible_p() */ -#define __must_be_array(a) 0 - #endif /* icc has this, but it's called _bswap16 */ #define __HAVE_BUILTIN_BSWAP16__ #define __builtin_bswap16 _bswap16 - -/* The following are for compatibility with GCC, from compiler-gcc.h, - * and may be redefined here because they should not be shared with other - * compilers, like clang. - */ -#define __visible __attribute__((externally_visible)) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 4170fcee5adb..06396c1cf127 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -23,8 +23,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __branch_check__(x, expect, is_constant) ({ \ long ______r; \ static struct ftrace_likely_data \ - __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_annotated_branch"))) \ + __aligned(4) \ + __section("_ftrace_annotated_branch") \ ______f = { \ .data.func = __func__, \ .data.file = __FILE__, \ @@ -59,8 +59,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, ({ \ int ______r; \ static struct ftrace_branch_data \ - __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_branch"))) \ + __aligned(4) \ + __section("_ftrace_branch") \ ______f = { \ .func = __func__, \ .file = __FILE__, \ @@ -115,7 +115,10 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define ASM_UNREACHABLE #endif #ifndef unreachable -# define unreachable() do { annotate_reachable(); do { } while (1); } while (0) +# define unreachable() do { \ + annotate_unreachable(); \ + __builtin_unreachable(); \ +} while (0) #endif /* @@ -137,7 +140,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, extern typeof(sym) sym; \ static const unsigned long __kentry_##sym \ __used \ - __attribute__((section("___kentry" "+" #sym ), used)) \ + __section("___kentry" "+" #sym ) \ = (unsigned long)&sym; #endif @@ -186,7 +189,7 @@ void __read_once_size(const volatile void *p, void *res, int size) * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 * '__maybe_unused' allows us to avoid defined-but-not-used warnings. */ -# define __no_kasan_or_inline __no_sanitize_address __maybe_unused +# define __no_kasan_or_inline __no_sanitize_address notrace __maybe_unused #else # define __no_kasan_or_inline __always_inline #endif @@ -278,7 +281,7 @@ unsigned long read_word_at_a_time(const void *addr) * visible to the compiler. */ #define __ADDRESSABLE(sym) \ - static void * __attribute__((section(".discard.addressable"), used)) \ + static void * __section(".discard.addressable") __used \ __PASTE(__addressable_##sym, __LINE__) = (void *)&sym; /** @@ -331,10 +334,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ -#ifndef __optimize -# define __optimize(level) -#endif - /* Compile time object size, -1 for unknown */ #ifndef __compiletime_object_size # define __compiletime_object_size(obj) -1 @@ -376,4 +375,7 @@ static inline void *offset_to_ptr(const int *off) compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") +/* &a[0] degrades to a pointer: a different type from an array */ +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) + #endif /* __LINUX_COMPILER_H */ diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h new file mode 100644 index 000000000000..6b28c1b7310c --- /dev/null +++ b/include/linux/compiler_attributes.h @@ -0,0 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_COMPILER_ATTRIBUTES_H +#define __LINUX_COMPILER_ATTRIBUTES_H + +/* + * The attributes in this file are unconditionally defined and they directly + * map to compiler attribute(s) -- except those that are optional. + * + * Any other "attributes" (i.e. those that depend on a configuration option, + * on a compiler, on an architecture, on plugins, on other attributes...) + * should be defined elsewhere (e.g. compiler_types.h or compiler-*.h). + * + * This file is meant to be sorted (by actual attribute name, + * not by #define identifier). Use the __attribute__((__name__)) syntax + * (i.e. with underscores) to avoid future collisions with other macros. + * If an attribute is optional, state the reason in the comment. + */ + +/* + * To check for optional attributes, we use __has_attribute, which is supported + * on gcc >= 5, clang >= 2.9 and icc >= 17. In the meantime, to support + * 4.6 <= gcc < 5, we implement __has_attribute by hand. + * + * sparse does not support __has_attribute (yet) and defines __GNUC_MINOR__ + * depending on the compiler used to build it; however, these attributes have + * no semantic effects for sparse, so it does not matter. Also note that, + * in order to avoid sparse's warnings, even the unsupported ones must be + * defined to 0. + */ +#ifndef __has_attribute +# define __has_attribute(x) __GCC4_has_attribute_##x +# define __GCC4_has_attribute___assume_aligned__ (__GNUC_MINOR__ >= 9) +# define __GCC4_has_attribute___designated_init__ 0 +# define __GCC4_has_attribute___externally_visible__ 1 +# define __GCC4_has_attribute___noclone__ 1 +# define __GCC4_has_attribute___optimize__ 1 +# define __GCC4_has_attribute___nonstring__ 0 +# define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alias-function-attribute + */ +#define __alias(symbol) __attribute__((__alias__(#symbol))) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-aligned-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-aligned-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-aligned-variable-attribute + */ +#define __aligned(x) __attribute__((__aligned__(x))) +#define __aligned_largest __attribute__((__aligned__)) + +/* + * Note: users of __always_inline currently do not write "inline" themselves, + * which seems to be required by gcc to apply the attribute according + * to its docs (and also "warning: always_inline function might not be + * inlinable [-Wattributes]" is emitted). + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-always_005finline-function-attribute + * clang: mentioned + */ +#define __always_inline inline __attribute__((__always_inline__)) + +/* + * The second argument is optional (default 0), so we use a variadic macro + * to make the shorthand. + * + * Beware: Do not apply this to functions which may return + * ERR_PTRs. Also, it is probably unwise to apply it to functions + * returning extra information in the low bits (but in that case the + * compiler should see some alignment anyway, when the return value is + * massaged by 'flags = ptr & 3; ptr &= ~3;'). + * + * Optional: only supported since gcc >= 4.9 + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-assume_005faligned-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#assume-aligned + */ +#if __has_attribute(__assume_aligned__) +# define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) +#else +# define __assume_aligned(a, ...) +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-cold-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-cold-label-attribute + */ +#define __cold __attribute__((__cold__)) + +/* + * Note the long name. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-const-function-attribute + */ +#define __attribute_const__ __attribute__((__const__)) + +/* + * Don't. Just don't. See commit 771c035372a0 ("deprecate the '__deprecated' + * attribute warnings entirely and for good") for more information. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-deprecated-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-deprecated-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-deprecated-variable-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Enumerator-Attributes.html#index-deprecated-enumerator-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#deprecated + */ +#define __deprecated + +/* + * Optional: only supported since gcc >= 5.1 + * Optional: not supported by clang + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-designated_005finit-type-attribute + */ +#if __has_attribute(__designated_init__) +# define __designated_init __attribute__((__designated_init__)) +#else +# define __designated_init +#endif + +/* + * Optional: not supported by clang + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-externally_005fvisible-function-attribute + */ +#if __has_attribute(__externally_visible__) +# define __visible __attribute__((__externally_visible__)) +#else +# define __visible +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-format-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#format + */ +#define __printf(a, b) __attribute__((__format__(printf, a, b))) +#define __scanf(a, b) __attribute__((__format__(scanf, a, b))) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-gnu_005finline-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#gnu-inline + */ +#define __gnu_inline __attribute__((__gnu_inline__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute + */ +#define __malloc __attribute__((__malloc__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-mode-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-mode-variable-attribute + */ +#define __mode(x) __attribute__((__mode__(x))) + +/* + * Optional: not supported by clang + * Note: icc does not recognize gcc's no-tracer + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noclone-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-optimize-function-attribute + */ +#if __has_attribute(__noclone__) +# if __has_attribute(__optimize__) +# define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) +# else +# define __noclone __attribute__((__noclone__)) +# endif +#else +# define __noclone +#endif + +/* + * Note the missing underscores. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noinline-function-attribute + * clang: mentioned + */ +#define noinline __attribute__((__noinline__)) + +/* + * Optional: only supported since gcc >= 8 + * Optional: not supported by clang + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-nonstring-variable-attribute + */ +#if __has_attribute(__nonstring__) +# define __nonstring __attribute__((__nonstring__)) +#else +# define __nonstring +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noreturn-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#noreturn + * clang: https://clang.llvm.org/docs/AttributeReference.html#id1 + */ +#define __noreturn __attribute__((__noreturn__)) + +/* + * Optional: only supported since gcc >= 4.8 + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-no_005fsanitize_005faddress-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#no-sanitize-address-no-address-safety-analysis + */ +#if __has_attribute(__no_sanitize_address__) +# define __no_sanitize_address __attribute__((__no_sanitize_address__)) +#else +# define __no_sanitize_address +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-packed-type-attribute + * clang: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-packed-variable-attribute + */ +#define __packed __attribute__((__packed__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-pure-function-attribute + */ +#define __pure __attribute__((__pure__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-section-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-section-variable-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate + */ +#define __section(S) __attribute__((__section__(#S))) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-unused-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-unused-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-unused-variable-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-unused-label-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#maybe-unused-unused + */ +#define __always_unused __attribute__((__unused__)) +#define __maybe_unused __attribute__((__unused__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-used-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-used-variable-attribute + */ +#define __used __attribute__((__used__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute + */ +#define __weak __attribute__((__weak__)) + +#endif /* __LINUX_COMPILER_ATTRIBUTES_H */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 97cfe29b3f0a..3439d7d0249a 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPILER_TYPES_H #define __LINUX_COMPILER_TYPES_H @@ -54,6 +55,9 @@ extern void __chk_io_ptr(const volatile void __iomem *); #ifdef __KERNEL__ +/* Attributes */ +#include <linux/compiler_attributes.h> + /* Compiler specific macros. */ #ifdef __clang__ #include <linux/compiler-clang.h> @@ -78,12 +82,6 @@ extern void __chk_io_ptr(const volatile void __iomem *); #include <asm/compiler.h> #endif -/* - * Generic compiler-independent macros required for kernel - * build go below this comment. Actual compiler/compiler version - * specific implementations come from the above header files - */ - struct ftrace_branch_data { const char *func; const char *file; @@ -106,10 +104,6 @@ struct ftrace_likely_data { unsigned long constant; }; -/* Don't. Just don't. */ -#define __deprecated -#define __deprecated_for_modules - #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ @@ -119,10 +113,6 @@ struct ftrace_likely_data { * compilers. We don't consider that to be an error, so set them to nothing. * For example, some of them are for compiler specific plugins. */ -#ifndef __designated_init -# define __designated_init -#endif - #ifndef __latent_entropy # define __latent_entropy #endif @@ -140,17 +130,6 @@ struct ftrace_likely_data { # define randomized_struct_fields_end #endif -#ifndef __visible -#define __visible -#endif - -/* - * Assume alignment of return value. - */ -#ifndef __assume_aligned -#define __assume_aligned(a, ...) -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) @@ -159,14 +138,6 @@ struct ftrace_likely_data { (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) -#ifndef __attribute_const__ -#define __attribute_const__ __attribute__((__const__)) -#endif - -#ifndef __noclone -#define __noclone -#endif - /* Helpers for emitting diagnostics in pragmas. */ #ifndef __diag #define __diag(string) @@ -186,43 +157,16 @@ struct ftrace_likely_data { #define __diag_error(compiler, version, option, comment) \ __diag_ ## compiler(version, error, option) -/* - * From the GCC manual: - * - * Many functions have no effects except the return value and their - * return value depends only on the parameters and/or global - * variables. Such a function can be subject to common subexpression - * elimination and loop optimization just as an arithmetic operator - * would be. - * [...] - */ -#define __pure __attribute__((pure)) -#define __aligned(x) __attribute__((aligned(x))) -#define __printf(a, b) __attribute__((format(printf, a, b))) -#define __scanf(a, b) __attribute__((format(scanf, a, b))) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) -#define __mode(x) __attribute__((mode(x))) -#define __malloc __attribute__((__malloc__)) -#define __used __attribute__((__used__)) -#define __noreturn __attribute__((noreturn)) -#define __packed __attribute__((packed)) -#define __weak __attribute__((weak)) -#define __alias(symbol) __attribute__((alias(#symbol))) -#define __cold __attribute__((cold)) -#define __section(S) __attribute__((__section__(#S))) - - #ifdef CONFIG_ENABLE_MUST_CHECK -#define __must_check __attribute__((warn_unused_result)) +#define __must_check __attribute__((__warn_unused_result__)) #else #define __must_check #endif -#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) +#if defined(CC_USING_HOTPATCH) #define notrace __attribute__((hotpatch(0, 0))) #else -#define notrace __attribute__((no_instrument_function)) +#define notrace __attribute__((__no_instrument_function__)) #endif /* @@ -231,23 +175,11 @@ struct ftrace_likely_data { * stack and frame pointer being set up and there is no chance to * restore the lr register to the value before mcount was called. */ -#define __naked __attribute__((naked)) notrace +#define __naked __attribute__((__naked__)) notrace #define __compiler_offsetof(a, b) __builtin_offsetof(a, b) /* - * Feature detection for gnu_inline (gnu89 extern inline semantics). Either - * __GNUC_STDC_INLINE__ is defined (not using gnu89 extern inline semantics, - * and we opt in to the gnu89 semantics), or __GNUC_STDC_INLINE__ is not - * defined so the gnu89 semantics are the default. - */ -#ifdef __GNUC_STDC_INLINE__ -# define __gnu_inline __attribute__((gnu_inline)) -#else -# define __gnu_inline -#endif - -/* * Force always-inline if the user requests it so via the .config. * GCC does not warn about unused static inline functions for * -Wunused-function. This turns out to avoid the need for complex #ifdef @@ -258,22 +190,20 @@ struct ftrace_likely_data { * semantics rather than c99. This prevents multiple symbol definition errors * of extern inline functions at link time. * A lot of inline functions can cause havoc with function tracing. + * Do not use __always_inline here, since currently it expands to inline again + * (which would break users of __always_inline). */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) -#define inline \ - inline __attribute__((always_inline, unused)) notrace __gnu_inline +#define inline inline __attribute__((__always_inline__)) __gnu_inline \ + __maybe_unused notrace #else -#define inline inline __attribute__((unused)) notrace __gnu_inline +#define inline inline __gnu_inline \ + __maybe_unused notrace #endif #define __inline__ inline -#define __inline inline -#define noinline __attribute__((noinline)) - -#ifndef __always_inline -#define __always_inline inline __attribute__((always_inline)) -#endif +#define __inline inline /* * Rather then using noinline to prevent stack consumption, use diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index caf40ad0bbc6..e0cd2baa8380 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -126,6 +126,7 @@ enum cpuhp_state { CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, + CPUHP_AP_CSKY_TIMER_STARTING, CPUHP_AP_KVM_STARTING, CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, CPUHP_AP_KVM_ARM_VGIC_STARTING, diff --git a/include/linux/fs.h b/include/linux/fs.h index 8252df30b9a1..c95c0807471f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1752,6 +1752,25 @@ struct block_device_operations; #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) +/* + * These flags control the behavior of the remap_file_range function pointer. + * If it is called with len == 0 that means "remap to end of source file". + * See Documentation/filesystems/vfs.txt for more details about this call. + * + * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) + * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request + */ +#define REMAP_FILE_DEDUP (1 << 0) +#define REMAP_FILE_CAN_SHORTEN (1 << 1) + +/* + * These flags signal that the caller is ok with altering various aspects of + * the behavior of the remap operation. The changes must be made by the + * implementation; the vfs remap helper functions can take advantage of them. + * Flags in this category exist to preserve the quirky behavior of the hoisted + * btrfs clone/dedupe ioctls. + */ +#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) struct iov_iter; @@ -1790,10 +1809,9 @@ struct file_operations { #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, - u64); - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, - u64); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; @@ -1856,21 +1874,21 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, unsigned long, loff_t *, rwf_t); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); -extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe); -extern int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); -extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); -extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same); +extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, + unsigned int remap_flags); +extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); +extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); -extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, - u64 len); +extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags); struct super_operations { @@ -2998,6 +3016,9 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); +extern int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, unsigned int remap_flags); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 24bcc5eec6b4..76f8db0b0e71 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ - alloc_pages(gfp_mask, order) -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) + alloc_pages_vma(gfp_mask, 0, vma, addr, node) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 05d8fb5a06c4..bc9af551fc83 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -17,6 +17,9 @@ #ifdef CONFIG_KEYS +struct kernel_pkey_query; +struct kernel_pkey_params; + /* * key under-construction record * - passed to the request_key actor if supplied @@ -155,6 +158,14 @@ struct key_type { */ struct key_restriction *(*lookup_restriction)(const char *params); + /* Asymmetric key accessor functions. */ + int (*asym_query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info); + int (*asym_eds_op)(struct kernel_pkey_params *params, + const void *in, void *out); + int (*asym_verify_signature)(struct kernel_pkey_params *params, + const void *in, const void *in2); + /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h new file mode 100644 index 000000000000..c7c48c79ce0e --- /dev/null +++ b/include/linux/keyctl.h @@ -0,0 +1,46 @@ +/* keyctl kernel bits + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef __LINUX_KEYCTL_H +#define __LINUX_KEYCTL_H + +#include <uapi/linux/keyctl.h> + +struct kernel_pkey_query { + __u32 supported_ops; /* Which ops are supported */ + __u32 key_size; /* Size of the key in bits */ + __u16 max_data_size; /* Maximum size of raw data to sign in bytes */ + __u16 max_sig_size; /* Maximum size of signature in bytes */ + __u16 max_enc_size; /* Maximum size of encrypted blob in bytes */ + __u16 max_dec_size; /* Maximum size of decrypted blob in bytes */ +}; + +enum kernel_pkey_operation { + kernel_pkey_encrypt, + kernel_pkey_decrypt, + kernel_pkey_sign, + kernel_pkey_verify, +}; + +struct kernel_pkey_params { + struct key *key; + const char *encoding; /* Encoding (eg. "oaep" or "raw" for none) */ + const char *hash_algo; /* Digest algorithm used (eg. "sha1") or NULL if N/A */ + char *info; /* Modified info string to be released later */ + __u32 in_len; /* Input data size */ + union { + __u32 out_len; /* Output buffer size (enc/dec/sign) */ + __u32 in2_len; /* 2nd input data size (verify) */ + }; + enum kernel_pkey_operation op : 8; +}; + +#endif /* __LINUX_KEYCTL_H */ diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5228c62af416..bac395f1d00a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/include/linux/notifier.h b/include/linux/notifier.h index f35c7bf76143..0096a05395e3 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -122,8 +122,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #ifdef CONFIG_TREE_SRCU #define _SRCU_NOTIFIER_HEAD(name, mod) \ - static DEFINE_PER_CPU(struct srcu_data, \ - name##_head_srcu_data); \ + static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) diff --git a/include/linux/sched.h b/include/linux/sched.h index 8f8a5418b627..a51c13c2b1a0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1200,6 +1200,11 @@ struct task_struct { void *security; #endif +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + unsigned long lowest_stack; + unsigned long prev_lowest_stack; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h new file mode 100644 index 000000000000..3d5c3271a9a8 --- /dev/null +++ b/include/linux/stackleak.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_STACKLEAK_H +#define _LINUX_STACKLEAK_H + +#include <linux/sched.h> +#include <linux/sched/task_stack.h> + +/* + * Check that the poison value points to the unused hole in the + * virtual memory map for your platform. + */ +#define STACKLEAK_POISON -0xBEEF +#define STACKLEAK_SEARCH_DEPTH 128 + +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#include <asm/stacktrace.h> + +static inline void stackleak_task_init(struct task_struct *t) +{ + t->lowest_stack = (unsigned long)end_of_stack(t) + sizeof(unsigned long); +# ifdef CONFIG_STACKLEAK_METRICS + t->prev_lowest_stack = t->lowest_stack; +# endif +} + +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + +#else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ +static inline void stackleak_task_init(struct task_struct *t) { } +#endif + +#endif diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index 131424cefc6a..02c0412e368c 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -107,8 +107,8 @@ struct krb5_ctx { u8 Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */ u8 cksum[GSS_KRB5_MAX_KEYLEN]; s32 endtime; - u32 seq_send; - u64 seq_send64; + atomic_t seq_send; + atomic64_t seq_send64; struct xdr_netobj mech_used; u8 initiator_sign[GSS_KRB5_MAX_KEYLEN]; u8 acceptor_sign[GSS_KRB5_MAX_KEYLEN]; @@ -118,9 +118,6 @@ struct krb5_ctx { u8 acceptor_integ[GSS_KRB5_MAX_KEYLEN]; }; -extern u32 gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx); -extern u64 gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx); - /* The length of the Kerberos GSS token header */ #define GSS_KRB5_TOK_HDR_LEN (16) diff --git a/include/linux/uio.h b/include/linux/uio.h index 422b1c01ee0d..55ce99ddb912 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -21,15 +21,16 @@ struct kvec { size_t iov_len; }; -enum { +enum iter_type { ITER_IOVEC = 0, ITER_KVEC = 2, ITER_BVEC = 4, ITER_PIPE = 8, + ITER_DISCARD = 16, }; struct iov_iter { - int type; + unsigned int type; size_t iov_offset; size_t count; union { @@ -47,6 +48,41 @@ struct iov_iter { }; }; +static inline enum iter_type iov_iter_type(const struct iov_iter *i) +{ + return i->type & ~(READ | WRITE); +} + +static inline bool iter_is_iovec(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_IOVEC; +} + +static inline bool iov_iter_is_kvec(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_KVEC; +} + +static inline bool iov_iter_is_bvec(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_BVEC; +} + +static inline bool iov_iter_is_pipe(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_PIPE; +} + +static inline bool iov_iter_is_discard(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_DISCARD; +} + +static inline unsigned char iov_iter_rw(const struct iov_iter *i) +{ + return i->type & (READ | WRITE); +} + /* * Total number of bytes covered by an iovec. * @@ -74,7 +110,8 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) } #define iov_for_each(iov, iter, start) \ - if (!((start).type & (ITER_BVEC | ITER_PIPE))) \ + if (iov_iter_type(start) == ITER_IOVEC || \ + iov_iter_type(start) == ITER_KVEC) \ for (iter = (start); \ (iter).count && \ ((iov = iov_iter_iovec(&(iter))), 1); \ @@ -181,14 +218,15 @@ size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i) size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); -void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, +void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); -void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, +void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); -void iov_iter_bvec(struct iov_iter *i, int direction, const struct bio_vec *bvec, +void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); -void iov_iter_pipe(struct iov_iter *i, int direction, struct pipe_inode_info *pipe, +void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count); +void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, @@ -202,19 +240,6 @@ static inline size_t iov_iter_count(const struct iov_iter *i) return i->count; } -static inline bool iter_is_iovec(const struct iov_iter *i) -{ - return !(i->type & (ITER_BVEC | ITER_KVEC | ITER_PIPE)); -} - -/* - * Get one of READ or WRITE out of iter->type without any other flags OR'd in - * with it. - * - * The ?: is just for type safety. - */ -#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & (READ | WRITE)) - /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 738a0c24874f..fdfd04e348f6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -246,8 +246,7 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup - * writeback context. Must be called after the bio has been associated with - * a device. + * writeback context. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { @@ -258,7 +257,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) * regular writeback instead of writing things out itself. */ if (wbc->wb) - bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); + bio_associate_blkcg(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index d0a341bc4540..33d291888ba9 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -54,6 +54,35 @@ enum afs_fs_operation { afs_FS_StoreData64 = 65538, /* AFS Store file data */ afs_FS_GiveUpAllCallBacks = 65539, /* AFS Give up all our callbacks on a server */ afs_FS_GetCapabilities = 65540, /* AFS Get FS server capabilities */ + + yfs_FS_FetchData = 130, /* YFS Fetch file data */ + yfs_FS_FetchACL = 64131, /* YFS Fetch file ACL */ + yfs_FS_FetchStatus = 64132, /* YFS Fetch file status */ + yfs_FS_StoreACL = 64134, /* YFS Store file ACL */ + yfs_FS_StoreStatus = 64135, /* YFS Store file status */ + yfs_FS_RemoveFile = 64136, /* YFS Remove a file */ + yfs_FS_CreateFile = 64137, /* YFS Create a file */ + yfs_FS_Rename = 64138, /* YFS Rename or move a file or directory */ + yfs_FS_Symlink = 64139, /* YFS Create a symbolic link */ + yfs_FS_Link = 64140, /* YFS Create a hard link */ + yfs_FS_MakeDir = 64141, /* YFS Create a directory */ + yfs_FS_RemoveDir = 64142, /* YFS Remove a directory */ + yfs_FS_GetVolumeStatus = 64149, /* YFS Get volume status information */ + yfs_FS_SetVolumeStatus = 64150, /* YFS Set volume status information */ + yfs_FS_SetLock = 64156, /* YFS Request a file lock */ + yfs_FS_ExtendLock = 64157, /* YFS Extend a file lock */ + yfs_FS_ReleaseLock = 64158, /* YFS Release a file lock */ + yfs_FS_Lookup = 64161, /* YFS lookup file in directory */ + yfs_FS_FlushCPS = 64165, + yfs_FS_FetchOpaqueACL = 64168, + yfs_FS_WhoAmI = 64170, + yfs_FS_RemoveACL = 64171, + yfs_FS_RemoveFile2 = 64173, + yfs_FS_StoreOpaqueACL2 = 64174, + yfs_FS_InlineBulkStatus = 64536, /* YFS Fetch multiple file statuses with errors */ + yfs_FS_FetchData64 = 64537, /* YFS Fetch file data */ + yfs_FS_StoreData64 = 64538, /* YFS Store file data */ + yfs_FS_UpdateSymlink = 64540, }; enum afs_vl_operation { @@ -84,6 +113,44 @@ enum afs_edit_dir_reason { afs_edit_dir_for_unlink, }; +enum afs_eproto_cause { + afs_eproto_bad_status, + afs_eproto_cb_count, + afs_eproto_cb_fid_count, + afs_eproto_file_type, + afs_eproto_ibulkst_cb_count, + afs_eproto_ibulkst_count, + afs_eproto_motd_len, + afs_eproto_offline_msg_len, + afs_eproto_volname_len, + afs_eproto_yvl_fsendpt4_len, + afs_eproto_yvl_fsendpt6_len, + afs_eproto_yvl_fsendpt_num, + afs_eproto_yvl_fsendpt_type, + afs_eproto_yvl_vlendpt4_len, + afs_eproto_yvl_vlendpt6_len, + afs_eproto_yvl_vlendpt_type, +}; + +enum afs_io_error { + afs_io_error_cm_reply, + afs_io_error_extract, + afs_io_error_fs_probe_fail, + afs_io_error_vl_lookup_fail, + afs_io_error_vl_probe_fail, +}; + +enum afs_file_error { + afs_file_error_dir_bad_magic, + afs_file_error_dir_big, + afs_file_error_dir_missing_page, + afs_file_error_dir_over_end, + afs_file_error_dir_small, + afs_file_error_dir_unmarked_ext, + afs_file_error_mntpt, + afs_file_error_writeback_fail, +}; + #endif /* end __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY */ /* @@ -119,7 +186,34 @@ enum afs_edit_dir_reason { EM(afs_FS_FetchData64, "FS.FetchData64") \ EM(afs_FS_StoreData64, "FS.StoreData64") \ EM(afs_FS_GiveUpAllCallBacks, "FS.GiveUpAllCallBacks") \ - E_(afs_FS_GetCapabilities, "FS.GetCapabilities") + EM(afs_FS_GetCapabilities, "FS.GetCapabilities") \ + EM(yfs_FS_FetchACL, "YFS.FetchACL") \ + EM(yfs_FS_FetchStatus, "YFS.FetchStatus") \ + EM(yfs_FS_StoreACL, "YFS.StoreACL") \ + EM(yfs_FS_StoreStatus, "YFS.StoreStatus") \ + EM(yfs_FS_RemoveFile, "YFS.RemoveFile") \ + EM(yfs_FS_CreateFile, "YFS.CreateFile") \ + EM(yfs_FS_Rename, "YFS.Rename") \ + EM(yfs_FS_Symlink, "YFS.Symlink") \ + EM(yfs_FS_Link, "YFS.Link") \ + EM(yfs_FS_MakeDir, "YFS.MakeDir") \ + EM(yfs_FS_RemoveDir, "YFS.RemoveDir") \ + EM(yfs_FS_GetVolumeStatus, "YFS.GetVolumeStatus") \ + EM(yfs_FS_SetVolumeStatus, "YFS.SetVolumeStatus") \ + EM(yfs_FS_SetLock, "YFS.SetLock") \ + EM(yfs_FS_ExtendLock, "YFS.ExtendLock") \ + EM(yfs_FS_ReleaseLock, "YFS.ReleaseLock") \ + EM(yfs_FS_Lookup, "YFS.Lookup") \ + EM(yfs_FS_FlushCPS, "YFS.FlushCPS") \ + EM(yfs_FS_FetchOpaqueACL, "YFS.FetchOpaqueACL") \ + EM(yfs_FS_WhoAmI, "YFS.WhoAmI") \ + EM(yfs_FS_RemoveACL, "YFS.RemoveACL") \ + EM(yfs_FS_RemoveFile2, "YFS.RemoveFile2") \ + EM(yfs_FS_StoreOpaqueACL2, "YFS.StoreOpaqueACL2") \ + EM(yfs_FS_InlineBulkStatus, "YFS.InlineBulkStatus") \ + EM(yfs_FS_FetchData64, "YFS.FetchData64") \ + EM(yfs_FS_StoreData64, "YFS.StoreData64") \ + E_(yfs_FS_UpdateSymlink, "YFS.UpdateSymlink") #define afs_vl_operations \ EM(afs_VL_GetEntryByNameU, "VL.GetEntryByNameU") \ @@ -146,6 +240,40 @@ enum afs_edit_dir_reason { EM(afs_edit_dir_for_symlink, "Symlnk") \ E_(afs_edit_dir_for_unlink, "Unlink") +#define afs_eproto_causes \ + EM(afs_eproto_bad_status, "BadStatus") \ + EM(afs_eproto_cb_count, "CbCount") \ + EM(afs_eproto_cb_fid_count, "CbFidCount") \ + EM(afs_eproto_file_type, "FileTYpe") \ + EM(afs_eproto_ibulkst_cb_count, "IBS.CbCount") \ + EM(afs_eproto_ibulkst_count, "IBS.FidCount") \ + EM(afs_eproto_motd_len, "MotdLen") \ + EM(afs_eproto_offline_msg_len, "OfflineMsgLen") \ + EM(afs_eproto_volname_len, "VolNameLen") \ + EM(afs_eproto_yvl_fsendpt4_len, "YVL.FsEnd4Len") \ + EM(afs_eproto_yvl_fsendpt6_len, "YVL.FsEnd6Len") \ + EM(afs_eproto_yvl_fsendpt_num, "YVL.FsEndCount") \ + EM(afs_eproto_yvl_fsendpt_type, "YVL.FsEndType") \ + EM(afs_eproto_yvl_vlendpt4_len, "YVL.VlEnd4Len") \ + EM(afs_eproto_yvl_vlendpt6_len, "YVL.VlEnd6Len") \ + E_(afs_eproto_yvl_vlendpt_type, "YVL.VlEndType") + +#define afs_io_errors \ + EM(afs_io_error_cm_reply, "CM_REPLY") \ + EM(afs_io_error_extract, "EXTRACT") \ + EM(afs_io_error_fs_probe_fail, "FS_PROBE_FAIL") \ + EM(afs_io_error_vl_lookup_fail, "VL_LOOKUP_FAIL") \ + E_(afs_io_error_vl_probe_fail, "VL_PROBE_FAIL") + +#define afs_file_errors \ + EM(afs_file_error_dir_bad_magic, "DIR_BAD_MAGIC") \ + EM(afs_file_error_dir_big, "DIR_BIG") \ + EM(afs_file_error_dir_missing_page, "DIR_MISSING_PAGE") \ + EM(afs_file_error_dir_over_end, "DIR_ENT_OVER_END") \ + EM(afs_file_error_dir_small, "DIR_SMALL") \ + EM(afs_file_error_dir_unmarked_ext, "DIR_UNMARKED_EXT") \ + EM(afs_file_error_mntpt, "MNTPT_READ_FAILED") \ + E_(afs_file_error_writeback_fail, "WRITEBACK_FAILED") /* * Export enum symbols via userspace. @@ -160,6 +288,9 @@ afs_fs_operations; afs_vl_operations; afs_edit_dir_ops; afs_edit_dir_reasons; +afs_eproto_causes; +afs_io_errors; +afs_file_errors; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -170,17 +301,16 @@ afs_edit_dir_reasons; #define EM(a, b) { a, b }, #define E_(a, b) { a, b } -TRACE_EVENT(afs_recv_data, - TP_PROTO(struct afs_call *call, unsigned count, unsigned offset, +TRACE_EVENT(afs_receive_data, + TP_PROTO(struct afs_call *call, struct iov_iter *iter, bool want_more, int ret), - TP_ARGS(call, count, offset, want_more, ret), + TP_ARGS(call, iter, want_more, ret), TP_STRUCT__entry( + __field(loff_t, remain ) __field(unsigned int, call ) __field(enum afs_call_state, state ) - __field(unsigned int, count ) - __field(unsigned int, offset ) __field(unsigned short, unmarshall ) __field(bool, want_more ) __field(int, ret ) @@ -190,17 +320,18 @@ TRACE_EVENT(afs_recv_data, __entry->call = call->debug_id; __entry->state = call->state; __entry->unmarshall = call->unmarshall; - __entry->count = count; - __entry->offset = offset; + __entry->remain = iov_iter_count(iter); __entry->want_more = want_more; __entry->ret = ret; ), - TP_printk("c=%08x s=%u u=%u %u/%u wm=%u ret=%d", + TP_printk("c=%08x r=%llu u=%u w=%u s=%u ret=%d", __entry->call, - __entry->state, __entry->unmarshall, - __entry->offset, __entry->count, - __entry->want_more, __entry->ret) + __entry->remain, + __entry->unmarshall, + __entry->want_more, + __entry->state, + __entry->ret) ); TRACE_EVENT(afs_notify_call, @@ -301,7 +432,7 @@ TRACE_EVENT(afs_make_fs_call, } ), - TP_printk("c=%08x %06x:%06x:%06x %s", + TP_printk("c=%08x %06llx:%06llx:%06x %s", __entry->call, __entry->fid.vid, __entry->fid.vnode, @@ -555,24 +686,70 @@ TRACE_EVENT(afs_edit_dir, ); TRACE_EVENT(afs_protocol_error, - TP_PROTO(struct afs_call *call, int error, const void *where), + TP_PROTO(struct afs_call *call, int error, enum afs_eproto_cause cause), + + TP_ARGS(call, error, cause), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(int, error ) + __field(enum afs_eproto_cause, cause ) + ), + + TP_fast_assign( + __entry->call = call ? call->debug_id : 0; + __entry->error = error; + __entry->cause = cause; + ), + + TP_printk("c=%08x r=%d %s", + __entry->call, __entry->error, + __print_symbolic(__entry->cause, afs_eproto_causes)) + ); + +TRACE_EVENT(afs_io_error, + TP_PROTO(unsigned int call, int error, enum afs_io_error where), TP_ARGS(call, error, where), TP_STRUCT__entry( __field(unsigned int, call ) __field(int, error ) - __field(const void *, where ) + __field(enum afs_io_error, where ) ), TP_fast_assign( - __entry->call = call ? call->debug_id : 0; + __entry->call = call; + __entry->error = error; + __entry->where = where; + ), + + TP_printk("c=%08x r=%d %s", + __entry->call, __entry->error, + __print_symbolic(__entry->where, afs_io_errors)) + ); + +TRACE_EVENT(afs_file_error, + TP_PROTO(struct afs_vnode *vnode, int error, enum afs_file_error where), + + TP_ARGS(vnode, error, where), + + TP_STRUCT__entry( + __field_struct(struct afs_fid, fid ) + __field(int, error ) + __field(enum afs_file_error, where ) + ), + + TP_fast_assign( + __entry->fid = vnode->fid; __entry->error = error; __entry->where = where; ), - TP_printk("c=%08x r=%d sp=%pSR", - __entry->call, __entry->error, __entry->where) + TP_printk("%llx:%llx:%x r=%d %s", + __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, + __entry->error, + __print_symbolic(__entry->where, afs_file_errors)) ); TRACE_EVENT(afs_cm_no_server, diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 0f3cb13db8e9..f45ee0f69c0c 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -61,6 +61,11 @@ #define KEYCTL_INVALIDATE 21 /* invalidate a key */ #define KEYCTL_GET_PERSISTENT 22 /* get a user's persistent keyring */ #define KEYCTL_DH_COMPUTE 23 /* Compute Diffie-Hellman values */ +#define KEYCTL_PKEY_QUERY 24 /* Query public key parameters */ +#define KEYCTL_PKEY_ENCRYPT 25 /* Encrypt a blob using a public key */ +#define KEYCTL_PKEY_DECRYPT 26 /* Decrypt a blob using a public key */ +#define KEYCTL_PKEY_SIGN 27 /* Create a public key signature */ +#define KEYCTL_PKEY_VERIFY 28 /* Verify a public key signature */ #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ /* keyctl structures */ @@ -82,4 +87,29 @@ struct keyctl_kdf_params { __u32 __spare[8]; }; +#define KEYCTL_SUPPORTS_ENCRYPT 0x01 +#define KEYCTL_SUPPORTS_DECRYPT 0x02 +#define KEYCTL_SUPPORTS_SIGN 0x04 +#define KEYCTL_SUPPORTS_VERIFY 0x08 + +struct keyctl_pkey_query { + __u32 supported_ops; /* Which ops are supported */ + __u32 key_size; /* Size of the key in bits */ + __u16 max_data_size; /* Maximum size of raw data to sign in bytes */ + __u16 max_sig_size; /* Maximum size of signature in bytes */ + __u16 max_enc_size; /* Maximum size of encrypted blob in bytes */ + __u16 max_dec_size; /* Maximum size of decrypted blob in bytes */ + __u32 __spare[10]; +}; + +struct keyctl_pkey_params { + __s32 key_id; /* Serial no. of public key to use */ + __u32 in_len; /* Input data size */ + union { + __u32 out_len; /* Output buffer size (encrypt/decrypt/sign) */ + __u32 in2_len; /* 2nd input data size (verify) */ + }; + __u32 __spare[7]; +}; + #endif /* _LINUX_KEYCTL_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index f35eb72739c0..9de8780ac8d9 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -646,10 +646,12 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events */ #define PERF_RECORD_MISC_MMAP_DATA (1 << 13) #define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) /* * These PERF_RECORD_MISC_* flags below are safely reused diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 13b8cb563892..a1966cd7b677 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -34,15 +34,23 @@ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ +#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 +#define VIRTIO_BALLOON_CMD_ID_STOP 0 +#define VIRTIO_BALLOON_CMD_ID_DONE 1 struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ __u32 num_pages; /* Number of pages we've actually got in balloon. */ __u32 actual; + /* Free page report command id, readonly by guest */ + __u32 free_page_report_cmd_id; + /* Stores PAGE_POISON if page poisoning is in use */ + __u32 poison_val; }; #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ diff --git a/kernel/Makefile b/kernel/Makefile index 7a63d567fdb5..7343b3a9bff0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -117,6 +117,10 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o +KASAN_SANITIZE_stackleak.o := n +KCOV_INSTRUMENT_stackleak.o := n + $(obj)/configs.o: $(obj)/config_data.h targets += config_data.gz diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8b79318810ad..6aaf5dd5383b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -493,7 +493,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -502,8 +502,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -524,35 +524,6 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, } /** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get the effective css of @cgrp for @ss. The effective css is - * defined as the matching css of the nearest ancestor including self which - * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, - * the root css is returned, so this function always returns a valid css. - * - * The returned css is not guaranteed to be online, and therefore it is the - * callers responsiblity to tryget a reference for it. - */ -struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - do { - css = cgroup_css(cgrp, ss); - - if (css) - return css; - cgrp = cgroup_parent(cgrp); - } while (cgrp); - - return init_css_set.subsys[ss->id]; -} - -/** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest @@ -634,11 +605,10 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1037,7 +1007,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css_by_mask(cgrp, ss); + template[i] = cgroup_e_css(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -3054,7 +3024,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css_by_mask() results reflect the new csses + * At this point, cgroup_e_css() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 108fecc20fc1..208481d91090 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -20,6 +20,7 @@ CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y CONFIG_S390_GUEST=y CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BLK=y CONFIG_VIRTIO_CONSOLE=y diff --git a/kernel/events/core.c b/kernel/events/core.c index 8c490130c4fb..84530ab358c3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -750,7 +750,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) __update_cgrp_time(event->cgrp); } diff --git a/kernel/fork.c b/kernel/fork.c index 8f82a3bdcb8f..07cddff89c7b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,6 +91,7 @@ #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> +#include <linux/stackleak.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -1926,6 +1927,8 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; + stackleak_task_init(p); + if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 6e6d467f3dec..1f0985adf193 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -8,7 +8,7 @@ #include <linux/cpu.h> #include <linux/irq.h> -#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long)) +#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS)) struct cpumap { unsigned int available; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c6a3b6851372..35cf0ad29718 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -25,8 +25,6 @@ #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/kernel.h> -#include <linux/kexec.h> -#include <linux/slab.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> #include "kexec_internal.h" diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2e2955a8cf8f..a21ea6021929 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1561,7 +1561,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * We may dequeue prev's rt_rq in put_prev_task(). - * So, we update time before rt_nr_running check. + * So, we update time before rt_queued check. */ if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9d74371e4aad..8d7f15ba5916 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1337,7 +1337,7 @@ void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); if (!sched_domains_numa_distance) return; diff --git a/kernel/stackleak.c b/kernel/stackleak.c new file mode 100644 index 000000000000..e42892926244 --- /dev/null +++ b/kernel/stackleak.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code fills the used part of the kernel stack with a poison value + * before returning to userspace. It's part of the STACKLEAK feature + * ported from grsecurity/PaX. + * + * Author: Alexander Popov <alex.popov@linux.com> + * + * STACKLEAK reduces the information which kernel stack leak bugs can + * reveal and blocks some uninitialized stack variable attacks. + */ + +#include <linux/stackleak.h> + +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include <linux/jump_label.h> +#include <linux/sysctl.h> + +static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); + +int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + int state = !static_branch_unlikely(&stack_erasing_bypass); + int prev_state = state; + + table->data = &state; + table->maxlen = sizeof(int); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + state = !!state; + if (ret || !write || state == prev_state) + return ret; + + if (state) + static_branch_disable(&stack_erasing_bypass); + else + static_branch_enable(&stack_erasing_bypass); + + pr_warn("stackleak: kernel stack erasing is %s\n", + state ? "enabled" : "disabled"); + return ret; +} + +#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) +#else +#define skip_erasing() false +#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ + +asmlinkage void stackleak_erase(void) +{ + /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ + unsigned long kstack_ptr = current->lowest_stack; + unsigned long boundary = (unsigned long)end_of_stack(current); + unsigned int poison_count = 0; + const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); + + if (skip_erasing()) + return; + + /* Check that 'lowest_stack' value is sane */ + if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) + kstack_ptr = boundary; + + /* Search for the poison value in the kernel stack */ + while (kstack_ptr > boundary && poison_count <= depth) { + if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON) + poison_count++; + else + poison_count = 0; + + kstack_ptr -= sizeof(unsigned long); + } + + /* + * One 'long int' at the bottom of the thread stack is reserved and + * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y). + */ + if (kstack_ptr == boundary) + kstack_ptr += sizeof(unsigned long); + +#ifdef CONFIG_STACKLEAK_METRICS + current->prev_lowest_stack = kstack_ptr; +#endif + + /* + * Now write the poison value to the kernel stack. Start from + * 'kstack_ptr' and move up till the new 'boundary'. We assume that + * the stack pointer doesn't change when we write poison. + */ + if (on_thread_stack()) + boundary = current_stack_pointer; + else + boundary = current_top_of_stack(); + + while (kstack_ptr < boundary) { + *(unsigned long *)kstack_ptr = STACKLEAK_POISON; + kstack_ptr += sizeof(unsigned long); + } + + /* Reset the 'lowest_stack' value for the next syscall */ + current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; +} + +void __used stackleak_track_stack(void) +{ + /* + * N.B. stackleak_erase() fills the kernel stack with the poison value, + * which has the register width. That code assumes that the value + * of 'lowest_stack' is aligned on the register width boundary. + * + * That is true for x86 and x86_64 because of the kernel stack + * alignment on these platforms (for details, see 'cc_stack_align' in + * arch/x86/Makefile). Take care of that when you port STACKLEAK to + * new platforms. + */ + unsigned long sp = (unsigned long)&sp; + + /* + * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than + * STACKLEAK_SEARCH_DEPTH makes the poison search in + * stackleak_erase() unreliable. Let's prevent that. + */ + BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + + if (sp < current->lowest_stack && + sp >= (unsigned long)task_stack_page(current) + + sizeof(unsigned long)) { + current->lowest_stack = sp; + } +} +EXPORT_SYMBOL(stackleak_track_stack); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc02050fd0c4..5fc724e4e454 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,7 +66,6 @@ #include <linux/kexec.h> #include <linux/bpf.h> #include <linux/mount.h> -#include <linux/pipe_fs_i.h> #include <linux/uaccess.h> #include <asm/processor.h> @@ -91,7 +90,9 @@ #ifdef CONFIG_CHR_DEV_SG #include <scsi/sg.h> #endif - +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include <linux/stackleak.h> +#endif #ifdef CONFIG_LOCKUP_DETECTOR #include <linux/nmi.h> #endif @@ -1233,6 +1234,17 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/kernel/time/time.c b/kernel/time/time.c index e3a7f7fd3abc..ad204cf6d001 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -842,7 +842,7 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())) + if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..2868d85f1fb1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_blkg) + if (!bio->bi_css) return NULL; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_get_kernfs_id(bio->bi_css->cgroup); } #else static union kernfs_node_id * diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 8be175df3075..7ebccb5c1637 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -83,6 +83,7 @@ const struct kvec *kvec; \ struct kvec v; \ iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else if (unlikely(i->type & ITER_DISCARD)) { \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -114,6 +115,8 @@ } \ i->nr_segs -= kvec - i->kvec; \ i->kvec = kvec; \ + } else if (unlikely(i->type & ITER_DISCARD)) { \ + skip += n; \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -428,17 +431,19 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) } EXPORT_SYMBOL(iov_iter_fault_in_readable); -void iov_iter_init(struct iov_iter *i, int direction, +void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { + WARN_ON(direction & ~(READ | WRITE)); + direction &= READ | WRITE; + /* It will get better. Eventually... */ if (uaccess_kernel()) { - direction |= ITER_KVEC; - i->type = direction; + i->type = ITER_KVEC | direction; i->kvec = (struct kvec *)iov; } else { - i->type = direction; + i->type = ITER_IOVEC | direction; i->iov = iov; } i->nr_segs = nr_segs; @@ -558,7 +563,7 @@ static size_t copy_pipe_to_iter(const void *addr, size_t bytes, size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); @@ -658,7 +663,7 @@ size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i) const char *from = addr; unsigned long rem, curr_addr, s_addr = (unsigned long) addr; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter_mcsafe(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); @@ -692,7 +697,7 @@ EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -712,7 +717,7 @@ EXPORT_SYMBOL(_copy_from_iter); bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return false; } @@ -739,7 +744,7 @@ EXPORT_SYMBOL(_copy_from_iter_full); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -773,7 +778,7 @@ EXPORT_SYMBOL(_copy_from_iter_nocache); size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -794,7 +799,7 @@ EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return false; } @@ -836,7 +841,9 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else if (likely(!(i->type & ITER_PIPE))) + } else if (unlikely(iov_iter_is_discard(i))) + return bytes; + else if (likely(!iov_iter_is_pipe(i))) return copy_page_to_iter_iovec(page, offset, bytes, i); else return copy_page_to_iter_pipe(page, offset, bytes, i); @@ -848,7 +855,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, { if (unlikely(!page_copy_sane(page, offset, bytes))) return 0; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return 0; } @@ -888,7 +895,7 @@ static size_t pipe_zero(size_t bytes, struct iov_iter *i) size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_zero(bytes, i); iterate_and_advance(i, bytes, v, clear_user(v.iov_base, v.iov_len), @@ -908,7 +915,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, kunmap_atomic(kaddr); return 0; } - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { kunmap_atomic(kaddr); WARN_ON(1); return 0; @@ -972,10 +979,14 @@ static void pipe_advance(struct iov_iter *i, size_t size) void iov_iter_advance(struct iov_iter *i, size_t size) { - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { pipe_advance(i, size); return; } + if (unlikely(iov_iter_is_discard(i))) { + i->count -= size; + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -987,7 +998,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; int idx = i->idx; size_t off = i->iov_offset; @@ -1011,12 +1022,14 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) pipe_truncate(i); return; } + if (unlikely(iov_iter_is_discard(i))) + return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; - if (i->type & ITER_BVEC) { + if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; @@ -1049,23 +1062,25 @@ EXPORT_SYMBOL(iov_iter_revert); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; - else if (i->type & ITER_BVEC) + if (unlikely(iov_iter_is_discard(i))) + return i->count; + else if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); else return min(i->count, i->iov->iov_len - i->iov_offset); } EXPORT_SYMBOL(iov_iter_single_seg_count); -void iov_iter_kvec(struct iov_iter *i, int direction, +void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { - BUG_ON(!(direction & ITER_KVEC)); - i->type = direction; + WARN_ON(direction & ~(READ | WRITE)); + i->type = ITER_KVEC | (direction & (READ | WRITE)); i->kvec = kvec; i->nr_segs = nr_segs; i->iov_offset = 0; @@ -1073,12 +1088,12 @@ void iov_iter_kvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_kvec); -void iov_iter_bvec(struct iov_iter *i, int direction, +void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { - BUG_ON(!(direction & ITER_BVEC)); - i->type = direction; + WARN_ON(direction & ~(READ | WRITE)); + i->type = ITER_BVEC | (direction & (READ | WRITE)); i->bvec = bvec; i->nr_segs = nr_segs; i->iov_offset = 0; @@ -1086,13 +1101,13 @@ void iov_iter_bvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_bvec); -void iov_iter_pipe(struct iov_iter *i, int direction, +void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count) { - BUG_ON(direction != ITER_PIPE); + BUG_ON(direction != READ); WARN_ON(pipe->nrbufs == pipe->buffers); - i->type = direction; + i->type = ITER_PIPE | READ; i->pipe = pipe; i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); i->iov_offset = 0; @@ -1101,12 +1116,30 @@ void iov_iter_pipe(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_pipe); +/** + * iov_iter_discard - Initialise an I/O iterator that discards data + * @i: The iterator to initialise. + * @direction: The direction of the transfer. + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator that just discards everything that's written to it. + * It's only available as a READ iterator. + */ +void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) +{ + BUG_ON(direction != READ); + i->type = ITER_DISCARD | READ; + i->count = count; + i->iov_offset = 0; +} +EXPORT_SYMBOL(iov_iter_discard); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; size_t size = i->count; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx])) return size | i->iov_offset; return size; @@ -1125,7 +1158,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) unsigned long res = 0; size_t size = i->count; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return ~0U; } @@ -1193,8 +1226,11 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages(i, pages, maxsize, maxpages, start); + if (unlikely(iov_iter_is_discard(i))) + return -EFAULT; + iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -1205,7 +1241,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); if (unlikely(res < 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1270,8 +1306,11 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages_alloc(i, pages, maxsize, start); + if (unlikely(iov_iter_is_discard(i))) + return -EFAULT; + iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -1283,7 +1322,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); if (unlikely(res < 0)) { kvfree(p); return res; @@ -1313,7 +1352,7 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return 0; } @@ -1355,7 +1394,7 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return false; } @@ -1400,7 +1439,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); /* for now */ return 0; } @@ -1442,8 +1481,10 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; + if (unlikely(iov_iter_is_discard(i))) + return 0; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; size_t off; int idx; @@ -1481,11 +1522,13 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; - if (unlikely(new->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(new))) { WARN_ON(1); return NULL; } - if (new->type & ITER_BVEC) + if (unlikely(iov_iter_is_discard(new))) + return NULL; + if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); diff --git a/mm/filemap.c b/mm/filemap.c index 218d0b2ec82d..81adec8ee02c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2049,7 +2049,7 @@ find_page: !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; /* pipes can't handle partially uptodate pages */ - if (unlikely(iter->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(iter))) goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; @@ -2825,6 +2825,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping, EXPORT_SYMBOL(read_cache_page_gfp); /* + * Don't operate on ranges the page cache doesn't support, and don't exceed the + * LFS limits. If pos is under the limit it becomes a short access. If it + * exceeds the limit we return -EFBIG. + */ +static int generic_access_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; + + if (unlikely(pos >= max_size)) + return -EFBIG; + *count = min(*count, max_size - pos); + return 0; +} + +static int generic_write_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + loff_t limit = rlimit(RLIMIT_FSIZE); + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + *count = min(*count, limit - pos); + } + + return generic_access_check_limits(file, pos, count); +} + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -2835,8 +2871,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - unsigned long limit = rlimit(RLIMIT_FSIZE); - loff_t pos; + loff_t count; + int ret; if (!iov_iter_count(from)) return 0; @@ -2845,43 +2881,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - pos = iocb->ki_pos; - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - if (limit != RLIM_INFINITY) { - if (iocb->ki_pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - iov_iter_truncate(from, limit - (unsigned long)pos); - } + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); + if (ret) + return ret; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} +EXPORT_SYMBOL(generic_write_checks); + +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_access_check_limits(file_in, pos_in, &count); + if (ret) + return ret; + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; /* - * LFS rule + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. */ - if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && - !(file->f_flags & O_LARGEFILE))) { - if (pos >= MAX_NON_LFS) - return -EFBIG; - iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; } + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + /* - * Are we about to exceed the fs block limit ? - * - * If we have written data it becomes a short write. If we have - * exceeded without writing data we send a signal and return EFBIG. - * Linus frestrict idea will clean these up nicely.. + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. */ - if (unlikely(pos >= inode->i_sb->s_maxbytes)) - return -EFBIG; + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; - iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); - return iov_iter_count(from); + *req_count = count; + return 0; } -EXPORT_SYMBOL(generic_write_checks); int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4e4ef8fa479d..55478ab3c83b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -629,21 +629,40 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + gfp_t this_node = 0; + +#ifdef CONFIG_NUMA + struct mempolicy *pol; + /* + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not + * specified, to express a general desire to stay on the current + * node for optimistic allocation attempts. If the defrag mode + * and/or madvise hint requires the direct reclaim then we prefer + * to fallback to other node rather than node reclaim because that + * can lead to excessive reclaim even though there is free memory + * on other nodes. We expect that NUMA preferences are specified + * by memory policies. + */ + pol = get_vma_policy(vma, addr); + if (pol->mode != MPOL_BIND) + this_node = __GFP_THISNODE; + mpol_cond_put(pol); +#endif if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + __GFP_KSWAPD_RECLAIM | this_node); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - 0); - return GFP_TRANSHUGE_LIGHT; + this_node); + return GFP_TRANSHUGE_LIGHT | this_node; } /* Caller must hold page table lock. */ @@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, + haddr, numa_node_id()); } else new_page = NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 54920cbc46bf..6e1469b80cb7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2593,7 +2593,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) struct mem_cgroup *memcg; int ret = 0; - if (memcg_kmem_bypass()) + if (mem_cgroup_disabled() || memcg_kmem_bypass()) return 0; memcg = get_mem_cgroup_from_current(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 61972da38d93..2b2b3ccbbfb5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -586,6 +586,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + cond_resched(); ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, altmap); map_offset = 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cfd26d7e61a1..5837a067124d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, - HPAGE_PMD_ORDER); + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, + address, numa_node_id()); if (!thp) return NULL; prep_transhuge_page(thp); @@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); @@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, int node) { struct mempolicy *pol; struct page *page; @@ -2040,32 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; - - /* - * For hugepage allocation and non-interleave policy which - * allows the current node (or other explicitly preferred - * node) we only try to allocate from the current/preferred - * node and don't fall back to other nodes, as the cost of - * remote accesses would likely offset THP benefits. - * - * If the policy is interleave, or does not allow the current - * node in its nodemask, we allocate the standard way. - */ - if (pol->mode == MPOL_PREFERRED && - !(pol->flags & MPOL_F_LOCAL)) - hpage_node = pol->v.preferred_node; - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); - page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); - goto out; - } - } - nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/page_io.c b/mm/page_io.c index a451ffa9491c..d4d1c89bcddd 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -294,7 +294,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, }; struct iov_iter from; - iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); + iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); @@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); - bio_associate_blkg_from_page(bio, page); + bio_associate_blkcg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); diff --git a/mm/page_poison.c b/mm/page_poison.c index f7e2a676365a..f0c15e9017c0 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -17,6 +17,11 @@ static int __init early_page_poison_param(char *buf) } early_param("page_poison", early_page_poison_param); +/** + * page_poisoning_enabled - check if page poisoning is enabled + * + * Return true if page poisoning is enabled, or false if not. + */ bool page_poisoning_enabled(void) { /* @@ -29,6 +34,7 @@ bool page_poisoning_enabled(void) (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())); } +EXPORT_SYMBOL_GPL(page_poisoning_enabled); static void poison_page(struct page *page) { diff --git a/mm/shmem.c b/mm/shmem.c index 56bf122e0bb4..ea26d7a0342d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); diff --git a/net/9p/client.c b/net/9p/client.c index 5f23e18eecc0..2c9a17b9b46b 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -2066,7 +2066,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) struct kvec kv = {.iov_base = data, .iov_len = count}; struct iov_iter to; - iov_iter_kvec(&to, READ | ITER_KVEC, &kv, 1, count); + iov_iter_kvec(&to, READ, &kv, 1, count); p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", fid->fid, (unsigned long long) offset, count); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index eb596c2ed546..b1d39cabf125 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -329,7 +329,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, if (!iov_iter_count(data)) return 0; - if (!(data->type & ITER_KVEC)) { + if (!iov_iter_is_kvec(data)) { int n; /* * We allow only p9_max_pages pinned. We wait for the diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4e2576fc0c59..828e87fe8027 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -467,7 +467,7 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, skb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len); if (err > 0) { diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 51c2cf2d8923..58fc6333d412 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -63,7 +63,7 @@ static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *dat memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, total_len); + iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, total_len); l2cap_chan_send(chan, &msg, total_len); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a1c1b7e8a45c..c822e626761b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -622,7 +622,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iv, 2, 1 + len); + iov_iter_kvec(&msg.msg_iter, WRITE, iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 88e35830198c..57fcc6b4bf6e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -513,7 +513,7 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) if (!buf) msg.msg_flags |= MSG_TRUNC; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; @@ -532,7 +532,7 @@ static int ceph_tcp_recvpage(struct socket *sock, struct page *page, int r; BUG_ON(page_offset + length > PAGE_SIZE); - iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length); + iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; @@ -594,7 +594,7 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size); + iov_iter_bvec(&msg.msg_iter, WRITE, &bvec, 1, size); ret = sock_sendmsg(sock, &msg); if (ret == -EAGAIN) ret = 0; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d4020c5e831d..2526be6b3d90 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1616,7 +1616,7 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) EnterFunction(7); /* Receive a packet */ - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, buflen); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); if (len < 0) return len; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 52241d679cc9..89c3a8c7859a 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -286,7 +286,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, */ krflags = MSG_PEEK | MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { @@ -325,7 +325,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, datlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { diff --git a/net/socket.c b/net/socket.c index 99c96851469f..593826e11a53 100644 --- a/net/socket.c +++ b/net/socket.c @@ -635,7 +635,7 @@ EXPORT_SYMBOL(sock_sendmsg); int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { - iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); @@ -648,7 +648,7 @@ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, if (!sock->ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); - iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } @@ -823,7 +823,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, mm_segment_t oldfs = get_fs(); int result; - iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); set_fs(KERNEL_DS); result = sock_recvmsg(sock, msg, flags); set_fs(oldfs); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 7f0424dfa8f6..eab71fc7af3e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -274,6 +274,7 @@ out_err: static int gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) { + u32 seq_send; int tmp; p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); @@ -315,9 +316,10 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); if (IS_ERR(p)) goto out_err; - p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send)); if (IS_ERR(p)) goto out_err; + atomic_set(&ctx->seq_send, seq_send); p = simple_get_netobj(p, end, &ctx->mech_used); if (IS_ERR(p)) goto out_err; @@ -607,6 +609,7 @@ static int gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, gfp_t gfp_mask) { + u64 seq_send64; int keylen; p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); @@ -617,14 +620,15 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); if (IS_ERR(p)) goto out_err; - p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64)); + p = simple_get_bytes(p, end, &seq_send64, sizeof(seq_send64)); if (IS_ERR(p)) goto out_err; + atomic64_set(&ctx->seq_send64, seq_send64); /* set seq_send for use by "older" enctypes */ - ctx->seq_send = ctx->seq_send64; - if (ctx->seq_send64 != ctx->seq_send) { - dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, - (unsigned long)ctx->seq_send64, ctx->seq_send); + atomic_set(&ctx->seq_send, seq_send64); + if (seq_send64 != atomic_read(&ctx->seq_send)) { + dprintk("%s: seq_send64 %llx, seq_send %x overflow?\n", __func__, + seq_send64, atomic_read(&ctx->seq_send)); p = ERR_PTR(-EINVAL); goto out_err; } diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index b4adeb06660b..48fe4a591b54 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -123,30 +123,6 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) return krb5_hdr; } -u32 -gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx) -{ - u32 old, seq_send = READ_ONCE(ctx->seq_send); - - do { - old = seq_send; - seq_send = cmpxchg(&ctx->seq_send, old, old + 1); - } while (old != seq_send); - return seq_send; -} - -u64 -gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx) -{ - u64 old, seq_send = READ_ONCE(ctx->seq_send); - - do { - old = seq_send; - seq_send = cmpxchg64(&ctx->seq_send64, old, old + 1); - } while (old != seq_send); - return seq_send; -} - static u32 gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token) @@ -177,7 +153,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - seq_send = gss_seq_send_fetch_and_inc(ctx); + seq_send = atomic_fetch_inc(&ctx->seq_send); if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) @@ -205,7 +181,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, /* Set up the sequence number. Now 64-bits in clear * text and w/o direction indicator */ - seq_send_be64 = cpu_to_be64(gss_seq_send64_fetch_and_inc(ctx)); + seq_send_be64 = cpu_to_be64(atomic64_fetch_inc(&ctx->seq_send64)); memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); if (ctx->initiate) { diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 962fa84e6db1..5cdde6cb703a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -228,7 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - seq_send = gss_seq_send_fetch_and_inc(kctx); + seq_send = atomic_fetch_inc(&kctx->seq_send); /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ @@ -475,7 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; - *be64ptr = cpu_to_be64(gss_seq_send64_fetch_and_inc(kctx)); + *be64ptr = cpu_to_be64(atomic64_fetch_inc(&kctx->seq_send64)); err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 3b525accaa68..986f3ed7d1a2 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -336,7 +336,7 @@ static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen); + iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen); if (base != 0) { iov_iter_advance(&msg.msg_iter, base); buflen -= base; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1b51e04d3566..ae77c71c1f64 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -361,7 +361,7 @@ static ssize_t xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, struct kvec *kvec, size_t count, size_t seek) { - iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, kvec, 1, count); + iov_iter_kvec(&msg->msg_iter, READ, kvec, 1, count); return xs_sock_recvmsg(sock, msg, flags, seek); } @@ -370,7 +370,7 @@ xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, struct bio_vec *bvec, unsigned long nr, size_t count, size_t seek) { - iov_iter_bvec(&msg->msg_iter, READ | ITER_BVEC, bvec, nr, count); + iov_iter_bvec(&msg->msg_iter, READ, bvec, nr, count); return xs_sock_recvmsg(sock, msg, flags, seek); } diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 4bdea0057171..efb16f69bd2c 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -394,7 +394,7 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con) iov.iov_base = &s; iov.iov_len = sizeof(s); msg.msg_name = NULL; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, iov.iov_len); ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret == -EWOULDBLOCK) return -EWOULDBLOCK; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 276edbc04f38..d753e362d2d9 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -489,7 +489,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, iov.iov_base = kaddr + offset; iov.iov_len = size; - iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); rc = tls_push_data(sk, &msg_iter, size, flags, TLS_RECORD_TYPE_DATA); kunmap(page); @@ -538,7 +538,7 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter msg_iter; - iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5cd88ba8acd1..7b1af8b59cd2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -799,7 +799,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send); bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; unsigned char record_type = TLS_RECORD_TYPE_DATA; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; struct sk_msg *msg_pl, *msg_en; @@ -1457,7 +1457,7 @@ int tls_sw_recvmsg(struct sock *sk, bool cmsg = false; int target, err = 0; long timeo; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); int num_async = 0; flags |= nonblock; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b669262682c9..dc4a9f1fb941 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2077,10 +2077,8 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; -#ifdef CONFIG_COMPAT if (in_compat_syscall()) return -EOPNOTSUPP; -#endif if (!optval && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ca7a207b81a9..c9a84e22f5d5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2621,10 +2621,8 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, const struct xfrm_link *link; int type, err; -#ifdef CONFIG_COMPAT if (in_compat_syscall()) return -EOPNOTSUPP; -#endif type = nlh->nlmsg_type; if (type > XFRM_MSG_MAX) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index ca21a35fa244..bb015551c2d9 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -140,17 +140,9 @@ cc-option-yn = $(call try-run,\ cc-disable-warning = $(call try-run,\ $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) -# cc-name -# Expands to either gcc or clang -cc-name = $(shell $(CC) -v 2>&1 | grep -q "clang version" && echo clang || echo gcc) - # cc-version cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) -# cc-fullversion -cc-fullversion = $(shell $(CONFIG_SHELL) \ - $(srctree)/scripts/gcc-version.sh -p $(CC)) - # cc-ifversion # Usage: EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1) cc-ifversion = $(shell [ $(cc-version) $(1) $(2) ] && echo $(3) || echo $(4)) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 24b2fb1d1297..768306add591 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -29,6 +29,7 @@ warning-1 += $(call cc-option, -Wmissing-include-dirs) warning-1 += $(call cc-option, -Wunused-but-set-variable) warning-1 += $(call cc-option, -Wunused-const-variable) warning-1 += $(call cc-option, -Wpacked-not-aligned) +warning-1 += $(call cc-option, -Wstringop-truncation) warning-1 += $(call cc-disable-warning, missing-field-initializers) warning-1 += $(call cc-disable-warning, sign-compare) @@ -64,7 +65,7 @@ endif KBUILD_CFLAGS += $(warning) else -ifeq ($(cc-name),clang) +ifdef CONFIG_CC_IS_CLANG KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides) KBUILD_CFLAGS += $(call cc-disable-warning, unused-value) KBUILD_CFLAGS += $(call cc-disable-warning, format) diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 0a482f341576..46c5c6809806 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -26,6 +26,16 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_RANDSTRUCT) \ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_RANDSTRUCT_PERFORMANCE) \ += -fplugin-arg-randomize_layout_plugin-performance-mode +gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak_plugin.so +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ + += -DSTACKLEAK_PLUGIN +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ + += -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE) +ifdef CONFIG_GCC_PLUGIN_STACKLEAK + DISABLE_STACKLEAK_PLUGIN += -fplugin-arg-stackleak_plugin-disable +endif +export DISABLE_STACKLEAK_PLUGIN + # All the plugin CFLAGS are collected here in case a build target needs to # filter them out of the KBUILD_CFLAGS. GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y)) diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index cb0c889e13aa..0d5c799688f0 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -139,4 +139,55 @@ config GCC_PLUGIN_RANDSTRUCT_PERFORMANCE in structures. This reduces the performance hit of RANDSTRUCT at the cost of weakened randomization. +config GCC_PLUGIN_STACKLEAK + bool "Erase the kernel stack before returning from syscalls" + depends on GCC_PLUGINS + depends on HAVE_ARCH_STACKLEAK + help + This option makes the kernel erase the kernel stack before + returning from system calls. That reduces the information which + kernel stack leak bugs can reveal and blocks some uninitialized + stack variable attacks. + + The tradeoff is the performance impact: on a single CPU system kernel + compilation sees a 1% slowdown, other systems and workloads may vary + and you are advised to test this feature on your expected workload + before deploying it. + + This plugin was ported from grsecurity/PaX. More information at: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + +config STACKLEAK_TRACK_MIN_SIZE + int "Minimum stack frame size of functions tracked by STACKLEAK" + default 100 + range 0 4096 + depends on GCC_PLUGIN_STACKLEAK + help + The STACKLEAK gcc plugin instruments the kernel code for tracking + the lowest border of the kernel stack (and for some other purposes). + It inserts the stackleak_track_stack() call for the functions with + a stack frame size greater than or equal to this parameter. + If unsure, leave the default value 100. + +config STACKLEAK_METRICS + bool "Show STACKLEAK metrics in the /proc file system" + depends on GCC_PLUGIN_STACKLEAK + depends on PROC_FS + help + If this is set, STACKLEAK metrics for every task are available in + the /proc file system. In particular, /proc/<pid>/stack_depth + shows the maximum kernel stack consumption for the current and + previous syscalls. Although this information is not precise, it + can be useful for estimating the STACKLEAK performance impact for + your workloads. + +config STACKLEAK_RUNTIME_DISABLE + bool "Allow runtime disabling of kernel stack erasing" + depends on GCC_PLUGIN_STACKLEAK + help + This option provides 'stack_erasing' sysctl, which can be used in + runtime to control kernel stack erasing for kernels built with + CONFIG_GCC_PLUGIN_STACKLEAK. + endif diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c new file mode 100644 index 000000000000..2f48da98b5d4 --- /dev/null +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -0,0 +1,427 @@ +/* + * Copyright 2011-2017 by the PaX Team <pageexec@freemail.hu> + * Modified by Alexander Popov <alex.popov@linux.com> + * Licensed under the GPL v2 + * + * Note: the choice of the license means that the compilation process is + * NOT 'eligible' as defined by gcc's library exception to the GPL v3, + * but for the kernel it doesn't matter since it doesn't link against + * any of the gcc libraries + * + * This gcc plugin is needed for tracking the lowest border of the kernel stack. + * It instruments the kernel code inserting stackleak_track_stack() calls: + * - after alloca(); + * - for the functions with a stack frame size greater than or equal + * to the "track-min-size" plugin parameter. + * + * This plugin is ported from grsecurity/PaX. For more information see: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + * + * Debugging: + * - use fprintf() to stderr, debug_generic_expr(), debug_gimple_stmt(), + * print_rtl() and print_simple_rtl(); + * - add "-fdump-tree-all -fdump-rtl-all" to the plugin CFLAGS in + * Makefile.gcc-plugins to see the verbose dumps of the gcc passes; + * - use gcc -E to understand the preprocessing shenanigans; + * - use gcc with enabled CFG/GIMPLE/SSA verification (--enable-checking). + */ + +#include "gcc-common.h" + +__visible int plugin_is_GPL_compatible; + +static int track_frame_size = -1; +static const char track_function[] = "stackleak_track_stack"; + +/* + * Mark these global variables (roots) for gcc garbage collector since + * they point to the garbage-collected memory. + */ +static GTY(()) tree track_function_decl; + +static struct plugin_info stackleak_plugin_info = { + .version = "201707101337", + .help = "track-min-size=nn\ttrack stack for functions with a stack frame size >= nn bytes\n" + "disable\t\tdo not activate the plugin\n" +}; + +static void stackleak_add_track_stack(gimple_stmt_iterator *gsi, bool after) +{ + gimple stmt; + gcall *stackleak_track_stack; + cgraph_node_ptr node; + int frequency; + basic_block bb; + + /* Insert call to void stackleak_track_stack(void) */ + stmt = gimple_build_call(track_function_decl, 0); + stackleak_track_stack = as_a_gcall(stmt); + if (after) { + gsi_insert_after(gsi, stackleak_track_stack, + GSI_CONTINUE_LINKING); + } else { + gsi_insert_before(gsi, stackleak_track_stack, GSI_SAME_STMT); + } + + /* Update the cgraph */ + bb = gimple_bb(stackleak_track_stack); + node = cgraph_get_create_node(track_function_decl); + gcc_assert(node); + frequency = compute_call_stmt_bb_frequency(current_function_decl, bb); + cgraph_create_edge(cgraph_get_node(current_function_decl), node, + stackleak_track_stack, bb->count, frequency); +} + +static bool is_alloca(gimple stmt) +{ + if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA)) + return true; + +#if BUILDING_GCC_VERSION >= 4007 + if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA_WITH_ALIGN)) + return true; +#endif + + return false; +} + +/* + * Work with the GIMPLE representation of the code. Insert the + * stackleak_track_stack() call after alloca() and into the beginning + * of the function if it is not instrumented. + */ +static unsigned int stackleak_instrument_execute(void) +{ + basic_block bb, entry_bb; + bool prologue_instrumented = false, is_leaf = true; + gimple_stmt_iterator gsi; + + /* + * ENTRY_BLOCK_PTR is a basic block which represents possible entry + * point of a function. This block does not contain any code and + * has a CFG edge to its successor. + */ + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + entry_bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + + /* + * Loop through the GIMPLE statements in each of cfun basic blocks. + * cfun is a global variable which represents the function that is + * currently processed. + */ + FOR_EACH_BB_FN(bb, cfun) { + for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) { + gimple stmt; + + stmt = gsi_stmt(gsi); + + /* Leaf function is a function which makes no calls */ + if (is_gimple_call(stmt)) + is_leaf = false; + + if (!is_alloca(stmt)) + continue; + + /* Insert stackleak_track_stack() call after alloca() */ + stackleak_add_track_stack(&gsi, true); + if (bb == entry_bb) + prologue_instrumented = true; + } + } + + if (prologue_instrumented) + return 0; + + /* + * Special cases to skip the instrumentation. + * + * Taking the address of static inline functions materializes them, + * but we mustn't instrument some of them as the resulting stack + * alignment required by the function call ABI will break other + * assumptions regarding the expected (but not otherwise enforced) + * register clobbering ABI. + * + * Case in point: native_save_fl on amd64 when optimized for size + * clobbers rdx if it were instrumented here. + * + * TODO: any more special cases? + */ + if (is_leaf && + !TREE_PUBLIC(current_function_decl) && + DECL_DECLARED_INLINE_P(current_function_decl)) { + return 0; + } + + if (is_leaf && + !strncmp(IDENTIFIER_POINTER(DECL_NAME(current_function_decl)), + "_paravirt_", 10)) { + return 0; + } + + /* Insert stackleak_track_stack() call at the function beginning */ + bb = entry_bb; + if (!single_pred_p(bb)) { + /* gcc_assert(bb_loop_depth(bb) || + (bb->flags & BB_IRREDUCIBLE_LOOP)); */ + split_edge(single_succ_edge(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + } + gsi = gsi_after_labels(bb); + stackleak_add_track_stack(&gsi, false); + + return 0; +} + +static bool large_stack_frame(void) +{ +#if BUILDING_GCC_VERSION >= 8000 + return maybe_ge(get_frame_size(), track_frame_size); +#else + return (get_frame_size() >= track_frame_size); +#endif +} + +/* + * Work with the RTL representation of the code. + * Remove the unneeded stackleak_track_stack() calls from the functions + * which don't call alloca() and don't have a large enough stack frame size. + */ +static unsigned int stackleak_cleanup_execute(void) +{ + rtx_insn *insn, *next; + + if (cfun->calls_alloca) + return 0; + + if (large_stack_frame()) + return 0; + + /* + * Find stackleak_track_stack() calls. Loop through the chain of insns, + * which is an RTL representation of the code for a function. + * + * The example of a matching insn: + * (call_insn 8 4 10 2 (call (mem (symbol_ref ("stackleak_track_stack") + * [flags 0x41] <function_decl 0x7f7cd3302a80 stackleak_track_stack>) + * [0 stackleak_track_stack S1 A8]) (0)) 675 {*call} (expr_list + * (symbol_ref ("stackleak_track_stack") [flags 0x41] <function_decl + * 0x7f7cd3302a80 stackleak_track_stack>) (expr_list (0) (nil))) (nil)) + */ + for (insn = get_insns(); insn; insn = next) { + rtx body; + + next = NEXT_INSN(insn); + + /* Check the expression code of the insn */ + if (!CALL_P(insn)) + continue; + + /* + * Check the expression code of the insn body, which is an RTL + * Expression (RTX) describing the side effect performed by + * that insn. + */ + body = PATTERN(insn); + + if (GET_CODE(body) == PARALLEL) + body = XVECEXP(body, 0, 0); + + if (GET_CODE(body) != CALL) + continue; + + /* + * Check the first operand of the call expression. It should + * be a mem RTX describing the needed subroutine with a + * symbol_ref RTX. + */ + body = XEXP(body, 0); + if (GET_CODE(body) != MEM) + continue; + + body = XEXP(body, 0); + if (GET_CODE(body) != SYMBOL_REF) + continue; + + if (SYMBOL_REF_DECL(body) != track_function_decl) + continue; + + /* Delete the stackleak_track_stack() call */ + delete_insn_and_edges(insn); +#if BUILDING_GCC_VERSION >= 4007 && BUILDING_GCC_VERSION < 8000 + if (GET_CODE(next) == NOTE && + NOTE_KIND(next) == NOTE_INSN_CALL_ARG_LOCATION) { + insn = next; + next = NEXT_INSN(insn); + delete_insn_and_edges(insn); + } +#endif + } + + return 0; +} + +static bool stackleak_gate(void) +{ + tree section; + + section = lookup_attribute("section", + DECL_ATTRIBUTES(current_function_decl)); + if (section && TREE_VALUE(section)) { + section = TREE_VALUE(TREE_VALUE(section)); + + if (!strncmp(TREE_STRING_POINTER(section), ".init.text", 10)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".devinit.text", 13)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".cpuinit.text", 13)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".meminit.text", 13)) + return false; + } + + return track_frame_size >= 0; +} + +/* Build the function declaration for stackleak_track_stack() */ +static void stackleak_start_unit(void *gcc_data __unused, + void *user_data __unused) +{ + tree fntype; + + /* void stackleak_track_stack(void) */ + fntype = build_function_type_list(void_type_node, NULL_TREE); + track_function_decl = build_fn_decl(track_function, fntype); + DECL_ASSEMBLER_NAME(track_function_decl); /* for LTO */ + TREE_PUBLIC(track_function_decl) = 1; + TREE_USED(track_function_decl) = 1; + DECL_EXTERNAL(track_function_decl) = 1; + DECL_ARTIFICIAL(track_function_decl) = 1; + DECL_PRESERVE_P(track_function_decl) = 1; +} + +/* + * Pass gate function is a predicate function that gets executed before the + * corresponding pass. If the return value is 'true' the pass gets executed, + * otherwise, it is skipped. + */ +static bool stackleak_instrument_gate(void) +{ + return stackleak_gate(); +} + +#define PASS_NAME stackleak_instrument +#define PROPERTIES_REQUIRED PROP_gimple_leh | PROP_cfg +#define TODO_FLAGS_START TODO_verify_ssa | TODO_verify_flow | TODO_verify_stmts +#define TODO_FLAGS_FINISH TODO_verify_ssa | TODO_verify_stmts | TODO_dump_func \ + | TODO_update_ssa | TODO_rebuild_cgraph_edges +#include "gcc-generate-gimple-pass.h" + +static bool stackleak_cleanup_gate(void) +{ + return stackleak_gate(); +} + +#define PASS_NAME stackleak_cleanup +#define TODO_FLAGS_FINISH TODO_dump_func +#include "gcc-generate-rtl-pass.h" + +/* + * Every gcc plugin exports a plugin_init() function that is called right + * after the plugin is loaded. This function is responsible for registering + * the plugin callbacks and doing other required initialization. + */ +__visible int plugin_init(struct plugin_name_args *plugin_info, + struct plugin_gcc_version *version) +{ + const char * const plugin_name = plugin_info->base_name; + const int argc = plugin_info->argc; + const struct plugin_argument * const argv = plugin_info->argv; + int i = 0; + + /* Extra GGC root tables describing our GTY-ed data */ + static const struct ggc_root_tab gt_ggc_r_gt_stackleak[] = { + { + .base = &track_function_decl, + .nelt = 1, + .stride = sizeof(track_function_decl), + .cb = >_ggc_mx_tree_node, + .pchw = >_pch_nx_tree_node + }, + LAST_GGC_ROOT_TAB + }; + + /* + * The stackleak_instrument pass should be executed before the + * "optimized" pass, which is the control flow graph cleanup that is + * performed just before expanding gcc trees to the RTL. In former + * versions of the plugin this new pass was inserted before the + * "tree_profile" pass, which is currently called "profile". + */ + PASS_INFO(stackleak_instrument, "optimized", 1, + PASS_POS_INSERT_BEFORE); + + /* + * The stackleak_cleanup pass should be executed after the + * "reload" pass, when the stack frame size is final. + */ + PASS_INFO(stackleak_cleanup, "reload", 1, PASS_POS_INSERT_AFTER); + + if (!plugin_default_version_check(version, &gcc_version)) { + error(G_("incompatible gcc/plugin versions")); + return 1; + } + + /* Parse the plugin arguments */ + for (i = 0; i < argc; i++) { + if (!strcmp(argv[i].key, "disable")) + return 0; + + if (!strcmp(argv[i].key, "track-min-size")) { + if (!argv[i].value) { + error(G_("no value supplied for option '-fplugin-arg-%s-%s'"), + plugin_name, argv[i].key); + return 1; + } + + track_frame_size = atoi(argv[i].value); + if (track_frame_size < 0) { + error(G_("invalid option argument '-fplugin-arg-%s-%s=%s'"), + plugin_name, argv[i].key, argv[i].value); + return 1; + } + } else { + error(G_("unknown option '-fplugin-arg-%s-%s'"), + plugin_name, argv[i].key); + return 1; + } + } + + /* Give the information about the plugin */ + register_callback(plugin_name, PLUGIN_INFO, NULL, + &stackleak_plugin_info); + + /* Register to be called before processing a translation unit */ + register_callback(plugin_name, PLUGIN_START_UNIT, + &stackleak_start_unit, NULL); + + /* Register an extra GCC garbage collector (GGC) root table */ + register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, NULL, + (void *)>_ggc_r_gt_stackleak); + + /* + * Hook into the Pass Manager to register new gcc passes. + * + * The stack frame size info is available only at the last RTL pass, + * when it's too late to insert complex code like a function call. + * So we register two gcc passes to instrument every function at first + * and remove the unneeded instrumentation later. + */ + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &stackleak_instrument_pass_info); + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &stackleak_cleanup_pass_info); + + return 0; +} diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 67ed9f6ccdf8..63b609243d03 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -68,21 +68,7 @@ PHONY += $(simple-targets) $(simple-targets): $(obj)/conf $< $(silent) --$@ $(Kconfig) -PHONY += oldnoconfig silentoldconfig savedefconfig defconfig - -# oldnoconfig is an alias of olddefconfig, because people already are dependent -# on its behavior (sets new symbols to their default value but not 'n') with the -# counter-intuitive name. -oldnoconfig: olddefconfig - @echo " WARNING: \"oldnoconfig\" target will be removed after Linux 4.19" - @echo " Please use \"olddefconfig\" instead, which is an alias." - -# We do not expect manual invokcation of "silentoldcofig" (or "syncconfig"). -silentoldconfig: syncconfig - @echo " WARNING: \"silentoldconfig\" has been renamed to \"syncconfig\"" - @echo " and is now an internal implementation detail." - @echo " What you want is probably \"oldconfig\"." - @echo " \"silentoldconfig\" will be removed after Linux 4.19" +PHONY += savedefconfig defconfig savedefconfig: $(obj)/conf $< $(silent) --$@=defconfig $(Kconfig) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 7b2b37260669..98e0c7a34699 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -460,12 +460,6 @@ static struct option long_opts[] = { {"randconfig", no_argument, NULL, randconfig}, {"listnewconfig", no_argument, NULL, listnewconfig}, {"olddefconfig", no_argument, NULL, olddefconfig}, - /* - * oldnoconfig is an alias of olddefconfig, because people already - * are dependent on its behavior(sets new symbols to their default - * value but not 'n') with the counter-intuitive name. - */ - {"oldnoconfig", no_argument, NULL, olddefconfig}, {NULL, 0, NULL, 0} }; @@ -480,7 +474,6 @@ static void conf_usage(const char *progname) printf(" --syncconfig Similar to oldconfig but generates configuration in\n" " include/{generated/,config/}\n"); printf(" --olddefconfig Same as oldconfig but sets new symbols to their default value\n"); - printf(" --oldnoconfig An alias of olddefconfig\n"); printf(" --defconfig <file> New config with default defined in <file>\n"); printf(" --savedefconfig <file> Save the minimal current configuration to <file>\n"); printf(" --allnoconfig New config where all options are answered with no\n"); diff --git a/scripts/kconfig/merge_config.sh b/scripts/kconfig/merge_config.sh index 67d131447631..da66e7742282 100755 --- a/scripts/kconfig/merge_config.sh +++ b/scripts/kconfig/merge_config.sh @@ -33,12 +33,15 @@ usage() { echo " -n use allnoconfig instead of alldefconfig" echo " -r list redundant entries when merging fragments" echo " -O dir to put generated output files. Consider setting \$KCONFIG_CONFIG instead." + echo + echo "Used prefix: '$CONFIG_PREFIX'. You can redefine it with \$CONFIG_ environment variable." } RUNMAKE=true ALLTARGET=alldefconfig WARNREDUN=false OUTPUT=. +CONFIG_PREFIX=${CONFIG_-CONFIG_} while true; do case $1 in @@ -99,7 +102,8 @@ if [ ! -r "$INITFILE" ]; then fi MERGE_LIST=$* -SED_CONFIG_EXP="s/^\(# \)\{0,1\}\(CONFIG_[a-zA-Z0-9_]*\)[= ].*/\2/p" +SED_CONFIG_EXP="s/^\(# \)\{0,1\}\(${CONFIG_PREFIX}[a-zA-Z0-9_]*\)[= ].*/\2/p" + TMP_FILE=$(mktemp ./.tmp.config.XXXXXXXXXX) echo "Using $INITFILE as base" diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index e09fe4d7307c..8963203319ea 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1742,7 +1742,7 @@ static int ns_rmdir_op(struct inode *dir, struct dentry *dentry) if (error) return error; - parent = aa_get_ns(dir->i_private); + parent = aa_get_ns(dir->i_private); /* rmdir calls the generic securityfs functions to remove files * from the apparmor dir. It is up to the apparmor ns locking * to avoid races. diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 4285943f7260..d0afed9ebd0e 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -496,7 +496,7 @@ static void update_file_ctx(struct aa_file_ctx *fctx, struct aa_label *label, /* update caching of label on file_ctx */ spin_lock(&fctx->lock); old = rcu_dereference_protected(fctx->label, - spin_is_locked(&fctx->lock)); + lockdep_is_held(&fctx->lock)); l = aa_label_merge(old, label, GFP_ATOMIC); if (l) { if (l != old) { diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h index e287b7d0d4be..265ae6641a06 100644 --- a/security/apparmor/include/cred.h +++ b/security/apparmor/include/cred.h @@ -151,6 +151,8 @@ static inline struct aa_label *begin_current_label_crit_section(void) { struct aa_label *label = aa_current_raw_label(); + might_sleep(); + if (label_is_stale(label)) { label = aa_get_newest_label(label); if (aa_replace_current_label(label) == 0) diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h index ec7228e857a9..7334ac966d01 100644 --- a/security/apparmor/include/net.h +++ b/security/apparmor/include/net.h @@ -83,6 +83,13 @@ struct aa_sk_ctx { __e; \ }) +struct aa_secmark { + u8 audit; + u8 deny; + u32 secid; + char *label; +}; + extern struct aa_sfs_entry aa_sfs_entry_network[]; void audit_net_cb(struct audit_buffer *ab, void *va); @@ -103,4 +110,7 @@ int aa_sk_perm(const char *op, u32 request, struct sock *sk); int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, struct socket *sock); +int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, + u32 secid, struct sock *sk); + #endif /* __AA_NET_H */ diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index ab64c6b5db5a..8e6707c837be 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -155,6 +155,9 @@ struct aa_profile { struct aa_rlimit rlimits; + int secmark_count; + struct aa_secmark *secmark; + struct aa_loaddata *rawdata; unsigned char *hash; char *dirname; diff --git a/security/apparmor/include/secid.h b/security/apparmor/include/secid.h index dee6fa3b6081..fa2062711b63 100644 --- a/security/apparmor/include/secid.h +++ b/security/apparmor/include/secid.h @@ -22,6 +22,9 @@ struct aa_label; /* secid value that will not be allocated */ #define AA_SECID_INVALID 0 +/* secid value that matches any other secid */ +#define AA_SECID_WILDCARD 1 + struct aa_label *aa_secid_to_label(u32 secid); int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 974affe50531..76491e7f4177 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -90,10 +90,12 @@ const char *aa_splitn_fqname(const char *fqname, size_t n, const char **ns_name, const char *end = fqname + n; const char *name = skipn_spaces(fqname, n); - if (!name) - return NULL; *ns_name = NULL; *ns_len = 0; + + if (!name) + return NULL; + if (name[0] == ':') { char *split = strnchr(&name[1], end - &name[1], ':'); *ns_name = skipn_spaces(&name[1], end - &name[1]); diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index aa35939443c4..42446a216f3b 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -23,6 +23,8 @@ #include <linux/sysctl.h> #include <linux/audit.h> #include <linux/user_namespace.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> #include <net/sock.h> #include "include/apparmor.h" @@ -114,13 +116,13 @@ static int apparmor_ptrace_access_check(struct task_struct *child, struct aa_label *tracer, *tracee; int error; - tracer = begin_current_label_crit_section(); + tracer = __begin_current_label_crit_section(); tracee = aa_get_task_label(child); error = aa_may_ptrace(tracer, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); aa_put_label(tracee); - end_current_label_crit_section(tracer); + __end_current_label_crit_section(tracer); return error; } @@ -130,11 +132,11 @@ static int apparmor_ptrace_traceme(struct task_struct *parent) struct aa_label *tracer, *tracee; int error; - tracee = begin_current_label_crit_section(); + tracee = __begin_current_label_crit_section(); tracer = aa_get_task_label(parent); error = aa_may_ptrace(tracer, tracee, AA_PTRACE_TRACE); aa_put_label(tracer); - end_current_label_crit_section(tracee); + __end_current_label_crit_section(tracee); return error; } @@ -1020,6 +1022,7 @@ static int apparmor_socket_shutdown(struct socket *sock, int how) return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } +#ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_recv_skb - check perms before associating skb to sk * @@ -1030,8 +1033,15 @@ static int apparmor_socket_shutdown(struct socket *sock, int how) */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { - return 0; + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (!skb->secmark) + return 0; + + return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, + skb->secmark, sk); } +#endif static struct aa_label *sk_peer_label(struct sock *sk) @@ -1126,6 +1136,20 @@ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) ctx->label = aa_get_current_label(); } +#ifdef CONFIG_NETWORK_SECMARK +static int apparmor_inet_conn_request(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (!skb->secmark) + return 0; + + return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, + skb->secmark, sk); +} +#endif + static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), @@ -1177,12 +1201,17 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), +#ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), +#endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), +#ifdef CONFIG_NETWORK_SECMARK + LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), +#endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), @@ -1538,6 +1567,97 @@ static inline int apparmor_init_sysctl(void) } #endif /* CONFIG_SYSCTL */ +#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) +static unsigned int apparmor_ip_postroute(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct aa_sk_ctx *ctx; + struct sock *sk; + + if (!skb->secmark) + return NF_ACCEPT; + + sk = skb_to_full_sk(skb); + if (sk == NULL) + return NF_ACCEPT; + + ctx = SK_CTX(sk); + if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, + skb->secmark, sk)) + return NF_ACCEPT; + + return NF_DROP_ERR(-ECONNREFUSED); + +} + +static unsigned int apparmor_ipv4_postroute(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return apparmor_ip_postroute(priv, skb, state); +} + +static unsigned int apparmor_ipv6_postroute(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return apparmor_ip_postroute(priv, skb, state); +} + +static const struct nf_hook_ops apparmor_nf_ops[] = { + { + .hook = apparmor_ipv4_postroute, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_SELINUX_FIRST, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .hook = apparmor_ipv6_postroute, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_SELINUX_FIRST, + }, +#endif +}; + +static int __net_init apparmor_nf_register(struct net *net) +{ + int ret; + + ret = nf_register_net_hooks(net, apparmor_nf_ops, + ARRAY_SIZE(apparmor_nf_ops)); + return ret; +} + +static void __net_exit apparmor_nf_unregister(struct net *net) +{ + nf_unregister_net_hooks(net, apparmor_nf_ops, + ARRAY_SIZE(apparmor_nf_ops)); +} + +static struct pernet_operations apparmor_net_ops = { + .init = apparmor_nf_register, + .exit = apparmor_nf_unregister, +}; + +static int __init apparmor_nf_ip_init(void) +{ + int err; + + if (!apparmor_enabled) + return 0; + + err = register_pernet_subsys(&apparmor_net_ops); + if (err) + panic("Apparmor: register_pernet_subsys: error %d\n", err); + + return 0; +} +__initcall(apparmor_nf_ip_init); +#endif + static int __init apparmor_init(void) { int error; diff --git a/security/apparmor/net.c b/security/apparmor/net.c index bb24cfa0a164..c07fde444792 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -18,6 +18,7 @@ #include "include/label.h" #include "include/net.h" #include "include/policy.h" +#include "include/secid.h" #include "net_names.h" @@ -146,17 +147,20 @@ int aa_af_perm(struct aa_label *label, const char *op, u32 request, u16 family, static int aa_label_sk_perm(struct aa_label *label, const char *op, u32 request, struct sock *sk) { - struct aa_profile *profile; - DEFINE_AUDIT_SK(sa, op, sk); + int error = 0; AA_BUG(!label); AA_BUG(!sk); - if (unconfined(label)) - return 0; + if (!unconfined(label)) { + struct aa_profile *profile; + DEFINE_AUDIT_SK(sa, op, sk); - return fn_for_each_confined(label, profile, - aa_profile_af_sk_perm(profile, &sa, request, sk)); + error = fn_for_each_confined(label, profile, + aa_profile_af_sk_perm(profile, &sa, request, sk)); + } + + return error; } int aa_sk_perm(const char *op, u32 request, struct sock *sk) @@ -185,3 +189,70 @@ int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, return aa_label_sk_perm(label, op, request, sock->sk); } + +#ifdef CONFIG_NETWORK_SECMARK +static int apparmor_secmark_init(struct aa_secmark *secmark) +{ + struct aa_label *label; + + if (secmark->label[0] == '*') { + secmark->secid = AA_SECID_WILDCARD; + return 0; + } + + label = aa_label_strn_parse(&root_ns->unconfined->label, + secmark->label, strlen(secmark->label), + GFP_ATOMIC, false, false); + + if (IS_ERR(label)) + return PTR_ERR(label); + + secmark->secid = label->secid; + + return 0; +} + +static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, + struct common_audit_data *sa, struct sock *sk) +{ + int i, ret; + struct aa_perms perms = { }; + + if (profile->secmark_count == 0) + return 0; + + for (i = 0; i < profile->secmark_count; i++) { + if (!profile->secmark[i].secid) { + ret = apparmor_secmark_init(&profile->secmark[i]); + if (ret) + return ret; + } + + if (profile->secmark[i].secid == secid || + profile->secmark[i].secid == AA_SECID_WILDCARD) { + if (profile->secmark[i].deny) + perms.deny = ALL_PERMS_MASK; + else + perms.allow = ALL_PERMS_MASK; + + if (profile->secmark[i].audit) + perms.audit = ALL_PERMS_MASK; + } + } + + aa_apply_modes_to_perms(profile, &perms); + + return aa_check_perms(profile, &perms, request, sa, audit_net_cb); +} + +int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, + u32 secid, struct sock *sk) +{ + struct aa_profile *profile; + DEFINE_AUDIT_SK(sa, op, sk); + + return fn_for_each_confined(label, profile, + aa_secmark_perm(profile, request, secid, + &sa, sk)); +} +#endif diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 1590e2de4e84..df9c5890a878 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -231,6 +231,9 @@ void aa_free_profile(struct aa_profile *profile) for (i = 0; i < profile->xattr_count; i++) kzfree(profile->xattrs[i]); kzfree(profile->xattrs); + for (i = 0; i < profile->secmark_count; i++) + kzfree(profile->secmark[i].label); + kzfree(profile->secmark); kzfree(profile->dirname); aa_put_dfa(profile->xmatch); aa_put_dfa(profile->policy.dfa); diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 21cb384d712a..379682e2a8d5 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -292,6 +292,19 @@ fail: return 0; } +static bool unpack_u8(struct aa_ext *e, u8 *data, const char *name) +{ + if (unpack_nameX(e, AA_U8, name)) { + if (!inbounds(e, sizeof(u8))) + return 0; + if (data) + *data = get_unaligned((u8 *)e->pos); + e->pos += sizeof(u8); + return 1; + } + return 0; +} + static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name) { if (unpack_nameX(e, AA_U32, name)) { @@ -529,6 +542,49 @@ fail: return 0; } +static bool unpack_secmark(struct aa_ext *e, struct aa_profile *profile) +{ + void *pos = e->pos; + int i, size; + + if (unpack_nameX(e, AA_STRUCT, "secmark")) { + size = unpack_array(e, NULL); + + profile->secmark = kcalloc(size, sizeof(struct aa_secmark), + GFP_KERNEL); + if (!profile->secmark) + goto fail; + + profile->secmark_count = size; + + for (i = 0; i < size; i++) { + if (!unpack_u8(e, &profile->secmark[i].audit, NULL)) + goto fail; + if (!unpack_u8(e, &profile->secmark[i].deny, NULL)) + goto fail; + if (!unpack_strdup(e, &profile->secmark[i].label, NULL)) + goto fail; + } + if (!unpack_nameX(e, AA_ARRAYEND, NULL)) + goto fail; + if (!unpack_nameX(e, AA_STRUCTEND, NULL)) + goto fail; + } + + return 1; + +fail: + if (profile->secmark) { + for (i = 0; i < size; i++) + kfree(profile->secmark[i].label); + kfree(profile->secmark); + profile->secmark_count = 0; + } + + e->pos = pos; + return 0; +} + static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) { void *pos = e->pos; @@ -727,6 +783,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } + if (!unpack_secmark(e, profile)) { + info = "failed to unpack profile secmark rules"; + goto fail; + } + if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ info = "failed to unpack policydb"; diff --git a/security/apparmor/secid.c b/security/apparmor/secid.c index 4ccec1bcf6f5..05373d9a3d6a 100644 --- a/security/apparmor/secid.c +++ b/security/apparmor/secid.c @@ -32,8 +32,7 @@ * secids - do not pin labels with a refcount. They rely on the label * properly updating/freeing them */ - -#define AA_FIRST_SECID 1 +#define AA_FIRST_SECID 2 static DEFINE_IDR(aa_secids); static DEFINE_SPINLOCK(secid_lock); diff --git a/security/keys/Makefile b/security/keys/Makefile index ef1581b337a3..9cef54064f60 100644 --- a/security/keys/Makefile +++ b/security/keys/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_SYSCTL) += sysctl.o obj-$(CONFIG_PERSISTENT_KEYRINGS) += persistent.o obj-$(CONFIG_KEY_DH_OPERATIONS) += dh.o +obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += keyctl_pkey.o # # Key types diff --git a/security/keys/compat.c b/security/keys/compat.c index e87c89c0177c..9482df601dc3 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -141,6 +141,24 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, return keyctl_restrict_keyring(arg2, compat_ptr(arg3), compat_ptr(arg4)); + case KEYCTL_PKEY_QUERY: + if (arg3 != 0) + return -EINVAL; + return keyctl_pkey_query(arg2, + compat_ptr(arg4), + compat_ptr(arg5)); + + case KEYCTL_PKEY_ENCRYPT: + case KEYCTL_PKEY_DECRYPT: + case KEYCTL_PKEY_SIGN: + return keyctl_pkey_e_d_s(option, + compat_ptr(arg2), compat_ptr(arg3), + compat_ptr(arg4), compat_ptr(arg5)); + + case KEYCTL_PKEY_VERIFY: + return keyctl_pkey_verify(compat_ptr(arg2), compat_ptr(arg3), + compat_ptr(arg4), compat_ptr(arg5)); + default: return -EOPNOTSUPP; } diff --git a/security/keys/internal.h b/security/keys/internal.h index 9f8208dc0e55..74cb0ff42fed 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -298,6 +298,45 @@ static inline long compat_keyctl_dh_compute( #endif #endif +#ifdef CONFIG_ASYMMETRIC_KEY_TYPE +extern long keyctl_pkey_query(key_serial_t, + const char __user *, + struct keyctl_pkey_query __user *); + +extern long keyctl_pkey_verify(const struct keyctl_pkey_params __user *, + const char __user *, + const void __user *, const void __user *); + +extern long keyctl_pkey_e_d_s(int, + const struct keyctl_pkey_params __user *, + const char __user *, + const void __user *, void __user *); +#else +static inline long keyctl_pkey_query(key_serial_t id, + const char __user *_info, + struct keyctl_pkey_query __user *_res) +{ + return -EOPNOTSUPP; +} + +static inline long keyctl_pkey_verify(const struct keyctl_pkey_params __user *params, + const char __user *_info, + const void __user *_in, + const void __user *_in2) +{ + return -EOPNOTSUPP; +} + +static inline long keyctl_pkey_e_d_s(int op, + const struct keyctl_pkey_params __user *params, + const char __user *_info, + const void __user *_in, + void __user *_out) +{ + return -EOPNOTSUPP; +} +#endif + /* * Debugging key validation */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 1ffe60bb2845..18619690ce77 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1747,6 +1747,30 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (const char __user *) arg3, (const char __user *) arg4); + case KEYCTL_PKEY_QUERY: + if (arg3 != 0) + return -EINVAL; + return keyctl_pkey_query((key_serial_t)arg2, + (const char __user *)arg4, + (struct keyctl_pkey_query *)arg5); + + case KEYCTL_PKEY_ENCRYPT: + case KEYCTL_PKEY_DECRYPT: + case KEYCTL_PKEY_SIGN: + return keyctl_pkey_e_d_s( + option, + (const struct keyctl_pkey_params __user *)arg2, + (const char __user *)arg3, + (const void __user *)arg4, + (void __user *)arg5); + + case KEYCTL_PKEY_VERIFY: + return keyctl_pkey_verify( + (const struct keyctl_pkey_params __user *)arg2, + (const char __user *)arg3, + (const void __user *)arg4, + (const void __user *)arg5); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyctl_pkey.c b/security/keys/keyctl_pkey.c new file mode 100644 index 000000000000..783978842f13 --- /dev/null +++ b/security/keys/keyctl_pkey.c @@ -0,0 +1,323 @@ +/* Public-key operation keyctls + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/key.h> +#include <linux/keyctl.h> +#include <linux/parser.h> +#include <linux/uaccess.h> +#include <keys/user-type.h> +#include "internal.h" + +static void keyctl_pkey_params_free(struct kernel_pkey_params *params) +{ + kfree(params->info); + key_put(params->key); +} + +enum { + Opt_err = -1, + Opt_enc, /* "enc=<encoding>" eg. "enc=oaep" */ + Opt_hash, /* "hash=<digest-name>" eg. "hash=sha1" */ +}; + +static const match_table_t param_keys = { + { Opt_enc, "enc=%s" }, + { Opt_hash, "hash=%s" }, + { Opt_err, NULL } +}; + +/* + * Parse the information string which consists of key=val pairs. + */ +static int keyctl_pkey_params_parse(struct kernel_pkey_params *params) +{ + unsigned long token_mask = 0; + substring_t args[MAX_OPT_ARGS]; + char *c = params->info, *p, *q; + int token; + + while ((p = strsep(&c, " \t"))) { + if (*p == '\0' || *p == ' ' || *p == '\t') + continue; + token = match_token(p, param_keys, args); + if (__test_and_set_bit(token, &token_mask)) + return -EINVAL; + q = args[0].from; + if (!q[0]) + return -EINVAL; + + switch (token) { + case Opt_enc: + params->encoding = q; + break; + + case Opt_hash: + params->hash_algo = q; + break; + + default: + return -EINVAL; + } + } + + return 0; +} + +/* + * Interpret parameters. Callers must always call the free function + * on params, even if an error is returned. + */ +static int keyctl_pkey_params_get(key_serial_t id, + const char __user *_info, + struct kernel_pkey_params *params) +{ + key_ref_t key_ref; + void *p; + int ret; + + memset(params, 0, sizeof(*params)); + params->encoding = "raw"; + + p = strndup_user(_info, PAGE_SIZE); + if (IS_ERR(p)) + return PTR_ERR(p); + params->info = p; + + ret = keyctl_pkey_params_parse(params); + if (ret < 0) + return ret; + + key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH); + if (IS_ERR(key_ref)) + return PTR_ERR(key_ref); + params->key = key_ref_to_ptr(key_ref); + + if (!params->key->type->asym_query) + return -EOPNOTSUPP; + + return 0; +} + +/* + * Get parameters from userspace. Callers must always call the free function + * on params, even if an error is returned. + */ +static int keyctl_pkey_params_get_2(const struct keyctl_pkey_params __user *_params, + const char __user *_info, + int op, + struct kernel_pkey_params *params) +{ + struct keyctl_pkey_params uparams; + struct kernel_pkey_query info; + int ret; + + memset(params, 0, sizeof(*params)); + params->encoding = "raw"; + + if (copy_from_user(&uparams, _params, sizeof(uparams)) != 0) + return -EFAULT; + + ret = keyctl_pkey_params_get(uparams.key_id, _info, params); + if (ret < 0) + return ret; + + ret = params->key->type->asym_query(params, &info); + if (ret < 0) + return ret; + + switch (op) { + case KEYCTL_PKEY_ENCRYPT: + case KEYCTL_PKEY_DECRYPT: + if (uparams.in_len > info.max_enc_size || + uparams.out_len > info.max_dec_size) + return -EINVAL; + break; + case KEYCTL_PKEY_SIGN: + case KEYCTL_PKEY_VERIFY: + if (uparams.in_len > info.max_sig_size || + uparams.out_len > info.max_data_size) + return -EINVAL; + break; + default: + BUG(); + } + + params->in_len = uparams.in_len; + params->out_len = uparams.out_len; + return 0; +} + +/* + * Query information about an asymmetric key. + */ +long keyctl_pkey_query(key_serial_t id, + const char __user *_info, + struct keyctl_pkey_query __user *_res) +{ + struct kernel_pkey_params params; + struct kernel_pkey_query res; + long ret; + + memset(¶ms, 0, sizeof(params)); + + ret = keyctl_pkey_params_get(id, _info, ¶ms); + if (ret < 0) + goto error; + + ret = params.key->type->asym_query(¶ms, &res); + if (ret < 0) + goto error; + + ret = -EFAULT; + if (copy_to_user(_res, &res, sizeof(res)) == 0 && + clear_user(_res->__spare, sizeof(_res->__spare)) == 0) + ret = 0; + +error: + keyctl_pkey_params_free(¶ms); + return ret; +} + +/* + * Encrypt/decrypt/sign + * + * Encrypt data, decrypt data or sign data using a public key. + * + * _info is a string of supplementary information in key=val format. For + * instance, it might contain: + * + * "enc=pkcs1 hash=sha256" + * + * where enc= specifies the encoding and hash= selects the OID to go in that + * particular encoding if required. If enc= isn't supplied, it's assumed that + * the caller is supplying raw values. + * + * If successful, the amount of data written into the output buffer is + * returned. + */ +long keyctl_pkey_e_d_s(int op, + const struct keyctl_pkey_params __user *_params, + const char __user *_info, + const void __user *_in, + void __user *_out) +{ + struct kernel_pkey_params params; + void *in, *out; + long ret; + + ret = keyctl_pkey_params_get_2(_params, _info, op, ¶ms); + if (ret < 0) + goto error_params; + + ret = -EOPNOTSUPP; + if (!params.key->type->asym_eds_op) + goto error_params; + + switch (op) { + case KEYCTL_PKEY_ENCRYPT: + params.op = kernel_pkey_encrypt; + break; + case KEYCTL_PKEY_DECRYPT: + params.op = kernel_pkey_decrypt; + break; + case KEYCTL_PKEY_SIGN: + params.op = kernel_pkey_sign; + break; + default: + BUG(); + } + + in = memdup_user(_in, params.in_len); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto error_params; + } + + ret = -ENOMEM; + out = kmalloc(params.out_len, GFP_KERNEL); + if (!out) + goto error_in; + + ret = params.key->type->asym_eds_op(¶ms, in, out); + if (ret < 0) + goto error_out; + + if (copy_to_user(_out, out, ret) != 0) + ret = -EFAULT; + +error_out: + kfree(out); +error_in: + kfree(in); +error_params: + keyctl_pkey_params_free(¶ms); + return ret; +} + +/* + * Verify a signature. + * + * Verify a public key signature using the given key, or if not given, search + * for a matching key. + * + * _info is a string of supplementary information in key=val format. For + * instance, it might contain: + * + * "enc=pkcs1 hash=sha256" + * + * where enc= specifies the signature blob encoding and hash= selects the OID + * to go in that particular encoding. If enc= isn't supplied, it's assumed + * that the caller is supplying raw values. + * + * If successful, 0 is returned. + */ +long keyctl_pkey_verify(const struct keyctl_pkey_params __user *_params, + const char __user *_info, + const void __user *_in, + const void __user *_in2) +{ + struct kernel_pkey_params params; + void *in, *in2; + long ret; + + ret = keyctl_pkey_params_get_2(_params, _info, KEYCTL_PKEY_VERIFY, + ¶ms); + if (ret < 0) + goto error_params; + + ret = -EOPNOTSUPP; + if (!params.key->type->asym_verify_signature) + goto error_params; + + in = memdup_user(_in, params.in_len); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto error_params; + } + + in2 = memdup_user(_in2, params.in2_len); + if (IS_ERR(in2)) { + ret = PTR_ERR(in2); + goto error_in; + } + + params.op = kernel_pkey_verify; + ret = params.key->type->asym_verify_signature(¶ms, in, in2); + + kfree(in2); +error_in: + kfree(in); +error_params: + keyctl_pkey_params_free(¶ms); + return ret; +} diff --git a/security/keys/trusted.c b/security/keys/trusted.c index b69d3b1777c2..ff6789365a12 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -30,7 +30,7 @@ #include <linux/tpm.h> #include <linux/tpm_command.h> -#include "trusted.h" +#include <keys/trusted.h> static const char hmac_alg[] = "hmac(sha1)"; static const char hash_alg[] = "sha1"; @@ -121,7 +121,7 @@ out: /* * calculate authorization info fields to send to TPM */ -static int TSS_authhmac(unsigned char *digest, const unsigned char *key, +int TSS_authhmac(unsigned char *digest, const unsigned char *key, unsigned int keylen, unsigned char *h1, unsigned char *h2, unsigned char h3, ...) { @@ -168,11 +168,12 @@ out: kzfree(sdesc); return ret; } +EXPORT_SYMBOL_GPL(TSS_authhmac); /* * verify the AUTH1_COMMAND (Seal) result from TPM */ -static int TSS_checkhmac1(unsigned char *buffer, +int TSS_checkhmac1(unsigned char *buffer, const uint32_t command, const unsigned char *ononce, const unsigned char *key, @@ -249,6 +250,7 @@ out: kzfree(sdesc); return ret; } +EXPORT_SYMBOL_GPL(TSS_checkhmac1); /* * verify the AUTH2_COMMAND (unseal) result from TPM @@ -355,7 +357,7 @@ out: * For key specific tpm requests, we will generate and send our * own TPM command packets using the drivers send function. */ -static int trusted_tpm_send(unsigned char *cmd, size_t buflen) +int trusted_tpm_send(unsigned char *cmd, size_t buflen) { int rc; @@ -367,6 +369,7 @@ static int trusted_tpm_send(unsigned char *cmd, size_t buflen) rc = -EPERM; return rc; } +EXPORT_SYMBOL_GPL(trusted_tpm_send); /* * Lock a trusted key, by extending a selected PCR. @@ -425,7 +428,7 @@ static int osap(struct tpm_buf *tb, struct osapsess *s, /* * Create an object independent authorisation protocol (oiap) session */ -static int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) +int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) { int ret; @@ -442,6 +445,7 @@ static int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) TPM_NONCE_SIZE); return 0; } +EXPORT_SYMBOL_GPL(oiap); struct tpm_digests { unsigned char encauth[SHA1_DIGEST_SIZE]; diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index fcd965f1d69e..9be76c808fcc 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -146,53 +146,22 @@ static int apply_constraint_to_size(struct snd_pcm_hw_params *params, struct snd_interval *s = hw_param_interval(params, rule->var); const struct snd_interval *r = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_RATE); - struct snd_interval t = { - .min = s->min, .max = s->max, .integer = 1, - }; + struct snd_interval t = {0}; + unsigned int step = 0; int i; for (i = 0; i < CIP_SFC_COUNT; ++i) { - unsigned int rate = amdtp_rate_table[i]; - unsigned int step = amdtp_syt_intervals[i]; - - if (!snd_interval_test(r, rate)) - continue; - - t.min = roundup(t.min, step); - t.max = rounddown(t.max, step); + if (snd_interval_test(r, amdtp_rate_table[i])) + step = max(step, amdtp_syt_intervals[i]); } - if (snd_interval_checkempty(&t)) - return -EINVAL; + t.min = roundup(s->min, step); + t.max = rounddown(s->max, step); + t.integer = 1; return snd_interval_refine(s, &t); } -static int apply_constraint_to_rate(struct snd_pcm_hw_params *params, - struct snd_pcm_hw_rule *rule) -{ - struct snd_interval *r = - hw_param_interval(params, SNDRV_PCM_HW_PARAM_RATE); - const struct snd_interval *s = hw_param_interval_c(params, rule->deps[0]); - struct snd_interval t = { - .min = UINT_MAX, .max = 0, .integer = 1, - }; - int i; - - for (i = 0; i < CIP_SFC_COUNT; ++i) { - unsigned int step = amdtp_syt_intervals[i]; - unsigned int rate = amdtp_rate_table[i]; - - if (s->min % step || s->max % step) - continue; - - t.min = min(t.min, rate); - t.max = max(t.max, rate); - } - - return snd_interval_refine(r, &t); -} - /** * amdtp_stream_add_pcm_hw_constraints - add hw constraints for PCM substream * @s: the AMDTP stream, which must be initialized. @@ -250,24 +219,16 @@ int amdtp_stream_add_pcm_hw_constraints(struct amdtp_stream *s, */ err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, apply_constraint_to_size, NULL, + SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) goto end; - err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, - apply_constraint_to_rate, NULL, - SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1); - if (err < 0) - goto end; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, apply_constraint_to_size, NULL, + SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) goto end; - err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, - apply_constraint_to_rate, NULL, - SNDRV_PCM_HW_PARAM_BUFFER_SIZE, -1); - if (err < 0) - goto end; end: return err; } diff --git a/sound/firewire/dice/dice.c b/sound/firewire/dice/dice.c index 0f6dbcffe711..ed50b222d36e 100644 --- a/sound/firewire/dice/dice.c +++ b/sound/firewire/dice/dice.c @@ -240,8 +240,8 @@ static void dice_remove(struct fw_unit *unit) cancel_delayed_work_sync(&dice->dwork); if (dice->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(dice->card); + // Block till all of ALSA character devices are released. + snd_card_free(dice->card); } mutex_destroy(&dice->mutex); diff --git a/sound/pci/ca0106/ca0106.h b/sound/pci/ca0106/ca0106.h index 04402c14cb23..9847b669cf3c 100644 --- a/sound/pci/ca0106/ca0106.h +++ b/sound/pci/ca0106/ca0106.h @@ -582,7 +582,7 @@ #define SPI_PL_BIT_R_R (2<<7) /* right channel = right */ #define SPI_PL_BIT_R_C (3<<7) /* right channel = (L+R)/2 */ #define SPI_IZD_REG 2 -#define SPI_IZD_BIT (1<<4) /* infinite zero detect */ +#define SPI_IZD_BIT (0<<4) /* infinite zero detect */ #define SPI_FMT_REG 3 #define SPI_FMT_BIT_RJ (0<<0) /* right justified mode */ diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h index 5072cbd15c82..dae1584cf017 100644 --- a/tools/arch/arm64/include/uapi/asm/unistd.h +++ b/tools/arch/arm64/include/uapi/asm/unistd.h @@ -16,5 +16,6 @@ */ #define __ARCH_WANT_RENAMEAT +#define __ARCH_WANT_NEW_STAT #include <asm-generic/unistd.h> diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 1b32b56a03d3..8c876c166ef2 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) +#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 9a50f02b9894..16511d97e8dc 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -160,6 +160,8 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +#define KVM_S390_VM_CRYPTO_ENABLE_APIE 4 +#define KVM_S390_VM_CRYPTO_DISABLE_APIE 5 /* kvm attributes for migration mode */ #define KVM_S390_VM_MIGRATION_STOP 0 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 8a6eff9c27f3..dabfcf7c3941 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -300,10 +300,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - union { - __u8 pad; - __u8 pending; - }; + __u8 pending; __u32 error_code; } exception; struct { @@ -387,6 +384,7 @@ struct kvm_sync_regs { #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 +#define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index df4bedb9b01c..538546edbfbd 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -242,10 +242,12 @@ __SYSCALL(__NR_tee, sys_tee) /* fs/stat.c */ #define __NR_readlinkat 78 __SYSCALL(__NR_readlinkat, sys_readlinkat) +#if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64) #define __NR3264_fstatat 79 __SC_3264(__NR3264_fstatat, sys_fstatat64, sys_newfstatat) #define __NR3264_fstat 80 __SC_3264(__NR3264_fstat, sys_fstat64, sys_newfstat) +#endif /* fs/sync.c */ #define __NR_sync 81 diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h new file mode 100644 index 000000000000..a441ea1bfe6d --- /dev/null +++ b/tools/include/uapi/linux/fs.h @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FS_H +#define _UAPI_LINUX_FS_H + +/* + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. + */ + +#include <linux/limits.h> +#include <linux/ioctl.h> +#include <linux/types.h> + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) + +#define SEEK_SET 0 /* seek relative to beginning of file */ +#define SEEK_CUR 1 /* seek relative to current file position */ +#define SEEK_END 2 /* seek relative to end of file */ +#define SEEK_DATA 3 /* seek to the next data */ +#define SEEK_HOLE 4 /* seek to the next hole */ +#define SEEK_MAX SEEK_HOLE + +#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ +#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ + +struct file_clone_range { + __s64 src_fd; + __u64 src_offset; + __u64 src_length; + __u64 dest_offset; +}; + +struct fstrim_range { + __u64 start; + __u64 len; + __u64 minlen; +}; + +/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ +#define FILE_DEDUPE_RANGE_SAME 0 +#define FILE_DEDUPE_RANGE_DIFFERS 1 + +/* from struct btrfs_ioctl_file_extent_same_info */ +struct file_dedupe_range_info { + __s64 dest_fd; /* in - destination file */ + __u64 dest_offset; /* in - start of extent in destination */ + __u64 bytes_deduped; /* out - total # of bytes we were able + * to dedupe from this file. */ + /* status of this dedupe operation: + * < 0 for error + * == FILE_DEDUPE_RANGE_SAME if dedupe succeeds + * == FILE_DEDUPE_RANGE_DIFFERS if data differs + */ + __s32 status; /* out - see above description */ + __u32 reserved; /* must be zero */ +}; + +/* from struct btrfs_ioctl_file_extent_same_args */ +struct file_dedupe_range { + __u64 src_offset; /* in - start of extent in source */ + __u64 src_length; /* in - length of extent */ + __u16 dest_count; /* in - total elements in info array */ + __u16 reserved1; /* must be zero */ + __u32 reserved2; /* must be zero */ + struct file_dedupe_range_info info[0]; +}; + +/* And dynamically-tunable limits and defaults: */ +struct files_stat_struct { + unsigned long nr_files; /* read only */ + unsigned long nr_free_files; /* read only */ + unsigned long max_files; /* tunable */ +}; + +struct inodes_stat_t { + long nr_inodes; + long nr_unused; + long dummy[5]; /* padding for sysctl ABI compatibility */ +}; + + +#define NR_FILE 8192 /* this can well be larger on a larger system */ + + +/* + * These are the fs-independent mount-flags: up to 32 flags are supported + */ +#define MS_RDONLY 1 /* Mount read-only */ +#define MS_NOSUID 2 /* Ignore suid and sgid bits */ +#define MS_NODEV 4 /* Disallow access to device special files */ +#define MS_NOEXEC 8 /* Disallow program execution */ +#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ +#define MS_REMOUNT 32 /* Alter flags of a mounted FS */ +#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ +#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_NOATIME 1024 /* Do not update access times. */ +#define MS_NODIRATIME 2048 /* Do not update directory access times */ +#define MS_BIND 4096 +#define MS_MOVE 8192 +#define MS_REC 16384 +#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. + MS_VERBOSE is deprecated. */ +#define MS_SILENT 32768 +#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ +#define MS_UNBINDABLE (1<<17) /* change to unbindable */ +#define MS_PRIVATE (1<<18) /* change to private */ +#define MS_SLAVE (1<<19) /* change to slave */ +#define MS_SHARED (1<<20) /* change to shared */ +#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ +#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ +#define MS_I_VERSION (1<<23) /* Update inode I_version field */ +#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ + +/* These sb flags are internal to the kernel */ +#define MS_SUBMOUNT (1<<26) +#define MS_NOREMOTELOCK (1<<27) +#define MS_NOSEC (1<<28) +#define MS_BORN (1<<29) +#define MS_ACTIVE (1<<30) +#define MS_NOUSER (1<<31) + +/* + * Superblock flags that can be altered by MS_REMOUNT + */ +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ + MS_LAZYTIME) + +/* + * Old magic mount flag and mask + */ +#define MS_MGC_VAL 0xC0ED0000 +#define MS_MGC_MSK 0xffff0000 + +/* + * Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + __u32 fsx_cowextsize; /* CoW extsize field value (get/set)*/ + unsigned char fsx_pad[8]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ +#define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + +/* the read-only stuff doesn't really belong here, but any other place is + probably as bad and I don't want to create yet another include file. */ + +#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ +#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ +#define BLKRRPART _IO(0x12,95) /* re-read partition table */ +#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ +#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ +#define BLKRASET _IO(0x12,98) /* set read ahead for block device */ +#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ +#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ +#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ +#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ +#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ +#define BLKSSZGET _IO(0x12,104)/* get block device sector size */ +#if 0 +#define BLKPG _IO(0x12,105)/* See blkpg.h */ + +/* Some people are morons. Do not use sizeof! */ + +#define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ +#define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ +/* This was here just to show that the number is taken - + probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ +#endif +/* A jump here: 108-111 have been used for various private purposes. */ +#define BLKBSZGET _IOR(0x12,112,size_t) +#define BLKBSZSET _IOW(0x12,113,size_t) +#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ +#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) +#define BLKTRACESTART _IO(0x12,116) +#define BLKTRACESTOP _IO(0x12,117) +#define BLKTRACETEARDOWN _IO(0x12,118) +#define BLKDISCARD _IO(0x12,119) +#define BLKIOMIN _IO(0x12,120) +#define BLKIOOPT _IO(0x12,121) +#define BLKALIGNOFF _IO(0x12,122) +#define BLKPBSZGET _IO(0x12,123) +#define BLKDISCARDZEROES _IO(0x12,124) +#define BLKSECDISCARD _IO(0x12,125) +#define BLKROTATIONAL _IO(0x12,126) +#define BLKZEROOUT _IO(0x12,127) +/* + * A jump here: 130-131 are reserved for zoned block devices + * (see uapi/linux/blkzoned.h) + */ + +#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ +#define FIBMAP _IO(0x00,1) /* bmap access */ +#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ +#define FIFREEZE _IOWR('X', 119, int) /* Freeze */ +#define FITHAW _IOWR('X', 120, int) /* Thaw */ +#define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */ +#define FICLONE _IOW(0x94, 9, int) +#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range) +#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range) + +#define FSLABEL_MAX 256 /* Max chars for the interface; each fs may differ */ + +#define FS_IOC_GETFLAGS _IOR('f', 1, long) +#define FS_IOC_SETFLAGS _IOW('f', 2, long) +#define FS_IOC_GETVERSION _IOR('v', 1, long) +#define FS_IOC_SETVERSION _IOW('v', 2, long) +#define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap) +#define FS_IOC32_GETFLAGS _IOR('f', 1, int) +#define FS_IOC32_SETFLAGS _IOW('f', 2, int) +#define FS_IOC32_GETVERSION _IOR('v', 1, int) +#define FS_IOC32_SETVERSION _IOW('v', 2, int) +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) +#define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) +#define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) + +/* + * File system encryption support + */ +/* Policy provided via an ioctl on the topmost directory */ +#define FS_KEY_DESCRIPTOR_SIZE 8 + +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 +#define FS_ENCRYPTION_MODE_AES_128_CBC 5 +#define FS_ENCRYPTION_MODE_AES_128_CTS 6 +#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* Removed, do not use. */ +#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* Removed, do not use. */ + +struct fscrypt_policy { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) + +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + +/* + * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) + * + * Note: for historical reasons, these flags were originally used and + * defined for use by ext2/ext3, and then other file systems started + * using these flags so they wouldn't need to write their own version + * of chattr/lsattr (which was shipped as part of e2fsprogs). You + * should think twice before trying to use these flags in new + * contexts, or trying to assign these flags, since they are used both + * as the UAPI and the on-disk encoding for ext2/3/4. Also, we are + * almost out of 32-bit flags. :-) + * + * We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from + * XFS to the generic FS level interface. This uses a structure that + * has padding and hence has more room to grow, so it may be more + * appropriate for many new use cases. + * + * Please do not change these flags or interfaces before checking with + * linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org. + */ +#define FS_SECRM_FL 0x00000001 /* Secure deletion */ +#define FS_UNRM_FL 0x00000002 /* Undelete */ +#define FS_COMPR_FL 0x00000004 /* Compress file */ +#define FS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define FS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define FS_NODUMP_FL 0x00000040 /* do not dump file */ +#define FS_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define FS_DIRTY_FL 0x00000100 +#define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define FS_NOCOMP_FL 0x00000400 /* Don't compress */ +/* End compression flags --- maybe not all used */ +#define FS_ENCRYPT_FL 0x00000800 /* Encrypted file */ +#define FS_BTREE_FL 0x00001000 /* btree format dir */ +#define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define FS_IMAGIC_FL 0x00002000 /* AFS directory */ +#define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */ +#define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */ +#define FS_EXTENT_FL 0x00080000 /* Extents */ +#define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ +#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ +#define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ + +#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + + +#define SYNC_FILE_RANGE_WAIT_BEFORE 1 +#define SYNC_FILE_RANGE_WRITE 2 +#define SYNC_FILE_RANGE_WAIT_AFTER 4 + +/* + * Flags for preadv2/pwritev2: + */ + +typedef int __bitwise __kernel_rwf_t; + +/* high priority request, poll if possible */ +#define RWF_HIPRI ((__force __kernel_rwf_t)0x00000001) + +/* per-IO O_DSYNC */ +#define RWF_DSYNC ((__force __kernel_rwf_t)0x00000002) + +/* per-IO O_SYNC */ +#define RWF_SYNC ((__force __kernel_rwf_t)0x00000004) + +/* per-IO, return -EAGAIN if operation would block */ +#define RWF_NOWAIT ((__force __kernel_rwf_t)0x00000008) + +/* per-IO O_APPEND */ +#define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) + +/* mask of flags supported by the kernel */ +#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ + RWF_APPEND) + +#endif /* _UAPI_LINUX_FS_H */ diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 58faab897201..1debfa42cba1 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -287,6 +287,7 @@ enum { IFLA_BR_MCAST_STATS_ENABLED, IFLA_BR_MCAST_IGMP_VERSION, IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, __IFLA_BR_MAX, }; diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 2875ce85b322..2b7a652c9fa4 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -420,13 +420,19 @@ struct kvm_run { struct kvm_coalesced_mmio_zone { __u64 addr; __u32 size; - __u32 pad; + union { + __u32 pad; + __u32 pio; + }; }; struct kvm_coalesced_mmio { __u64 phys_addr; __u32 len; - __u32 pad; + union { + __u32 pad; + __u32 pio; + }; __u8 data[8]; }; @@ -752,6 +758,15 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 /* + * On arm64, machine type can be used to request the physical + * address size for the VM. Bits[7-0] are reserved for the guest + * PA size shift (i.e, log2(PA_Size)). For backward compatibility, + * value 0 implies the default IPA size, 40bits. + */ +#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL +#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ + ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) +/* * ioctls for /dev/kvm fds: */ #define KVM_GET_API_VERSION _IO(KVMIO, 0x00) @@ -958,6 +973,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_SEND_IPI 161 #define KVM_CAP_COALESCED_PIO 162 #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 +#define KVM_CAP_EXCEPTION_PAYLOAD 164 +#define KVM_CAP_ARM_VM_IPA_SIZE 165 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index bfd5938fede6..d0f515d53299 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -28,7 +28,9 @@ #define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/tools/include/uapi/linux/netlink.h b/tools/include/uapi/linux/netlink.h index 776bc92e9118..486ed1f0c0bc 100644 --- a/tools/include/uapi/linux/netlink.h +++ b/tools/include/uapi/linux/netlink.h @@ -155,6 +155,7 @@ enum nlmsgerr_attrs { #define NETLINK_LIST_MEMBERSHIPS 9 #define NETLINK_CAP_ACK 10 #define NETLINK_EXT_ACK 11 +#define NETLINK_DUMP_STRICT_CHK 12 struct nl_pktinfo { __u32 group; diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index f35eb72739c0..9de8780ac8d9 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -646,10 +646,12 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events */ #define PERF_RECORD_MISC_MMAP_DATA (1 << 13) #define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) /* * These PERF_RECORD_MISC_* flags below are safely reused diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index ed0a120d4f08..404d4b9ffe76 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -752,7 +752,7 @@ struct snd_timer_info { #define SNDRV_TIMER_PSFLG_EARLY_EVENT (1<<2) /* write early event to the poll queue */ struct snd_timer_params { - unsigned int flags; /* flags - SNDRV_MIXER_PSFLG_* */ + unsigned int flags; /* flags - SNDRV_TIMER_PSFLG_* */ unsigned int ticks; /* requested resolution in ticks */ unsigned int queue_size; /* total size of queue (32-1024) */ unsigned int reserved0; /* reserved, was: failure locations */ diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index cb7154eccbdc..dbb9efbf718a 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -116,6 +116,7 @@ static int get_value(struct parse_opt_ctx_t *p, case OPTION_INTEGER: case OPTION_UINTEGER: case OPTION_LONG: + case OPTION_ULONG: case OPTION_U64: default: break; @@ -166,6 +167,7 @@ static int get_value(struct parse_opt_ctx_t *p, case OPTION_INTEGER: case OPTION_UINTEGER: case OPTION_LONG: + case OPTION_ULONG: case OPTION_U64: default: break; @@ -295,6 +297,22 @@ static int get_value(struct parse_opt_ctx_t *p, return opterror(opt, "expects a numerical value", flags); return 0; + case OPTION_ULONG: + if (unset) { + *(unsigned long *)opt->value = 0; + return 0; + } + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { + *(unsigned long *)opt->value = opt->defval; + return 0; + } + if (get_arg(p, opt, flags, &arg)) + return -1; + *(unsigned long *)opt->value = strtoul(arg, (char **)&s, 10); + if (*s) + return opterror(opt, "expects a numerical value", flags); + return 0; + case OPTION_U64: if (unset) { *(u64 *)opt->value = 0; @@ -703,6 +721,7 @@ static void print_option_help(const struct option *opts, int full) case OPTION_ARGUMENT: break; case OPTION_LONG: + case OPTION_ULONG: case OPTION_U64: case OPTION_INTEGER: case OPTION_UINTEGER: diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h index 92fdbe1519f6..6ca2a8bfe716 100644 --- a/tools/lib/subcmd/parse-options.h +++ b/tools/lib/subcmd/parse-options.h @@ -25,6 +25,7 @@ enum parse_opt_type { OPTION_STRING, OPTION_INTEGER, OPTION_LONG, + OPTION_ULONG, OPTION_CALLBACK, OPTION_U64, OPTION_UINTEGER, @@ -133,6 +134,7 @@ struct option { #define OPT_INTEGER(s, l, v, h) { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) } #define OPT_UINTEGER(s, l, v, h) { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) } #define OPT_LONG(s, l, v, h) { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) } +#define OPT_ULONG(s, l, v, h) { .type = OPTION_ULONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned long *), .help = (h) } #define OPT_U64(s, l, v, h) { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) } #define OPT_STRING(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), .argh = (a), .help = (h) } #define OPT_STRING_OPTARG(s, l, v, a, h, d) \ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 2928939b98ec..0414a0d52262 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -836,7 +836,7 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn, struct symbol *pfunc = insn->func->pfunc; unsigned int prev_offset = 0; - list_for_each_entry_from(rela, &file->rodata->rela->rela_list, list) { + list_for_each_entry_from(rela, &table->rela_sec->rela_list, list) { if (rela == next_table) break; @@ -926,6 +926,7 @@ static struct rela *find_switch_table(struct objtool_file *file, { struct rela *text_rela, *rodata_rela; struct instruction *orig_insn = insn; + struct section *rodata_sec; unsigned long table_offset; /* @@ -953,10 +954,13 @@ static struct rela *find_switch_table(struct objtool_file *file, /* look for a relocation which references .rodata */ text_rela = find_rela_by_dest_range(insn->sec, insn->offset, insn->len); - if (!text_rela || text_rela->sym != file->rodata->sym) + if (!text_rela || text_rela->sym->type != STT_SECTION || + !text_rela->sym->sec->rodata) continue; table_offset = text_rela->addend; + rodata_sec = text_rela->sym->sec; + if (text_rela->type == R_X86_64_PC32) table_offset += 4; @@ -964,10 +968,10 @@ static struct rela *find_switch_table(struct objtool_file *file, * Make sure the .rodata address isn't associated with a * symbol. gcc jump tables are anonymous data. */ - if (find_symbol_containing(file->rodata, table_offset)) + if (find_symbol_containing(rodata_sec, table_offset)) continue; - rodata_rela = find_rela_by_dest(file->rodata, table_offset); + rodata_rela = find_rela_by_dest(rodata_sec, table_offset); if (rodata_rela) { /* * Use of RIP-relative switch jumps is quite rare, and @@ -1052,7 +1056,7 @@ static int add_switch_table_alts(struct objtool_file *file) struct symbol *func; int ret; - if (!file->rodata || !file->rodata->rela) + if (!file->rodata) return 0; for_each_sec(file, sec) { @@ -1198,10 +1202,33 @@ static int read_retpoline_hints(struct objtool_file *file) return 0; } +static void mark_rodata(struct objtool_file *file) +{ + struct section *sec; + bool found = false; + + /* + * This searches for the .rodata section or multiple .rodata.func_name + * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8 + * rodata sections are ignored as they don't contain jump tables. + */ + for_each_sec(file, sec) { + if (!strncmp(sec->name, ".rodata", 7) && + !strstr(sec->name, ".str1.")) { + sec->rodata = true; + found = true; + } + } + + file->rodata = found; +} + static int decode_sections(struct objtool_file *file) { int ret; + mark_rodata(file); + ret = decode_instructions(file); if (ret) return ret; @@ -2171,7 +2198,6 @@ int check(const char *_objname, bool orc) INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard"); - file.rodata = find_section_by_name(file.elf, ".rodata"); file.c_file = find_section_by_name(file.elf, ".comment"); file.ignore_unreachables = no_unreachable; file.hints = false; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 95700a2bcb7c..e6e8a655b556 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -60,8 +60,8 @@ struct objtool_file { struct elf *elf; struct list_head insn_list; DECLARE_HASHTABLE(insn_hash, 16); - struct section *rodata, *whitelist; - bool ignore_unreachables, c_file, hints; + struct section *whitelist; + bool ignore_unreachables, c_file, hints, rodata; }; int check(const char *objname, bool orc); diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7ec85d567598..6dbb9fae0f9d 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -301,7 +301,7 @@ static int read_symbols(struct elf *elf) if (sym->type != STT_FUNC) continue; sym->pfunc = sym->cfunc = sym; - coldstr = strstr(sym->name, ".cold."); + coldstr = strstr(sym->name, ".cold"); if (!coldstr) continue; @@ -379,6 +379,7 @@ static int read_relas(struct elf *elf) rela->offset = rela->rela.r_offset; symndx = GELF_R_SYM(rela->rela.r_info); rela->sym = find_symbol_by_index(elf, symndx); + rela->rela_sec = sec; if (!rela->sym) { WARN("can't find rela entry symbol %d for %s", symndx, sec->name); diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index de5cd2ddded9..bc97ed86b9cd 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -48,7 +48,7 @@ struct section { char *name; int idx; unsigned int len; - bool changed, text; + bool changed, text, rodata; }; struct symbol { @@ -68,6 +68,7 @@ struct rela { struct list_head list; struct hlist_node hash; GElf_Rela rela; + struct section *rela_sec; struct symbol *sym; unsigned int type; unsigned long offset; diff --git a/tools/perf/Documentation/build-xed.txt b/tools/perf/Documentation/build-xed.txt new file mode 100644 index 000000000000..6222c1e7231f --- /dev/null +++ b/tools/perf/Documentation/build-xed.txt @@ -0,0 +1,19 @@ + +For --xed the xed tool is needed. Here is how to install it: + + $ git clone https://github.com/intelxed/mbuild.git mbuild + $ git clone https://github.com/intelxed/xed + $ cd xed + $ ./mfile.py --share + $ ./mfile.py examples + $ sudo ./mfile.py --prefix=/usr/local install + $ sudo ldconfig + $ sudo cp obj/examples/xed /usr/local/bin + +Basic xed testing: + + $ xed | head -3 + ERROR: required argument(s) were missing + Copyright (C) 2017, Intel Corporation. All rights reserved. + XED version: [v10.0-328-g7d62c8c49b7b] + $ diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index 76971d2e4164..115eaacc455f 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -106,7 +106,7 @@ in transaction, respectively. While it is possible to create scripts to analyze the data, an alternative approach is available to export the data to a sqlite or postgresql database. Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, -and to script call-graph-from-sql.py for an example of using the database. +and to script exported-sql-viewer.py for an example of using the database. There is also script intel-pt-events.py which provides an example of how to unpack the raw data for power events and PTWRITE. diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index a3abe04c779d..c2182cbabde3 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -11,10 +11,11 @@ l synthesize last branch entries (use with i or x) s skip initial number of events - The default is all events i.e. the same as --itrace=ibxwpe + The default is all events i.e. the same as --itrace=ibxwpe, + except for perf script where it is --itrace=ce - In addition, the period (default 100000) for instructions events - can be specified in units of: + In addition, the period (default 100000, except for perf script where it is 1) + for instructions events can be specified in units of: i instructions t ticks diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index afdafe2110a1..a2b37ce48094 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -383,6 +383,24 @@ include::itrace.txt[] will be printed. Each entry has function name and file/line. Enabled by default, disable with --no-inline. +--insn-trace:: + Show instruction stream for intel_pt traces. Combine with --xed to + show disassembly. + +--xed:: + Run xed disassembler on output. Requires installing the xed disassembler. + +--call-trace:: + Show call stream for intel_pt traces. The CPUs are interleaved, but + can be filtered with -C. + +--call-ret-trace:: + Show call and return stream for intel_pt traces. + +--graph-function:: + For itrace only show specified functions and their callees for + itrace. Multiple functions can be separated by comma. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 114fda12aa49..808b664343c9 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -242,6 +242,16 @@ Default is to monitor all CPUS. --hierarchy:: Enable hierarchy output. +--overwrite:: + Enable this to use just the most recent records, which helps in high core count + machines such as Knights Landing/Mill, but right now is disabled by default as + the pausing used in this technique is leading to loss of metadata events such + as PERF_RECORD_MMAP which makes 'perf top' unable to resolve samples, leading + to lots of unknown samples appearing on the UI. Enable this if you are in such + machines and profiling a workload that doesn't creates short lived threads and/or + doesn't uses many executable mmap operations. Work is being planed to solve + this situation, till then, this will remain disabled by default. + --force:: Don't do ownership validation. diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 115db9e06ecd..e113450503d2 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -171,6 +171,11 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --kernel-syscall-graph:: Show the kernel callchains on the syscall exit path. +--max-events=N:: + Stop after processing N events. Note that strace-like events are considered + only at exit time or when a syscall is interrupted, i.e. in those cases this + option is equivalent to the number of lines printed. + --max-stack:: Set the stack depth limit when parsing the callchain, anything beyond the specified depth will be ignored. Note that at this point @@ -238,6 +243,68 @@ Trace syscalls, major and minor pagefaults: As you can see, there was major pagefault in python process, from CRYPTO_push_info_ routine which faulted somewhere in libcrypto.so. +Trace the first 4 open, openat or open_by_handle_at syscalls (in the future more syscalls may match here): + + $ perf trace -e open* --max-events 4 + [root@jouet perf]# trace -e open* --max-events 4 + 2272.992 ( 0.037 ms): gnome-shell/1370 openat(dfd: CWD, filename: /proc/self/stat) = 31 + 2277.481 ( 0.139 ms): gnome-shell/3039 openat(dfd: CWD, filename: /proc/self/stat) = 65 + 3026.398 ( 0.076 ms): gnome-shell/3039 openat(dfd: CWD, filename: /proc/self/stat) = 65 + 4294.665 ( 0.015 ms): sed/15879 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC) = 3 + $ + +Trace the first minor page fault when running a workload: + + # perf trace -F min --max-stack=7 --max-events 1 sleep 1 + 0.000 ( 0.000 ms): sleep/18006 minfault [__clear_user+0x1a] => 0x5626efa56080 (?k) + __clear_user ([kernel.kallsyms]) + load_elf_binary ([kernel.kallsyms]) + search_binary_handler ([kernel.kallsyms]) + __do_execve_file.isra.33 ([kernel.kallsyms]) + __x64_sys_execve ([kernel.kallsyms]) + do_syscall_64 ([kernel.kallsyms]) + entry_SYSCALL_64 ([kernel.kallsyms]) + # + +Trace the next min page page fault to take place on the first CPU: + + # perf trace -F min --call-graph=dwarf --max-events 1 --cpu 0 + 0.000 ( 0.000 ms): Web Content/17136 minfault [js::gc::Chunk::fetchNextDecommittedArena+0x4b] => 0x7fbe6181b000 (?.) + js::gc::FreeSpan::initAsEmpty (inlined) + js::gc::Arena::setAsNotAllocated (inlined) + js::gc::Chunk::fetchNextDecommittedArena (/usr/lib64/firefox/libxul.so) + js::gc::Chunk::allocateArena (/usr/lib64/firefox/libxul.so) + js::gc::GCRuntime::allocateArena (/usr/lib64/firefox/libxul.so) + js::gc::ArenaLists::allocateFromArena (/usr/lib64/firefox/libxul.so) + js::gc::GCRuntime::tryNewTenuredThing<JSString, (js::AllowGC)1> (inlined) + js::AllocateString<JSString, (js::AllowGC)1> (/usr/lib64/firefox/libxul.so) + js::Allocate<JSThinInlineString, (js::AllowGC)1> (inlined) + JSThinInlineString::new_<(js::AllowGC)1> (inlined) + AllocateInlineString<(js::AllowGC)1, unsigned char> (inlined) + js::ConcatStrings<(js::AllowGC)1> (/usr/lib64/firefox/libxul.so) + [0x18b26e6bc2bd] (/tmp/perf-17136.map) + # + +Trace the next two sched:sched_switch events, four block:*_plug events, the +next block:*_unplug and the next three net:*dev_queue events, this last one +with a backtrace of at most 16 entries, system wide: + + # perf trace -e sched:*switch/nr=2/,block:*_plug/nr=4/,block:*_unplug/nr=1/,net:*dev_queue/nr=3,max-stack=16/ + 0.000 :0/0 sched:sched_switch:swapper/2:0 [120] S ==> rcu_sched:10 [120] + 0.015 rcu_sched/10 sched:sched_switch:rcu_sched:10 [120] R ==> swapper/2:0 [120] + 254.198 irq/50-iwlwifi/680 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051f600 len=66 + __dev_queue_xmit ([kernel.kallsyms]) + 273.977 :0/0 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051f600 len=78 + __dev_queue_xmit ([kernel.kallsyms]) + 274.007 :0/0 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051ff00 len=78 + __dev_queue_xmit ([kernel.kallsyms]) + 2930.140 kworker/u16:58/2722 block:block_plug:[kworker/u16:58] + 2930.162 kworker/u16:58/2722 block:block_unplug:[kworker/u16:58] 1 + 4466.094 jbd2/dm-2-8/748 block:block_plug:[jbd2/dm-2-8] + 8050.123 kworker/u16:30/2694 block:block_plug:[kworker/u16:30] + 8050.271 kworker/u16:30/2694 block:block_plug:[kworker/u16:30] + # + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script[1] diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 2f3bf025e305..3ccb4f0bf088 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1,4 +1,5 @@ include ../scripts/Makefile.include +include ../scripts/Makefile.arch # The default target of this Makefile is... all: @@ -385,6 +386,8 @@ export INSTALL SHELL_PATH SHELL = $(SHELL_PATH) linux_uapi_dir := $(srctree)/tools/include/uapi/linux +asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic +arch_asm_uapi_dir := $(srctree)/tools/arch/$(ARCH)/include/uapi/asm/ beauty_outdir := $(OUTPUT)trace/beauty/generated beauty_ioctl_outdir := $(beauty_outdir)/ioctl @@ -460,6 +463,18 @@ madvise_behavior_tbl := $(srctree)/tools/perf/trace/beauty/madvise_behavior.sh $(madvise_behavior_array): $(madvise_hdr_dir)/mman-common.h $(madvise_behavior_tbl) $(Q)$(SHELL) '$(madvise_behavior_tbl)' $(madvise_hdr_dir) > $@ +mmap_flags_array := $(beauty_outdir)/mmap_flags_array.c +mmap_flags_tbl := $(srctree)/tools/perf/trace/beauty/mmap_flags.sh + +$(mmap_flags_array): $(asm_generic_uapi_dir)/mman.h $(asm_generic_uapi_dir)/mman-common.h $(arch_asm_uapi_dir)/mman.h $(mmap_flags_tbl) + $(Q)$(SHELL) '$(mmap_flags_tbl)' $(asm_generic_uapi_dir) $(arch_asm_uapi_dir) > $@ + +mount_flags_array := $(beauty_outdir)/mount_flags_array.c +mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/mount_flags.sh + +$(mount_flags_array): $(linux_uapi_dir)/fs.h $(mount_flags_tbl) + $(Q)$(SHELL) '$(mount_flags_tbl)' $(linux_uapi_dir) > $@ + prctl_option_array := $(beauty_outdir)/prctl_option_array.c prctl_hdr_dir := $(srctree)/tools/include/uapi/linux/ prctl_option_tbl := $(srctree)/tools/perf/trace/beauty/prctl_option.sh @@ -577,6 +592,8 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc $(socket_ipproto_array) \ $(vhost_virtio_ioctl_array) \ $(madvise_behavior_array) \ + $(mmap_flags_array) \ + $(mount_flags_array) \ $(perf_ioctl_array) \ $(prctl_option_array) \ $(arch_errno_name_array) @@ -863,6 +880,8 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \ $(OUTPUT)pmu-events/pmu-events.c \ $(OUTPUT)$(madvise_behavior_array) \ + $(OUTPUT)$(mmap_flags_array) \ + $(OUTPUT)$(mount_flags_array) \ $(OUTPUT)$(drm_ioctl_array) \ $(OUTPUT)$(pkey_alloc_access_rights_array) \ $(OUTPUT)$(sndrv_ctl_ioctl_array) \ diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl index 2dbb8cade048..c88fd32563eb 100755 --- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl @@ -23,7 +23,7 @@ create_table_from_c() { local sc nr last_sc - create_table_exe=`mktemp /tmp/create-table-XXXXXX` + create_table_exe=`mktemp ${TMPDIR:-/tmp}/create-table-XXXXXX` { diff --git a/tools/perf/arch/sparc/Makefile b/tools/perf/arch/sparc/Makefile index 7fbca175099e..275dea7ff59a 100644 --- a/tools/perf/arch/sparc/Makefile +++ b/tools/perf/arch/sparc/Makefile @@ -1,3 +1,5 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 endif + +PERF_HAVE_JITDUMP := 1 diff --git a/tools/perf/arch/sparc/annotate/instructions.c b/tools/perf/arch/sparc/annotate/instructions.c new file mode 100644 index 000000000000..2614c010c235 --- /dev/null +++ b/tools/perf/arch/sparc/annotate/instructions.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +static int is_branch_cond(const char *cond) +{ + if (cond[0] == '\0') + return 1; + + if (cond[0] == 'a' && cond[1] == '\0') + return 1; + + if (cond[0] == 'c' && + (cond[1] == 'c' || cond[1] == 's') && + cond[2] == '\0') + return 1; + + if (cond[0] == 'e' && + (cond[1] == '\0' || + (cond[1] == 'q' && cond[2] == '\0'))) + return 1; + + if (cond[0] == 'g' && + (cond[1] == '\0' || + (cond[1] == 't' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'l' && + (cond[1] == '\0' || + (cond[1] == 't' && cond[2] == '\0') || + (cond[1] == 'u' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'n' && + (cond[1] == '\0' || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'z' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'g' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'b' && + cond[1] == 'p' && + cond[2] == 'o' && + cond[3] == 's' && + cond[4] == '\0') + return 1; + + if (cond[0] == 'v' && + (cond[1] == 'c' || cond[1] == 's') && + cond[2] == '\0') + return 1; + + if (cond[0] == 'b' && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + return 0; +} + +static int is_branch_reg_cond(const char *cond) +{ + if ((cond[0] == 'n' || cond[0] == 'l') && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + if (cond[0] == 'z' && + cond[1] == '\0') + return 1; + + if ((cond[0] == 'g' || cond[0] == 'l') && + cond[1] == 'e' && + cond[2] == 'z' && + cond[3] == '\0') + return 1; + + if (cond[0] == 'g' && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + return 0; +} + +static int is_branch_float_cond(const char *cond) +{ + if (cond[0] == '\0') + return 1; + + if ((cond[0] == 'a' || cond[0] == 'e' || + cond[0] == 'z' || cond[0] == 'g' || + cond[0] == 'l' || cond[0] == 'n' || + cond[0] == 'o' || cond[0] == 'u') && + cond[1] == '\0') + return 1; + + if (((cond[0] == 'g' && cond[1] == 'e') || + (cond[0] == 'l' && (cond[1] == 'e' || + cond[1] == 'g')) || + (cond[0] == 'n' && (cond[1] == 'e' || + cond[1] == 'z')) || + (cond[0] == 'u' && (cond[1] == 'e' || + cond[1] == 'g' || + cond[1] == 'l'))) && + cond[2] == '\0') + return 1; + + if (cond[0] == 'u' && + (cond[1] == 'g' || cond[1] == 'l') && + cond[2] == 'e' && + cond[3] == '\0') + return 1; + + return 0; +} + +static struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name) +{ + struct ins_ops *ops = NULL; + + if (!strcmp(name, "call") || + !strcmp(name, "jmp") || + !strcmp(name, "jmpl")) { + ops = &call_ops; + } else if (!strcmp(name, "ret") || + !strcmp(name, "retl") || + !strcmp(name, "return")) { + ops = &ret_ops; + } else if (!strcmp(name, "mov")) { + ops = &mov_ops; + } else { + if (name[0] == 'c' && + (name[1] == 'w' || name[1] == 'x')) + name += 2; + + if (name[0] == 'b') { + const char *cond = name + 1; + + if (cond[0] == 'r') { + if (is_branch_reg_cond(cond + 1)) + ops = &jump_ops; + } else if (is_branch_cond(cond)) { + ops = &jump_ops; + } + } else if (name[0] == 'f' && name[1] == 'b') { + if (is_branch_float_cond(name + 2)) + ops = &jump_ops; + } + } + + if (ops) + arch__associate_ins_ops(arch, name, ops); + + return ops; +} + +static int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->initialized = true; + arch->associate_instruction_ops = sparc__associate_instruction_ops; + arch->objdump.comment_char = '#'; + } + + return 0; +} diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 0980dfe3396b..10cf889c6d75 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -592,6 +592,9 @@ static void record__init_features(struct record *rec) if (!rec->opts.full_auxtrace) perf_header__clear_feat(&session->header, HEADER_AUXTRACE); + if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) + perf_header__clear_feat(&session->header, HEADER_CLOCKID); + perf_header__clear_feat(&session->header, HEADER_STAT); } @@ -897,6 +900,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) record__init_features(rec); + if (rec->opts.use_clockid && rec->opts.clockid_res_ns) + session->header.env.clockid_res_ns = rec->opts.clockid_res_ns; + if (forks) { err = perf_evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe, @@ -1337,6 +1343,19 @@ static const struct clockid_map clockids[] = { CLOCKID_END, }; +static int get_clockid_res(clockid_t clk_id, u64 *res_ns) +{ + struct timespec res; + + *res_ns = 0; + if (!clock_getres(clk_id, &res)) + *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; + else + pr_warning("WARNING: Failed to determine specified clock resolution.\n"); + + return 0; +} + static int parse_clockid(const struct option *opt, const char *str, int unset) { struct record_opts *opts = (struct record_opts *)opt->value; @@ -1360,7 +1379,7 @@ static int parse_clockid(const struct option *opt, const char *str, int unset) /* if its a number, we're done */ if (sscanf(str, "%d", &opts->clockid) == 1) - return 0; + return get_clockid_res(opts->clockid, &opts->clockid_res_ns); /* allow a "CLOCK_" prefix to the name */ if (!strncasecmp(str, "CLOCK_", 6)) @@ -1369,7 +1388,8 @@ static int parse_clockid(const struct option *opt, const char *str, int unset) for (cm = clockids; cm->name; cm++) { if (!strcasecmp(str, cm->name)) { opts->clockid = cm->clockid; - return 0; + return get_clockid_res(opts->clockid, + &opts->clockid_res_ns); } } diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 4da5e32b9e03..b5bc85bd0bbe 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -44,6 +44,7 @@ #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> +#include <subcmd/pager.h> #include "sane_ctype.h" @@ -912,7 +913,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, struct perf_insn *x, u8 *inbuf, int len, - int insn, FILE *fp) + int insn, FILE *fp, int *total_cycles) { int printed = fprintf(fp, "\t%016" PRIx64 "\t%-30s\t#%s%s%s%s", ip, dump_insn(x, ip, inbuf, len, NULL), @@ -921,7 +922,8 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, en->flags.in_tx ? " INTX" : "", en->flags.abort ? " ABORT" : ""); if (en->flags.cycles) { - printed += fprintf(fp, " %d cycles", en->flags.cycles); + *total_cycles += en->flags.cycles; + printed += fprintf(fp, " %d cycles [%d]", en->flags.cycles, *total_cycles); if (insn) printed += fprintf(fp, " %.2f IPC", (float)insn / en->flags.cycles); } @@ -978,6 +980,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, u8 buffer[MAXBB]; unsigned off; struct symbol *lastsym = NULL; + int total_cycles = 0; if (!(br && br->nr)) return 0; @@ -998,7 +1001,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(br->entries[nr - 1].from, thread, x.cpumode, x.cpu, &lastsym, attr, fp); printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1], - &x, buffer, len, 0, fp); + &x, buffer, len, 0, fp, &total_cycles); } /* Print all blocks */ @@ -1026,7 +1029,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (ip == end) { - printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn, fp); + printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn, fp, + &total_cycles); break; } else { printed += fprintf(fp, "\t%016" PRIx64 "\t%s\n", ip, @@ -1104,6 +1108,35 @@ out: return printed; } +static const char *resolve_branch_sym(struct perf_sample *sample, + struct perf_evsel *evsel, + struct thread *thread, + struct addr_location *al, + u64 *ip) +{ + struct addr_location addr_al; + struct perf_event_attr *attr = &evsel->attr; + const char *name = NULL; + + if (sample->flags & (PERF_IP_FLAG_CALL | PERF_IP_FLAG_TRACE_BEGIN)) { + if (sample_addr_correlates_sym(attr)) { + thread__resolve(thread, &addr_al, sample); + if (addr_al.sym) + name = addr_al.sym->name; + else + *ip = sample->addr; + } else { + *ip = sample->addr; + } + } else if (sample->flags & (PERF_IP_FLAG_RETURN | PERF_IP_FLAG_TRACE_END)) { + if (al->sym) + name = al->sym->name; + else + *ip = sample->ip; + } + return name; +} + static int perf_sample__fprintf_callindent(struct perf_sample *sample, struct perf_evsel *evsel, struct thread *thread, @@ -1111,7 +1144,6 @@ static int perf_sample__fprintf_callindent(struct perf_sample *sample, { struct perf_event_attr *attr = &evsel->attr; size_t depth = thread_stack__depth(thread); - struct addr_location addr_al; const char *name = NULL; static int spacing; int len = 0; @@ -1125,22 +1157,7 @@ static int perf_sample__fprintf_callindent(struct perf_sample *sample, if (thread->ts && sample->flags & PERF_IP_FLAG_RETURN) depth += 1; - if (sample->flags & (PERF_IP_FLAG_CALL | PERF_IP_FLAG_TRACE_BEGIN)) { - if (sample_addr_correlates_sym(attr)) { - thread__resolve(thread, &addr_al, sample); - if (addr_al.sym) - name = addr_al.sym->name; - else - ip = sample->addr; - } else { - ip = sample->addr; - } - } else if (sample->flags & (PERF_IP_FLAG_RETURN | PERF_IP_FLAG_TRACE_END)) { - if (al->sym) - name = al->sym->name; - else - ip = sample->ip; - } + name = resolve_branch_sym(sample, evsel, thread, al, &ip); if (PRINT_FIELD(DSO) && !(PRINT_FIELD(IP) || PRINT_FIELD(ADDR))) { dlen += fprintf(fp, "("); @@ -1646,6 +1663,47 @@ static void perf_sample__fprint_metric(struct perf_script *script, } } +static bool show_event(struct perf_sample *sample, + struct perf_evsel *evsel, + struct thread *thread, + struct addr_location *al) +{ + int depth = thread_stack__depth(thread); + + if (!symbol_conf.graph_function) + return true; + + if (thread->filter) { + if (depth <= thread->filter_entry_depth) { + thread->filter = false; + return false; + } + return true; + } else { + const char *s = symbol_conf.graph_function; + u64 ip; + const char *name = resolve_branch_sym(sample, evsel, thread, al, + &ip); + unsigned nlen; + + if (!name) + return false; + nlen = strlen(name); + while (*s) { + unsigned len = strcspn(s, ","); + if (nlen == len && !strncmp(name, s, len)) { + thread->filter = true; + thread->filter_entry_depth = depth; + return true; + } + s += len; + if (*s == ',') + s++; + } + return false; + } +} + static void process_event(struct perf_script *script, struct perf_sample *sample, struct perf_evsel *evsel, struct addr_location *al, @@ -1660,6 +1718,9 @@ static void process_event(struct perf_script *script, if (output[type].fields == 0) return; + if (!show_event(sample, evsel, thread, al)) + return; + ++es->samples; perf_sample__fprintf_start(sample, thread, evsel, @@ -1737,6 +1798,9 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(METRIC)) perf_sample__fprint_metric(script, thread, evsel, sample, fp); + + if (verbose) + fflush(fp); } static struct scripting_ops *scripting_ops; @@ -3100,6 +3164,44 @@ static int perf_script__process_auxtrace_info(struct perf_session *session, #define perf_script__process_auxtrace_info 0 #endif +static int parse_insn_trace(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + parse_output_fields(NULL, "+insn,-event,-period", 0); + itrace_parse_synth_opts(opt, "i0ns", 0); + nanosecs = true; + return 0; +} + +static int parse_xed(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + force_pager("xed -F insn: -A -64 | less"); + return 0; +} + +static int parse_call_trace(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + parse_output_fields(NULL, "-ip,-addr,-event,-period,+callindent", 0); + itrace_parse_synth_opts(opt, "cewp", 0); + nanosecs = true; + return 0; +} + +static int parse_callret_trace(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + parse_output_fields(NULL, "-ip,-addr,-event,-period,+callindent,+flags", 0); + itrace_parse_synth_opts(opt, "crewp", 0); + nanosecs = true; + return 0; +} + int cmd_script(int argc, const char **argv) { bool show_full_info = false; @@ -3109,7 +3211,10 @@ int cmd_script(int argc, const char **argv) char *rec_script_path = NULL; char *rep_script_path = NULL; struct perf_session *session; - struct itrace_synth_opts itrace_synth_opts = { .set = false, }; + struct itrace_synth_opts itrace_synth_opts = { + .set = false, + .default_no_sample = true, + }; char *script_path = NULL; const char **__argv; int i, j, err = 0; @@ -3184,6 +3289,16 @@ int cmd_script(int argc, const char **argv) "system-wide collection from all CPUs"), OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", "only consider these symbols"), + OPT_CALLBACK_OPTARG(0, "insn-trace", &itrace_synth_opts, NULL, NULL, + "Decode instructions from itrace", parse_insn_trace), + OPT_CALLBACK_OPTARG(0, "xed", NULL, NULL, NULL, + "Run xed disassembler on output", parse_xed), + OPT_CALLBACK_OPTARG(0, "call-trace", &itrace_synth_opts, NULL, NULL, + "Decode calls from from itrace", parse_call_trace), + OPT_CALLBACK_OPTARG(0, "call-ret-trace", &itrace_synth_opts, NULL, NULL, + "Decode calls and returns from itrace", parse_callret_trace), + OPT_STRING(0, "graph-function", &symbol_conf.graph_function, "symbol[,symbol...]", + "Only print symbols and callees with --call-trace/--call-ret-trace"), OPT_STRING(0, "stop-bt", &symbol_conf.bt_stop_list_str, "symbol[,symbol...]", "Stop display of callgraph at these symbols"), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), @@ -3417,8 +3532,10 @@ int cmd_script(int argc, const char **argv) exit(-1); } - if (!script_name) + if (!script_name) { setup_pager(); + use_browser = 0; + } session = perf_session__new(&data, false, &script.tool); if (session == NULL) @@ -3439,7 +3556,8 @@ int cmd_script(int argc, const char **argv) script.session = session; script__setup_sample_type(&script); - if (output[PERF_TYPE_HARDWARE].fields & PERF_OUTPUT_CALLINDENT) + if ((output[PERF_TYPE_HARDWARE].fields & PERF_OUTPUT_CALLINDENT) || + symbol_conf.graph_function) itrace_synth_opts.thread_stack = true; session->itrace_synth_opts = &itrace_synth_opts; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b86aba1c8028..d1028d7755bb 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -409,6 +409,28 @@ static struct perf_evsel *perf_evsel__reset_weak_group(struct perf_evsel *evsel) return leader; } +static bool is_target_alive(struct target *_target, + struct thread_map *threads) +{ + struct stat st; + int i; + + if (!target__has_task(_target)) + return true; + + for (i = 0; i < threads->nr; i++) { + char path[PATH_MAX]; + + scnprintf(path, PATH_MAX, "%s/%d", procfs__mountpoint(), + threads->map[i].pid); + + if (!stat(path, &st)) + return true; + } + + return false; +} + static int __run_perf_stat(int argc, const char **argv, int run_idx) { int interval = stat_config.interval; @@ -579,6 +601,8 @@ try_again: enable_counters(); while (!done) { nanosleep(&ts, NULL); + if (!is_target_alive(&target, evsel_list->threads)) + break; if (timeout) break; if (interval) { diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index d21d8751e749..b2838de13de0 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1134,11 +1134,6 @@ static int __cmd_top(struct perf_top *top) if (!target__none(&opts->target)) perf_evlist__enable(top->evlist); - /* Wait for a minimal set of events before starting the snapshot */ - perf_evlist__poll(top->evlist, 100); - - perf_top__mmap_read(top); - ret = -1; if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui : display_thread), top)) { @@ -1156,6 +1151,11 @@ static int __cmd_top(struct perf_top *top) } } + /* Wait for a minimal set of events before starting the snapshot */ + perf_evlist__poll(top->evlist, 100); + + perf_top__mmap_read(top); + while (!done) { u64 hits = top->samples; @@ -1257,7 +1257,14 @@ int cmd_top(int argc, const char **argv) .uses_mmap = true, }, .proc_map_timeout = 500, - .overwrite = 1, + /* + * FIXME: This will lose PERF_RECORD_MMAP and other metadata + * when we pause, fix that and reenable. Probably using a + * separate evlist with a dummy event, i.e. a non-overwrite + * ring buffer just for metadata events, while PERF_RECORD_SAMPLE + * stays in overwrite mode. -acme + * */ + .overwrite = 0, }, .max_stack = sysctl__max_stack(), .annotation_opts = annotation__default_options, @@ -1372,6 +1379,8 @@ int cmd_top(int argc, const char **argv) "Show raw trace event output (do not use print fmt or plugins)"), OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy, "Show entries in a hierarchy"), + OPT_BOOLEAN(0, "overwrite", &top.record_opts.overwrite, + "Use a backward ring buffer, default: no"), OPT_BOOLEAN(0, "force", &symbol_conf.force, "don't complain, do it"), OPT_UINTEGER(0, "num-thread-synthesize", &top.nr_threads_synthesize, "number of thread to run event synthesize"), diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 90289f31dd87..dc8a6c4986ce 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -89,6 +89,8 @@ struct trace { u64 base_time; FILE *output; unsigned long nr_events; + unsigned long nr_events_printed; + unsigned long max_events; struct strlist *ev_qualifier; struct { size_t nr; @@ -612,6 +614,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, struct syscall_arg_fmt { size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); + unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val); void *parm; const char *name; bool show_zero; @@ -723,6 +726,10 @@ static struct syscall_fmt { .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, }, + { .name = "mount", + .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ }, + [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */ + .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, @@ -832,7 +839,8 @@ static struct syscall_fmt { .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "tkill", .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, - { .name = "umount2", .alias = "umount", }, + { .name = "umount2", .alias = "umount", + .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, @@ -856,6 +864,18 @@ static struct syscall_fmt *syscall_fmt__find(const char *name) return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); } +static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) +{ + int i, nmemb = ARRAY_SIZE(syscall_fmts); + + for (i = 0; i < nmemb; ++i) { + if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0) + return &syscall_fmts[i]; + } + + return NULL; +} + /* * is_exit: is this "exit" or "exit_group"? * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. @@ -1485,6 +1505,19 @@ static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size, return scnprintf(bf, size, "arg%d: ", arg->idx); } +/* + * Check if the value is in fact zero, i.e. mask whatever needs masking, such + * as mount 'flags' argument that needs ignoring some magic flag, see comment + * in tools/perf/trace/beauty/mount_flags.c + */ +static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val) +{ + if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val) + return sc->arg_fmt[arg->idx].mask_val(arg, val); + + return val; +} + static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, struct syscall_arg *arg, unsigned long val) { @@ -1533,6 +1566,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, continue; val = syscall_arg__val(&arg, arg.idx); + /* + * Some syscall args need some mask, most don't and + * return val untouched. + */ + val = syscall__mask_val(sc, &arg, val); /* * Suppress this argument if its value is zero and @@ -1664,6 +1702,8 @@ static int trace__printf_interrupted_entry(struct trace *trace) printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); ttrace->entry_pending = false; + ++trace->nr_events_printed; + return printed; } @@ -1810,12 +1850,14 @@ static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evse int max_stack = evsel->attr.sample_max_stack ? evsel->attr.sample_max_stack : trace->max_stack; + int err; - if (machine__resolve(trace->host, &al, sample) < 0 || - thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack)) + if (machine__resolve(trace->host, &al, sample) < 0) return -1; - return 0; + err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack); + addr_location__put(&al); + return err; } static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) @@ -1940,6 +1982,13 @@ errno_print: { fputc('\n', trace->output); + /* + * We only consider an 'event' for the sake of --max-events a non-filtered + * sys_enter + sys_exit and other tracepoint events. + */ + if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX) + interrupted = true; + if (callchain_ret > 0) trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) @@ -2072,14 +2121,25 @@ static void bpf_output__fprintf(struct trace *trace, { binary__fprintf(sample->raw_data, sample->raw_size, 8, bpf_output__printer, NULL, trace->output); + ++trace->nr_events_printed; } static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) { - struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); + struct thread *thread; int callchain_ret = 0; + /* + * Check if we called perf_evsel__disable(evsel) due to, for instance, + * this event's max_events having been hit and this is an entry coming + * from the ring buffer that we should discard, since the max events + * have already been considered/printed. + */ + if (evsel->disabled) + return 0; + + thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); @@ -2127,6 +2187,12 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, event_format__fprintf(evsel->tp_format, sample->cpu, sample->raw_data, sample->raw_size, trace->output); + ++trace->nr_events_printed; + + if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) { + perf_evsel__disable(evsel); + perf_evsel__close(evsel); + } } } @@ -2137,8 +2203,8 @@ newline: trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); - thread__put(thread); out: + thread__put(thread); return 0; } @@ -2225,6 +2291,8 @@ static int trace__pgfault(struct trace *trace, trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); + + ++trace->nr_events_printed; out: err = 0; out_put: @@ -2402,6 +2470,9 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st tracepoint_handler handler = evsel->handler; handler(trace, evsel, event, sample); } + + if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX) + interrupted = true; } static int trace__add_syscall_newtp(struct trace *trace) @@ -2706,7 +2777,7 @@ next_event: int timeout = done ? 100 : -1; if (!draining && perf_evlist__poll(evlist, timeout) > 0) { - if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) + if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0) draining = true; goto again; @@ -3138,6 +3209,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; + struct syscall_fmt *fmt; if (strace_groups_dir == NULL) return -1; @@ -3155,12 +3227,19 @@ static int trace__parse_events_option(const struct option *opt, const char *str, if (syscalltbl__id(trace->sctbl, s) >= 0 || syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { list = 1; + goto do_concat; + } + + fmt = syscall_fmt__find_by_alias(s); + if (fmt != NULL) { + list = 1; + s = fmt->name; } else { path__join(group_name, sizeof(group_name), strace_groups_dir, s); if (access(group_name, R_OK) == 0) list = 1; } - +do_concat: if (lists[list]) { sprintf(lists[list] + strlen(lists[list]), ",%s", s); } else { @@ -3249,6 +3328,7 @@ int cmd_trace(int argc, const char **argv) .trace_syscalls = false, .kernel_syscallchains = false, .max_stack = UINT_MAX, + .max_events = ULONG_MAX, }; const char *output_name = NULL; const struct option trace_options[] = { @@ -3301,6 +3381,8 @@ int cmd_trace(int argc, const char **argv) &record_parse_callchain_opt), OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, "Show the kernel callchains on the syscall exit path"), + OPT_ULONG(0, "max-events", &trace.max_events, + "Set the maximum number of events to print, exit after that is reached. "), OPT_UINTEGER(0, "min-stack", &trace.min_stack, "Set the minimum stack depth when parsing the callchain, " "anything below the specified depth will be ignored."), diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index c72cc73a6b09..9531f7bd7d9b 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -5,6 +5,7 @@ HEADERS=' include/uapi/drm/drm.h include/uapi/drm/i915_drm.h include/uapi/linux/fcntl.h +include/uapi/linux/fs.h include/uapi/linux/kcmp.h include/uapi/linux/kvm.h include/uapi/linux/in.h diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 21bf7f5a3cf5..0ed4a34c74c4 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -81,6 +81,7 @@ struct record_opts { unsigned initial_delay; bool use_clockid; clockid_t clockid; + u64 clockid_res_ns; unsigned int proc_map_timeout; }; diff --git a/tools/perf/scripts/python/call-graph-from-sql.py b/tools/perf/scripts/python/call-graph-from-sql.py deleted file mode 100644 index b494a67a1c67..000000000000 --- a/tools/perf/scripts/python/call-graph-from-sql.py +++ /dev/null @@ -1,339 +0,0 @@ -#!/usr/bin/python2 -# call-graph-from-sql.py: create call-graph from sql database -# Copyright (c) 2014-2017, Intel Corporation. -# -# This program is free software; you can redistribute it and/or modify it -# under the terms and conditions of the GNU General Public License, -# version 2, as published by the Free Software Foundation. -# -# This program is distributed in the hope it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -# more details. - -# To use this script you will need to have exported data using either the -# export-to-sqlite.py or the export-to-postgresql.py script. Refer to those -# scripts for details. -# -# Following on from the example in the export scripts, a -# call-graph can be displayed for the pt_example database like this: -# -# python tools/perf/scripts/python/call-graph-from-sql.py pt_example -# -# Note that for PostgreSQL, this script supports connecting to remote databases -# by setting hostname, port, username, password, and dbname e.g. -# -# python tools/perf/scripts/python/call-graph-from-sql.py "hostname=myhost username=myuser password=mypassword dbname=pt_example" -# -# The result is a GUI window with a tree representing a context-sensitive -# call-graph. Expanding a couple of levels of the tree and adjusting column -# widths to suit will display something like: -# -# Call Graph: pt_example -# Call Path Object Count Time(ns) Time(%) Branch Count Branch Count(%) -# v- ls -# v- 2638:2638 -# v- _start ld-2.19.so 1 10074071 100.0 211135 100.0 -# |- unknown unknown 1 13198 0.1 1 0.0 -# >- _dl_start ld-2.19.so 1 1400980 13.9 19637 9.3 -# >- _d_linit_internal ld-2.19.so 1 448152 4.4 11094 5.3 -# v-__libc_start_main@plt ls 1 8211741 81.5 180397 85.4 -# >- _dl_fixup ld-2.19.so 1 7607 0.1 108 0.1 -# >- __cxa_atexit libc-2.19.so 1 11737 0.1 10 0.0 -# >- __libc_csu_init ls 1 10354 0.1 10 0.0 -# |- _setjmp libc-2.19.so 1 0 0.0 4 0.0 -# v- main ls 1 8182043 99.6 180254 99.9 -# -# Points to note: -# The top level is a command name (comm) -# The next level is a thread (pid:tid) -# Subsequent levels are functions -# 'Count' is the number of calls -# 'Time' is the elapsed time until the function returns -# Percentages are relative to the level above -# 'Branch Count' is the total number of branches for that function and all -# functions that it calls - -import sys -from PySide.QtCore import * -from PySide.QtGui import * -from PySide.QtSql import * -from decimal import * - -class TreeItem(): - - def __init__(self, db, row, parent_item): - self.db = db - self.row = row - self.parent_item = parent_item - self.query_done = False; - self.child_count = 0 - self.child_items = [] - self.data = ["", "", "", "", "", "", ""] - self.comm_id = 0 - self.thread_id = 0 - self.call_path_id = 1 - self.branch_count = 0 - self.time = 0 - if not parent_item: - self.setUpRoot() - - def setUpRoot(self): - self.query_done = True - query = QSqlQuery(self.db) - ret = query.exec_('SELECT id, comm FROM comms') - if not ret: - raise Exception("Query failed: " + query.lastError().text()) - while query.next(): - if not query.value(0): - continue - child_item = TreeItem(self.db, self.child_count, self) - self.child_items.append(child_item) - self.child_count += 1 - child_item.setUpLevel1(query.value(0), query.value(1)) - - def setUpLevel1(self, comm_id, comm): - self.query_done = True; - self.comm_id = comm_id - self.data[0] = comm - self.child_items = [] - self.child_count = 0 - query = QSqlQuery(self.db) - ret = query.exec_('SELECT thread_id, ( SELECT pid FROM threads WHERE id = thread_id ), ( SELECT tid FROM threads WHERE id = thread_id ) FROM comm_threads WHERE comm_id = ' + str(comm_id)) - if not ret: - raise Exception("Query failed: " + query.lastError().text()) - while query.next(): - child_item = TreeItem(self.db, self.child_count, self) - self.child_items.append(child_item) - self.child_count += 1 - child_item.setUpLevel2(comm_id, query.value(0), query.value(1), query.value(2)) - - def setUpLevel2(self, comm_id, thread_id, pid, tid): - self.comm_id = comm_id - self.thread_id = thread_id - self.data[0] = str(pid) + ":" + str(tid) - - def getChildItem(self, row): - return self.child_items[row] - - def getParentItem(self): - return self.parent_item - - def getRow(self): - return self.row - - def timePercent(self, b): - if not self.time: - return "0.0" - x = (b * Decimal(100)) / self.time - return str(x.quantize(Decimal('.1'), rounding=ROUND_HALF_UP)) - - def branchPercent(self, b): - if not self.branch_count: - return "0.0" - x = (b * Decimal(100)) / self.branch_count - return str(x.quantize(Decimal('.1'), rounding=ROUND_HALF_UP)) - - def addChild(self, call_path_id, name, dso, count, time, branch_count): - child_item = TreeItem(self.db, self.child_count, self) - child_item.comm_id = self.comm_id - child_item.thread_id = self.thread_id - child_item.call_path_id = call_path_id - child_item.branch_count = branch_count - child_item.time = time - child_item.data[0] = name - if dso == "[kernel.kallsyms]": - dso = "[kernel]" - child_item.data[1] = dso - child_item.data[2] = str(count) - child_item.data[3] = str(time) - child_item.data[4] = self.timePercent(time) - child_item.data[5] = str(branch_count) - child_item.data[6] = self.branchPercent(branch_count) - self.child_items.append(child_item) - self.child_count += 1 - - def selectCalls(self): - self.query_done = True; - query = QSqlQuery(self.db) - ret = query.exec_('SELECT id, call_path_id, branch_count, call_time, return_time, ' - '( SELECT name FROM symbols WHERE id = ( SELECT symbol_id FROM call_paths WHERE id = call_path_id ) ), ' - '( SELECT short_name FROM dsos WHERE id = ( SELECT dso_id FROM symbols WHERE id = ( SELECT symbol_id FROM call_paths WHERE id = call_path_id ) ) ), ' - '( SELECT ip FROM call_paths where id = call_path_id ) ' - 'FROM calls WHERE parent_call_path_id = ' + str(self.call_path_id) + ' AND comm_id = ' + str(self.comm_id) + ' AND thread_id = ' + str(self.thread_id) + - ' ORDER BY call_path_id') - if not ret: - raise Exception("Query failed: " + query.lastError().text()) - last_call_path_id = 0 - name = "" - dso = "" - count = 0 - branch_count = 0 - total_branch_count = 0 - time = 0 - total_time = 0 - while query.next(): - if query.value(1) == last_call_path_id: - count += 1 - branch_count += query.value(2) - time += query.value(4) - query.value(3) - else: - if count: - self.addChild(last_call_path_id, name, dso, count, time, branch_count) - last_call_path_id = query.value(1) - name = query.value(5) - dso = query.value(6) - count = 1 - total_branch_count += branch_count - total_time += time - branch_count = query.value(2) - time = query.value(4) - query.value(3) - if count: - self.addChild(last_call_path_id, name, dso, count, time, branch_count) - total_branch_count += branch_count - total_time += time - # Top level does not have time or branch count, so fix that here - if total_branch_count > self.branch_count: - self.branch_count = total_branch_count - if self.branch_count: - for child_item in self.child_items: - child_item.data[6] = self.branchPercent(child_item.branch_count) - if total_time > self.time: - self.time = total_time - if self.time: - for child_item in self.child_items: - child_item.data[4] = self.timePercent(child_item.time) - - def childCount(self): - if not self.query_done: - self.selectCalls() - return self.child_count - - def columnCount(self): - return 7 - - def columnHeader(self, column): - headers = ["Call Path", "Object", "Count ", "Time (ns) ", "Time (%) ", "Branch Count ", "Branch Count (%) "] - return headers[column] - - def getData(self, column): - return self.data[column] - -class TreeModel(QAbstractItemModel): - - def __init__(self, db, parent=None): - super(TreeModel, self).__init__(parent) - self.db = db - self.root = TreeItem(db, 0, None) - - def columnCount(self, parent): - return self.root.columnCount() - - def rowCount(self, parent): - if parent.isValid(): - parent_item = parent.internalPointer() - else: - parent_item = self.root - return parent_item.childCount() - - def headerData(self, section, orientation, role): - if role == Qt.TextAlignmentRole: - if section > 1: - return Qt.AlignRight - if role != Qt.DisplayRole: - return None - if orientation != Qt.Horizontal: - return None - return self.root.columnHeader(section) - - def parent(self, child): - child_item = child.internalPointer() - if child_item is self.root: - return QModelIndex() - parent_item = child_item.getParentItem() - return self.createIndex(parent_item.getRow(), 0, parent_item) - - def index(self, row, column, parent): - if parent.isValid(): - parent_item = parent.internalPointer() - else: - parent_item = self.root - child_item = parent_item.getChildItem(row) - return self.createIndex(row, column, child_item) - - def data(self, index, role): - if role == Qt.TextAlignmentRole: - if index.column() > 1: - return Qt.AlignRight - if role != Qt.DisplayRole: - return None - index_item = index.internalPointer() - return index_item.getData(index.column()) - -class MainWindow(QMainWindow): - - def __init__(self, db, dbname, parent=None): - super(MainWindow, self).__init__(parent) - - self.setObjectName("MainWindow") - self.setWindowTitle("Call Graph: " + dbname) - self.move(100, 100) - self.resize(800, 600) - style = self.style() - icon = style.standardIcon(QStyle.SP_MessageBoxInformation) - self.setWindowIcon(icon); - - self.model = TreeModel(db) - - self.view = QTreeView() - self.view.setModel(self.model) - - self.setCentralWidget(self.view) - -if __name__ == '__main__': - if (len(sys.argv) < 2): - print >> sys.stderr, "Usage is: call-graph-from-sql.py <database name>" - raise Exception("Too few arguments") - - dbname = sys.argv[1] - - is_sqlite3 = False - try: - f = open(dbname) - if f.read(15) == "SQLite format 3": - is_sqlite3 = True - f.close() - except: - pass - - if is_sqlite3: - db = QSqlDatabase.addDatabase('QSQLITE') - else: - db = QSqlDatabase.addDatabase('QPSQL') - opts = dbname.split() - for opt in opts: - if '=' in opt: - opt = opt.split('=') - if opt[0] == 'hostname': - db.setHostName(opt[1]) - elif opt[0] == 'port': - db.setPort(int(opt[1])) - elif opt[0] == 'username': - db.setUserName(opt[1]) - elif opt[0] == 'password': - db.setPassword(opt[1]) - elif opt[0] == 'dbname': - dbname = opt[1] - else: - dbname = opt - - db.setDatabaseName(dbname) - if not db.open(): - raise Exception("Failed to open database " + dbname + " error: " + db.lastError().text()) - - app = QApplication(sys.argv) - window = MainWindow(db, dbname) - window.show() - err = app.exec_() - db.close() - sys.exit(err) diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index e46f51b17513..0564dd7377f2 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -59,7 +59,7 @@ import datetime # pt_example=# \q # # An example of using the database is provided by the script -# call-graph-from-sql.py. Refer to that script for details. +# exported-sql-viewer.py. Refer to that script for details. # # Tables: # diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index e4bb82c8aba9..245caf2643ed 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -40,7 +40,7 @@ import datetime # sqlite> .quit # # An example of using the database is provided by the script -# call-graph-from-sql.py. Refer to that script for details. +# exported-sql-viewer.py. Refer to that script for details. # # The database structure is practically the same as created by the script # export-to-postgresql.py. Refer to that script for details. A notable diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py new file mode 100755 index 000000000000..24cb0bd56afa --- /dev/null +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -0,0 +1,2128 @@ +#!/usr/bin/python2 +# SPDX-License-Identifier: GPL-2.0 +# exported-sql-viewer.py: view data from sql database +# Copyright (c) 2014-2018, Intel Corporation. + +# To use this script you will need to have exported data using either the +# export-to-sqlite.py or the export-to-postgresql.py script. Refer to those +# scripts for details. +# +# Following on from the example in the export scripts, a +# call-graph can be displayed for the pt_example database like this: +# +# python tools/perf/scripts/python/exported-sql-viewer.py pt_example +# +# Note that for PostgreSQL, this script supports connecting to remote databases +# by setting hostname, port, username, password, and dbname e.g. +# +# python tools/perf/scripts/python/exported-sql-viewer.py "hostname=myhost username=myuser password=mypassword dbname=pt_example" +# +# The result is a GUI window with a tree representing a context-sensitive +# call-graph. Expanding a couple of levels of the tree and adjusting column +# widths to suit will display something like: +# +# Call Graph: pt_example +# Call Path Object Count Time(ns) Time(%) Branch Count Branch Count(%) +# v- ls +# v- 2638:2638 +# v- _start ld-2.19.so 1 10074071 100.0 211135 100.0 +# |- unknown unknown 1 13198 0.1 1 0.0 +# >- _dl_start ld-2.19.so 1 1400980 13.9 19637 9.3 +# >- _d_linit_internal ld-2.19.so 1 448152 4.4 11094 5.3 +# v-__libc_start_main@plt ls 1 8211741 81.5 180397 85.4 +# >- _dl_fixup ld-2.19.so 1 7607 0.1 108 0.1 +# >- __cxa_atexit libc-2.19.so 1 11737 0.1 10 0.0 +# >- __libc_csu_init ls 1 10354 0.1 10 0.0 +# |- _setjmp libc-2.19.so 1 0 0.0 4 0.0 +# v- main ls 1 8182043 99.6 180254 99.9 +# +# Points to note: +# The top level is a command name (comm) +# The next level is a thread (pid:tid) +# Subsequent levels are functions +# 'Count' is the number of calls +# 'Time' is the elapsed time until the function returns +# Percentages are relative to the level above +# 'Branch Count' is the total number of branches for that function and all +# functions that it calls + +# There is also a "All branches" report, which displays branches and +# possibly disassembly. However, presently, the only supported disassembler is +# Intel XED, and additionally the object code must be present in perf build ID +# cache. To use Intel XED, libxed.so must be present. To build and install +# libxed.so: +# git clone https://github.com/intelxed/mbuild.git mbuild +# git clone https://github.com/intelxed/xed +# cd xed +# ./mfile.py --share +# sudo ./mfile.py --prefix=/usr/local install +# sudo ldconfig +# +# Example report: +# +# Time CPU Command PID TID Branch Type In Tx Branch +# 8107675239590 2 ls 22011 22011 return from interrupt No ffffffff86a00a67 native_irq_return_iret ([kernel]) -> 7fab593ea260 _start (ld-2.19.so) +# 7fab593ea260 48 89 e7 mov %rsp, %rdi +# 8107675239899 2 ls 22011 22011 hardware interrupt No 7fab593ea260 _start (ld-2.19.so) -> ffffffff86a012e0 page_fault ([kernel]) +# 8107675241900 2 ls 22011 22011 return from interrupt No ffffffff86a00a67 native_irq_return_iret ([kernel]) -> 7fab593ea260 _start (ld-2.19.so) +# 7fab593ea260 48 89 e7 mov %rsp, %rdi +# 7fab593ea263 e8 c8 06 00 00 callq 0x7fab593ea930 +# 8107675241900 2 ls 22011 22011 call No 7fab593ea263 _start+0x3 (ld-2.19.so) -> 7fab593ea930 _dl_start (ld-2.19.so) +# 7fab593ea930 55 pushq %rbp +# 7fab593ea931 48 89 e5 mov %rsp, %rbp +# 7fab593ea934 41 57 pushq %r15 +# 7fab593ea936 41 56 pushq %r14 +# 7fab593ea938 41 55 pushq %r13 +# 7fab593ea93a 41 54 pushq %r12 +# 7fab593ea93c 53 pushq %rbx +# 7fab593ea93d 48 89 fb mov %rdi, %rbx +# 7fab593ea940 48 83 ec 68 sub $0x68, %rsp +# 7fab593ea944 0f 31 rdtsc +# 7fab593ea946 48 c1 e2 20 shl $0x20, %rdx +# 7fab593ea94a 89 c0 mov %eax, %eax +# 7fab593ea94c 48 09 c2 or %rax, %rdx +# 7fab593ea94f 48 8b 05 1a 15 22 00 movq 0x22151a(%rip), %rax +# 8107675242232 2 ls 22011 22011 hardware interrupt No 7fab593ea94f _dl_start+0x1f (ld-2.19.so) -> ffffffff86a012e0 page_fault ([kernel]) +# 8107675242900 2 ls 22011 22011 return from interrupt No ffffffff86a00a67 native_irq_return_iret ([kernel]) -> 7fab593ea94f _dl_start+0x1f (ld-2.19.so) +# 7fab593ea94f 48 8b 05 1a 15 22 00 movq 0x22151a(%rip), %rax +# 7fab593ea956 48 89 15 3b 13 22 00 movq %rdx, 0x22133b(%rip) +# 8107675243232 2 ls 22011 22011 hardware interrupt No 7fab593ea956 _dl_start+0x26 (ld-2.19.so) -> ffffffff86a012e0 page_fault ([kernel]) + +import sys +import weakref +import threading +import string +import cPickle +import re +import os +from PySide.QtCore import * +from PySide.QtGui import * +from PySide.QtSql import * +from decimal import * +from ctypes import * +from multiprocessing import Process, Array, Value, Event + +# Data formatting helpers + +def tohex(ip): + if ip < 0: + ip += 1 << 64 + return "%x" % ip + +def offstr(offset): + if offset: + return "+0x%x" % offset + return "" + +def dsoname(name): + if name == "[kernel.kallsyms]": + return "[kernel]" + return name + +# Percent to one decimal place + +def PercentToOneDP(n, d): + if not d: + return "0.0" + x = (n * Decimal(100)) / d + return str(x.quantize(Decimal(".1"), rounding=ROUND_HALF_UP)) + +# Helper for queries that must not fail + +def QueryExec(query, stmt): + ret = query.exec_(stmt) + if not ret: + raise Exception("Query failed: " + query.lastError().text()) + +# Background thread + +class Thread(QThread): + + done = Signal(object) + + def __init__(self, task, param=None, parent=None): + super(Thread, self).__init__(parent) + self.task = task + self.param = param + + def run(self): + while True: + if self.param is None: + done, result = self.task() + else: + done, result = self.task(self.param) + self.done.emit(result) + if done: + break + +# Tree data model + +class TreeModel(QAbstractItemModel): + + def __init__(self, root, parent=None): + super(TreeModel, self).__init__(parent) + self.root = root + self.last_row_read = 0 + + def Item(self, parent): + if parent.isValid(): + return parent.internalPointer() + else: + return self.root + + def rowCount(self, parent): + result = self.Item(parent).childCount() + if result < 0: + result = 0 + self.dataChanged.emit(parent, parent) + return result + + def hasChildren(self, parent): + return self.Item(parent).hasChildren() + + def headerData(self, section, orientation, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(section) + if role != Qt.DisplayRole: + return None + if orientation != Qt.Horizontal: + return None + return self.columnHeader(section) + + def parent(self, child): + child_item = child.internalPointer() + if child_item is self.root: + return QModelIndex() + parent_item = child_item.getParentItem() + return self.createIndex(parent_item.getRow(), 0, parent_item) + + def index(self, row, column, parent): + child_item = self.Item(parent).getChildItem(row) + return self.createIndex(row, column, child_item) + + def DisplayData(self, item, index): + return item.getData(index.column()) + + def FetchIfNeeded(self, row): + if row > self.last_row_read: + self.last_row_read = row + if row + 10 >= self.root.child_count: + self.fetcher.Fetch(glb_chunk_sz) + + def columnAlignment(self, column): + return Qt.AlignLeft + + def columnFont(self, column): + return None + + def data(self, index, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(index.column()) + if role == Qt.FontRole: + return self.columnFont(index.column()) + if role != Qt.DisplayRole: + return None + item = index.internalPointer() + return self.DisplayData(item, index) + +# Table data model + +class TableModel(QAbstractTableModel): + + def __init__(self, parent=None): + super(TableModel, self).__init__(parent) + self.child_count = 0 + self.child_items = [] + self.last_row_read = 0 + + def Item(self, parent): + if parent.isValid(): + return parent.internalPointer() + else: + return self + + def rowCount(self, parent): + return self.child_count + + def headerData(self, section, orientation, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(section) + if role != Qt.DisplayRole: + return None + if orientation != Qt.Horizontal: + return None + return self.columnHeader(section) + + def index(self, row, column, parent): + return self.createIndex(row, column, self.child_items[row]) + + def DisplayData(self, item, index): + return item.getData(index.column()) + + def FetchIfNeeded(self, row): + if row > self.last_row_read: + self.last_row_read = row + if row + 10 >= self.child_count: + self.fetcher.Fetch(glb_chunk_sz) + + def columnAlignment(self, column): + return Qt.AlignLeft + + def columnFont(self, column): + return None + + def data(self, index, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(index.column()) + if role == Qt.FontRole: + return self.columnFont(index.column()) + if role != Qt.DisplayRole: + return None + item = index.internalPointer() + return self.DisplayData(item, index) + +# Model cache + +model_cache = weakref.WeakValueDictionary() +model_cache_lock = threading.Lock() + +def LookupCreateModel(model_name, create_fn): + model_cache_lock.acquire() + try: + model = model_cache[model_name] + except: + model = None + if model is None: + model = create_fn() + model_cache[model_name] = model + model_cache_lock.release() + return model + +# Find bar + +class FindBar(): + + def __init__(self, parent, finder, is_reg_expr=False): + self.finder = finder + self.context = [] + self.last_value = None + self.last_pattern = None + + label = QLabel("Find:") + label.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.textbox = QComboBox() + self.textbox.setEditable(True) + self.textbox.currentIndexChanged.connect(self.ValueChanged) + + self.progress = QProgressBar() + self.progress.setRange(0, 0) + self.progress.hide() + + if is_reg_expr: + self.pattern = QCheckBox("Regular Expression") + else: + self.pattern = QCheckBox("Pattern") + self.pattern.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.next_button = QToolButton() + self.next_button.setIcon(parent.style().standardIcon(QStyle.SP_ArrowDown)) + self.next_button.released.connect(lambda: self.NextPrev(1)) + + self.prev_button = QToolButton() + self.prev_button.setIcon(parent.style().standardIcon(QStyle.SP_ArrowUp)) + self.prev_button.released.connect(lambda: self.NextPrev(-1)) + + self.close_button = QToolButton() + self.close_button.setIcon(parent.style().standardIcon(QStyle.SP_DockWidgetCloseButton)) + self.close_button.released.connect(self.Deactivate) + + self.hbox = QHBoxLayout() + self.hbox.setContentsMargins(0, 0, 0, 0) + + self.hbox.addWidget(label) + self.hbox.addWidget(self.textbox) + self.hbox.addWidget(self.progress) + self.hbox.addWidget(self.pattern) + self.hbox.addWidget(self.next_button) + self.hbox.addWidget(self.prev_button) + self.hbox.addWidget(self.close_button) + + self.bar = QWidget() + self.bar.setLayout(self.hbox); + self.bar.hide() + + def Widget(self): + return self.bar + + def Activate(self): + self.bar.show() + self.textbox.setFocus() + + def Deactivate(self): + self.bar.hide() + + def Busy(self): + self.textbox.setEnabled(False) + self.pattern.hide() + self.next_button.hide() + self.prev_button.hide() + self.progress.show() + + def Idle(self): + self.textbox.setEnabled(True) + self.progress.hide() + self.pattern.show() + self.next_button.show() + self.prev_button.show() + + def Find(self, direction): + value = self.textbox.currentText() + pattern = self.pattern.isChecked() + self.last_value = value + self.last_pattern = pattern + self.finder.Find(value, direction, pattern, self.context) + + def ValueChanged(self): + value = self.textbox.currentText() + pattern = self.pattern.isChecked() + index = self.textbox.currentIndex() + data = self.textbox.itemData(index) + # Store the pattern in the combo box to keep it with the text value + if data == None: + self.textbox.setItemData(index, pattern) + else: + self.pattern.setChecked(data) + self.Find(0) + + def NextPrev(self, direction): + value = self.textbox.currentText() + pattern = self.pattern.isChecked() + if value != self.last_value: + index = self.textbox.findText(value) + # Allow for a button press before the value has been added to the combo box + if index < 0: + index = self.textbox.count() + self.textbox.addItem(value, pattern) + self.textbox.setCurrentIndex(index) + return + else: + self.textbox.setItemData(index, pattern) + elif pattern != self.last_pattern: + # Keep the pattern recorded in the combo box up to date + index = self.textbox.currentIndex() + self.textbox.setItemData(index, pattern) + self.Find(direction) + + def NotFound(self): + QMessageBox.information(self.bar, "Find", "'" + self.textbox.currentText() + "' not found") + +# Context-sensitive call graph data model item base + +class CallGraphLevelItemBase(object): + + def __init__(self, glb, row, parent_item): + self.glb = glb + self.row = row + self.parent_item = parent_item + self.query_done = False; + self.child_count = 0 + self.child_items = [] + + def getChildItem(self, row): + return self.child_items[row] + + def getParentItem(self): + return self.parent_item + + def getRow(self): + return self.row + + def childCount(self): + if not self.query_done: + self.Select() + if not self.child_count: + return -1 + return self.child_count + + def hasChildren(self): + if not self.query_done: + return True + return self.child_count > 0 + + def getData(self, column): + return self.data[column] + +# Context-sensitive call graph data model level 2+ item base + +class CallGraphLevelTwoPlusItemBase(CallGraphLevelItemBase): + + def __init__(self, glb, row, comm_id, thread_id, call_path_id, time, branch_count, parent_item): + super(CallGraphLevelTwoPlusItemBase, self).__init__(glb, row, parent_item) + self.comm_id = comm_id + self.thread_id = thread_id + self.call_path_id = call_path_id + self.branch_count = branch_count + self.time = time + + def Select(self): + self.query_done = True; + query = QSqlQuery(self.glb.db) + QueryExec(query, "SELECT call_path_id, name, short_name, COUNT(calls.id), SUM(return_time - call_time), SUM(branch_count)" + " FROM calls" + " INNER JOIN call_paths ON calls.call_path_id = call_paths.id" + " INNER JOIN symbols ON call_paths.symbol_id = symbols.id" + " INNER JOIN dsos ON symbols.dso_id = dsos.id" + " WHERE parent_call_path_id = " + str(self.call_path_id) + + " AND comm_id = " + str(self.comm_id) + + " AND thread_id = " + str(self.thread_id) + + " GROUP BY call_path_id, name, short_name" + " ORDER BY call_path_id") + while query.next(): + child_item = CallGraphLevelThreeItem(self.glb, self.child_count, self.comm_id, self.thread_id, query.value(0), query.value(1), query.value(2), query.value(3), int(query.value(4)), int(query.value(5)), self) + self.child_items.append(child_item) + self.child_count += 1 + +# Context-sensitive call graph data model level three item + +class CallGraphLevelThreeItem(CallGraphLevelTwoPlusItemBase): + + def __init__(self, glb, row, comm_id, thread_id, call_path_id, name, dso, count, time, branch_count, parent_item): + super(CallGraphLevelThreeItem, self).__init__(glb, row, comm_id, thread_id, call_path_id, time, branch_count, parent_item) + dso = dsoname(dso) + self.data = [ name, dso, str(count), str(time), PercentToOneDP(time, parent_item.time), str(branch_count), PercentToOneDP(branch_count, parent_item.branch_count) ] + self.dbid = call_path_id + +# Context-sensitive call graph data model level two item + +class CallGraphLevelTwoItem(CallGraphLevelTwoPlusItemBase): + + def __init__(self, glb, row, comm_id, thread_id, pid, tid, parent_item): + super(CallGraphLevelTwoItem, self).__init__(glb, row, comm_id, thread_id, 1, 0, 0, parent_item) + self.data = [str(pid) + ":" + str(tid), "", "", "", "", "", ""] + self.dbid = thread_id + + def Select(self): + super(CallGraphLevelTwoItem, self).Select() + for child_item in self.child_items: + self.time += child_item.time + self.branch_count += child_item.branch_count + for child_item in self.child_items: + child_item.data[4] = PercentToOneDP(child_item.time, self.time) + child_item.data[6] = PercentToOneDP(child_item.branch_count, self.branch_count) + +# Context-sensitive call graph data model level one item + +class CallGraphLevelOneItem(CallGraphLevelItemBase): + + def __init__(self, glb, row, comm_id, comm, parent_item): + super(CallGraphLevelOneItem, self).__init__(glb, row, parent_item) + self.data = [comm, "", "", "", "", "", ""] + self.dbid = comm_id + + def Select(self): + self.query_done = True; + query = QSqlQuery(self.glb.db) + QueryExec(query, "SELECT thread_id, pid, tid" + " FROM comm_threads" + " INNER JOIN threads ON thread_id = threads.id" + " WHERE comm_id = " + str(self.dbid)) + while query.next(): + child_item = CallGraphLevelTwoItem(self.glb, self.child_count, self.dbid, query.value(0), query.value(1), query.value(2), self) + self.child_items.append(child_item) + self.child_count += 1 + +# Context-sensitive call graph data model root item + +class CallGraphRootItem(CallGraphLevelItemBase): + + def __init__(self, glb): + super(CallGraphRootItem, self).__init__(glb, 0, None) + self.dbid = 0 + self.query_done = True; + query = QSqlQuery(glb.db) + QueryExec(query, "SELECT id, comm FROM comms") + while query.next(): + if not query.value(0): + continue + child_item = CallGraphLevelOneItem(glb, self.child_count, query.value(0), query.value(1), self) + self.child_items.append(child_item) + self.child_count += 1 + +# Context-sensitive call graph data model + +class CallGraphModel(TreeModel): + + def __init__(self, glb, parent=None): + super(CallGraphModel, self).__init__(CallGraphRootItem(glb), parent) + self.glb = glb + + def columnCount(self, parent=None): + return 7 + + def columnHeader(self, column): + headers = ["Call Path", "Object", "Count ", "Time (ns) ", "Time (%) ", "Branch Count ", "Branch Count (%) "] + return headers[column] + + def columnAlignment(self, column): + alignment = [ Qt.AlignLeft, Qt.AlignLeft, Qt.AlignRight, Qt.AlignRight, Qt.AlignRight, Qt.AlignRight, Qt.AlignRight ] + return alignment[column] + + def FindSelect(self, value, pattern, query): + if pattern: + # postgresql and sqlite pattern patching differences: + # postgresql LIKE is case sensitive but sqlite LIKE is not + # postgresql LIKE allows % and _ to be escaped with \ but sqlite LIKE does not + # postgresql supports ILIKE which is case insensitive + # sqlite supports GLOB (text only) which uses * and ? and is case sensitive + if not self.glb.dbref.is_sqlite3: + # Escape % and _ + s = value.replace("%", "\%") + s = s.replace("_", "\_") + # Translate * and ? into SQL LIKE pattern characters % and _ + trans = string.maketrans("*?", "%_") + match = " LIKE '" + str(s).translate(trans) + "'" + else: + match = " GLOB '" + str(value) + "'" + else: + match = " = '" + str(value) + "'" + QueryExec(query, "SELECT call_path_id, comm_id, thread_id" + " FROM calls" + " INNER JOIN call_paths ON calls.call_path_id = call_paths.id" + " INNER JOIN symbols ON call_paths.symbol_id = symbols.id" + " WHERE symbols.name" + match + + " GROUP BY comm_id, thread_id, call_path_id" + " ORDER BY comm_id, thread_id, call_path_id") + + def FindPath(self, query): + # Turn the query result into a list of ids that the tree view can walk + # to open the tree at the right place. + ids = [] + parent_id = query.value(0) + while parent_id: + ids.insert(0, parent_id) + q2 = QSqlQuery(self.glb.db) + QueryExec(q2, "SELECT parent_id" + " FROM call_paths" + " WHERE id = " + str(parent_id)) + if not q2.next(): + break + parent_id = q2.value(0) + # The call path root is not used + if ids[0] == 1: + del ids[0] + ids.insert(0, query.value(2)) + ids.insert(0, query.value(1)) + return ids + + def Found(self, query, found): + if found: + return self.FindPath(query) + return [] + + def FindValue(self, value, pattern, query, last_value, last_pattern): + if last_value == value and pattern == last_pattern: + found = query.first() + else: + self.FindSelect(value, pattern, query) + found = query.next() + return self.Found(query, found) + + def FindNext(self, query): + found = query.next() + if not found: + found = query.first() + return self.Found(query, found) + + def FindPrev(self, query): + found = query.previous() + if not found: + found = query.last() + return self.Found(query, found) + + def FindThread(self, c): + if c.direction == 0 or c.value != c.last_value or c.pattern != c.last_pattern: + ids = self.FindValue(c.value, c.pattern, c.query, c.last_value, c.last_pattern) + elif c.direction > 0: + ids = self.FindNext(c.query) + else: + ids = self.FindPrev(c.query) + return (True, ids) + + def Find(self, value, direction, pattern, context, callback): + class Context(): + def __init__(self, *x): + self.value, self.direction, self.pattern, self.query, self.last_value, self.last_pattern = x + def Update(self, *x): + self.value, self.direction, self.pattern, self.last_value, self.last_pattern = x + (self.value, self.pattern) + if len(context): + context[0].Update(value, direction, pattern) + else: + context.append(Context(value, direction, pattern, QSqlQuery(self.glb.db), None, None)) + # Use a thread so the UI is not blocked during the SELECT + thread = Thread(self.FindThread, context[0]) + thread.done.connect(lambda ids, t=thread, c=callback: self.FindDone(t, c, ids), Qt.QueuedConnection) + thread.start() + + def FindDone(self, thread, callback, ids): + callback(ids) + +# Vertical widget layout + +class VBox(): + + def __init__(self, w1, w2, w3=None): + self.vbox = QWidget() + self.vbox.setLayout(QVBoxLayout()); + + self.vbox.layout().setContentsMargins(0, 0, 0, 0) + + self.vbox.layout().addWidget(w1) + self.vbox.layout().addWidget(w2) + if w3: + self.vbox.layout().addWidget(w3) + + def Widget(self): + return self.vbox + +# Context-sensitive call graph window + +class CallGraphWindow(QMdiSubWindow): + + def __init__(self, glb, parent=None): + super(CallGraphWindow, self).__init__(parent) + + self.model = LookupCreateModel("Context-Sensitive Call Graph", lambda x=glb: CallGraphModel(x)) + + self.view = QTreeView() + self.view.setModel(self.model) + + for c, w in ((0, 250), (1, 100), (2, 60), (3, 70), (4, 70), (5, 100)): + self.view.setColumnWidth(c, w) + + self.find_bar = FindBar(self, self) + + self.vbox = VBox(self.view, self.find_bar.Widget()) + + self.setWidget(self.vbox.Widget()) + + AddSubWindow(glb.mainwindow.mdi_area, self, "Context-Sensitive Call Graph") + + def DisplayFound(self, ids): + if not len(ids): + return False + parent = QModelIndex() + for dbid in ids: + found = False + n = self.model.rowCount(parent) + for row in xrange(n): + child = self.model.index(row, 0, parent) + if child.internalPointer().dbid == dbid: + found = True + self.view.setCurrentIndex(child) + parent = child + break + if not found: + break + return found + + def Find(self, value, direction, pattern, context): + self.view.setFocus() + self.find_bar.Busy() + self.model.Find(value, direction, pattern, context, self.FindDone) + + def FindDone(self, ids): + found = True + if not self.DisplayFound(ids): + found = False + self.find_bar.Idle() + if not found: + self.find_bar.NotFound() + +# Child data item finder + +class ChildDataItemFinder(): + + def __init__(self, root): + self.root = root + self.value, self.direction, self.pattern, self.last_value, self.last_pattern = (None,) * 5 + self.rows = [] + self.pos = 0 + + def FindSelect(self): + self.rows = [] + if self.pattern: + pattern = re.compile(self.value) + for child in self.root.child_items: + for column_data in child.data: + if re.search(pattern, str(column_data)) is not None: + self.rows.append(child.row) + break + else: + for child in self.root.child_items: + for column_data in child.data: + if self.value in str(column_data): + self.rows.append(child.row) + break + + def FindValue(self): + self.pos = 0 + if self.last_value != self.value or self.pattern != self.last_pattern: + self.FindSelect() + if not len(self.rows): + return -1 + return self.rows[self.pos] + + def FindThread(self): + if self.direction == 0 or self.value != self.last_value or self.pattern != self.last_pattern: + row = self.FindValue() + elif len(self.rows): + if self.direction > 0: + self.pos += 1 + if self.pos >= len(self.rows): + self.pos = 0 + else: + self.pos -= 1 + if self.pos < 0: + self.pos = len(self.rows) - 1 + row = self.rows[self.pos] + else: + row = -1 + return (True, row) + + def Find(self, value, direction, pattern, context, callback): + self.value, self.direction, self.pattern, self.last_value, self.last_pattern = (value, direction,pattern, self.value, self.pattern) + # Use a thread so the UI is not blocked + thread = Thread(self.FindThread) + thread.done.connect(lambda row, t=thread, c=callback: self.FindDone(t, c, row), Qt.QueuedConnection) + thread.start() + + def FindDone(self, thread, callback, row): + callback(row) + +# Number of database records to fetch in one go + +glb_chunk_sz = 10000 + +# size of pickled integer big enough for record size + +glb_nsz = 8 + +# Background process for SQL data fetcher + +class SQLFetcherProcess(): + + def __init__(self, dbref, sql, buffer, head, tail, fetch_count, fetching_done, process_target, wait_event, fetched_event, prep): + # Need a unique connection name + conn_name = "SQLFetcher" + str(os.getpid()) + self.db, dbname = dbref.Open(conn_name) + self.sql = sql + self.buffer = buffer + self.head = head + self.tail = tail + self.fetch_count = fetch_count + self.fetching_done = fetching_done + self.process_target = process_target + self.wait_event = wait_event + self.fetched_event = fetched_event + self.prep = prep + self.query = QSqlQuery(self.db) + self.query_limit = 0 if "$$last_id$$" in sql else 2 + self.last_id = -1 + self.fetched = 0 + self.more = True + self.local_head = self.head.value + self.local_tail = self.tail.value + + def Select(self): + if self.query_limit: + if self.query_limit == 1: + return + self.query_limit -= 1 + stmt = self.sql.replace("$$last_id$$", str(self.last_id)) + QueryExec(self.query, stmt) + + def Next(self): + if not self.query.next(): + self.Select() + if not self.query.next(): + return None + self.last_id = self.query.value(0) + return self.prep(self.query) + + def WaitForTarget(self): + while True: + self.wait_event.clear() + target = self.process_target.value + if target > self.fetched or target < 0: + break + self.wait_event.wait() + return target + + def HasSpace(self, sz): + if self.local_tail <= self.local_head: + space = len(self.buffer) - self.local_head + if space > sz: + return True + if space >= glb_nsz: + # Use 0 (or space < glb_nsz) to mean there is no more at the top of the buffer + nd = cPickle.dumps(0, cPickle.HIGHEST_PROTOCOL) + self.buffer[self.local_head : self.local_head + len(nd)] = nd + self.local_head = 0 + if self.local_tail - self.local_head > sz: + return True + return False + + def WaitForSpace(self, sz): + if self.HasSpace(sz): + return + while True: + self.wait_event.clear() + self.local_tail = self.tail.value + if self.HasSpace(sz): + return + self.wait_event.wait() + + def AddToBuffer(self, obj): + d = cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL) + n = len(d) + nd = cPickle.dumps(n, cPickle.HIGHEST_PROTOCOL) + sz = n + glb_nsz + self.WaitForSpace(sz) + pos = self.local_head + self.buffer[pos : pos + len(nd)] = nd + self.buffer[pos + glb_nsz : pos + sz] = d + self.local_head += sz + + def FetchBatch(self, batch_size): + fetched = 0 + while batch_size > fetched: + obj = self.Next() + if obj is None: + self.more = False + break + self.AddToBuffer(obj) + fetched += 1 + if fetched: + self.fetched += fetched + with self.fetch_count.get_lock(): + self.fetch_count.value += fetched + self.head.value = self.local_head + self.fetched_event.set() + + def Run(self): + while self.more: + target = self.WaitForTarget() + if target < 0: + break + batch_size = min(glb_chunk_sz, target - self.fetched) + self.FetchBatch(batch_size) + self.fetching_done.value = True + self.fetched_event.set() + +def SQLFetcherFn(*x): + process = SQLFetcherProcess(*x) + process.Run() + +# SQL data fetcher + +class SQLFetcher(QObject): + + done = Signal(object) + + def __init__(self, glb, sql, prep, process_data, parent=None): + super(SQLFetcher, self).__init__(parent) + self.process_data = process_data + self.more = True + self.target = 0 + self.last_target = 0 + self.fetched = 0 + self.buffer_size = 16 * 1024 * 1024 + self.buffer = Array(c_char, self.buffer_size, lock=False) + self.head = Value(c_longlong) + self.tail = Value(c_longlong) + self.local_tail = 0 + self.fetch_count = Value(c_longlong) + self.fetching_done = Value(c_bool) + self.last_count = 0 + self.process_target = Value(c_longlong) + self.wait_event = Event() + self.fetched_event = Event() + glb.AddInstanceToShutdownOnExit(self) + self.process = Process(target=SQLFetcherFn, args=(glb.dbref, sql, self.buffer, self.head, self.tail, self.fetch_count, self.fetching_done, self.process_target, self.wait_event, self.fetched_event, prep)) + self.process.start() + self.thread = Thread(self.Thread) + self.thread.done.connect(self.ProcessData, Qt.QueuedConnection) + self.thread.start() + + def Shutdown(self): + # Tell the thread and process to exit + self.process_target.value = -1 + self.wait_event.set() + self.more = False + self.fetching_done.value = True + self.fetched_event.set() + + def Thread(self): + if not self.more: + return True, 0 + while True: + self.fetched_event.clear() + fetch_count = self.fetch_count.value + if fetch_count != self.last_count: + break + if self.fetching_done.value: + self.more = False + return True, 0 + self.fetched_event.wait() + count = fetch_count - self.last_count + self.last_count = fetch_count + self.fetched += count + return False, count + + def Fetch(self, nr): + if not self.more: + # -1 inidcates there are no more + return -1 + result = self.fetched + extra = result + nr - self.target + if extra > 0: + self.target += extra + # process_target < 0 indicates shutting down + if self.process_target.value >= 0: + self.process_target.value = self.target + self.wait_event.set() + return result + + def RemoveFromBuffer(self): + pos = self.local_tail + if len(self.buffer) - pos < glb_nsz: + pos = 0 + n = cPickle.loads(self.buffer[pos : pos + glb_nsz]) + if n == 0: + pos = 0 + n = cPickle.loads(self.buffer[0 : glb_nsz]) + pos += glb_nsz + obj = cPickle.loads(self.buffer[pos : pos + n]) + self.local_tail = pos + n + return obj + + def ProcessData(self, count): + for i in xrange(count): + obj = self.RemoveFromBuffer() + self.process_data(obj) + self.tail.value = self.local_tail + self.wait_event.set() + self.done.emit(count) + +# Fetch more records bar + +class FetchMoreRecordsBar(): + + def __init__(self, model, parent): + self.model = model + + self.label = QLabel("Number of records (x " + "{:,}".format(glb_chunk_sz) + ") to fetch:") + self.label.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.fetch_count = QSpinBox() + self.fetch_count.setRange(1, 1000000) + self.fetch_count.setValue(10) + self.fetch_count.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.fetch = QPushButton("Go!") + self.fetch.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + self.fetch.released.connect(self.FetchMoreRecords) + + self.progress = QProgressBar() + self.progress.setRange(0, 100) + self.progress.hide() + + self.done_label = QLabel("All records fetched") + self.done_label.hide() + + self.spacer = QLabel("") + + self.close_button = QToolButton() + self.close_button.setIcon(parent.style().standardIcon(QStyle.SP_DockWidgetCloseButton)) + self.close_button.released.connect(self.Deactivate) + + self.hbox = QHBoxLayout() + self.hbox.setContentsMargins(0, 0, 0, 0) + + self.hbox.addWidget(self.label) + self.hbox.addWidget(self.fetch_count) + self.hbox.addWidget(self.fetch) + self.hbox.addWidget(self.spacer) + self.hbox.addWidget(self.progress) + self.hbox.addWidget(self.done_label) + self.hbox.addWidget(self.close_button) + + self.bar = QWidget() + self.bar.setLayout(self.hbox); + self.bar.show() + + self.in_progress = False + self.model.progress.connect(self.Progress) + + self.done = False + + if not model.HasMoreRecords(): + self.Done() + + def Widget(self): + return self.bar + + def Activate(self): + self.bar.show() + self.fetch.setFocus() + + def Deactivate(self): + self.bar.hide() + + def Enable(self, enable): + self.fetch.setEnabled(enable) + self.fetch_count.setEnabled(enable) + + def Busy(self): + self.Enable(False) + self.fetch.hide() + self.spacer.hide() + self.progress.show() + + def Idle(self): + self.in_progress = False + self.Enable(True) + self.progress.hide() + self.fetch.show() + self.spacer.show() + + def Target(self): + return self.fetch_count.value() * glb_chunk_sz + + def Done(self): + self.done = True + self.Idle() + self.label.hide() + self.fetch_count.hide() + self.fetch.hide() + self.spacer.hide() + self.done_label.show() + + def Progress(self, count): + if self.in_progress: + if count: + percent = ((count - self.start) * 100) / self.Target() + if percent >= 100: + self.Idle() + else: + self.progress.setValue(percent) + if not count: + # Count value of zero means no more records + self.Done() + + def FetchMoreRecords(self): + if self.done: + return + self.progress.setValue(0) + self.Busy() + self.in_progress = True + self.start = self.model.FetchMoreRecords(self.Target()) + +# Brance data model level two item + +class BranchLevelTwoItem(): + + def __init__(self, row, text, parent_item): + self.row = row + self.parent_item = parent_item + self.data = [""] * 8 + self.data[7] = text + self.level = 2 + + def getParentItem(self): + return self.parent_item + + def getRow(self): + return self.row + + def childCount(self): + return 0 + + def hasChildren(self): + return False + + def getData(self, column): + return self.data[column] + +# Brance data model level one item + +class BranchLevelOneItem(): + + def __init__(self, glb, row, data, parent_item): + self.glb = glb + self.row = row + self.parent_item = parent_item + self.child_count = 0 + self.child_items = [] + self.data = data[1:] + self.dbid = data[0] + self.level = 1 + self.query_done = False + + def getChildItem(self, row): + return self.child_items[row] + + def getParentItem(self): + return self.parent_item + + def getRow(self): + return self.row + + def Select(self): + self.query_done = True + + if not self.glb.have_disassembler: + return + + query = QSqlQuery(self.glb.db) + + QueryExec(query, "SELECT cpu, to_dso_id, to_symbol_id, to_sym_offset, short_name, long_name, build_id, sym_start, to_ip" + " FROM samples" + " INNER JOIN dsos ON samples.to_dso_id = dsos.id" + " INNER JOIN symbols ON samples.to_symbol_id = symbols.id" + " WHERE samples.id = " + str(self.dbid)) + if not query.next(): + return + cpu = query.value(0) + dso = query.value(1) + sym = query.value(2) + if dso == 0 or sym == 0: + return + off = query.value(3) + short_name = query.value(4) + long_name = query.value(5) + build_id = query.value(6) + sym_start = query.value(7) + ip = query.value(8) + + QueryExec(query, "SELECT samples.dso_id, symbol_id, sym_offset, sym_start" + " FROM samples" + " INNER JOIN symbols ON samples.symbol_id = symbols.id" + " WHERE samples.id > " + str(self.dbid) + " AND cpu = " + str(cpu) + + " ORDER BY samples.id" + " LIMIT 1") + if not query.next(): + return + if query.value(0) != dso: + # Cannot disassemble from one dso to another + return + bsym = query.value(1) + boff = query.value(2) + bsym_start = query.value(3) + if bsym == 0: + return + tot = bsym_start + boff + 1 - sym_start - off + if tot <= 0 or tot > 16384: + return + + inst = self.glb.disassembler.Instruction() + f = self.glb.FileFromNamesAndBuildId(short_name, long_name, build_id) + if not f: + return + mode = 0 if Is64Bit(f) else 1 + self.glb.disassembler.SetMode(inst, mode) + + buf_sz = tot + 16 + buf = create_string_buffer(tot + 16) + f.seek(sym_start + off) + buf.value = f.read(buf_sz) + buf_ptr = addressof(buf) + i = 0 + while tot > 0: + cnt, text = self.glb.disassembler.DisassembleOne(inst, buf_ptr, buf_sz, ip) + if cnt: + byte_str = tohex(ip).rjust(16) + for k in xrange(cnt): + byte_str += " %02x" % ord(buf[i]) + i += 1 + while k < 15: + byte_str += " " + k += 1 + self.child_items.append(BranchLevelTwoItem(0, byte_str + " " + text, self)) + self.child_count += 1 + else: + return + buf_ptr += cnt + tot -= cnt + buf_sz -= cnt + ip += cnt + + def childCount(self): + if not self.query_done: + self.Select() + if not self.child_count: + return -1 + return self.child_count + + def hasChildren(self): + if not self.query_done: + return True + return self.child_count > 0 + + def getData(self, column): + return self.data[column] + +# Brance data model root item + +class BranchRootItem(): + + def __init__(self): + self.child_count = 0 + self.child_items = [] + self.level = 0 + + def getChildItem(self, row): + return self.child_items[row] + + def getParentItem(self): + return None + + def getRow(self): + return 0 + + def childCount(self): + return self.child_count + + def hasChildren(self): + return self.child_count > 0 + + def getData(self, column): + return "" + +# Branch data preparation + +def BranchDataPrep(query): + data = [] + for i in xrange(0, 8): + data.append(query.value(i)) + data.append(tohex(query.value(8)).rjust(16) + " " + query.value(9) + offstr(query.value(10)) + + " (" + dsoname(query.value(11)) + ")" + " -> " + + tohex(query.value(12)) + " " + query.value(13) + offstr(query.value(14)) + + " (" + dsoname(query.value(15)) + ")") + return data + +# Branch data model + +class BranchModel(TreeModel): + + progress = Signal(object) + + def __init__(self, glb, event_id, where_clause, parent=None): + super(BranchModel, self).__init__(BranchRootItem(), parent) + self.glb = glb + self.event_id = event_id + self.more = True + self.populated = 0 + sql = ("SELECT samples.id, time, cpu, comm, pid, tid, branch_types.name," + " CASE WHEN in_tx = '0' THEN 'No' ELSE 'Yes' END," + " ip, symbols.name, sym_offset, dsos.short_name," + " to_ip, to_symbols.name, to_sym_offset, to_dsos.short_name" + " FROM samples" + " INNER JOIN comms ON comm_id = comms.id" + " INNER JOIN threads ON thread_id = threads.id" + " INNER JOIN branch_types ON branch_type = branch_types.id" + " INNER JOIN symbols ON symbol_id = symbols.id" + " INNER JOIN symbols to_symbols ON to_symbol_id = to_symbols.id" + " INNER JOIN dsos ON samples.dso_id = dsos.id" + " INNER JOIN dsos AS to_dsos ON samples.to_dso_id = to_dsos.id" + " WHERE samples.id > $$last_id$$" + where_clause + + " AND evsel_id = " + str(self.event_id) + + " ORDER BY samples.id" + " LIMIT " + str(glb_chunk_sz)) + self.fetcher = SQLFetcher(glb, sql, BranchDataPrep, self.AddSample) + self.fetcher.done.connect(self.Update) + self.fetcher.Fetch(glb_chunk_sz) + + def columnCount(self, parent=None): + return 8 + + def columnHeader(self, column): + return ("Time", "CPU", "Command", "PID", "TID", "Branch Type", "In Tx", "Branch")[column] + + def columnFont(self, column): + if column != 7: + return None + return QFont("Monospace") + + def DisplayData(self, item, index): + if item.level == 1: + self.FetchIfNeeded(item.row) + return item.getData(index.column()) + + def AddSample(self, data): + child = BranchLevelOneItem(self.glb, self.populated, data, self.root) + self.root.child_items.append(child) + self.populated += 1 + + def Update(self, fetched): + if not fetched: + self.more = False + self.progress.emit(0) + child_count = self.root.child_count + count = self.populated - child_count + if count > 0: + parent = QModelIndex() + self.beginInsertRows(parent, child_count, child_count + count - 1) + self.insertRows(child_count, count, parent) + self.root.child_count += count + self.endInsertRows() + self.progress.emit(self.root.child_count) + + def FetchMoreRecords(self, count): + current = self.root.child_count + if self.more: + self.fetcher.Fetch(count) + else: + self.progress.emit(0) + return current + + def HasMoreRecords(self): + return self.more + +# Branch window + +class BranchWindow(QMdiSubWindow): + + def __init__(self, glb, event_id, name, where_clause, parent=None): + super(BranchWindow, self).__init__(parent) + + model_name = "Branch Events " + str(event_id) + if len(where_clause): + model_name = where_clause + " " + model_name + + self.model = LookupCreateModel(model_name, lambda: BranchModel(glb, event_id, where_clause)) + + self.view = QTreeView() + self.view.setUniformRowHeights(True) + self.view.setModel(self.model) + + self.ResizeColumnsToContents() + + self.find_bar = FindBar(self, self, True) + + self.finder = ChildDataItemFinder(self.model.root) + + self.fetch_bar = FetchMoreRecordsBar(self.model, self) + + self.vbox = VBox(self.view, self.find_bar.Widget(), self.fetch_bar.Widget()) + + self.setWidget(self.vbox.Widget()) + + AddSubWindow(glb.mainwindow.mdi_area, self, name + " Branch Events") + + def ResizeColumnToContents(self, column, n): + # Using the view's resizeColumnToContents() here is extrememly slow + # so implement a crude alternative + mm = "MM" if column else "MMMM" + font = self.view.font() + metrics = QFontMetrics(font) + max = 0 + for row in xrange(n): + val = self.model.root.child_items[row].data[column] + len = metrics.width(str(val) + mm) + max = len if len > max else max + val = self.model.columnHeader(column) + len = metrics.width(str(val) + mm) + max = len if len > max else max + self.view.setColumnWidth(column, max) + + def ResizeColumnsToContents(self): + n = min(self.model.root.child_count, 100) + if n < 1: + # No data yet, so connect a signal to notify when there is + self.model.rowsInserted.connect(self.UpdateColumnWidths) + return + columns = self.model.columnCount() + for i in xrange(columns): + self.ResizeColumnToContents(i, n) + + def UpdateColumnWidths(self, *x): + # This only needs to be done once, so disconnect the signal now + self.model.rowsInserted.disconnect(self.UpdateColumnWidths) + self.ResizeColumnsToContents() + + def Find(self, value, direction, pattern, context): + self.view.setFocus() + self.find_bar.Busy() + self.finder.Find(value, direction, pattern, context, self.FindDone) + + def FindDone(self, row): + self.find_bar.Idle() + if row >= 0: + self.view.setCurrentIndex(self.model.index(row, 0, QModelIndex())) + else: + self.find_bar.NotFound() + +# Event list + +def GetEventList(db): + events = [] + query = QSqlQuery(db) + QueryExec(query, "SELECT name FROM selected_events WHERE id > 0 ORDER BY id") + while query.next(): + events.append(query.value(0)) + return events + +# SQL data preparation + +def SQLTableDataPrep(query, count): + data = [] + for i in xrange(count): + data.append(query.value(i)) + return data + +# SQL table data model item + +class SQLTableItem(): + + def __init__(self, row, data): + self.row = row + self.data = data + + def getData(self, column): + return self.data[column] + +# SQL table data model + +class SQLTableModel(TableModel): + + progress = Signal(object) + + def __init__(self, glb, sql, column_count, parent=None): + super(SQLTableModel, self).__init__(parent) + self.glb = glb + self.more = True + self.populated = 0 + self.fetcher = SQLFetcher(glb, sql, lambda x, y=column_count: SQLTableDataPrep(x, y), self.AddSample) + self.fetcher.done.connect(self.Update) + self.fetcher.Fetch(glb_chunk_sz) + + def DisplayData(self, item, index): + self.FetchIfNeeded(item.row) + return item.getData(index.column()) + + def AddSample(self, data): + child = SQLTableItem(self.populated, data) + self.child_items.append(child) + self.populated += 1 + + def Update(self, fetched): + if not fetched: + self.more = False + self.progress.emit(0) + child_count = self.child_count + count = self.populated - child_count + if count > 0: + parent = QModelIndex() + self.beginInsertRows(parent, child_count, child_count + count - 1) + self.insertRows(child_count, count, parent) + self.child_count += count + self.endInsertRows() + self.progress.emit(self.child_count) + + def FetchMoreRecords(self, count): + current = self.child_count + if self.more: + self.fetcher.Fetch(count) + else: + self.progress.emit(0) + return current + + def HasMoreRecords(self): + return self.more + +# SQL automatic table data model + +class SQLAutoTableModel(SQLTableModel): + + def __init__(self, glb, table_name, parent=None): + sql = "SELECT * FROM " + table_name + " WHERE id > $$last_id$$ ORDER BY id LIMIT " + str(glb_chunk_sz) + if table_name == "comm_threads_view": + # For now, comm_threads_view has no id column + sql = "SELECT * FROM " + table_name + " WHERE comm_id > $$last_id$$ ORDER BY comm_id LIMIT " + str(glb_chunk_sz) + self.column_headers = [] + query = QSqlQuery(glb.db) + if glb.dbref.is_sqlite3: + QueryExec(query, "PRAGMA table_info(" + table_name + ")") + while query.next(): + self.column_headers.append(query.value(1)) + if table_name == "sqlite_master": + sql = "SELECT * FROM " + table_name + else: + if table_name[:19] == "information_schema.": + sql = "SELECT * FROM " + table_name + select_table_name = table_name[19:] + schema = "information_schema" + else: + select_table_name = table_name + schema = "public" + QueryExec(query, "SELECT column_name FROM information_schema.columns WHERE table_schema = '" + schema + "' and table_name = '" + select_table_name + "'") + while query.next(): + self.column_headers.append(query.value(0)) + super(SQLAutoTableModel, self).__init__(glb, sql, len(self.column_headers), parent) + + def columnCount(self, parent=None): + return len(self.column_headers) + + def columnHeader(self, column): + return self.column_headers[column] + +# Base class for custom ResizeColumnsToContents + +class ResizeColumnsToContentsBase(QObject): + + def __init__(self, parent=None): + super(ResizeColumnsToContentsBase, self).__init__(parent) + + def ResizeColumnToContents(self, column, n): + # Using the view's resizeColumnToContents() here is extrememly slow + # so implement a crude alternative + font = self.view.font() + metrics = QFontMetrics(font) + max = 0 + for row in xrange(n): + val = self.data_model.child_items[row].data[column] + len = metrics.width(str(val) + "MM") + max = len if len > max else max + val = self.data_model.columnHeader(column) + len = metrics.width(str(val) + "MM") + max = len if len > max else max + self.view.setColumnWidth(column, max) + + def ResizeColumnsToContents(self): + n = min(self.data_model.child_count, 100) + if n < 1: + # No data yet, so connect a signal to notify when there is + self.data_model.rowsInserted.connect(self.UpdateColumnWidths) + return + columns = self.data_model.columnCount() + for i in xrange(columns): + self.ResizeColumnToContents(i, n) + + def UpdateColumnWidths(self, *x): + # This only needs to be done once, so disconnect the signal now + self.data_model.rowsInserted.disconnect(self.UpdateColumnWidths) + self.ResizeColumnsToContents() + +# Table window + +class TableWindow(QMdiSubWindow, ResizeColumnsToContentsBase): + + def __init__(self, glb, table_name, parent=None): + super(TableWindow, self).__init__(parent) + + self.data_model = LookupCreateModel(table_name + " Table", lambda: SQLAutoTableModel(glb, table_name)) + + self.model = QSortFilterProxyModel() + self.model.setSourceModel(self.data_model) + + self.view = QTableView() + self.view.setModel(self.model) + self.view.setEditTriggers(QAbstractItemView.NoEditTriggers) + self.view.verticalHeader().setVisible(False) + self.view.sortByColumn(-1, Qt.AscendingOrder) + self.view.setSortingEnabled(True) + + self.ResizeColumnsToContents() + + self.find_bar = FindBar(self, self, True) + + self.finder = ChildDataItemFinder(self.data_model) + + self.fetch_bar = FetchMoreRecordsBar(self.data_model, self) + + self.vbox = VBox(self.view, self.find_bar.Widget(), self.fetch_bar.Widget()) + + self.setWidget(self.vbox.Widget()) + + AddSubWindow(glb.mainwindow.mdi_area, self, table_name + " Table") + + def Find(self, value, direction, pattern, context): + self.view.setFocus() + self.find_bar.Busy() + self.finder.Find(value, direction, pattern, context, self.FindDone) + + def FindDone(self, row): + self.find_bar.Idle() + if row >= 0: + self.view.setCurrentIndex(self.model.index(row, 0, QModelIndex())) + else: + self.find_bar.NotFound() + +# Table list + +def GetTableList(glb): + tables = [] + query = QSqlQuery(glb.db) + if glb.dbref.is_sqlite3: + QueryExec(query, "SELECT name FROM sqlite_master WHERE type IN ( 'table' , 'view' ) ORDER BY name") + else: + QueryExec(query, "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_type IN ( 'BASE TABLE' , 'VIEW' ) ORDER BY table_name") + while query.next(): + tables.append(query.value(0)) + if glb.dbref.is_sqlite3: + tables.append("sqlite_master") + else: + tables.append("information_schema.tables") + tables.append("information_schema.views") + tables.append("information_schema.columns") + return tables + +# Action Definition + +def CreateAction(label, tip, callback, parent=None, shortcut=None): + action = QAction(label, parent) + if shortcut != None: + action.setShortcuts(shortcut) + action.setStatusTip(tip) + action.triggered.connect(callback) + return action + +# Typical application actions + +def CreateExitAction(app, parent=None): + return CreateAction("&Quit", "Exit the application", app.closeAllWindows, parent, QKeySequence.Quit) + +# Typical MDI actions + +def CreateCloseActiveWindowAction(mdi_area): + return CreateAction("Cl&ose", "Close the active window", mdi_area.closeActiveSubWindow, mdi_area) + +def CreateCloseAllWindowsAction(mdi_area): + return CreateAction("Close &All", "Close all the windows", mdi_area.closeAllSubWindows, mdi_area) + +def CreateTileWindowsAction(mdi_area): + return CreateAction("&Tile", "Tile the windows", mdi_area.tileSubWindows, mdi_area) + +def CreateCascadeWindowsAction(mdi_area): + return CreateAction("&Cascade", "Cascade the windows", mdi_area.cascadeSubWindows, mdi_area) + +def CreateNextWindowAction(mdi_area): + return CreateAction("Ne&xt", "Move the focus to the next window", mdi_area.activateNextSubWindow, mdi_area, QKeySequence.NextChild) + +def CreatePreviousWindowAction(mdi_area): + return CreateAction("Pre&vious", "Move the focus to the previous window", mdi_area.activatePreviousSubWindow, mdi_area, QKeySequence.PreviousChild) + +# Typical MDI window menu + +class WindowMenu(): + + def __init__(self, mdi_area, menu): + self.mdi_area = mdi_area + self.window_menu = menu.addMenu("&Windows") + self.close_active_window = CreateCloseActiveWindowAction(mdi_area) + self.close_all_windows = CreateCloseAllWindowsAction(mdi_area) + self.tile_windows = CreateTileWindowsAction(mdi_area) + self.cascade_windows = CreateCascadeWindowsAction(mdi_area) + self.next_window = CreateNextWindowAction(mdi_area) + self.previous_window = CreatePreviousWindowAction(mdi_area) + self.window_menu.aboutToShow.connect(self.Update) + + def Update(self): + self.window_menu.clear() + sub_window_count = len(self.mdi_area.subWindowList()) + have_sub_windows = sub_window_count != 0 + self.close_active_window.setEnabled(have_sub_windows) + self.close_all_windows.setEnabled(have_sub_windows) + self.tile_windows.setEnabled(have_sub_windows) + self.cascade_windows.setEnabled(have_sub_windows) + self.next_window.setEnabled(have_sub_windows) + self.previous_window.setEnabled(have_sub_windows) + self.window_menu.addAction(self.close_active_window) + self.window_menu.addAction(self.close_all_windows) + self.window_menu.addSeparator() + self.window_menu.addAction(self.tile_windows) + self.window_menu.addAction(self.cascade_windows) + self.window_menu.addSeparator() + self.window_menu.addAction(self.next_window) + self.window_menu.addAction(self.previous_window) + if sub_window_count == 0: + return + self.window_menu.addSeparator() + nr = 1 + for sub_window in self.mdi_area.subWindowList(): + label = str(nr) + " " + sub_window.name + if nr < 10: + label = "&" + label + action = self.window_menu.addAction(label) + action.setCheckable(True) + action.setChecked(sub_window == self.mdi_area.activeSubWindow()) + action.triggered.connect(lambda x=nr: self.setActiveSubWindow(x)) + self.window_menu.addAction(action) + nr += 1 + + def setActiveSubWindow(self, nr): + self.mdi_area.setActiveSubWindow(self.mdi_area.subWindowList()[nr - 1]) + +# Font resize + +def ResizeFont(widget, diff): + font = widget.font() + sz = font.pointSize() + font.setPointSize(sz + diff) + widget.setFont(font) + +def ShrinkFont(widget): + ResizeFont(widget, -1) + +def EnlargeFont(widget): + ResizeFont(widget, 1) + +# Unique name for sub-windows + +def NumberedWindowName(name, nr): + if nr > 1: + name += " <" + str(nr) + ">" + return name + +def UniqueSubWindowName(mdi_area, name): + nr = 1 + while True: + unique_name = NumberedWindowName(name, nr) + ok = True + for sub_window in mdi_area.subWindowList(): + if sub_window.name == unique_name: + ok = False + break + if ok: + return unique_name + nr += 1 + +# Add a sub-window + +def AddSubWindow(mdi_area, sub_window, name): + unique_name = UniqueSubWindowName(mdi_area, name) + sub_window.setMinimumSize(200, 100) + sub_window.resize(800, 600) + sub_window.setWindowTitle(unique_name) + sub_window.setAttribute(Qt.WA_DeleteOnClose) + sub_window.setWindowIcon(sub_window.style().standardIcon(QStyle.SP_FileIcon)) + sub_window.name = unique_name + mdi_area.addSubWindow(sub_window) + sub_window.show() + +# Main window + +class MainWindow(QMainWindow): + + def __init__(self, glb, parent=None): + super(MainWindow, self).__init__(parent) + + self.glb = glb + + self.setWindowTitle("Exported SQL Viewer: " + glb.dbname) + self.setWindowIcon(self.style().standardIcon(QStyle.SP_ComputerIcon)) + self.setMinimumSize(200, 100) + + self.mdi_area = QMdiArea() + self.mdi_area.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) + self.mdi_area.setVerticalScrollBarPolicy(Qt.ScrollBarAsNeeded) + + self.setCentralWidget(self.mdi_area) + + menu = self.menuBar() + + file_menu = menu.addMenu("&File") + file_menu.addAction(CreateExitAction(glb.app, self)) + + edit_menu = menu.addMenu("&Edit") + edit_menu.addAction(CreateAction("&Find...", "Find items", self.Find, self, QKeySequence.Find)) + edit_menu.addAction(CreateAction("Fetch &more records...", "Fetch more records", self.FetchMoreRecords, self, [QKeySequence(Qt.Key_F8)])) + edit_menu.addAction(CreateAction("&Shrink Font", "Make text smaller", self.ShrinkFont, self, [QKeySequence("Ctrl+-")])) + edit_menu.addAction(CreateAction("&Enlarge Font", "Make text bigger", self.EnlargeFont, self, [QKeySequence("Ctrl++")])) + + reports_menu = menu.addMenu("&Reports") + reports_menu.addAction(CreateAction("Context-Sensitive Call &Graph", "Create a new window containing a context-sensitive call graph", self.NewCallGraph, self)) + + self.EventMenu(GetEventList(glb.db), reports_menu) + + self.TableMenu(GetTableList(glb), menu) + + self.window_menu = WindowMenu(self.mdi_area, menu) + + def Find(self): + win = self.mdi_area.activeSubWindow() + if win: + try: + win.find_bar.Activate() + except: + pass + + def FetchMoreRecords(self): + win = self.mdi_area.activeSubWindow() + if win: + try: + win.fetch_bar.Activate() + except: + pass + + def ShrinkFont(self): + win = self.mdi_area.activeSubWindow() + ShrinkFont(win.view) + + def EnlargeFont(self): + win = self.mdi_area.activeSubWindow() + EnlargeFont(win.view) + + def EventMenu(self, events, reports_menu): + branches_events = 0 + for event in events: + event = event.split(":")[0] + if event == "branches": + branches_events += 1 + dbid = 0 + for event in events: + dbid += 1 + event = event.split(":")[0] + if event == "branches": + label = "All branches" if branches_events == 1 else "All branches " + "(id=" + dbid + ")" + reports_menu.addAction(CreateAction(label, "Create a new window displaying branch events", lambda x=dbid: self.NewBranchView(x), self)) + + def TableMenu(self, tables, menu): + table_menu = menu.addMenu("&Tables") + for table in tables: + table_menu.addAction(CreateAction(table, "Create a new window containing a table view", lambda t=table: self.NewTableView(t), self)) + + def NewCallGraph(self): + CallGraphWindow(self.glb, self) + + def NewBranchView(self, event_id): + BranchWindow(self.glb, event_id, "", "", self) + + def NewTableView(self, table_name): + TableWindow(self.glb, table_name, self) + +# XED Disassembler + +class xed_state_t(Structure): + + _fields_ = [ + ("mode", c_int), + ("width", c_int) + ] + +class XEDInstruction(): + + def __init__(self, libxed): + # Current xed_decoded_inst_t structure is 192 bytes. Use 512 to allow for future expansion + xedd_t = c_byte * 512 + self.xedd = xedd_t() + self.xedp = addressof(self.xedd) + libxed.xed_decoded_inst_zero(self.xedp) + self.state = xed_state_t() + self.statep = addressof(self.state) + # Buffer for disassembled instruction text + self.buffer = create_string_buffer(256) + self.bufferp = addressof(self.buffer) + +class LibXED(): + + def __init__(self): + self.libxed = CDLL("libxed.so") + + self.xed_tables_init = self.libxed.xed_tables_init + self.xed_tables_init.restype = None + self.xed_tables_init.argtypes = [] + + self.xed_decoded_inst_zero = self.libxed.xed_decoded_inst_zero + self.xed_decoded_inst_zero.restype = None + self.xed_decoded_inst_zero.argtypes = [ c_void_p ] + + self.xed_operand_values_set_mode = self.libxed.xed_operand_values_set_mode + self.xed_operand_values_set_mode.restype = None + self.xed_operand_values_set_mode.argtypes = [ c_void_p, c_void_p ] + + self.xed_decoded_inst_zero_keep_mode = self.libxed.xed_decoded_inst_zero_keep_mode + self.xed_decoded_inst_zero_keep_mode.restype = None + self.xed_decoded_inst_zero_keep_mode.argtypes = [ c_void_p ] + + self.xed_decode = self.libxed.xed_decode + self.xed_decode.restype = c_int + self.xed_decode.argtypes = [ c_void_p, c_void_p, c_uint ] + + self.xed_format_context = self.libxed.xed_format_context + self.xed_format_context.restype = c_uint + self.xed_format_context.argtypes = [ c_int, c_void_p, c_void_p, c_int, c_ulonglong, c_void_p, c_void_p ] + + self.xed_tables_init() + + def Instruction(self): + return XEDInstruction(self) + + def SetMode(self, inst, mode): + if mode: + inst.state.mode = 4 # 32-bit + inst.state.width = 4 # 4 bytes + else: + inst.state.mode = 1 # 64-bit + inst.state.width = 8 # 8 bytes + self.xed_operand_values_set_mode(inst.xedp, inst.statep) + + def DisassembleOne(self, inst, bytes_ptr, bytes_cnt, ip): + self.xed_decoded_inst_zero_keep_mode(inst.xedp) + err = self.xed_decode(inst.xedp, bytes_ptr, bytes_cnt) + if err: + return 0, "" + # Use AT&T mode (2), alternative is Intel (3) + ok = self.xed_format_context(2, inst.xedp, inst.bufferp, sizeof(inst.buffer), ip, 0, 0) + if not ok: + return 0, "" + # Return instruction length and the disassembled instruction text + # For now, assume the length is in byte 166 + return inst.xedd[166], inst.buffer.value + +def TryOpen(file_name): + try: + return open(file_name, "rb") + except: + return None + +def Is64Bit(f): + result = sizeof(c_void_p) + # ELF support only + pos = f.tell() + f.seek(0) + header = f.read(7) + f.seek(pos) + magic = header[0:4] + eclass = ord(header[4]) + encoding = ord(header[5]) + version = ord(header[6]) + if magic == chr(127) + "ELF" and eclass > 0 and eclass < 3 and encoding > 0 and encoding < 3 and version == 1: + result = True if eclass == 2 else False + return result + +# Global data + +class Glb(): + + def __init__(self, dbref, db, dbname): + self.dbref = dbref + self.db = db + self.dbname = dbname + self.home_dir = os.path.expanduser("~") + self.buildid_dir = os.getenv("PERF_BUILDID_DIR") + if self.buildid_dir: + self.buildid_dir += "/.build-id/" + else: + self.buildid_dir = self.home_dir + "/.debug/.build-id/" + self.app = None + self.mainwindow = None + self.instances_to_shutdown_on_exit = weakref.WeakSet() + try: + self.disassembler = LibXED() + self.have_disassembler = True + except: + self.have_disassembler = False + + def FileFromBuildId(self, build_id): + file_name = self.buildid_dir + build_id[0:2] + "/" + build_id[2:] + "/elf" + return TryOpen(file_name) + + def FileFromNamesAndBuildId(self, short_name, long_name, build_id): + # Assume current machine i.e. no support for virtualization + if short_name[0:7] == "[kernel" and os.path.basename(long_name) == "kcore": + file_name = os.getenv("PERF_KCORE") + f = TryOpen(file_name) if file_name else None + if f: + return f + # For now, no special handling if long_name is /proc/kcore + f = TryOpen(long_name) + if f: + return f + f = self.FileFromBuildId(build_id) + if f: + return f + return None + + def AddInstanceToShutdownOnExit(self, instance): + self.instances_to_shutdown_on_exit.add(instance) + + # Shutdown any background processes or threads + def ShutdownInstances(self): + for x in self.instances_to_shutdown_on_exit: + try: + x.Shutdown() + except: + pass + +# Database reference + +class DBRef(): + + def __init__(self, is_sqlite3, dbname): + self.is_sqlite3 = is_sqlite3 + self.dbname = dbname + + def Open(self, connection_name): + dbname = self.dbname + if self.is_sqlite3: + db = QSqlDatabase.addDatabase("QSQLITE", connection_name) + else: + db = QSqlDatabase.addDatabase("QPSQL", connection_name) + opts = dbname.split() + for opt in opts: + if "=" in opt: + opt = opt.split("=") + if opt[0] == "hostname": + db.setHostName(opt[1]) + elif opt[0] == "port": + db.setPort(int(opt[1])) + elif opt[0] == "username": + db.setUserName(opt[1]) + elif opt[0] == "password": + db.setPassword(opt[1]) + elif opt[0] == "dbname": + dbname = opt[1] + else: + dbname = opt + + db.setDatabaseName(dbname) + if not db.open(): + raise Exception("Failed to open database " + dbname + " error: " + db.lastError().text()) + return db, dbname + +# Main + +def Main(): + if (len(sys.argv) < 2): + print >> sys.stderr, "Usage is: exported-sql-viewer.py <database name>" + raise Exception("Too few arguments") + + dbname = sys.argv[1] + + is_sqlite3 = False + try: + f = open(dbname) + if f.read(15) == "SQLite format 3": + is_sqlite3 = True + f.close() + except: + pass + + dbref = DBRef(is_sqlite3, dbname) + db, dbname = dbref.Open("main") + glb = Glb(dbref, db, dbname) + app = QApplication(sys.argv) + glb.app = app + mainwindow = MainWindow(glb) + glb.mainwindow = mainwindow + mainwindow.show() + err = app.exec_() + glb.ShutdownInstances() + db.close() + sys.exit(err) + +if __name__ == "__main__": + Main() diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build index c3b0afd67760..304313073242 100644 --- a/tools/perf/trace/beauty/Build +++ b/tools/perf/trace/beauty/Build @@ -5,6 +5,7 @@ ifeq ($(SRCARCH),$(filter $(SRCARCH),x86)) libperf-y += ioctl.o endif libperf-y += kcmp.o +libperf-y += mount_flags.o libperf-y += pkey_alloc.o libperf-y += prctl.o libperf-y += sockaddr.o diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 2570152d3909..039c29039b2c 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -24,6 +24,7 @@ struct strarray { } size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val); +size_t strarray__scnprintf_flags(struct strarray *sa, char *bf, size_t size, unsigned long flags); struct trace; struct thread; @@ -122,6 +123,12 @@ size_t syscall_arg__scnprintf_kcmp_type(char *bf, size_t size, struct syscall_ar size_t syscall_arg__scnprintf_kcmp_idx(char *bf, size_t size, struct syscall_arg *arg); #define SCA_KCMP_IDX syscall_arg__scnprintf_kcmp_idx +unsigned long syscall_arg__mask_val_mount_flags(struct syscall_arg *arg, unsigned long flags); +#define SCAMV_MOUNT_FLAGS syscall_arg__mask_val_mount_flags + +size_t syscall_arg__scnprintf_mount_flags(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_MOUNT_FLAGS syscall_arg__scnprintf_mount_flags + size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, struct syscall_arg *arg); #define SCA_PKEY_ALLOC_ACCESS_RIGHTS syscall_arg__scnprintf_pkey_alloc_access_rights diff --git a/tools/perf/trace/beauty/clone.c b/tools/perf/trace/beauty/clone.c index d64d049ab991..010406500c30 100644 --- a/tools/perf/trace/beauty/clone.c +++ b/tools/perf/trace/beauty/clone.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/cone.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/drm_ioctl.sh b/tools/perf/trace/beauty/drm_ioctl.sh index 9d3816815e60..9aa94fd523a9 100755 --- a/tools/perf/trace/beauty/drm_ioctl.sh +++ b/tools/perf/trace/beauty/drm_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/drm/ diff --git a/tools/perf/trace/beauty/eventfd.c b/tools/perf/trace/beauty/eventfd.c index 5d6a477a6400..db5b9b492113 100644 --- a/tools/perf/trace/beauty/eventfd.c +++ b/tools/perf/trace/beauty/eventfd.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #ifndef EFD_SEMAPHORE #define EFD_SEMAPHORE 1 #endif diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c index 9e8900c13cb1..e6de31674e24 100644 --- a/tools/perf/trace/beauty/fcntl.c +++ b/tools/perf/trace/beauty/fcntl.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/fcntl.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/flock.c b/tools/perf/trace/beauty/flock.c index c4ff6ad30b06..cf02ae5f0ba6 100644 --- a/tools/perf/trace/beauty/flock.c +++ b/tools/perf/trace/beauty/flock.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include "trace/beauty/beauty.h" #include <linux/kernel.h> diff --git a/tools/perf/trace/beauty/futex_op.c b/tools/perf/trace/beauty/futex_op.c index 61850fbc85ff..1136bde56406 100644 --- a/tools/perf/trace/beauty/futex_op.c +++ b/tools/perf/trace/beauty/futex_op.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <linux/futex.h> #ifndef FUTEX_WAIT_BITSET diff --git a/tools/perf/trace/beauty/futex_val3.c b/tools/perf/trace/beauty/futex_val3.c index 26f6b3253511..138b7d588a70 100644 --- a/tools/perf/trace/beauty/futex_val3.c +++ b/tools/perf/trace/beauty/futex_val3.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <linux/futex.h> #ifndef FUTEX_BITSET_MATCH_ANY diff --git a/tools/perf/trace/beauty/ioctl.c b/tools/perf/trace/beauty/ioctl.c index 1be3b4cf0827..5d2a7fd8d407 100644 --- a/tools/perf/trace/beauty/ioctl.c +++ b/tools/perf/trace/beauty/ioctl.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/ioctl.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/kcmp.c b/tools/perf/trace/beauty/kcmp.c index f62040eb9d5c..b276a274f203 100644 --- a/tools/perf/trace/beauty/kcmp.c +++ b/tools/perf/trace/beauty/kcmp.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/kcmp.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/kcmp_type.sh b/tools/perf/trace/beauty/kcmp_type.sh index a3c304caa336..df8b17486d57 100755 --- a/tools/perf/trace/beauty/kcmp_type.sh +++ b/tools/perf/trace/beauty/kcmp_type.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/kvm_ioctl.sh b/tools/perf/trace/beauty/kvm_ioctl.sh index c4699fd46bb6..4ce54f5bf756 100755 --- a/tools/perf/trace/beauty/kvm_ioctl.sh +++ b/tools/perf/trace/beauty/kvm_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/madvise_behavior.sh b/tools/perf/trace/beauty/madvise_behavior.sh index 431639eb4d29..4527d290cdfc 100755 --- a/tools/perf/trace/beauty/madvise_behavior.sh +++ b/tools/perf/trace/beauty/madvise_behavior.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/asm-generic/ diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c index 9f68077b241b..c534bd96ef5c 100644 --- a/tools/perf/trace/beauty/mmap.c +++ b/tools/perf/trace/beauty/mmap.c @@ -1,5 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <uapi/linux/mman.h> +#include <linux/log2.h> static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, struct syscall_arg *arg) @@ -30,50 +31,23 @@ static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot +static size_t mmap__scnprintf_flags(unsigned long flags, char *bf, size_t size) +{ +#include "trace/beauty/generated/mmap_flags_array.c" + static DEFINE_STRARRAY(mmap_flags); + + return strarray__scnprintf_flags(&strarray__mmap_flags, bf, size, flags); +} + static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size, struct syscall_arg *arg) { - int printed = 0, flags = arg->val; + unsigned long flags = arg->val; if (flags & MAP_ANONYMOUS) arg->mask |= (1 << 4) | (1 << 5); /* Mask 4th ('fd') and 5th ('offset') args, ignored */ -#define P_MMAP_FLAG(n) \ - if (flags & MAP_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~MAP_##n; \ - } - - P_MMAP_FLAG(SHARED); - P_MMAP_FLAG(PRIVATE); -#ifdef MAP_32BIT - P_MMAP_FLAG(32BIT); -#endif - P_MMAP_FLAG(ANONYMOUS); - P_MMAP_FLAG(DENYWRITE); - P_MMAP_FLAG(EXECUTABLE); - P_MMAP_FLAG(FILE); - P_MMAP_FLAG(FIXED); -#ifdef MAP_FIXED_NOREPLACE - P_MMAP_FLAG(FIXED_NOREPLACE); -#endif - P_MMAP_FLAG(GROWSDOWN); - P_MMAP_FLAG(HUGETLB); - P_MMAP_FLAG(LOCKED); - P_MMAP_FLAG(NONBLOCK); - P_MMAP_FLAG(NORESERVE); - P_MMAP_FLAG(POPULATE); - P_MMAP_FLAG(STACK); - P_MMAP_FLAG(UNINITIALIZED); -#ifdef MAP_SYNC - P_MMAP_FLAG(SYNC); -#endif -#undef P_MMAP_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; + return mmap__scnprintf_flags(flags, bf, size); } #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags diff --git a/tools/perf/trace/beauty/mmap_flags.sh b/tools/perf/trace/beauty/mmap_flags.sh new file mode 100755 index 000000000000..22c3fdca8975 --- /dev/null +++ b/tools/perf/trace/beauty/mmap_flags.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 2 ] ; then + [ $# -eq 1 ] && hostarch=$1 || hostarch=`uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/` + header_dir=tools/include/uapi/asm-generic + arch_header_dir=tools/arch/${hostarch}/include/uapi/asm +else + header_dir=$1 + arch_header_dir=$2 +fi + +arch_mman=${arch_header_dir}/mman.h + +# those in egrep -vw are flags, we want just the bits + +printf "static const char *mmap_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MAP_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +egrep -q $regex ${arch_mman} && \ +(egrep $regex ${arch_mman} | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n") +egrep -q '#[[:space:]]*include[[:space:]]+<uapi/asm-generic/mman.*' ${arch_mman} && +(egrep $regex ${header_dir}/mman-common.h | \ + egrep -vw 'MAP_(UNINITIALIZED|TYPE|SHARED_VALIDATE)' | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n") +egrep -q '#[[:space:]]*include[[:space:]]+<uapi/asm-generic/mman.h>.*' ${arch_mman} && +(egrep $regex ${header_dir}/mman.h | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n") +printf "};\n" diff --git a/tools/perf/trace/beauty/mode_t.c b/tools/perf/trace/beauty/mode_t.c index d929ad7dd97b..6879d36d3004 100644 --- a/tools/perf/trace/beauty/mode_t.c +++ b/tools/perf/trace/beauty/mode_t.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> diff --git a/tools/perf/trace/beauty/mount_flags.c b/tools/perf/trace/beauty/mount_flags.c new file mode 100644 index 000000000000..712935c6620a --- /dev/null +++ b/tools/perf/trace/beauty/mount_flags.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * trace/beauty/mount_flags.c + * + * Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> + */ + +#include "trace/beauty/beauty.h" +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <sys/mount.h> + +static size_t mount__scnprintf_flags(unsigned long flags, char *bf, size_t size) +{ +#include "trace/beauty/generated/mount_flags_array.c" + static DEFINE_STRARRAY(mount_flags); + + return strarray__scnprintf_flags(&strarray__mount_flags, bf, size, flags); +} + +unsigned long syscall_arg__mask_val_mount_flags(struct syscall_arg *arg __maybe_unused, unsigned long flags) +{ + // do_mount in fs/namespace.c: + /* + * Pre-0.97 versions of mount() didn't have a flags word. When the + * flags word was introduced its top half was required to have the + * magic value 0xC0ED, and this remained so until 2.4.0-test9. + * Therefore, if this magic number is present, it carries no + * information and must be discarded. + */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + return flags; +} + +size_t syscall_arg__scnprintf_mount_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + unsigned long flags = arg->val; + + return mount__scnprintf_flags(flags, bf, size); +} diff --git a/tools/perf/trace/beauty/mount_flags.sh b/tools/perf/trace/beauty/mount_flags.sh new file mode 100755 index 000000000000..45547573a1db --- /dev/null +++ b/tools/perf/trace/beauty/mount_flags.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ + +printf "static const char *mount_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+([[:digit:]]+)[[:space:]]*.*' +egrep $regex ${header_dir}/fs.h | egrep -v '(MSK|VERBOSE|MGC_VAL)\>' | \ + sed -r "s/$regex/\2 \2 \1/g" | sort -n | \ + xargs printf "\t[%s ? (ilog2(%s) + 1) : 0] = \"%s\",\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+\(1<<([[:digit:]]+)\)[[:space:]]*.*' +egrep $regex ${header_dir}/fs.h | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[%s + 1] = \"%s\",\n" +printf "};\n" diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c index c064d6aae659..1b9d6306d274 100644 --- a/tools/perf/trace/beauty/msg_flags.c +++ b/tools/perf/trace/beauty/msg_flags.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <sys/types.h> #include <sys/socket.h> diff --git a/tools/perf/trace/beauty/open_flags.c b/tools/perf/trace/beauty/open_flags.c index 6aec6178a99d..cc673fec9184 100644 --- a/tools/perf/trace/beauty/open_flags.c +++ b/tools/perf/trace/beauty/open_flags.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c index 2bafd7c995ff..981185c1974b 100644 --- a/tools/perf/trace/beauty/perf_event_open.c +++ b/tools/perf/trace/beauty/perf_event_open.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #ifndef PERF_FLAG_FD_NO_GROUP # define PERF_FLAG_FD_NO_GROUP (1UL << 0) #endif diff --git a/tools/perf/trace/beauty/perf_ioctl.sh b/tools/perf/trace/beauty/perf_ioctl.sh index 6492c74df928..9aabd9743ef6 100755 --- a/tools/perf/trace/beauty/perf_ioctl.sh +++ b/tools/perf/trace/beauty/perf_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c index 0313df342830..1a6acc46807b 100644 --- a/tools/perf/trace/beauty/pid.c +++ b/tools/perf/trace/beauty/pid.c @@ -1,4 +1,5 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 + size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg) { int pid = arg->val; diff --git a/tools/perf/trace/beauty/pkey_alloc.c b/tools/perf/trace/beauty/pkey_alloc.c index 2ba784a3734a..1b8ed4cac815 100644 --- a/tools/perf/trace/beauty/pkey_alloc.c +++ b/tools/perf/trace/beauty/pkey_alloc.c @@ -1,40 +1,36 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/pkey_alloc.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" #include <linux/kernel.h> #include <linux/log2.h> -static size_t pkey_alloc__scnprintf_access_rights(int access_rights, char *bf, size_t size) +size_t strarray__scnprintf_flags(struct strarray *sa, char *bf, size_t size, unsigned long flags) { int i, printed = 0; -#include "trace/beauty/generated/pkey_alloc_access_rights_array.c" - static DEFINE_STRARRAY(pkey_alloc_access_rights); - - if (access_rights == 0) { - const char *s = strarray__pkey_alloc_access_rights.entries[0]; + if (flags == 0) { + const char *s = sa->entries[0]; if (s) return scnprintf(bf, size, "%s", s); return scnprintf(bf, size, "%d", 0); } - for (i = 1; i < strarray__pkey_alloc_access_rights.nr_entries; ++i) { - int bit = 1 << (i - 1); + for (i = 1; i < sa->nr_entries; ++i) { + unsigned long bit = 1UL << (i - 1); - if (!(access_rights & bit)) + if (!(flags & bit)) continue; if (printed != 0) printed += scnprintf(bf + printed, size - printed, "|"); - if (strarray__pkey_alloc_access_rights.entries[i] != NULL) - printed += scnprintf(bf + printed, size - printed, "%s", strarray__pkey_alloc_access_rights.entries[i]); + if (sa->entries[i] != NULL) + printed += scnprintf(bf + printed, size - printed, "%s", sa->entries[i]); else printed += scnprintf(bf + printed, size - printed, "0x%#", bit); } @@ -42,6 +38,14 @@ static size_t pkey_alloc__scnprintf_access_rights(int access_rights, char *bf, s return printed; } +static size_t pkey_alloc__scnprintf_access_rights(int access_rights, char *bf, size_t size) +{ +#include "trace/beauty/generated/pkey_alloc_access_rights_array.c" + static DEFINE_STRARRAY(pkey_alloc_access_rights); + + return strarray__scnprintf_flags(&strarray__pkey_alloc_access_rights, bf, size, access_rights); +} + size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, struct syscall_arg *arg) { unsigned long cmd = arg->val; diff --git a/tools/perf/trace/beauty/pkey_alloc_access_rights.sh b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh index e0a51aeb20b2..f8f1b560cf8a 100755 --- a/tools/perf/trace/beauty/pkey_alloc_access_rights.sh +++ b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/asm-generic/ diff --git a/tools/perf/trace/beauty/prctl.c b/tools/perf/trace/beauty/prctl.c index 246130dad6c4..be7a5d395975 100644 --- a/tools/perf/trace/beauty/prctl.c +++ b/tools/perf/trace/beauty/prctl.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/prctl.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh index f24722146ebe..d32f8f1124af 100755 --- a/tools/perf/trace/beauty/prctl_option.sh +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/sched_policy.c b/tools/perf/trace/beauty/sched_policy.c index ba5096ae76b6..48f2b5c9aa3e 100644 --- a/tools/perf/trace/beauty/sched_policy.c +++ b/tools/perf/trace/beauty/sched_policy.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <sched.h> /* diff --git a/tools/perf/trace/beauty/seccomp.c b/tools/perf/trace/beauty/seccomp.c index b7097fd5fed9..e36156b19c70 100644 --- a/tools/perf/trace/beauty/seccomp.c +++ b/tools/perf/trace/beauty/seccomp.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #ifndef SECCOMP_SET_MODE_STRICT #define SECCOMP_SET_MODE_STRICT 0 #endif diff --git a/tools/perf/trace/beauty/signum.c b/tools/perf/trace/beauty/signum.c index bde18a53f090..587fec545b8a 100644 --- a/tools/perf/trace/beauty/signum.c +++ b/tools/perf/trace/beauty/signum.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <signal.h> static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg) diff --git a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh index eb511bb5fbd3..e0803b957593 100755 --- a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ diff --git a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh index 6818392968b2..7a464a7bf913 100755 --- a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ diff --git a/tools/perf/trace/beauty/sockaddr.c b/tools/perf/trace/beauty/sockaddr.c index 71a79f72d9d9..9410ad230f10 100644 --- a/tools/perf/trace/beauty/sockaddr.c +++ b/tools/perf/trace/beauty/sockaddr.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 // Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/socket.c b/tools/perf/trace/beauty/socket.c index 65227269384b..d971a2596417 100644 --- a/tools/perf/trace/beauty/socket.c +++ b/tools/perf/trace/beauty/socket.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/socket.c * diff --git a/tools/perf/trace/beauty/socket_ipproto.sh b/tools/perf/trace/beauty/socket_ipproto.sh index a3cc24633bec..de0f2f29017f 100755 --- a/tools/perf/trace/beauty/socket_ipproto.sh +++ b/tools/perf/trace/beauty/socket_ipproto.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/socket_type.c b/tools/perf/trace/beauty/socket_type.c index bca26aef4a77..a63a9a332aa0 100644 --- a/tools/perf/trace/beauty/socket_type.c +++ b/tools/perf/trace/beauty/socket_type.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <sys/types.h> #include <sys/socket.h> diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c index 5643b692af4c..630f2760dd66 100644 --- a/tools/perf/trace/beauty/statx.c +++ b/tools/perf/trace/beauty/statx.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/statx.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh index 0f6a5197d0be..439773daaf77 100755 --- a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh +++ b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/waitid_options.c b/tools/perf/trace/beauty/waitid_options.c index 8465281a093d..42ff58ad613b 100644 --- a/tools/perf/trace/beauty/waitid_options.c +++ b/tools/perf/trace/beauty/waitid_options.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include <sys/types.h> #include <sys/wait.h> diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 28cd6a17491b..6936daf89ddd 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -139,6 +139,7 @@ static int arch__associate_ins_ops(struct arch* arch, const char *name, struct i #include "arch/x86/annotate/instructions.c" #include "arch/powerpc/annotate/instructions.c" #include "arch/s390/annotate/instructions.c" +#include "arch/sparc/annotate/instructions.c" static struct arch architectures[] = { { @@ -170,6 +171,13 @@ static struct arch architectures[] = { .comment_char = '#', }, }, + { + .name = "sparc", + .init = sparc__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, }; static void ins__delete(struct ins_operands *ops) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index c4617bcfd521..72d5ba2479bf 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -962,16 +962,23 @@ s64 perf_event__process_auxtrace(struct perf_session *session, #define PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ 64 #define PERF_ITRACE_MAX_LAST_BRANCH_SZ 1024 -void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts) +void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts, + bool no_sample) { - synth_opts->instructions = true; synth_opts->branches = true; synth_opts->transactions = true; synth_opts->ptwrites = true; synth_opts->pwr_events = true; synth_opts->errors = true; - synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE; - synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD; + if (no_sample) { + synth_opts->period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; + synth_opts->period = 1; + synth_opts->calls = true; + } else { + synth_opts->instructions = true; + synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE; + synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD; + } synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ; synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ; synth_opts->initial_skip = 0; @@ -999,7 +1006,7 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, } if (!str) { - itrace_synth_opts__set_default(synth_opts); + itrace_synth_opts__set_default(synth_opts, false); return 0; } diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index d88f6e9eb461..8e50f96d4b23 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -58,6 +58,7 @@ enum itrace_period_type { /** * struct itrace_synth_opts - AUX area tracing synthesis options. * @set: indicates whether or not options have been set + * @default_no_sample: Default to no sampling. * @inject: indicates the event (not just the sample) must be fully synthesized * because 'perf inject' will write it out * @instructions: whether to synthesize 'instructions' events @@ -82,6 +83,7 @@ enum itrace_period_type { */ struct itrace_synth_opts { bool set; + bool default_no_sample; bool inject; bool instructions; bool branches; @@ -528,7 +530,8 @@ int perf_event__process_auxtrace_error(struct perf_session *session, union perf_event *event); int itrace_parse_synth_opts(const struct option *opt, const char *str, int unset); -void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts); +void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts, + bool no_sample); size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp); void perf_session__auxtrace_error_inc(struct perf_session *session, diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 2ae640257fdb..73430b73570d 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -244,6 +244,27 @@ static void cs_etm__free(struct perf_session *session) zfree(&aux); } +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address) +{ + struct machine *machine; + + machine = etmq->etm->machine; + + if (address >= etmq->etm->kernel_start) { + if (machine__is_host(machine)) + return PERF_RECORD_MISC_KERNEL; + else + return PERF_RECORD_MISC_GUEST_KERNEL; + } else { + if (machine__is_host(machine)) + return PERF_RECORD_MISC_USER; + else if (perf_guest) + return PERF_RECORD_MISC_GUEST_USER; + else + return PERF_RECORD_MISC_HYPERVISOR; + } +} + static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u64 address, size_t size, u8 *buffer) { @@ -258,10 +279,7 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u64 address, return -1; machine = etmq->etm->machine; - if (address >= etmq->etm->kernel_start) - cpumode = PERF_RECORD_MISC_KERNEL; - else - cpumode = PERF_RECORD_MISC_USER; + cpumode = cs_etm__cpu_mode(etmq, address); thread = etmq->thread; if (!thread) { @@ -653,7 +671,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, struct perf_sample sample = {.ip = 0,}; event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = PERF_RECORD_MISC_USER; + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr); event->sample.header.size = sizeof(struct perf_event_header); sample.ip = addr; @@ -665,7 +683,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, sample.cpu = etmq->packet->cpu; sample.flags = 0; sample.insn_len = 1; - sample.cpumode = event->header.misc; + sample.cpumode = event->sample.header.misc; if (etm->synth_opts.last_branch) { cs_etm__copy_last_branch_rb(etmq); @@ -706,12 +724,15 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq) u64 nr; struct branch_entry entries; } dummy_bs; + u64 ip; + + ip = cs_etm__last_executed_instr(etmq->prev_packet); event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = PERF_RECORD_MISC_USER; + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip); event->sample.header.size = sizeof(struct perf_event_header); - sample.ip = cs_etm__last_executed_instr(etmq->prev_packet); + sample.ip = ip; sample.pid = etmq->pid; sample.tid = etmq->tid; sample.addr = cs_etm__first_executed_instr(etmq->packet); @@ -720,7 +741,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq) sample.period = 1; sample.cpu = etmq->packet->cpu; sample.flags = 0; - sample.cpumode = PERF_RECORD_MISC_USER; + sample.cpumode = event->sample.header.misc; /* * perf report cannot handle events without a branch stack @@ -1432,7 +1453,8 @@ int cs_etm__process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { etm->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(&etm->synth_opts); + itrace_synth_opts__set_default(&etm->synth_opts, + session->itrace_synth_opts->default_no_sample); etm->synth_opts.callchain = false; } diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 1f3ccc368530..d01b8355f4ca 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -63,6 +63,7 @@ struct perf_env { struct numa_node *numa_nodes; struct memory_node *memory_nodes; unsigned long long memory_bsize; + u64 clockid_res_ns; }; extern struct perf_env perf_env; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index bc646185f8d9..e9c108a6b1c3 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -308,6 +308,7 @@ static int perf_event__synthesize_fork(struct perf_tool *tool, event->fork.pid = tgid; event->fork.tid = pid; event->fork.header.type = PERF_RECORD_FORK; + event->fork.header.misc = PERF_RECORD_MISC_FORK_EXEC; event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index be440df29615..e88e6f9b1463 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -358,7 +358,7 @@ void perf_evlist__disable(struct perf_evlist *evlist) struct perf_evsel *pos; evlist__for_each_entry(evlist, pos) { - if (!perf_evsel__is_group_leader(pos) || !pos->fd) + if (pos->disabled || !perf_evsel__is_group_leader(pos) || !pos->fd) continue; perf_evsel__disable(pos); } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 29d7b97f66fb..6d187059a373 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -232,6 +232,7 @@ void perf_evsel__init(struct perf_evsel *evsel, evsel->leader = evsel; evsel->unit = ""; evsel->scale = 1.0; + evsel->max_events = ULONG_MAX; evsel->evlist = NULL; evsel->bpf_fd = -1; INIT_LIST_HEAD(&evsel->node); @@ -793,6 +794,9 @@ static void apply_config_terms(struct perf_evsel *evsel, case PERF_EVSEL__CONFIG_TERM_MAX_STACK: max_stack = term->val.max_stack; break; + case PERF_EVSEL__CONFIG_TERM_MAX_EVENTS: + evsel->max_events = term->val.max_events; + break; case PERF_EVSEL__CONFIG_TERM_INHERIT: /* * attr->inherit should has already been set by @@ -1203,16 +1207,27 @@ int perf_evsel__append_addr_filter(struct perf_evsel *evsel, const char *filter) int perf_evsel__enable(struct perf_evsel *evsel) { - return perf_evsel__run_ioctl(evsel, - PERF_EVENT_IOC_ENABLE, - 0); + int err = perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_ENABLE, 0); + + if (!err) + evsel->disabled = false; + + return err; } int perf_evsel__disable(struct perf_evsel *evsel) { - return perf_evsel__run_ioctl(evsel, - PERF_EVENT_IOC_DISABLE, - 0); + int err = perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_DISABLE, 0); + /* + * We mark it disabled here so that tools that disable a event can + * ignore events after they disable it. I.e. the ring buffer may have + * already a few more events queued up before the kernel got the stop + * request. + */ + if (!err) + evsel->disabled = true; + + return err; } int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4107c39f4a54..3147ca76c6fc 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -46,6 +46,7 @@ enum term_type { PERF_EVSEL__CONFIG_TERM_STACK_USER, PERF_EVSEL__CONFIG_TERM_INHERIT, PERF_EVSEL__CONFIG_TERM_MAX_STACK, + PERF_EVSEL__CONFIG_TERM_MAX_EVENTS, PERF_EVSEL__CONFIG_TERM_OVERWRITE, PERF_EVSEL__CONFIG_TERM_DRV_CFG, PERF_EVSEL__CONFIG_TERM_BRANCH, @@ -65,6 +66,7 @@ struct perf_evsel_config_term { bool inherit; bool overwrite; char *branch; + unsigned long max_events; } val; bool weak; }; @@ -99,6 +101,8 @@ struct perf_evsel { struct perf_counts *prev_raw_counts; int idx; u32 ids; + unsigned long max_events; + unsigned long nr_events_printed; char *name; double scale; const char *unit; @@ -119,6 +123,7 @@ struct perf_evsel { bool snapshot; bool supported; bool needs_swap; + bool disabled; bool no_aux_samples; bool immediate; bool system_wide; diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h index de322d51c7fe..b72440bf9a79 100644 --- a/tools/perf/util/genelf.h +++ b/tools/perf/util/genelf.h @@ -29,6 +29,12 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent #elif defined(__powerpc__) #define GEN_ELF_ARCH EM_PPC #define GEN_ELF_CLASS ELFCLASS32 +#elif defined(__sparc__) && defined(__arch64__) +#define GEN_ELF_ARCH EM_SPARCV9 +#define GEN_ELF_CLASS ELFCLASS64 +#elif defined(__sparc__) +#define GEN_ELF_ARCH EM_SPARC +#define GEN_ELF_CLASS ELFCLASS32 #else #error "unsupported architecture" #endif diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 1ec1d9bc2d63..4fd45be95a43 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1034,6 +1034,13 @@ static int write_auxtrace(struct feat_fd *ff, return err; } +static int write_clockid(struct feat_fd *ff, + struct perf_evlist *evlist __maybe_unused) +{ + return do_write(ff, &ff->ph->env.clockid_res_ns, + sizeof(ff->ph->env.clockid_res_ns)); +} + static int cpu_cache_level__sort(const void *a, const void *b) { struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a; @@ -1508,6 +1515,12 @@ static void print_cpu_topology(struct feat_fd *ff, FILE *fp) fprintf(fp, "# Core ID and Socket ID information is not available\n"); } +static void print_clockid(struct feat_fd *ff, FILE *fp) +{ + fprintf(fp, "# clockid frequency: %"PRIu64" MHz\n", + ff->ph->env.clockid_res_ns * 1000); +} + static void free_event_desc(struct perf_evsel *events) { struct perf_evsel *evsel; @@ -2531,6 +2544,15 @@ out: return ret; } +static int process_clockid(struct feat_fd *ff, + void *data __maybe_unused) +{ + if (do_read_u64(ff, &ff->ph->env.clockid_res_ns)) + return -1; + + return 0; +} + struct feature_ops { int (*write)(struct feat_fd *ff, struct perf_evlist *evlist); void (*print)(struct feat_fd *ff, FILE *fp); @@ -2590,6 +2612,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPN(CACHE, cache, true), FEAT_OPR(SAMPLE_TIME, sample_time, false), FEAT_OPR(MEM_TOPOLOGY, mem_topology, true), + FEAT_OPR(CLOCKID, clockid, false) }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index e17903caa71d..0d553ddca0a3 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -38,6 +38,7 @@ enum { HEADER_CACHE, HEADER_SAMPLE_TIME, HEADER_MEM_TOPOLOGY, + HEADER_CLOCKID, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 7f0c83b6332b..7b27d77306c2 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -269,6 +269,13 @@ static int intel_bts_do_fix_overlap(struct auxtrace_queue *queue, return 0; } +static inline u8 intel_bts_cpumode(struct intel_bts *bts, uint64_t ip) +{ + return machine__kernel_ip(bts->machine, ip) ? + PERF_RECORD_MISC_KERNEL : + PERF_RECORD_MISC_USER; +} + static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, struct branch *branch) { @@ -281,12 +288,8 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, bts->num_events++ <= bts->synth_opts.initial_skip) return 0; - event.sample.header.type = PERF_RECORD_SAMPLE; - event.sample.header.misc = PERF_RECORD_MISC_USER; - event.sample.header.size = sizeof(struct perf_event_header); - - sample.cpumode = PERF_RECORD_MISC_USER; sample.ip = le64_to_cpu(branch->from); + sample.cpumode = intel_bts_cpumode(bts, sample.ip); sample.pid = btsq->pid; sample.tid = btsq->tid; sample.addr = le64_to_cpu(branch->to); @@ -298,6 +301,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, sample.insn_len = btsq->intel_pt_insn.length; memcpy(sample.insn, btsq->intel_pt_insn.buf, INTEL_PT_INSN_BUF_SZ); + event.sample.header.type = PERF_RECORD_SAMPLE; + event.sample.header.misc = sample.cpumode; + event.sample.header.size = sizeof(struct perf_event_header); + if (bts->synth_opts.inject) { event.sample.header.size = bts->branches_event_size; ret = perf_event__synthesize_sample(&event, @@ -910,7 +917,8 @@ int intel_bts_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { bts->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(&bts->synth_opts); + itrace_synth_opts__set_default(&bts->synth_opts, + session->itrace_synth_opts->default_no_sample); if (session->itrace_synth_opts) bts->synth_opts.thread_stack = session->itrace_synth_opts->thread_stack; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 48c1d415c6b0..86cc9a64e982 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -407,6 +407,13 @@ intel_pt_cache_lookup(struct dso *dso, struct machine *machine, u64 offset) return auxtrace_cache__lookup(dso->auxtrace_cache, offset); } +static inline u8 intel_pt_cpumode(struct intel_pt *pt, uint64_t ip) +{ + return ip >= pt->kernel_start ? + PERF_RECORD_MISC_KERNEL : + PERF_RECORD_MISC_USER; +} + static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, uint64_t *insn_cnt_ptr, uint64_t *ip, uint64_t to_ip, uint64_t max_insn_cnt, @@ -429,10 +436,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, if (to_ip && *ip == to_ip) goto out_no_cache; - if (*ip >= ptq->pt->kernel_start) - cpumode = PERF_RECORD_MISC_KERNEL; - else - cpumode = PERF_RECORD_MISC_USER; + cpumode = intel_pt_cpumode(ptq->pt, *ip); thread = ptq->thread; if (!thread) { @@ -759,7 +763,8 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, if (pt->synth_opts.callchain) { size_t sz = sizeof(struct ip_callchain); - sz += pt->synth_opts.callchain_sz * sizeof(u64); + /* Add 1 to callchain_sz for callchain context */ + sz += (pt->synth_opts.callchain_sz + 1) * sizeof(u64); ptq->chain = zalloc(sz); if (!ptq->chain) goto out_free; @@ -1058,15 +1063,11 @@ static void intel_pt_prep_b_sample(struct intel_pt *pt, union perf_event *event, struct perf_sample *sample) { - event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = PERF_RECORD_MISC_USER; - event->sample.header.size = sizeof(struct perf_event_header); - if (!pt->timeless_decoding) sample->time = tsc_to_perf_time(ptq->timestamp, &pt->tc); - sample->cpumode = PERF_RECORD_MISC_USER; sample->ip = ptq->state->from_ip; + sample->cpumode = intel_pt_cpumode(pt, sample->ip); sample->pid = ptq->pid; sample->tid = ptq->tid; sample->addr = ptq->state->to_ip; @@ -1075,6 +1076,10 @@ static void intel_pt_prep_b_sample(struct intel_pt *pt, sample->flags = ptq->flags; sample->insn_len = ptq->insn_len; memcpy(sample->insn, ptq->insn, INTEL_PT_INSN_BUF_SZ); + + event->sample.header.type = PERF_RECORD_SAMPLE; + event->sample.header.misc = sample->cpumode; + event->sample.header.size = sizeof(struct perf_event_header); } static int intel_pt_inject_event(union perf_event *event, @@ -1160,7 +1165,8 @@ static void intel_pt_prep_sample(struct intel_pt *pt, if (pt->synth_opts.callchain) { thread_stack__sample(ptq->thread, ptq->chain, - pt->synth_opts.callchain_sz, sample->ip); + pt->synth_opts.callchain_sz + 1, + sample->ip, pt->kernel_start); sample->callchain = ptq->chain; } @@ -2559,7 +2565,8 @@ int intel_pt_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { pt->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(&pt->synth_opts); + itrace_synth_opts__set_default(&pt->synth_opts, + session->itrace_synth_opts->default_no_sample); if (use_browser != -1) { pt->synth_opts.branches = false; pt->synth_opts.callchain = true; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 111ae858cbcb..8f36ce813bc5 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1708,6 +1708,7 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event struct thread *parent = machine__findnew_thread(machine, event->fork.ppid, event->fork.ptid); + bool do_maps_clone = true; int err = 0; if (dump_trace) @@ -1736,9 +1737,25 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); + /* + * When synthesizing FORK events, we are trying to create thread + * objects for the already running tasks on the machine. + * + * Normally, for a kernel FORK event, we want to clone the parent's + * maps because that is what the kernel just did. + * + * But when synthesizing, this should not be done. If we do, we end up + * with overlapping maps as we process the sythesized MMAP2 events that + * get delivered shortly thereafter. + * + * Use the FORK event misc flags in an internal way to signal this + * situation, so we can elide the map clone when appropriate. + */ + if (event->fork.header.misc & PERF_RECORD_MISC_FORK_EXEC) + do_maps_clone = false; if (thread == NULL || parent == NULL || - thread__fork(thread, parent, sample->time) < 0) { + thread__fork(thread, parent, sample->time, do_maps_clone) < 0) { dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); err = -1; } @@ -2140,6 +2157,27 @@ static int resolve_lbr_callchain_sample(struct thread *thread, return 0; } +static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread, + struct callchain_cursor *cursor, + struct symbol **parent, + struct addr_location *root_al, + u8 *cpumode, int ent) +{ + int err = 0; + + while (--ent >= 0) { + u64 ip = chain->ips[ent]; + + if (ip >= PERF_CONTEXT_MAX) { + err = add_callchain_ip(thread, cursor, parent, + root_al, cpumode, ip, + false, NULL, NULL, 0); + break; + } + } + return err; +} + static int thread__resolve_callchain_sample(struct thread *thread, struct callchain_cursor *cursor, struct perf_evsel *evsel, @@ -2246,6 +2284,12 @@ static int thread__resolve_callchain_sample(struct thread *thread, } check_calls: + if (callchain_param.order != ORDER_CALLEE) { + err = find_prev_cpumode(chain, thread, cursor, parent, root_al, + &cpumode, chain->nr - first_call); + if (err) + return (err < 0) ? err : 0; + } for (i = first_call, nr_entries = 0; i < chain_nr && nr_entries < max_stack; i++) { u64 ip; @@ -2260,9 +2304,15 @@ check_calls: continue; #endif ip = chain->ips[j]; - if (ip < PERF_CONTEXT_MAX) ++nr_entries; + else if (callchain_param.order != ORDER_CALLEE) { + err = find_prev_cpumode(chain, thread, cursor, parent, + root_al, &cpumode, j); + if (err) + return (err < 0) ? err : 0; + continue; + } err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip, diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f8cd3e7c9186..59be3466d64d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -926,6 +926,7 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = { [PARSE_EVENTS__TERM_TYPE_NOINHERIT] = "no-inherit", [PARSE_EVENTS__TERM_TYPE_INHERIT] = "inherit", [PARSE_EVENTS__TERM_TYPE_MAX_STACK] = "max-stack", + [PARSE_EVENTS__TERM_TYPE_MAX_EVENTS] = "nr", [PARSE_EVENTS__TERM_TYPE_OVERWRITE] = "overwrite", [PARSE_EVENTS__TERM_TYPE_NOOVERWRITE] = "no-overwrite", [PARSE_EVENTS__TERM_TYPE_DRV_CFG] = "driver-config", @@ -1037,6 +1038,9 @@ do { \ case PARSE_EVENTS__TERM_TYPE_MAX_STACK: CHECK_TYPE_VAL(NUM); break; + case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: + CHECK_TYPE_VAL(NUM); + break; default: err->str = strdup("unknown term"); err->idx = term->err_term; @@ -1084,6 +1088,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr, case PARSE_EVENTS__TERM_TYPE_INHERIT: case PARSE_EVENTS__TERM_TYPE_NOINHERIT: case PARSE_EVENTS__TERM_TYPE_MAX_STACK: + case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: case PARSE_EVENTS__TERM_TYPE_OVERWRITE: case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: return config_term_common(attr, term, err); @@ -1162,6 +1167,9 @@ do { \ case PARSE_EVENTS__TERM_TYPE_MAX_STACK: ADD_CONFIG_TERM(MAX_STACK, max_stack, term->val.num); break; + case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: + ADD_CONFIG_TERM(MAX_EVENTS, max_events, term->val.num); + break; case PARSE_EVENTS__TERM_TYPE_OVERWRITE: ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0); break; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 4473dac27aee..5ed035cbcbb7 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -71,6 +71,7 @@ enum { PARSE_EVENTS__TERM_TYPE_NOINHERIT, PARSE_EVENTS__TERM_TYPE_INHERIT, PARSE_EVENTS__TERM_TYPE_MAX_STACK, + PARSE_EVENTS__TERM_TYPE_MAX_EVENTS, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE, PARSE_EVENTS__TERM_TYPE_OVERWRITE, PARSE_EVENTS__TERM_TYPE_DRV_CFG, diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 5f761f3ed0f3..7805c71aaae2 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -269,6 +269,7 @@ time { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_TIME); } call-graph { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CALLGRAPH); } stack-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); } max-stack { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_MAX_STACK); } +nr { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_MAX_EVENTS); } inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_INHERIT); } no-inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); } overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_OVERWRITE); } diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 0281d5e2cd67..66a84d5846c8 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -324,7 +324,17 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss) plt_entry_size = 16; break; - default: /* FIXME: s390/alpha/mips/parisc/poperpc/sh/sparc/xtensa need to be checked */ + case EM_SPARC: + plt_header_size = 48; + plt_entry_size = 12; + break; + + case EM_SPARCV9: + plt_header_size = 128; + plt_entry_size = 32; + break; + + default: /* FIXME: s390/alpha/mips/parisc/poperpc/sh/xtensa need to be checked */ plt_header_size = shdr_plt.sh_entsize; plt_entry_size = shdr_plt.sh_entsize; break; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 20f49779116b..d026d215bdc6 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -123,7 +123,8 @@ struct symbol_conf { const char *vmlinux_name, *kallsyms_name, *source_prefix, - *field_sep; + *field_sep, + *graph_function; const char *default_guest_vmlinux_name, *default_guest_kallsyms, *default_guest_modules; diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index c091635bf7dc..61a4286a74dc 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -310,20 +310,46 @@ void thread_stack__free(struct thread *thread) } } +static inline u64 callchain_context(u64 ip, u64 kernel_start) +{ + return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL; +} + void thread_stack__sample(struct thread *thread, struct ip_callchain *chain, - size_t sz, u64 ip) + size_t sz, u64 ip, u64 kernel_start) { - size_t i; + u64 context = callchain_context(ip, kernel_start); + u64 last_context; + size_t i, j; - if (!thread || !thread->ts) - chain->nr = 1; - else - chain->nr = min(sz, thread->ts->cnt + 1); + if (sz < 2) { + chain->nr = 0; + return; + } - chain->ips[0] = ip; + chain->ips[0] = context; + chain->ips[1] = ip; + + if (!thread || !thread->ts) { + chain->nr = 2; + return; + } + + last_context = context; + + for (i = 2, j = 1; i < sz && j <= thread->ts->cnt; i++, j++) { + ip = thread->ts->stack[thread->ts->cnt - j].ret_addr; + context = callchain_context(ip, kernel_start); + if (context != last_context) { + if (i >= sz - 1) + break; + chain->ips[i++] = context; + last_context = context; + } + chain->ips[i] = ip; + } - for (i = 1; i < chain->nr; i++) - chain->ips[i] = thread->ts->stack[thread->ts->cnt - i].ret_addr; + chain->nr = i; } struct call_return_processor * diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index b7e41c4ebfdd..f97c00a8c251 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -84,7 +84,7 @@ int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip, u64 to_ip, u16 insn_len, u64 trace_nr); void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr); void thread_stack__sample(struct thread *thread, struct ip_callchain *chain, - size_t sz, u64 ip); + size_t sz, u64 ip, u64 kernel_start); int thread_stack__flush(struct thread *thread); void thread_stack__free(struct thread *thread); size_t thread_stack__depth(struct thread *thread); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 2048d393ece6..3d9ed7d0e281 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -330,7 +330,8 @@ static int thread__prepare_access(struct thread *thread) } static int thread__clone_map_groups(struct thread *thread, - struct thread *parent) + struct thread *parent, + bool do_maps_clone) { /* This is new thread, we share map groups for process. */ if (thread->pid_ == parent->pid_) @@ -341,15 +342,11 @@ static int thread__clone_map_groups(struct thread *thread, thread->pid_, thread->tid, parent->pid_, parent->tid); return 0; } - /* But this one is new process, copy maps. */ - if (map_groups__clone(thread, parent->mg) < 0) - return -ENOMEM; - - return 0; + return do_maps_clone ? map_groups__clone(thread, parent->mg) : 0; } -int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp) +int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone) { if (parent->comm_set) { const char *comm = thread__comm_str(parent); @@ -362,7 +359,7 @@ int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp) } thread->ppid = parent->tid; - return thread__clone_map_groups(thread, parent); + return thread__clone_map_groups(thread, parent, do_maps_clone); } void thread__find_cpumode_addr_location(struct thread *thread, u64 addr, diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 07606aa6998d..30e2b4c165fe 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -42,6 +42,8 @@ struct thread { void *addr_space; struct unwind_libunwind_ops *unwind_libunwind_ops; #endif + bool filter; + int filter_entry_depth; }; struct machine; @@ -87,7 +89,7 @@ struct comm *thread__comm(const struct thread *thread); struct comm *thread__exec_comm(const struct thread *thread); const char *thread__comm_str(const struct thread *thread); int thread__insert_map(struct thread *thread, struct map *map); -int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp); +int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone); size_t thread__fprintf(struct thread *thread, FILE *fp); struct thread *thread__main_thread(struct machine *machine, struct thread *thread); diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 6f318b15950e..5eff9bfc5758 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -45,13 +45,13 @@ static int __report_module(struct addr_location *al, u64 ip, Dwarf_Addr s; dwfl_module_info(mod, NULL, &s, NULL, NULL, NULL, NULL, NULL); - if (s != al->map->start) + if (s != al->map->start - al->map->pgoff) mod = 0; } if (!mod) mod = dwfl_report_elf(ui->dwfl, dso->short_name, - (dso->symsrc_filename ? dso->symsrc_filename : dso->long_name), -1, al->map->start, + (dso->symsrc_filename ? dso->symsrc_filename : dso->long_name), -1, al->map->start - al->map->pgoff, false); return mod && dwfl_addrmodule(ui->dwfl, ip) == mod ? 0 : -1; diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index ede4d3dae750..689f6c8ebcd8 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -1,12 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_PROGS := cache_shape - -all: $(TEST_PROGS) - -$(TEST_PROGS): ../harness.c ../utils.c +TEST_GEN_PROGS := cache_shape top_srcdir = ../../../../.. include ../../lib.mk -clean: - rm -f $(TEST_PROGS) *.o +$(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index bd5dfa509272..23f4caf48ffc 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -5,6 +5,9 @@ noarg: # The EBB handler is 64-bit code and everything links against it CFLAGS += -m64 +# Toolchains may build PIE by default which breaks the assembly +LDFLAGS += -no-pie + TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ cycles_with_freeze_test pmc56_overflow_test \ ebb_vs_cpu_event_test cpu_event_vs_ebb_test \ diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 9b35ca8e8f13..8d3f006c98cc 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ +TEST_GEN_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx ptrace-tm-vsx \ ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey core-pkey \ perf-hwbreak ptrace-syscall @@ -7,14 +7,9 @@ TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ top_srcdir = ../../../../.. include ../../lib.mk -all: $(TEST_PROGS) - CFLAGS += -m64 -I../../../../../usr/include -I../tm -mhtm -fno-pie -ptrace-pkey core-pkey: child.h -ptrace-pkey core-pkey: LDLIBS += -pthread - -$(TEST_PROGS): ../harness.c ../utils.c ../lib/reg.S ptrace.h +$(OUTPUT)/ptrace-pkey $(OUTPUT)/core-pkey: child.h +$(OUTPUT)/ptrace-pkey $(OUTPUT)/core-pkey: LDLIBS += -pthread -clean: - rm -f $(TEST_PROGS) *.o +$(TEST_GEN_PROGS): ../harness.c ../utils.c ../lib/reg.S ptrace.h diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c index 327fa943c7f3..dbdffa2e2c82 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c @@ -67,8 +67,8 @@ trans: "3: ;" : [res] "=r" (result), [texasr] "=r" (texasr) : [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2), [gpr_4]"i"(GPR_4), - [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "r" (&a), - [flt_2] "r" (&b), [flt_4] "r" (&d) + [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a), + [flt_4] "b" (&d) : "memory", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index 44690f1bb26a..85861c46b445 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0+ TEST_GEN_PROGS := rfi_flush +top_srcdir = ../../../../.. CFLAGS += -I../../../../../usr/include diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tools/testing/selftests/powerpc/security/rfi_flush.c index 564ed45bbf73..0a7d0afb26b8 100644 --- a/tools/testing/selftests/powerpc/security/rfi_flush.c +++ b/tools/testing/selftests/powerpc/security/rfi_flush.c @@ -49,6 +49,7 @@ int rfi_flush_test(void) struct perf_event_read v; __u64 l1d_misses_total = 0; unsigned long iterations = 100000, zero_size = 24 * 1024; + unsigned long l1d_misses_expected; int rfi_flush_org, rfi_flush; SKIP_IF(geteuid() != 0); @@ -71,6 +72,12 @@ int rfi_flush_test(void) iter = repetitions; + /* + * We expect to see l1d miss for each cacheline access when rfi_flush + * is set. Allow a small variation on this. + */ + l1d_misses_expected = iterations * (zero_size / CACHELINE_SIZE - 2); + again: FAIL_IF(perf_event_reset(fd)); @@ -78,10 +85,9 @@ again: FAIL_IF(read(fd, &v, sizeof(v)) != sizeof(v)); - /* Expect at least zero_size/CACHELINE_SIZE misses per iteration */ - if (v.l1d_misses >= (iterations * zero_size / CACHELINE_SIZE) && rfi_flush) + if (rfi_flush && v.l1d_misses >= l1d_misses_expected) passes++; - else if (v.l1d_misses < iterations && !rfi_flush) + else if (!rfi_flush && v.l1d_misses < (l1d_misses_expected / 2)) passes++; l1d_misses_total += v.l1d_misses; @@ -92,13 +98,15 @@ again: if (passes < repetitions) { printf("FAIL (L1D misses with rfi_flush=%d: %llu %c %lu) [%d/%d failures]\n", rfi_flush, l1d_misses_total, rfi_flush ? '<' : '>', - rfi_flush ? (repetitions * iterations * zero_size / CACHELINE_SIZE) : iterations, + rfi_flush ? repetitions * l1d_misses_expected : + repetitions * l1d_misses_expected / 2, repetitions - passes, repetitions); rc = 1; } else printf("PASS (L1D misses with rfi_flush=%d: %llu %c %lu) [%d/%d pass]\n", rfi_flush, l1d_misses_total, rfi_flush ? '>' : '<', - rfi_flush ? (repetitions * iterations * zero_size / CACHELINE_SIZE) : iterations, + rfi_flush ? repetitions * l1d_misses_expected : + repetitions * l1d_misses_expected / 2, passes, repetitions); if (rfi_flush == rfi_flush_org) { diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index 1fca25c6ace0..209a958dca12 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,15 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_PROGS := signal signal_tm - -all: $(TEST_PROGS) - -$(TEST_PROGS): ../harness.c ../utils.c signal.S +TEST_GEN_PROGS := signal signal_tm CFLAGS += -maltivec -signal_tm: CFLAGS += -mhtm +$(OUTPUT)/signal_tm: CFLAGS += -mhtm top_srcdir = ../../../../.. include ../../lib.mk -clean: - rm -f $(TEST_PROGS) *.o +$(TEST_GEN_PROGS): ../harness.c ../utils.c signal.S diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index fcd2dcb8972b..bdc081afedb0 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -8,6 +8,7 @@ EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +$(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S $(OUTPUT)/check-reversed.o: $(OUTPUT)/check.o diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c index 43c342845be0..ed62f4153d3e 100644 --- a/tools/testing/selftests/powerpc/utils.c +++ b/tools/testing/selftests/powerpc/utils.c @@ -25,7 +25,6 @@ #include "utils.h" static char auxv[4096]; -extern unsigned int dscr_insn[]; int read_auxv(char *buf, ssize_t buf_size) { @@ -247,7 +246,8 @@ static void sigill_handler(int signr, siginfo_t *info, void *unused) ucontext_t *ctx = (ucontext_t *)unused; unsigned long *pc = &UCONTEXT_NIA(ctx); - if (*pc == (unsigned long)&dscr_insn) { + /* mtspr 3,RS to check for move to DSCR below */ + if ((*((unsigned int *)*pc) & 0xfc1fffff) == 0x7c0303a6) { if (!warned++) printf("WARNING: Skipping over dscr setup. Consider running 'ppc64_cpu --dscr=1' manually.\n"); *pc += 4; @@ -271,5 +271,5 @@ void set_dscr(unsigned long val) init = 1; } - asm volatile("dscr_insn: mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR)); + asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR)); } |