diff options
1306 files changed, 17778 insertions, 13630 deletions
@@ -25,6 +25,8 @@ Aleksey Gorelov <aleksey_gorelov@phoenix.com> Alexander Lobakin <alobakin@pm.me> <alobakin@dlink.ru> Alexander Lobakin <alobakin@pm.me> <alobakin@marvell.com> Alexander Lobakin <alobakin@pm.me> <bloodyreaper@yandex.ru> +Alexander Mikhalitsyn <alexander@mihalicyn.com> <alexander.mikhalitsyn@virtuozzo.com> +Alexander Mikhalitsyn <alexander@mihalicyn.com> <aleksandr.mikhalitsyn@canonical.com> Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electrons.com> Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com> Alexei Starovoitov <ast@kernel.org> <ast@fb.com> @@ -130,6 +132,7 @@ Domen Puncer <domen@coderock.org> Douglas Gilbert <dougg@torque.net> Ed L. Cashin <ecashin@coraid.com> Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com> +Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com> Evgeniy Polyakov <johnpol@2ka.mipt.ru> Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com> Felipe W Damasio <felipewd@terra.com.br> @@ -214,6 +217,7 @@ Jisheng Zhang <jszhang@kernel.org> <jszhang@marvell.com> Jisheng Zhang <jszhang@kernel.org> <Jisheng.Zhang@synaptics.com> Johan Hovold <johan@kernel.org> <jhovold@gmail.com> Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com> +John Crispin <john@phrozen.org> <blogic@openwrt.org> John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> John Stultz <johnstul@us.ibm.com> Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org> @@ -1173,6 +1173,10 @@ D: Future Domain TMC-16x0 SCSI driver (author) D: APM driver (early port) D: DRM drivers (author of several) +N: Veaceslav Falico +E: vfalico@gmail.com +D: Co-maintainer and co-author of the network bonding driver. + N: János Farkas E: chexum@shadow.banki.hu D: romfs, various (mostly networking) fixes @@ -2489,6 +2493,13 @@ D: XF86_Mach8 D: XF86_8514 D: cfdisk (curses based disk partitioning program) +N: Mat Martineau +E: mat@martineau.name +D: MPTCP subsystem co-maintainer 2020-2023 +D: Keyctl restricted keyring and Diffie-Hellman UAPI +D: Bluetooth L2CAP ERTM mode and AMP +S: USA + N: John S. Marvin E: jsm@fc.hp.com D: PA-RISC port @@ -4172,6 +4183,10 @@ S: B-1206 Jingmao Guojigongyu S: 16 Baliqiao Nanjie, Beijing 101100 S: People's Repulic of China +N: Vlad Yasevich +E: vyasevich@gmail.com +D: SCTP protocol maintainer. + N: Aviad Yehezkel E: aviadye@nvidia.com D: Kernel TLS implementation and offload support. diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs index bb4681a01811..284224d1b56f 100644 --- a/Documentation/ABI/testing/sysfs-fs-erofs +++ b/Documentation/ABI/testing/sysfs-fs-erofs @@ -4,7 +4,8 @@ Contact: "Huang Jianan" <huangjianan@oppo.com> Description: Shows all enabled kernel features. Supported features: zero_padding, compr_cfgs, big_pcluster, chunked_file, - device_table, compr_head2, sb_chksum. + device_table, compr_head2, sb_chksum, ztailpacking, + dedupe, fragments. What: /sys/fs/erofs/<disk>/sync_decompress Date: November 2021 diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c8ae7c897f14..74cec76be9f2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a string which contains the number of bytes to - reclaim. + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. Example:: echo "1G" > memory.reclaim + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. - This file also allows the user to specify the nodes to reclaim from, - via the 'nodes=' key, for example:: - - echo "1G nodes=0,1" > memory.reclaim - - The above instructs the kernel to reclaim memory from nodes 0,1. - memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst b/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst new file mode 100644 index 000000000000..875616d675fe --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst @@ -0,0 +1,91 @@ + +.. SPDX-License-Identifier: GPL-2.0 + +Cross-Thread Return Address Predictions +======================================= + +Certain AMD and Hygon processors are subject to a cross-thread return address +predictions vulnerability. When running in SMT mode and one sibling thread +transitions out of C0 state, the other sibling thread could use return target +predictions from the sibling thread that transitioned out of C0. + +The Spectre v2 mitigations protect the Linux kernel, as it fills the return +address prediction entries with safe targets when context switching to the idle +thread. However, KVM does allow a VMM to prevent exiting guest mode when +transitioning out of C0. This could result in a guest-controlled return target +being consumed by the sibling thread. + +Affected processors +------------------- + +The following CPUs are vulnerable: + + - AMD Family 17h processors + - Hygon Family 18h processors + +Related CVEs +------------ + +The following CVE entry is related to this issue: + + ============== ======================================= + CVE-2022-27672 Cross-Thread Return Address Predictions + ============== ======================================= + +Problem +------- + +Affected SMT-capable processors support 1T and 2T modes of execution when SMT +is enabled. In 2T mode, both threads in a core are executing code. For the +processor core to enter 1T mode, it is required that one of the threads +requests to transition out of the C0 state. This can be communicated with the +HLT instruction or with an MWAIT instruction that requests non-C0. +When the thread re-enters the C0 state, the processor transitions back +to 2T mode, assuming the other thread is also still in C0 state. + +In affected processors, the return address predictor (RAP) is partitioned +depending on the SMT mode. For instance, in 2T mode each thread uses a private +16-entry RAP, but in 1T mode, the active thread uses a 32-entry RAP. Upon +transition between 1T/2T mode, the RAP contents are not modified but the RAP +pointers (which control the next return target to use for predictions) may +change. This behavior may result in return targets from one SMT thread being +used by RET predictions in the sibling thread following a 1T/2T switch. In +particular, a RET instruction executed immediately after a transition to 1T may +use a return target from the thread that just became idle. In theory, this +could lead to information disclosure if the return targets used do not come +from trustworthy code. + +Attack scenarios +---------------- + +An attack can be mounted on affected processors by performing a series of CALL +instructions with targeted return locations and then transitioning out of C0 +state. + +Mitigation mechanism +-------------------- + +Before entering idle state, the kernel context switches to the idle thread. The +context switch fills the RAP entries (referred to as the RSB in Linux) with safe +targets by performing a sequence of CALL instructions. + +Prevent a guest VM from directly putting the processor into an idle state by +intercepting HLT and MWAIT instructions. + +Both mitigations are required to fully address this issue. + +Mitigation control on the kernel command line +--------------------------------------------- + +Use existing Spectre v2 mitigations that will fill the RSB on context switch. + +Mitigation control for KVM - module parameter +--------------------------------------------- + +By default, the KVM hypervisor mitigates this issue by intercepting guest +attempts to transition out of C0. A VMM can use the KVM_CAP_X86_DISABLE_EXITS +capability to override those interceptions, but since this is not common, the +mitigation that covers this path is not enabled by default. + +The mitigation for the KVM_CAP_X86_DISABLE_EXITS capability can be turned on +using the boolean module parameter mitigate_smt_rsb, e.g. ``kvm.mitigate_smt_rsb=1``. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 4df436e7c417..e0614760a99e 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -18,3 +18,4 @@ are configurable at compile, boot or run time. core-scheduling.rst l1d_flush.rst processor_mmio_stale_data.rst + cross-thread-rsb.rst diff --git a/Documentation/devicetree/bindings/.gitignore b/Documentation/devicetree/bindings/.gitignore index a77719968a7e..51ddb26d93f0 100644 --- a/Documentation/devicetree/bindings/.gitignore +++ b/Documentation/devicetree/bindings/.gitignore @@ -2,3 +2,8 @@ *.example.dts /processed-schema*.yaml /processed-schema*.json + +# +# We don't want to ignore the following even if they are dot-files +# +!.yamllint diff --git a/Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml b/Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml index c46378efc123..92e899905ef8 100644 --- a/Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml +++ b/Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml @@ -16,7 +16,7 @@ properties: compatible: items: - enum: - - renesas,i2c-r9a09g011 # RZ/V2M + - renesas,r9a09g011-i2c # RZ/V2M - const: renesas,rzv2m-i2c reg: @@ -66,7 +66,7 @@ examples: #include <dt-bindings/interrupt-controller/arm-gic.h> i2c0: i2c@a4030000 { - compatible = "renesas,i2c-r9a09g011", "renesas,rzv2m-i2c"; + compatible = "renesas,r9a09g011-i2c", "renesas,rzv2m-i2c"; reg = <0xa4030000 0x80>; interrupts = <GIC_SPI 232 IRQ_TYPE_EDGE_RISING>, <GIC_SPI 236 IRQ_TYPE_EDGE_RISING>; diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml index 9f7d3e11aacb..8449e14af9f3 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml @@ -108,7 +108,7 @@ properties: msi-controller: description: - Only present if the Message Based Interrupt functionnality is + Only present if the Message Based Interrupt functionality is being exposed by the HW, and the mbi-ranges property present. mbi-ranges: diff --git a/Documentation/devicetree/bindings/regulator/samsung,s2mps14.yaml b/Documentation/devicetree/bindings/regulator/samsung,s2mps14.yaml index 01f9d4e236e9..a7feb497eb89 100644 --- a/Documentation/devicetree/bindings/regulator/samsung,s2mps14.yaml +++ b/Documentation/devicetree/bindings/regulator/samsung,s2mps14.yaml @@ -19,8 +19,8 @@ description: | additional information and example. patternProperties: - # 25 LDOs - "^LDO([1-9]|[1][0-9]|2[0-5])$": + # 25 LDOs, without LDO10-12 + "^LDO([1-9]|1[3-9]|2[0-5])$": type: object $ref: regulator.yaml# unevaluatedProperties: false @@ -30,6 +30,23 @@ patternProperties: required: - regulator-name + "^LDO(1[0-2])$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + description: + Properties for single LDO regulator. + + properties: + samsung,ext-control-gpios: + maxItems: 1 + description: + LDO10, LDO11 and LDO12 can be configured to external control over + GPIO. + + required: + - regulator-name + # 5 bucks "^BUCK[1-5]$": type: object diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index c6720764e765..a2884e3113da 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -83,7 +83,7 @@ properties: insensitive, letters in the riscv,isa string must be all lowercase to simplify parsing. $ref: "/schemas/types.yaml#/definitions/string" - pattern: ^rv(?:64|32)imaf?d?q?c?b?v?k?h?(?:_[hsxz](?:[a-z])+)*$ + pattern: ^rv(?:64|32)imaf?d?q?c?b?k?j?p?v?h?(?:[hsxz](?:[a-z])+)?(?:_[hsxz](?:[a-z])+)*$ # RISC-V requires 'timebase-frequency' in /cpus, so disallow it here timebase-frequency: false diff --git a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml index 0a7aa29563c1..21c8ea08ff0a 100644 --- a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml @@ -40,6 +40,8 @@ properties: description: Indicates that the setting of RTC time is allowed by the host CPU. + wakeup-source: true + required: - compatible - reg diff --git a/Documentation/devicetree/bindings/sound/everest,es8326.yaml b/Documentation/devicetree/bindings/sound/everest,es8326.yaml index 07781408e788..07781408e788 100755..100644 --- a/Documentation/devicetree/bindings/sound/everest,es8326.yaml +++ b/Documentation/devicetree/bindings/sound/everest,es8326.yaml diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index ef183387da20..eccd327e6df5 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -1277,8 +1277,8 @@ the file contents themselves, as described below: For the read path (->read_folio()) of regular files, filesystems can read the ciphertext into the page cache and decrypt it in-place. The -page lock must be held until decryption has finished, to prevent the -page from becoming visible to userspace prematurely. +folio lock must be held until decryption has finished, to prevent the +folio from becoming visible to userspace prematurely. For the write path (->writepage()) of regular files, filesystems cannot encrypt data in-place in the page cache, since the cached diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index cb8e7573882a..ede672dedf11 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -118,10 +118,11 @@ as follows: - ``hash_algorithm`` must be the identifier for the hash algorithm to use for the Merkle tree, such as FS_VERITY_HASH_ALG_SHA256. See ``include/uapi/linux/fsverity.h`` for the list of possible values. -- ``block_size`` must be the Merkle tree block size. Currently, this - must be equal to the system page size, which is usually 4096 bytes. - Other sizes may be supported in the future. This value is not - necessarily the same as the filesystem block size. +- ``block_size`` is the Merkle tree block size, in bytes. In Linux + v6.3 and later, this can be any power of 2 between (inclusively) + 1024 and the minimum of the system page size and the filesystem + block size. In earlier versions, the page size was the only allowed + value. - ``salt_size`` is the size of the salt in bytes, or 0 if no salt is provided. The salt is a value that is prepended to every hashed block; it can be used to personalize the hashing for a particular @@ -161,6 +162,7 @@ FS_IOC_ENABLE_VERITY can fail with the following errors: - ``EBUSY``: this ioctl is already running on the file - ``EEXIST``: the file already has verity enabled - ``EFAULT``: the caller provided inaccessible memory +- ``EFBIG``: the file is too large to enable verity on - ``EINTR``: the operation was interrupted by a fatal signal - ``EINVAL``: unsupported version, hash algorithm, or block size; or reserved bits are set; or the file descriptor refers to neither a @@ -495,9 +497,11 @@ To create verity files on an ext4 filesystem, the filesystem must have been formatted with ``-O verity`` or had ``tune2fs -O verity`` run on it. "verity" is an RO_COMPAT filesystem feature, so once set, old kernels will only be able to mount the filesystem readonly, and old -versions of e2fsck will be unable to check the filesystem. Moreover, -currently ext4 only supports mounting a filesystem with the "verity" -feature when its block size is equal to PAGE_SIZE (often 4096 bytes). +versions of e2fsck will be unable to check the filesystem. + +Originally, an ext4 filesystem with the "verity" feature could only be +mounted when its block size was equal to the system page size +(typically 4096 bytes). In Linux v6.3, this limitation was removed. ext4 sets the EXT4_VERITY_FL on-disk inode flag on verity files. It can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be cleared. @@ -518,9 +522,7 @@ support paging multi-gigabyte xattrs into memory, and to support encrypting xattrs. Note that the verity metadata *must* be encrypted when the file is, since it contains hashes of the plaintext data. -Currently, ext4 verity only supports the case where the Merkle tree -block size, filesystem block size, and page size are all the same. It -also only supports extent-based files. +ext4 only allows verity on extent-based files. f2fs ---- @@ -538,11 +540,10 @@ Like ext4, f2fs stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. See explanation for ext4 above. Moreover, f2fs supports at most 4096 bytes of xattr entries per inode -which wouldn't be enough for even a single Merkle tree block. +which usually wouldn't be enough for even a single Merkle tree block. -Currently, f2fs verity only supports a Merkle tree block size of 4096. -Also, f2fs doesn't support enabling verity on files that currently -have atomic or volatile writes pending. +f2fs doesn't support enabling verity on files that currently have +atomic or volatile writes pending. btrfs ----- @@ -567,51 +568,48 @@ Pagecache ~~~~~~~~~ For filesystems using Linux's pagecache, the ``->read_folio()`` and -``->readahead()`` methods must be modified to verify pages before they -are marked Uptodate. Merely hooking ``->read_iter()`` would be +``->readahead()`` methods must be modified to verify folios before +they are marked Uptodate. Merely hooking ``->read_iter()`` would be insufficient, since ``->read_iter()`` is not used for memory maps. -Therefore, fs/verity/ provides a function fsverity_verify_page() which -verifies a page that has been read into the pagecache of a verity -inode, but is still locked and not Uptodate, so it's not yet readable -by userspace. As needed to do the verification, -fsverity_verify_page() will call back into the filesystem to read -Merkle tree pages via fsverity_operations::read_merkle_tree_page(). +Therefore, fs/verity/ provides the function fsverity_verify_blocks() +which verifies data that has been read into the pagecache of a verity +inode. The containing folio must still be locked and not Uptodate, so +it's not yet readable by userspace. As needed to do the verification, +fsverity_verify_blocks() will call back into the filesystem to read +hash blocks via fsverity_operations::read_merkle_tree_page(). -fsverity_verify_page() returns false if verification failed; in this -case, the filesystem must not set the page Uptodate. Following this, +fsverity_verify_blocks() returns false if verification failed; in this +case, the filesystem must not set the folio Uptodate. Following this, as per the usual Linux pagecache behavior, attempts by userspace to -read() from the part of the file containing the page will fail with -EIO, and accesses to the page within a memory map will raise SIGBUS. - -fsverity_verify_page() currently only supports the case where the -Merkle tree block size is equal to PAGE_SIZE (often 4096 bytes). +read() from the part of the file containing the folio will fail with +EIO, and accesses to the folio within a memory map will raise SIGBUS. -In principle, fsverity_verify_page() verifies the entire path in the -Merkle tree from the data page to the root hash. However, for -efficiency the filesystem may cache the hash pages. Therefore, -fsverity_verify_page() only ascends the tree reading hash pages until -an already-verified hash page is seen, as indicated by the PageChecked -bit being set. It then verifies the path to that page. +In principle, verifying a data block requires verifying the entire +path in the Merkle tree from the data block to the root hash. +However, for efficiency the filesystem may cache the hash blocks. +Therefore, fsverity_verify_blocks() only ascends the tree reading hash +blocks until an already-verified hash block is seen. It then verifies +the path to that block. This optimization, which is also used by dm-verity, results in excellent sequential read performance. This is because usually (e.g. -127 in 128 times for 4K blocks and SHA-256) the hash page from the +127 in 128 times for 4K blocks and SHA-256) the hash block from the bottom level of the tree will already be cached and checked from -reading a previous data page. However, random reads perform worse. +reading a previous data block. However, random reads perform worse. Block device based filesystems ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Block device based filesystems (e.g. ext4 and f2fs) in Linux also use the pagecache, so the above subsection applies too. However, they -also usually read many pages from a file at once, grouped into a +also usually read many data blocks from a file at once, grouped into a structure called a "bio". To make it easier for these types of filesystems to support fs-verity, fs/verity/ also provides a function -fsverity_verify_bio() which verifies all pages in a bio. +fsverity_verify_bio() which verifies all data blocks in a bio. ext4 and f2fs also support encryption. If a verity file is also -encrypted, the pages must be decrypted before being verified. To +encrypted, the data must be decrypted before being verified. To support this, these filesystems allocate a "post-read context" for each bio and store it in ``->bi_private``:: @@ -626,14 +624,14 @@ each bio and store it in ``->bi_private``:: verity, or both is enabled. After the bio completes, for each needed postprocessing step the filesystem enqueues the bio_post_read_ctx on a workqueue, and then the workqueue work does the decryption or -verification. Finally, pages where no decryption or verity error -occurred are marked Uptodate, and the pages are unlocked. +verification. Finally, folios where no decryption or verity error +occurred are marked Uptodate, and the folios are unlocked. On many filesystems, files can contain holes. Normally, -``->readahead()`` simply zeroes holes and sets the corresponding pages -Uptodate; no bios are issued. To prevent this case from bypassing -fs-verity, these filesystems use fsverity_verify_page() to verify hole -pages. +``->readahead()`` simply zeroes hole blocks and considers the +corresponding data to be up-to-date; no bios are issued. To prevent +this case from bypassing fs-verity, filesystems use +fsverity_verify_blocks() to verify hole blocks. Filesystems also disable direct I/O on verity files, since otherwise direct I/O would bypass fs-verity. @@ -644,7 +642,7 @@ Userspace utility This document focuses on the kernel, but a userspace utility for fs-verity can be found at: - https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/fsverity-utils.git + https://git.kernel.org/pub/scm/fs/fsverity/fsverity-utils.git See the README.md file in the fsverity-utils source tree for details, including examples of setting up fs-verity protected files. @@ -793,9 +791,9 @@ weren't already directly answered in other parts of this document. :A: There are many reasons why this is not possible or would be very difficult, including the following: - - To prevent bypassing verification, pages must not be marked + - To prevent bypassing verification, folios must not be marked Uptodate until they've been verified. Currently, each - filesystem is responsible for marking pages Uptodate via + filesystem is responsible for marking folios Uptodate via ``->readahead()``. Therefore, currently it's not possible for the VFS to do the verification on its own. Changing this would require significant changes to the VFS and all filesystems. diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 36fa2a83d714..7de7a7272a5e 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -56,35 +56,35 @@ inode_operations prototypes:: - int (*create) (struct inode *,struct dentry *,umode_t, bool); + int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t, bool); struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,umode_t); + int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); + int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); - int (*rename) (struct inode *, struct dentry *, + int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); + int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int, unsigned int); + int (*permission) (struct mnt_idmap *, struct inode *, int, unsigned int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); - int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); + int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); + int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct user_namespace *, struct inode *, + int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); - int (*fileattr_set)(struct user_namespace *mnt_userns, + int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); - struct posix_acl * (*get_acl)(struct user_namespace *, struct dentry *, int); + struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); locking rules: all may block @@ -135,7 +135,7 @@ prototypes:: struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 2c15e7053113..c53f30251a66 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -421,31 +421,31 @@ As of kernel 2.6.22, the following members are defined: .. code-block:: c struct inode_operations { - int (*create) (struct user_namespace *, struct inode *,struct dentry *, umode_t, bool); + int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,const char *); - int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,umode_t); + int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); + int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,umode_t,dev_t); - int (*rename) (struct user_namespace *, struct inode *, struct dentry *, + int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); + int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); - int (*permission) (struct user_namespace *, struct inode *, int); + int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); - int (*setattr) (struct user_namespace *, struct dentry *, struct iattr *); - int (*getattr) (struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); + int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); + int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct user_namespace *, struct inode *, struct file *, umode_t); - struct posix_acl * (*get_acl)(struct user_namespace *, struct dentry *, int); - int (*set_acl)(struct user_namespace *, struct dentry *, struct posix_acl *, int); - int (*fileattr_set)(struct user_namespace *mnt_userns, + int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); + struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); + int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); + int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); }; diff --git a/Documentation/networking/bridge.rst b/Documentation/networking/bridge.rst index 4aef9cddde2f..c859f3c1636e 100644 --- a/Documentation/networking/bridge.rst +++ b/Documentation/networking/bridge.rst @@ -8,7 +8,7 @@ In order to use the Ethernet bridging functionality, you'll need the userspace tools. Documentation for Linux bridging is on: - http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge + https://wiki.linuxfoundation.org/networking/bridge The bridge-utilities are maintained at: git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/bridge-utils.git diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst index dc2e60ced927..b481b81f3be5 100644 --- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst +++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst @@ -819,7 +819,7 @@ NAPI ---- This driver supports NAPI (Rx polling mode). For more information on NAPI, see -https://www.linuxfoundation.org/collaborate/workgroups/networking/napi +https://wiki.linuxfoundation.org/networking/napi MACVLAN diff --git a/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst b/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst index eaa87dbe8848..d052ef40fe36 100644 --- a/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst +++ b/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst @@ -16,5 +16,5 @@ Contents Support ======= -If you got any problem, contact Wangxun support team via support@trustnetic.com +If you got any problem, contact Wangxun support team via nic-support@net-swift.com and Cc: netdev. diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index 49db1d11d7c4..8b1045c3b59e 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -173,7 +173,9 @@ nf_conntrack_sctp_timeout_cookie_echoed - INTEGER (seconds) default 3 nf_conntrack_sctp_timeout_established - INTEGER (seconds) - default 432000 (5 days) + default 210 + + Default is set to (hb_interval * path_max_retrans + rto_max) nf_conntrack_sctp_timeout_shutdown_sent - INTEGER (seconds) default 0.3 @@ -190,12 +192,6 @@ nf_conntrack_sctp_timeout_heartbeat_sent - INTEGER (seconds) This timeout is used to setup conntrack entry on secondary paths. Default is set to hb_interval. -nf_conntrack_sctp_timeout_heartbeat_acked - INTEGER (seconds) - default 210 - - This timeout is used to setup conntrack entry on secondary paths. - Default is set to (hb_interval * path_max_retrans + rto_max) - nf_conntrack_udp_timeout - INTEGER (seconds) default 30 diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9807b05a1b57..0a67cb738013 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8070,9 +8070,13 @@ considering the state as complete. VMM needs to ensure that the dirty state is final and avoid missing dirty pages from another ioctl ordered after the bitmap collection. -NOTE: One example of using the backup bitmap is saving arm64 vgic/its -tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on -KVM device "kvm-arm-vgic-its" when dirty ring is enabled. +NOTE: Multiple examples of using the backup bitmap: (1) save vgic/its +tables through command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} on +KVM device "kvm-arm-vgic-its". (2) restore vgic/its tables through +command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} on KVM device +"kvm-arm-vgic-its". VGICv3 LPI pending status is restored. (3) save +vgic3 pending table through KVM_DEV_ARM_VGIC_{GRP_CTRL, SAVE_PENDING_TABLES} +command on KVM device "kvm-arm-vgic-v3". 8.30 KVM_CAP_XEN_HVM -------------------- diff --git a/Documentation/x86/amd-memory-encryption.rst b/Documentation/x86/amd-memory-encryption.rst index a1940ebe7be5..934310ce7258 100644 --- a/Documentation/x86/amd-memory-encryption.rst +++ b/Documentation/x86/amd-memory-encryption.rst @@ -95,3 +95,39 @@ by supplying mem_encrypt=on on the kernel command line. However, if BIOS does not enable SME, then Linux will not be able to activate memory encryption, even if configured to do so by default or the mem_encrypt=on command line parameter is specified. + +Secure Nested Paging (SNP) +========================== + +SEV-SNP introduces new features (SEV_FEATURES[1:63]) which can be enabled +by the hypervisor for security enhancements. Some of these features need +guest side implementation to function correctly. The below table lists the +expected guest behavior with various possible scenarios of guest/hypervisor +SNP feature support. + ++-----------------+---------------+---------------+------------------+ +| Feature Enabled | Guest needs | Guest has | Guest boot | +| by the HV | implementation| implementation| behaviour | ++=================+===============+===============+==================+ +| No | No | No | Boot | +| | | | | ++-----------------+---------------+---------------+------------------+ +| No | Yes | No | Boot | +| | | | | ++-----------------+---------------+---------------+------------------+ +| No | Yes | Yes | Boot | +| | | | | ++-----------------+---------------+---------------+------------------+ +| Yes | No | No | Boot with | +| | | | feature enabled | ++-----------------+---------------+---------------+------------------+ +| Yes | Yes | No | Graceful boot | +| | | | failure | ++-----------------+---------------+---------------+------------------+ +| Yes | Yes | Yes | Boot with | +| | | | feature enabled | ++-----------------+---------------+---------------+------------------+ + +More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR + +[1] https://www.amd.com/system/files/TechDocs/40332.pdf diff --git a/MAINTAINERS b/MAINTAINERS index f781f936ae35..82938ca70466 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1097,7 +1097,6 @@ S: Maintained F: drivers/dma/ptdma/ AMD SEATTLE DEVICE TREE SUPPORT -M: Brijesh Singh <brijeshkumar.singh@amd.com> M: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> M: Tom Lendacky <thomas.lendacky@amd.com> S: Supported @@ -2212,6 +2211,9 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git X: drivers/media/i2c/ +F: arch/arm64/boot/dts/freescale/ +X: arch/arm64/boot/dts/freescale/fsl-* +X: arch/arm64/boot/dts/freescale/qoriq-* N: imx N: mxs @@ -2450,11 +2452,14 @@ F: drivers/rtc/rtc-mt7622.c ARM/Mediatek SoC support M: Matthias Brugger <matthias.bgg@gmail.com> +R: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com> +L: linux-kernel@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) S: Maintained W: https://mtk.wiki.kernel.org/ -C: irc://chat.freenode.net/linux-mediatek +C: irc://irc.libera.chat/linux-mediatek +F: arch/arm/boot/dts/mt2* F: arch/arm/boot/dts/mt6* F: arch/arm/boot/dts/mt7* F: arch/arm/boot/dts/mt8* @@ -2462,7 +2467,7 @@ F: arch/arm/mach-mediatek/ F: arch/arm64/boot/dts/mediatek/ F: drivers/soc/mediatek/ N: mtk -N: mt[678] +N: mt[2678] K: mediatek ARM/Mediatek USB3 PHY DRIVER @@ -3766,7 +3771,6 @@ F: net/bluetooth/ BONDING DRIVER M: Jay Vosburgh <j.vosburgh@gmail.com> -M: Veaceslav Falico <vfalico@gmail.com> M: Andy Gospodarek <andy@greyhouse.net> L: netdev@vger.kernel.org S: Supported @@ -7615,7 +7619,6 @@ S: Maintained F: drivers/firmware/efi/test/ EFI VARIABLE FILESYSTEM -M: Matthew Garrett <matthew.garrett@nebula.com> M: Jeremy Kerr <jk@ozlabs.org> M: Ard Biesheuvel <ardb@kernel.org> L: linux-efi@vger.kernel.org @@ -7744,6 +7747,7 @@ R: Jeffle Xu <jefflexu@linux.alibaba.com> L: linux-erofs@lists.ozlabs.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git +F: Documentation/ABI/testing/sysfs-fs-erofs F: Documentation/filesystems/erofs.rst F: fs/erofs/ F: include/trace/events/erofs.h @@ -7894,7 +7898,11 @@ F: include/linux/extcon/ EXTRA BOOT CONFIG M: Masami Hiramatsu <mhiramat@kernel.org> +L: linux-kernel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org +Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/ S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git F: Documentation/admin-guide/bootconfig.rst F: fs/proc/bootconfig.c F: include/linux/bootconfig.h @@ -8195,7 +8203,7 @@ F: drivers/fpga/microchip-spi.c FPU EMULATOR M: Bill Metzenthen <billm@melbpc.org.au> S: Maintained -W: http://floatingpoint.sourceforge.net/emulator/index.html +W: https://floatingpoint.billm.au/ F: arch/x86/math-emu/ FRAMEBUFFER CORE @@ -8467,16 +8475,16 @@ F: fs/fscache/ F: include/linux/fscache*.h FSCRYPT: FILE SYSTEM LEVEL ENCRYPTION SUPPORT +M: Eric Biggers <ebiggers@kernel.org> M: Theodore Y. Ts'o <tytso@mit.edu> M: Jaegeuk Kim <jaegeuk@kernel.org> -M: Eric Biggers <ebiggers@kernel.org> L: linux-fscrypt@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-fscrypt/list/ -T: git git://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git +T: git https://git.kernel.org/pub/scm/fs/fscrypt/linux.git F: Documentation/filesystems/fscrypt.rst F: fs/crypto/ -F: include/linux/fscrypt*.h +F: include/linux/fscrypt.h F: include/uapi/linux/fscrypt.h FSI SUBSYSTEM @@ -8519,10 +8527,10 @@ F: include/linux/fsnotify*.h FSVERITY: READ-ONLY FILE-BASED AUTHENTICITY PROTECTION M: Eric Biggers <ebiggers@kernel.org> M: Theodore Y. Ts'o <tytso@mit.edu> -L: linux-fscrypt@vger.kernel.org +L: fsverity@lists.linux.dev S: Supported -Q: https://patchwork.kernel.org/project/linux-fscrypt/list/ -T: git git://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git fsverity +Q: https://patchwork.kernel.org/project/fsverity/list/ +T: git https://git.kernel.org/pub/scm/fs/fsverity/linux.git F: Documentation/filesystems/fsverity.rst F: fs/verity/ F: include/linux/fsverity.h @@ -8570,6 +8578,7 @@ F: kernel/trace/fgraph.c F: arch/*/*/*/*ftrace* F: arch/*/*/*ftrace* F: include/*/ftrace.h +F: samples/ftrace FUNGIBLE ETHERNET DRIVERS M: Dimitris Michailidis <dmichail@fungible.com> @@ -9988,7 +9997,7 @@ S: Maintained T: git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping.git F: Documentation/filesystems/idmappings.rst F: tools/testing/selftests/mount_setattr/ -F: include/linux/mnt_idmapping.h +F: include/linux/mnt_idmapping.* IDT VersaClock 5 CLOCK DRIVER M: Luca Ceresoli <luca@lucaceresoli.net> @@ -14600,7 +14609,6 @@ F: tools/testing/selftests/net/ipsec.c NETWORKING [IPv4/IPv6] M: "David S. Miller" <davem@davemloft.net> -M: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> M: David Ahern <dsahern@kernel.org> L: netdev@vger.kernel.org S: Maintained @@ -14633,7 +14641,6 @@ F: net/netfilter/xt_SECMARK.c F: net/netlabel/ NETWORKING [MPTCP] -M: Mat Martineau <mathew.j.martineau@linux.intel.com> M: Matthieu Baerts <matthieu.baerts@tessares.net> L: netdev@vger.kernel.org L: mptcp@lists.linux.dev @@ -15658,7 +15665,7 @@ OPENRISC ARCHITECTURE M: Jonas Bonn <jonas@southpole.se> M: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi> M: Stafford Horne <shorne@gmail.com> -L: openrisc@lists.librecores.org +L: linux-openrisc@vger.kernel.org S: Maintained W: http://openrisc.io T: git https://github.com/openrisc/linux.git @@ -16114,7 +16121,7 @@ F: drivers/pci/controller/pci-v3-semi.c PCI ENDPOINT SUBSYSTEM M: Lorenzo Pieralisi <lpieralisi@kernel.org> -R: Krzysztof WilczyÅ„ski <kw@linux.com> +M: Krzysztof WilczyÅ„ski <kw@linux.com> R: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> R: Kishon Vijay Abraham I <kishon@kernel.org> L: linux-pci@vger.kernel.org @@ -16122,7 +16129,7 @@ S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/PCI/endpoint/* F: Documentation/misc-devices/pci-endpoint-test.rst F: drivers/misc/pci_endpoint_test.c @@ -16157,7 +16164,7 @@ S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/driver-api/pci/p2pdma.rst F: drivers/pci/p2pdma.c F: include/linux/pci-p2pdma.h @@ -16179,14 +16186,14 @@ F: drivers/pci/controller/pci-xgene-msi.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS M: Lorenzo Pieralisi <lpieralisi@kernel.org> +M: Krzysztof WilczyÅ„ski <kw@linux.com> R: Rob Herring <robh@kernel.org> -R: Krzysztof WilczyÅ„ski <kw@linux.com> L: linux-pci@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/devicetree/bindings/pci/ F: drivers/pci/controller/ F: drivers/pci/pci-bridge-emul.c @@ -16199,7 +16206,7 @@ S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/PCI/ F: Documentation/devicetree/bindings/pci/ F: arch/x86/kernel/early-quirks.c @@ -17962,6 +17969,7 @@ M: Albert Ou <aou@eecs.berkeley.edu> L: linux-riscv@lists.infradead.org S: Supported Q: https://patchwork.kernel.org/project/linux-riscv/list/ +C: irc://irc.libera.chat/riscv P: Documentation/riscv/patch-acceptance.rst T: git git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git F: arch/riscv/ @@ -18227,6 +18235,7 @@ L: rust-for-linux@vger.kernel.org S: Supported W: https://github.com/Rust-for-Linux/linux B: https://github.com/Rust-for-Linux/linux/issues +C: zulip://rust-for-linux.zulipchat.com T: git https://github.com/Rust-for-Linux/linux.git rust-next F: Documentation/rust/ F: rust/ @@ -18683,9 +18692,9 @@ F: drivers/target/ F: include/target/ SCTP PROTOCOL -M: Vlad Yasevich <vyasevich@gmail.com> M: Neil Horman <nhorman@tuxdriver.com> M: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> +M: Xin Long <lucien.xin@gmail.com> L: linux-sctp@vger.kernel.org S: Maintained W: http://lksctp.sourceforge.net @@ -20083,6 +20092,7 @@ F: drivers/watchdog/sunplus_wdt.c SUPERH M: Yoshinori Sato <ysato@users.sourceforge.jp> M: Rich Felker <dalias@libc.org> +M: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> L: linux-sh@vger.kernel.org S: Maintained Q: http://patchwork.kernel.org/project/linux-sh/list/ @@ -20315,8 +20325,7 @@ S: Maintained F: drivers/platform/x86/system76_acpi.c SYSV FILESYSTEM -M: Christoph Hellwig <hch@infradead.org> -S: Maintained +S: Orphan F: Documentation/filesystems/sysv-fs.rst F: fs/sysv/ F: include/linux/sysv_fs.h @@ -21723,6 +21732,7 @@ F: include/uapi/linux/uvcvideo.h USB WEBCAM GADGET M: Laurent Pinchart <laurent.pinchart@ideasonboard.com> +M: Daniel Scally <dan.scally@ideasonboard.com> L: linux-usb@vger.kernel.org S: Maintained F: drivers/usb/gadget/function/*uvc* @@ -21810,11 +21820,9 @@ W: http://en.wikipedia.org/wiki/Util-linux T: git git://git.kernel.org/pub/scm/utils/util-linux/util-linux.git UUID HELPERS -M: Christoph Hellwig <hch@lst.de> R: Andy Shevchenko <andriy.shevchenko@linux.intel.com> L: linux-kernel@vger.kernel.org S: Maintained -T: git git://git.infradead.org/users/hch/uuid.git F: include/linux/uuid.h F: include/uapi/linux/uuid.h F: lib/test_uuid.c @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 2 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* @@ -1602,7 +1602,7 @@ endif # CONFIG_MODULES CLEAN_FILES += include/ksym vmlinux.symvers modules-only.symvers \ modules.builtin modules.builtin.modinfo modules.nsdeps \ compile_commands.json .thinlto-cache rust/test rust/doc \ - .vmlinux.objs .vmlinux.export.c + rust-project.json .vmlinux.objs .vmlinux.export.c # Directories & files removed with 'make mrproper' MRPROPER_FILES += include/config include/generated \ diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 4067f5169144..955b0362cdfb 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -132,7 +132,7 @@ AFLAGS_NOWARN :=$(call as-option,-Wa$(comma)-mno-warn-deprecated,-Wa$(comma)-W) ifeq ($(CONFIG_THUMB2_KERNEL),y) CFLAGS_ISA :=-Wa,-mimplicit-it=always $(AFLAGS_NOWARN) -AFLAGS_ISA :=$(CFLAGS_ISA) -Wa$(comma)-mthumb -D__thumb2__=2 +AFLAGS_ISA :=$(CFLAGS_ISA) -Wa$(comma)-mthumb CFLAGS_ISA +=-mthumb else CFLAGS_ISA :=$(call cc-option,-marm,) $(AFLAGS_NOWARN) diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts index d1971ddf06a5..7f755e5a4624 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts @@ -751,7 +751,7 @@ }; pca9849@75 { - compatible = "nxp,pca849"; + compatible = "nxp,pca9849"; reg = <0x75>; #address-cells = <1>; #size-cells = <0>; diff --git a/arch/arm/boot/dts/imx7d-smegw01.dts b/arch/arm/boot/dts/imx7d-smegw01.dts index 546268b8d0b1..c0f00f5db11e 100644 --- a/arch/arm/boot/dts/imx7d-smegw01.dts +++ b/arch/arm/boot/dts/imx7d-smegw01.dts @@ -198,6 +198,7 @@ &usbotg2 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usbotg2>; + over-current-active-low; dr_mode = "host"; status = "okay"; }; @@ -374,7 +375,7 @@ pinctrl_usbotg2: usbotg2grp { fsl,pins = < - MX7D_PAD_UART3_RTS_B__USB_OTG2_OC 0x04 + MX7D_PAD_UART3_RTS_B__USB_OTG2_OC 0x5c >; }; diff --git a/arch/arm/boot/dts/nuvoton-wpcm450.dtsi b/arch/arm/boot/dts/nuvoton-wpcm450.dtsi index b637241316bb..fd671c7a1e5d 100644 --- a/arch/arm/boot/dts/nuvoton-wpcm450.dtsi +++ b/arch/arm/boot/dts/nuvoton-wpcm450.dtsi @@ -480,6 +480,7 @@ reg = <0xc8000000 0x1000>, <0xc0000000 0x4000000>; reg-names = "control", "memory"; clocks = <&clk 0>; + nuvoton,shm = <&shm>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index 487b0e03d4b4..2ca76b69add7 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -1181,6 +1181,7 @@ clock-names = "dp", "pclk"; phys = <&edp_phy>; phy-names = "dp"; + power-domains = <&power RK3288_PD_VIO>; resets = <&cru SRST_EDP>; reset-names = "dp"; rockchip,grf = <&grf>; diff --git a/arch/arm/boot/dts/stihxxx-b2120.dtsi b/arch/arm/boot/dts/stihxxx-b2120.dtsi index 920a0bad7494..8d9a2dfa76f1 100644 --- a/arch/arm/boot/dts/stihxxx-b2120.dtsi +++ b/arch/arm/boot/dts/stihxxx-b2120.dtsi @@ -178,7 +178,7 @@ tsin-num = <0>; serial-not-parallel; i2c-bus = <&ssc2>; - reset-gpios = <&pio15 4 GPIO_ACTIVE_HIGH>; + reset-gpios = <&pio15 4 GPIO_ACTIVE_LOW>; dvb-card = <STV0367_TDA18212_NIMA_1>; }; }; diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 971e74546fb1..13e62c7c25dc 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -53,7 +53,12 @@ $(obj)/%-core.S: $(src)/%-armv4.pl clean-files += poly1305-core.S sha256-core.S sha512-core.S +aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 + +AFLAGS_sha256-core.o += $(aflags-thumb2-y) +AFLAGS_sha512-core.o += $(aflags-thumb2-y) + # massage the perlasm code a bit so we only get the NEON routine if we need it poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 -AFLAGS_poly1305-core.o += $(poly1305-aflags-y) +AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 68112c172025..006163195d67 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -73,6 +73,7 @@ #include <linux/syscalls.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/cred.h> #include <linux/fcntl.h> #include <linux/eventpoll.h> diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index c1494a4dee25..53f2d8774fdb 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -161,7 +161,7 @@ void __init paging_init(const struct machine_desc *mdesc) mpu_setup(); /* allocate the zero page. */ - zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + zero_page = (void *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!zero_page) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index fa6999e24b07..e43f6d716b4b 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -6,6 +6,7 @@ * VM_EXEC */ #include <asm/asm-offsets.h> +#include <asm/pgtable.h> #include <asm/thread_info.h> #ifdef CONFIG_CPU_V7M diff --git a/arch/arm64/boot/dts/amlogic/meson-axg.dtsi b/arch/arm64/boot/dts/amlogic/meson-axg.dtsi index 1648e67afbb6..417523dc4cc0 100644 --- a/arch/arm64/boot/dts/amlogic/meson-axg.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-axg.dtsi @@ -1886,7 +1886,7 @@ sd_emmc_b: sd@5000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0x5000 0x0 0x800>; - interrupts = <GIC_SPI 217 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 217 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_B>, <&clkc CLKID_SD_EMMC_B_CLK0>, @@ -1898,7 +1898,7 @@ sd_emmc_c: mmc@7000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0x7000 0x0 0x800>; - interrupts = <GIC_SPI 218 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 218 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_C>, <&clkc CLKID_SD_EMMC_C_CLK0>, diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index 9dbd50820b1c..7f55d97f6c28 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -2324,7 +2324,7 @@ sd_emmc_a: sd@ffe03000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0xffe03000 0x0 0x800>; - interrupts = <GIC_SPI 189 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_A>, <&clkc CLKID_SD_EMMC_A_CLK0>, @@ -2336,7 +2336,7 @@ sd_emmc_b: sd@ffe05000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0xffe05000 0x0 0x800>; - interrupts = <GIC_SPI 190 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_B>, <&clkc CLKID_SD_EMMC_B_CLK0>, @@ -2348,7 +2348,7 @@ sd_emmc_c: mmc@ffe07000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0xffe07000 0x0 0x800>; - interrupts = <GIC_SPI 191 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_C>, <&clkc CLKID_SD_EMMC_C_CLK0>, diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi index e3c12e0be99d..5eed15035b67 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi @@ -603,21 +603,21 @@ sd_emmc_a: mmc@70000 { compatible = "amlogic,meson-gx-mmc", "amlogic,meson-gxbb-mmc"; reg = <0x0 0x70000 0x0 0x800>; - interrupts = <GIC_SPI 216 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 216 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; }; sd_emmc_b: mmc@72000 { compatible = "amlogic,meson-gx-mmc", "amlogic,meson-gxbb-mmc"; reg = <0x0 0x72000 0x0 0x800>; - interrupts = <GIC_SPI 217 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 217 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; }; sd_emmc_c: mmc@74000 { compatible = "amlogic,meson-gx-mmc", "amlogic,meson-gxbb-mmc"; reg = <0x0 0x74000 0x0 0x800>; - interrupts = <GIC_SPI 218 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 218 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8dxl.dtsi b/arch/arm64/boot/dts/freescale/imx8dxl.dtsi index 0c64b9194621..214f21bd0cb4 100644 --- a/arch/arm64/boot/dts/freescale/imx8dxl.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8dxl.dtsi @@ -164,7 +164,7 @@ sc_pwrkey: keys { compatible = "fsl,imx8qxp-sc-key", "fsl,imx-sc-key"; - linux,keycode = <KEY_POWER>; + linux,keycodes = <KEY_POWER>; wakeup-source; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts b/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts index 752f409a30b1..9889319d4f04 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts @@ -88,6 +88,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_watchdog_gpio>; compatible = "linux,wdt-gpio"; + always-running; gpios = <&gpio1 8 GPIO_ACTIVE_HIGH>; hw_algo = "level"; /* Reset triggers in 2..3 seconds */ diff --git a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h index 83c8f715cd90..b1f11098d248 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h +++ b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h @@ -602,7 +602,7 @@ #define MX8MM_IOMUXC_UART1_RXD_GPIO5_IO22 0x234 0x49C 0x000 0x5 0x0 #define MX8MM_IOMUXC_UART1_RXD_TPSMP_HDATA24 0x234 0x49C 0x000 0x7 0x0 #define MX8MM_IOMUXC_UART1_TXD_UART1_DCE_TX 0x238 0x4A0 0x000 0x0 0x0 -#define MX8MM_IOMUXC_UART1_TXD_UART1_DTE_RX 0x238 0x4A0 0x4F4 0x0 0x0 +#define MX8MM_IOMUXC_UART1_TXD_UART1_DTE_RX 0x238 0x4A0 0x4F4 0x0 0x1 #define MX8MM_IOMUXC_UART1_TXD_ECSPI3_MOSI 0x238 0x4A0 0x000 0x1 0x0 #define MX8MM_IOMUXC_UART1_TXD_GPIO5_IO23 0x238 0x4A0 0x000 0x5 0x0 #define MX8MM_IOMUXC_UART1_TXD_TPSMP_HDATA25 0x238 0x4A0 0x000 0x7 0x0 diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso index 3ea73a6886ff..f6ad1a4b8b66 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso @@ -33,7 +33,6 @@ pinctrl-0 = <&pinctrl_uart2>; rts-gpios = <&gpio5 29 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio5 28 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso index 2fa635e1c1a8..1f8ea20dfafc 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso @@ -33,7 +33,6 @@ pinctrl-0 = <&pinctrl_uart2>; rts-gpios = <&gpio5 29 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio5 28 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi index 244ef8d6cc68..7761d5671cb1 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi @@ -222,7 +222,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_bten>; cts-gpios = <&gpio5 8 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio5 9 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts index 6433c205f8dd..64b366e83fa1 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts @@ -733,7 +733,6 @@ dtr-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>; dsr-gpios = <&gpio1 1 GPIO_ACTIVE_LOW>; dcd-gpios = <&gpio1 11 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; @@ -749,7 +748,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; cts-gpios = <&gpio4 10 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio4 9 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; @@ -758,7 +756,6 @@ pinctrl-0 = <&pinctrl_uart4>, <&pinctrl_uart4_gpio>; cts-gpios = <&gpio5 11 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio5 12 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts index 32872b0b1aaf..e8bc1fccc47b 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts @@ -664,7 +664,6 @@ pinctrl-0 = <&pinctrl_uart1>, <&pinctrl_uart1_gpio>; rts-gpios = <&gpio4 10 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio4 24 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; @@ -681,7 +680,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; rts-gpios = <&gpio2 1 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio2 0 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { @@ -699,7 +697,6 @@ dtr-gpios = <&gpio4 3 GPIO_ACTIVE_LOW>; dsr-gpios = <&gpio4 4 GPIO_ACTIVE_LOW>; dcd-gpios = <&gpio4 6 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts index 8ce562246a08..acc2ba8e00a8 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts @@ -581,7 +581,6 @@ dtr-gpios = <&gpio1 0 GPIO_ACTIVE_LOW>; dsr-gpios = <&gpio1 1 GPIO_ACTIVE_LOW>; dcd-gpios = <&gpio3 24 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 0d454e0e2f7c..702d87621bb4 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -98,6 +98,7 @@ off-on-delay = <500000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_reg_eth>; + regulator-always-on; regulator-boot-on; regulator-max-microvolt = <3300000>; regulator-min-microvolt = <3300000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts index b9444e4a3d2d..7c12518dbc96 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts @@ -643,7 +643,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; rts-gpios = <&gpio2 1 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio2 0 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts index ceeca4966fc5..8eb7d5ee38da 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts @@ -623,7 +623,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; cts-gpios = <&gpio3 21 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio3 22 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { diff --git a/arch/arm64/boot/dts/mediatek/mt8195.dtsi b/arch/arm64/boot/dts/mediatek/mt8195.dtsi index 5d31536f4c48..c10cfeb1214d 100644 --- a/arch/arm64/boot/dts/mediatek/mt8195.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8195.dtsi @@ -2146,7 +2146,7 @@ }; vdosys0: syscon@1c01a000 { - compatible = "mediatek,mt8195-mmsys", "syscon"; + compatible = "mediatek,mt8195-vdosys0", "mediatek,mt8195-mmsys", "syscon"; reg = <0 0x1c01a000 0 0x1000>; mboxes = <&gce0 0 CMDQ_THR_PRIO_4>; #clock-cells = <1>; @@ -2292,7 +2292,7 @@ }; vdosys1: syscon@1c100000 { - compatible = "mediatek,mt8195-mmsys", "syscon"; + compatible = "mediatek,mt8195-vdosys1", "syscon"; reg = <0 0x1c100000 0 0x1000>; #clock-cells = <1>; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts b/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts index aa22a0c22265..5d5d9574088c 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts @@ -96,7 +96,6 @@ linux,default-trigger = "heartbeat"; gpios = <&rk805 1 GPIO_ACTIVE_LOW>; default-state = "on"; - mode = <0x23>; }; user_led: led-1 { @@ -104,7 +103,6 @@ linux,default-trigger = "mmc1"; gpios = <&rk805 0 GPIO_ACTIVE_LOW>; default-state = "off"; - mode = <0x05>; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi index 6e29e74f6fc6..783120e9cebe 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi @@ -111,7 +111,7 @@ }; }; - dmc_opp_table: dmc_opp_table { + dmc_opp_table: opp-table-3 { compatible = "operating-points-v2"; opp00 { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts index 04403a76238b..a0795a2b1cb1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts @@ -104,6 +104,13 @@ }; }; +&cpu_alert0 { + temperature = <65000>; +}; +&cpu_alert1 { + temperature = <68000>; +}; + &cpu_l0 { cpu-supply = <&vdd_cpu_l>; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 4391aea25984..1881b4b71f91 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -589,7 +589,7 @@ clocks = <&cru HCLK_M_CRYPTO0>, <&cru HCLK_S_CRYPTO0>, <&cru SCLK_CRYPTO0>; clock-names = "hclk_master", "hclk_slave", "sclk"; resets = <&cru SRST_CRYPTO0>, <&cru SRST_CRYPTO0_S>, <&cru SRST_CRYPTO0_M>; - reset-names = "master", "lave", "crypto"; + reset-names = "master", "slave", "crypto-rst"; }; crypto1: crypto@ff8b8000 { @@ -599,7 +599,7 @@ clocks = <&cru HCLK_M_CRYPTO1>, <&cru HCLK_S_CRYPTO1>, <&cru SCLK_CRYPTO1>; clock-names = "hclk_master", "hclk_slave", "sclk"; resets = <&cru SRST_CRYPTO1>, <&cru SRST_CRYPTO1_S>, <&cru SRST_CRYPTO1_M>; - reset-names = "master", "slave", "crypto"; + reset-names = "master", "slave", "crypto-rst"; }; i2c1: i2c@ff110000 { @@ -2241,13 +2241,11 @@ pcfg_input_pull_up: pcfg-input-pull-up { input-enable; bias-pull-up; - drive-strength = <2>; }; pcfg_input_pull_down: pcfg-input-pull-down { input-enable; bias-pull-down; - drive-strength = <2>; }; clock { diff --git a/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts b/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts index 4c7f9abd594f..d956496d5221 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts @@ -353,6 +353,17 @@ }; }; +&pmu_io_domains { + pmuio2-supply = <&vcc_3v3>; + vccio1-supply = <&vcc_3v3>; + vccio3-supply = <&vcc_3v3>; + vccio4-supply = <&vcca_1v8>; + vccio5-supply = <&vcc_3v3>; + vccio6-supply = <&vcca_1v8>; + vccio7-supply = <&vcc_3v3>; + status = "okay"; +}; + &pwm0 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts b/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts index a1c5fdf7d68f..3c9d85257cc9 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts @@ -571,6 +571,8 @@ }; &i2s1_8ch { + pinctrl-names = "default"; + pinctrl-0 = <&i2s1m0_sclktx &i2s1m0_lrcktx &i2s1m0_sdi0 &i2s1m0_sdo0>; rockchip,trcm-sync-tx-only; status = "okay"; }; @@ -730,14 +732,13 @@ disable-wp; pinctrl-names = "default"; pinctrl-0 = <&sdmmc0_bus4 &sdmmc0_clk &sdmmc0_cmd &sdmmc0_det>; - sd-uhs-sdr104; + sd-uhs-sdr50; vmmc-supply = <&vcc3v3_sd>; vqmmc-supply = <&vccio_sd>; status = "okay"; }; &sdmmc2 { - supports-sdio; bus-width = <4>; disable-wp; cap-sd-highspeed; diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index 5706c3e24f0a..c27f1c7f072d 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -966,6 +966,7 @@ clock-names = "aclk_mst", "aclk_slv", "aclk_dbi", "pclk", "aux"; device_type = "pci"; + #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0 0 0 1 &pcie_intc 0>, <0 0 0 2 &pcie_intc 1>, diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 31d13a6001df..de4ff90785b2 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -48,8 +48,17 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); }) extern spinlock_t efi_rt_lock; +extern u64 *efi_rt_stack_top; efi_status_t __efi_rt_asm_wrapper(void *, const char *, ...); +/* + * efi_rt_stack_top[-1] contains the value the stack pointer had before + * switching to the EFI runtime stack. + */ +#define current_in_efi() \ + (!preemptible() && efi_rt_stack_top != NULL && \ + on_task_stack(current, READ_ONCE(efi_rt_stack_top[-1]), 1)) + #define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) /* diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 4e5354beafb0..66ec8caa6ac0 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -106,4 +106,19 @@ static inline struct stack_info stackinfo_get_sdei_critical(void) #define stackinfo_get_sdei_critical() stackinfo_get_unknown() #endif +#ifdef CONFIG_EFI +extern u64 *efi_rt_stack_top; + +static inline struct stack_info stackinfo_get_efi(void) +{ + unsigned long high = (u64)efi_rt_stack_top; + unsigned long low = high - THREAD_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} +#endif + #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S index d872d18101d8..e8ae803662cf 100644 --- a/arch/arm64/kernel/efi-rt-wrapper.S +++ b/arch/arm64/kernel/efi-rt-wrapper.S @@ -46,7 +46,10 @@ SYM_FUNC_START(__efi_rt_asm_wrapper) mov x4, x6 blr x8 + mov x16, sp mov sp, x29 + str xzr, [x16, #8] // clear recorded task SP value + ldp x1, x2, [sp, #16] cmp x2, x18 ldp x29, x30, [sp], #112 @@ -71,6 +74,9 @@ SYM_FUNC_END(__efi_rt_asm_wrapper) SYM_CODE_START(__efi_rt_asm_recover) mov sp, x30 + ldr_l x16, efi_rt_stack_top // clear recorded task SP value + str xzr, [x16, #-8] + ldp x19, x20, [sp, #32] ldp x21, x22, [sp, #48] ldp x23, x24, [sp, #64] diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index fab05de2e12d..b273900f4566 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <asm/efi.h> +#include <asm/stacktrace.h> static bool region_is_misaligned(const efi_memory_desc_t *md) { @@ -154,7 +155,7 @@ asmlinkage efi_status_t __efi_rt_asm_recover(void); bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg) { /* Check whether the exception occurred while running the firmware */ - if (current_work() != &efi_rts_work.work || regs->pc >= TASK_SIZE_64) + if (!current_in_efi() || regs->pc >= TASK_SIZE_64) return false; pr_err(FW_BUG "Unable to handle %s in EFI runtime service\n", msg); diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index a5193f2146a6..dde06c0f97f3 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1023,12 +1023,6 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, return 0; } -static bool armv8pmu_filter(struct pmu *pmu, int cpu) -{ - struct arm_pmu *armpmu = to_arm_pmu(pmu); - return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus); -} - static void armv8pmu_reset(void *info) { struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; @@ -1069,6 +1063,14 @@ static int __armv8_pmuv3_map_event(struct perf_event *event, &armv8_pmuv3_perf_cache_map, ARMV8_PMU_EVTYPE_EVENT); + /* + * CHAIN events only work when paired with an adjacent counter, and it + * never makes sense for a user to open one in isolation, as they'll be + * rotated arbitrarily. + */ + if (hw_event_id == ARMV8_PMUV3_PERFCTR_CHAIN) + return -EINVAL; + if (armv8pmu_event_is_64bit(event)) event->hw.flags |= ARMPMU_EVT_64BIT; @@ -1258,7 +1260,6 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, cpu_pmu->stop = armv8pmu_stop; cpu_pmu->reset = armv8pmu_reset; cpu_pmu->set_event_filter = armv8pmu_set_event_filter; - cpu_pmu->filter = armv8pmu_filter; cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 117e2c180f3c..83154303e682 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -5,6 +5,7 @@ * Copyright (C) 2012 ARM Ltd. */ #include <linux/kernel.h> +#include <linux/efi.h> #include <linux/export.h> #include <linux/ftrace.h> #include <linux/sched.h> @@ -12,6 +13,7 @@ #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> +#include <asm/efi.h> #include <asm/irq.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> @@ -186,6 +188,13 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) : stackinfo_get_unknown(); \ }) +#define STACKINFO_EFI \ + ({ \ + ((task == current) && current_in_efi()) \ + ? stackinfo_get_efi() \ + : stackinfo_get_unknown(); \ + }) + noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) @@ -200,6 +209,9 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, STACKINFO_SDEI(normal), STACKINFO_SDEI(critical), #endif +#ifdef CONFIG_EFI + STACKINFO_EFI, +#endif }; struct unwind_state state = { .stacks = stacks, diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 5626ddb540ce..cf4c495a4321 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1079,7 +1079,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) - mte_clear_page_tags(page); + mte_clear_page_tags(maddr); set_page_mte_tagged(page); kvm_release_pfn_dirty(pfn); diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 94a666dd1443..2642e9ce2819 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2187,7 +2187,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); + return vgic_write_guest_lock(kvm, gpa, &val, ite_esz); } /** @@ -2339,7 +2339,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); + return vgic_write_guest_lock(kvm, ptr, &val, dte_esz); } /** @@ -2526,7 +2526,7 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); + return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz); } /* @@ -2607,7 +2607,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) */ val = 0; BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); + ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); return ret; } @@ -2743,7 +2743,6 @@ static int vgic_its_has_attr(struct kvm_device *dev, static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); - struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ @@ -2763,9 +2762,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: - dist->save_its_tables_in_progress = true; ret = abi->save_tables(its); - dist->save_its_tables_in_progress = false; break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); @@ -2792,7 +2789,7 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - return dist->save_its_tables_in_progress; + return dist->table_write_in_progress; } static int vgic_its_set_attr(struct kvm_device *dev, diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 2074521d4a8c..684bdfaad4a9 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -339,7 +339,7 @@ retry: if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } @@ -350,26 +350,23 @@ retry: * The deactivation of the doorbell interrupt will trigger the * unmapping of the associated vPE. */ -static void unmap_all_vpes(struct vgic_dist *dist) +static void unmap_all_vpes(struct kvm *kvm) { - struct irq_desc *desc; + struct vgic_dist *dist = &kvm->arch.vgic; int i; - for (i = 0; i < dist->its_vm.nr_vpes; i++) { - desc = irq_to_desc(dist->its_vm.vpes[i]->irq); - irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); - } + for (i = 0; i < dist->its_vm.nr_vpes; i++) + free_irq(dist->its_vm.vpes[i]->irq, kvm_get_vcpu(kvm, i)); } -static void map_all_vpes(struct vgic_dist *dist) +static void map_all_vpes(struct kvm *kvm) { - struct irq_desc *desc; + struct vgic_dist *dist = &kvm->arch.vgic; int i; - for (i = 0; i < dist->its_vm.nr_vpes; i++) { - desc = irq_to_desc(dist->its_vm.vpes[i]->irq); - irq_domain_activate_irq(irq_desc_get_irq_data(desc), false); - } + for (i = 0; i < dist->its_vm.nr_vpes; i++) + WARN_ON(vgic_v4_request_vpe_irq(kvm_get_vcpu(kvm, i), + dist->its_vm.vpes[i]->irq)); } /** @@ -394,7 +391,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) * and enabling of the doorbells have already been done. */ if (kvm_vgic_global_state.has_gicv4_1) { - unmap_all_vpes(dist); + unmap_all_vpes(kvm); vlpi_avail = true; } @@ -437,14 +434,14 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) else val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; } out: if (vlpi_avail) - map_all_vpes(dist); + map_all_vpes(kvm); return ret; } diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index ad06ba6c9b00..a413718be92b 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -222,6 +222,11 @@ void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val) *val = !!(*ptr & mask); } +int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq) +{ + return request_irq(irq, vgic_v4_doorbell_handler, 0, "vcpu", vcpu); +} + /** * vgic_v4_init - Initialize the GICv4 data structures * @kvm: Pointer to the VM being initialized @@ -283,8 +288,7 @@ int vgic_v4_init(struct kvm *kvm) irq_flags &= ~IRQ_NOAUTOEN; irq_set_status_flags(irq, irq_flags); - ret = request_irq(irq, vgic_v4_doorbell_handler, - 0, "vcpu", vcpu); + ret = vgic_v4_request_vpe_irq(vcpu, irq); if (ret) { kvm_err("failed to allocate vcpu IRQ%d\n", irq); /* diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 0c8da72953f0..7f7f3c5ed85a 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -6,6 +6,7 @@ #define __KVM_ARM_VGIC_NEW_H__ #include <linux/irqchip/arm-gic-common.h> +#include <asm/kvm_mmu.h> #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b @@ -131,6 +132,19 @@ static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) return vgic_irq_get_lr_count(irq) > 1; } +static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret; + + dist->table_write_in_progress = true; + ret = kvm_write_guest_lock(kvm, gpa, data, len); + dist->table_write_in_progress = false; + + return ret; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC @@ -331,5 +345,6 @@ int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); +int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); #endif diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index f6a502e8f02c..6e948d015332 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u asmlinkage long ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) { + struct timespec64 rtn_tp; + s64 tick_ns; + /* * ia64's clock_gettime() syscall is implemented as a vdso call * fsys_clock_gettime(). Currently it handles only @@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user * switch (which_clock) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: - s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); - struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + rtn_tp = ns_to_timespec64(tick_ns); return put_timespec64(&rtn_tp, tp); } diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 4dfe1f49c5c8..6817892a2c58 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -1303,7 +1303,7 @@ static char iodc_dbuf[4096] __page_aligned_bss; */ int pdc_iodc_print(const unsigned char *str, unsigned count) { - unsigned int i; + unsigned int i, found = 0; unsigned long flags; count = min_t(unsigned int, count, sizeof(iodc_dbuf)); @@ -1315,6 +1315,7 @@ int pdc_iodc_print(const unsigned char *str, unsigned count) iodc_dbuf[i+0] = '\r'; iodc_dbuf[i+1] = '\n'; i += 2; + found = 1; goto print; default: iodc_dbuf[i] = str[i]; @@ -1330,7 +1331,7 @@ print: __pa(pdc_result), 0, __pa(iodc_dbuf), i, 0); spin_unlock_irqrestore(&pdc_lock, flags); - return i; + return i - found; } #if !defined(BOOTLOADER) diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 69c62933e952..ceb45f51d52e 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -126,6 +126,12 @@ long arch_ptrace(struct task_struct *child, long request, unsigned long tmp; long ret = -EIO; + unsigned long user_regs_struct_size = sizeof(struct user_regs_struct); +#ifdef CONFIG_64BIT + if (is_compat_task()) + user_regs_struct_size /= 2; +#endif + switch (request) { /* Read the word at location addr in the USER area. For ptraced @@ -166,7 +172,7 @@ long arch_ptrace(struct task_struct *child, long request, addr >= sizeof(struct pt_regs)) break; if (addr == PT_IAOQ0 || addr == PT_IAOQ1) { - data |= 3; /* ensure userspace privilege */ + data |= PRIV_USER; /* ensure userspace privilege */ } if ((addr >= PT_GR1 && addr <= PT_GR31) || addr == PT_IAOQ0 || addr == PT_IAOQ1 || @@ -181,14 +187,14 @@ long arch_ptrace(struct task_struct *child, long request, return copy_regset_to_user(child, task_user_regset_view(current), REGSET_GENERAL, - 0, sizeof(struct user_regs_struct), + 0, user_regs_struct_size, datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, task_user_regset_view(current), REGSET_GENERAL, - 0, sizeof(struct user_regs_struct), + 0, user_regs_struct_size, datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ @@ -285,7 +291,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (addr >= sizeof(struct pt_regs)) break; if (addr == PT_IAOQ0+4 || addr == PT_IAOQ1+4) { - data |= 3; /* ensure userspace privilege */ + data |= PRIV_USER; /* ensure userspace privilege */ } if (addr >= PT_FR0 && addr <= PT_FR31 + 4) { /* Special case, fp regs are 64 bits anyway */ @@ -302,6 +308,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } } break; + case PTRACE_GETREGS: + case PTRACE_SETREGS: + case PTRACE_GETFPREGS: + case PTRACE_SETFPREGS: + return arch_ptrace(child, request, addr, data); default: ret = compat_ptrace_request(child, request, addr, data); @@ -484,7 +495,7 @@ static void set_reg(struct pt_regs *regs, int num, unsigned long val) case RI(iaoq[0]): case RI(iaoq[1]): /* set 2 lowest bits to ensure userspace privilege: */ - regs->iaoq[num - RI(iaoq[0])] = val | 3; + regs->iaoq[num - RI(iaoq[0])] = val | PRIV_USER; return; case RI(sar): regs->sar = val; return; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b8c4ac56bddc..7a5f8dbfbdd0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -163,7 +163,6 @@ config PPC select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx - select ARCH_WANTS_NO_INSTR select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF select BUILDTIME_TABLE_SORT diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index dd39313242b4..2bbc0fcce04a 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -97,6 +97,8 @@ static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) radix__tlb_flush(tlb); + else + hash__tlb_flush(tlb); } #ifdef CONFIG_SMP diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 77fa88c2aed0..eb6d094083fd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -173,6 +173,15 @@ static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) return flags; } +static inline notrace unsigned long irq_soft_mask_andc_return(unsigned long mask) +{ + unsigned long flags = irq_soft_mask_return(); + + irq_soft_mask_set(flags & ~mask); + + return flags; +} + static inline unsigned long arch_local_save_flags(void) { return irq_soft_mask_return(); @@ -192,7 +201,7 @@ static inline void arch_local_irq_enable(void) static inline unsigned long arch_local_irq_save(void) { - return irq_soft_mask_set_return(IRQS_DISABLED); + return irq_soft_mask_or_return(IRQS_DISABLED); } static inline bool arch_irqs_disabled_flags(unsigned long flags) @@ -331,10 +340,11 @@ bool power_pmu_wants_prompt_pmi(void); * is a different soft-masked interrupt pending that requires hard * masking. */ -static inline bool should_hard_irq_enable(void) +static inline bool should_hard_irq_enable(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); + WARN_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); + WARN_ON(!(get_paca()->irq_happened & PACA_IRQ_HARD_DIS)); WARN_ON(mfmsr() & MSR_EE); } @@ -347,8 +357,17 @@ static inline bool should_hard_irq_enable(void) * * TODO: Add test for 64e */ - if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !power_pmu_wants_prompt_pmi()) - return false; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) { + if (!power_pmu_wants_prompt_pmi()) + return false; + /* + * If PMIs are disabled then IRQs should be disabled as well, + * so we shouldn't see this condition, check for it just in + * case because we are about to enable PMIs. + */ + if (WARN_ON_ONCE(regs->softe & IRQS_PMI_DISABLED)) + return false; + } if (get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK) return false; @@ -358,18 +377,16 @@ static inline bool should_hard_irq_enable(void) /* * Do the hard enabling, only call this if should_hard_irq_enable is true. + * This allows PMI interrupts to profile irq handlers. */ static inline void do_hard_irq_enable(void) { - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); - WARN_ON(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK); - WARN_ON(mfmsr() & MSR_EE); - } /* - * This allows PMI interrupts (and watchdog soft-NMIs) through. - * There is no other reason to enable this way. + * Asynch interrupts come in with IRQS_ALL_DISABLED, + * PACA_IRQ_HARD_DIS, and MSR[EE]=0. */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + irq_soft_mask_andc_return(IRQS_PMI_DISABLED); get_paca()->irq_happened &= ~PACA_IRQ_HARD_DIS; __hard_irq_enable(); } @@ -452,7 +469,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) return !(regs->msr & MSR_EE); } -static __always_inline bool should_hard_irq_enable(void) +static __always_inline bool should_hard_irq_enable(struct pt_regs *regs) { return false; } diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index f55c6fb34a3a..5712dd846263 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -27,7 +27,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception) ppc_msgsync(); - if (should_hard_irq_enable()) + if (should_hard_irq_enable(regs)) do_hard_irq_enable(); kvmppc_clear_host_ipi(smp_processor_id()); diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index d438ca74e96c..fdbee1093e2b 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -864,7 +864,7 @@ _GLOBAL(load_up_spe) * SPE unavailable trap from kernel - print a message, but let * the task use SPE in the kernel until it returns to user mode. */ -KernelSPE: +SYM_FUNC_START_LOCAL(KernelSPE) lwz r3,_MSR(r1) oris r3,r3,MSR_SPE@h stw r3,_MSR(r1) /* enable use of SPE after return */ @@ -881,6 +881,7 @@ KernelSPE: #endif .align 4,0 +SYM_FUNC_END(KernelSPE) #endif /* CONFIG_SPE */ /* diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index fc6631a80527..0ec1581619db 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -50,16 +50,18 @@ static inline bool exit_must_hard_disable(void) */ static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable) { + bool must_hard_disable = (exit_must_hard_disable() || !restartable); + /* This must be done with RI=1 because tracing may touch vmaps */ trace_hardirqs_on(); - if (exit_must_hard_disable() || !restartable) + if (must_hard_disable) __hard_EE_RI_disable(); #ifdef CONFIG_PPC64 /* This pattern matches prep_irq_for_idle */ if (unlikely(lazy_irq_pending_nocheck())) { - if (exit_must_hard_disable() || !restartable) { + if (must_hard_disable) { local_paca->irq_happened |= PACA_IRQ_HARD_DIS; __hard_RI_enable(); } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index c5b9ce887483..c9535f2760b5 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -238,7 +238,7 @@ static void __do_irq(struct pt_regs *regs, unsigned long oldsp) irq = static_call(ppc_get_irq)(); /* We can hard enable interrupts now to allow perf interrupts */ - if (should_hard_irq_enable()) + if (should_hard_irq_enable(regs)) do_hard_irq_enable(); /* And finally process it */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index d68de3618741..e26eb6618ae5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -515,7 +515,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) } /* Conditionally hard-enable interrupts. */ - if (should_hard_irq_enable()) { + if (should_hard_irq_enable(regs)) { /* * Ensure a positive value is written to the decrementer, or * else some CPUs will continue to take decrementer exceptions. diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index af8854f9eae3..9be3e818a240 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -26,6 +26,7 @@ #include <asm/firmware.h> #include <asm/kexec_ranges.h> #include <asm/crashdump-ppc64.h> +#include <asm/mmzone.h> #include <asm/prom.h> struct umem_info { @@ -989,10 +990,13 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) * linux,drconf-usable-memory properties. Get an approximate on the * number of usable memory entries and use for FDT size estimation. */ - usm_entries = ((memblock_end_of_DRAM() / drmem_lmb_size()) + - (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); - - extra_size = (unsigned int)(usm_entries * sizeof(u64)); + if (drmem_lmb_size()) { + usm_entries = ((memory_hotplug_max() / drmem_lmb_size()) + + (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); + extra_size = (unsigned int)(usm_entries * sizeof(u64)); + } else { + extra_size = 0; + } /* * Get the number of CPU nodes in the current DT. This allows to diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 0dce93ccaadf..e89281d3ba28 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -912,16 +912,15 @@ static int kvmppc_handle_debug(struct kvm_vcpu *vcpu) static void kvmppc_fill_pt_regs(struct pt_regs *regs) { - ulong r1, ip, msr, lr; + ulong r1, msr, lr; asm("mr %0, 1" : "=r"(r1)); asm("mflr %0" : "=r"(lr)); asm("mfmsr %0" : "=r"(msr)); - asm("bl 1f; 1: mflr %0" : "=r"(ip)); memset(regs, 0, sizeof(*regs)); regs->gpr[1] = r1; - regs->nip = ip; + regs->nip = _THIS_IP_; regs->msr = msr; regs->link = lr; } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index cac727b01799..26245aaf12b8 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -234,6 +234,14 @@ void radix__mark_rodata_ro(void) end = (unsigned long)__end_rodata; radix__change_memory_range(start, end, _PAGE_WRITE); + + for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { + end = start + PAGE_SIZE; + if (overlaps_interrupt_vector_text(start, end)) + radix__change_memory_range(start, end, _PAGE_WRITE); + else + break; + } } void radix__mark_initmem_nx(void) @@ -262,6 +270,22 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX + unsigned long stext_phys; + + stext_phys = __pa_symbol(_stext); + + // Relocatable kernel running at non-zero real address + if (stext_phys != 0) { + // The end of interrupts code at zero is a rodata boundary + unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; + if (addr < end_intr) + return end_intr; + + // Start of relocated kernel text is a rodata boundary + if (addr < stext_phys) + return stext_phys; + } + if (addr < __pa_symbol(__srwx_boundary)) return __pa_symbol(__srwx_boundary); #endif diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 100e97daf76b..9d229ef7f86e 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -22,7 +22,7 @@ * Used to avoid races in counting the nest-pmu units during hotplug * register and unregister */ -static DEFINE_SPINLOCK(nest_init_lock); +static DEFINE_MUTEX(nest_init_lock); static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc); static struct imc_pmu **per_nest_pmu_arr; static cpumask_t nest_imc_cpumask; @@ -1629,7 +1629,7 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) { if (pmu_ptr->domain == IMC_DOMAIN_NEST) { - spin_lock(&nest_init_lock); + mutex_lock(&nest_init_lock); if (nest_pmus == 1) { cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); kfree(nest_imc_refc); @@ -1639,7 +1639,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) if (nest_pmus > 0) nest_pmus--; - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); } /* Free core_imc memory */ @@ -1796,11 +1796,11 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id * rest. To handle the cpuhotplug callback unregister, we track * the number of nest pmus in "nest_pmus". */ - spin_lock(&nest_init_lock); + mutex_lock(&nest_init_lock); if (nest_pmus == 0) { ret = init_nest_pmu_ref(); if (ret) { - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; goto err_free_mem; @@ -1808,7 +1808,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id /* Register for cpu hotplug notification. */ ret = nest_pmu_cpumask_init(); if (ret) { - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; @@ -1816,7 +1816,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id } } nest_pmus++; - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); break; case IMC_DOMAIN_CORE: ret = core_imc_pmu_cpumask_init(); diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index dbcfe361831a..ea807aa0c31a 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -92,7 +92,7 @@ out: } static int -spufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +spufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -100,7 +100,7 @@ spufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size != inode->i_size)) return -EINVAL; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -237,7 +237,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags, if (!inode) return -ENOSPC; - inode_init_owner(&init_user_ns, inode, dir, mode | S_IFDIR); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode | S_IFDIR); ctx = alloc_spu_context(SPUFS_I(dir)->i_gang); /* XXX gang */ SPUFS_I(inode)->i_ctx = ctx; if (!ctx) { @@ -468,7 +468,7 @@ spufs_mkgang(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; ret = 0; - inode_init_owner(&init_user_ns, inode, dir, mode | S_IFDIR); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode | S_IFDIR); gang = alloc_spu_gang(); SPUFS_I(inode)->i_ctx = NULL; SPUFS_I(inode)->i_gang = gang; diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index faf2c2177094..82153960ac00 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -80,6 +80,9 @@ ifeq ($(CONFIG_PERF_EVENTS),y) KBUILD_CFLAGS += -fno-omit-frame-pointer endif +# Avoid generating .eh_frame sections. +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables + KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-relax) KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 7226e2462584..2c0f4c887289 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -46,7 +46,7 @@ .macro ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ new_c_2, vendor_id_2, errata_id_2, enable_2 - ALTERNATIVE_CFG \old_c, \new_c_1, \vendor_id_1, \errata_id_1, \enable_1 + ALTERNATIVE_CFG "\old_c", "\new_c_1", \vendor_id_1, \errata_id_1, \enable_1 ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2 .endm diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 86328e3acb02..64ad1937e714 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -70,7 +70,6 @@ static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX); */ enum riscv_isa_ext_key { RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */ - RISCV_ISA_EXT_KEY_ZIHINTPAUSE, RISCV_ISA_EXT_KEY_SVINVAL, RISCV_ISA_EXT_KEY_MAX, }; @@ -91,8 +90,6 @@ static __always_inline int riscv_isa_ext2key(int num) return RISCV_ISA_EXT_KEY_FPU; case RISCV_ISA_EXT_d: return RISCV_ISA_EXT_KEY_FPU; - case RISCV_ISA_EXT_ZIHINTPAUSE: - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE; case RISCV_ISA_EXT_SVINVAL: return RISCV_ISA_EXT_KEY_SVINVAL; default: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 4eba9a98d0e3..3e01f4f3ab08 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -721,6 +721,10 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } + +#define pmdp_collapse_flush pmdp_collapse_flush +extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h index fa70cfe507aa..14f5d27783b8 100644 --- a/arch/riscv/include/asm/vdso/processor.h +++ b/arch/riscv/include/asm/vdso/processor.h @@ -4,30 +4,26 @@ #ifndef __ASSEMBLY__ -#include <linux/jump_label.h> #include <asm/barrier.h> -#include <asm/hwcap.h> static inline void cpu_relax(void) { - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) { #ifdef __riscv_muldiv - int dummy; - /* In lieu of a halt instruction, induce a long-latency stall. */ - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); + int dummy; + /* In lieu of a halt instruction, induce a long-latency stall. */ + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); #endif - } else { - /* - * Reduce instruction retirement. - * This assumes the PC changes. - */ -#ifdef CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE - __asm__ __volatile__ ("pause"); + +#ifdef __riscv_zihintpause + /* + * Reduce instruction retirement. + * This assumes the PC changes. + */ + __asm__ __volatile__ ("pause"); #else - /* Encoding of the pause instruction */ - __asm__ __volatile__ (".4byte 0x100000F"); + /* Encoding of the pause instruction */ + __asm__ __volatile__ (".4byte 0x100000F"); #endif - } barrier(); } diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index b865046e4dbb..4bf6c449d78b 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -326,7 +326,7 @@ clear_bss_done: call soc_early_init tail start_kernel -#if CONFIG_RISCV_BOOT_SPINWAIT +#ifdef CONFIG_RISCV_BOOT_SPINWAIT .Lsecondary_start: /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index f21592d20306..2bedec37d092 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -48,15 +48,35 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) post_kprobe_handler(p, kcb, regs); } +static bool __kprobes arch_check_kprobe(struct kprobe *p) +{ + unsigned long tmp = (unsigned long)p->addr - p->offset; + unsigned long addr = (unsigned long)p->addr; + + while (tmp <= addr) { + if (tmp == addr) + return true; + + tmp += GET_INSN_LENGTH(*(u16 *)tmp); + } + + return false; +} + int __kprobes arch_prepare_kprobe(struct kprobe *p) { - unsigned long probe_addr = (unsigned long)p->addr; + u16 *insn = (u16 *)p->addr; + + if ((unsigned long)insn & 0x1) + return -EILSEQ; - if (probe_addr & 0x1) + if (!arch_check_kprobe(p)) return -EILSEQ; /* copy instruction */ - p->opcode = *p->addr; + p->opcode = (kprobe_opcode_t)(*insn++); + if (GET_INSN_LENGTH(p->opcode) == 4) + p->opcode |= (kprobe_opcode_t)(*insn) << 16; /* decode instruction */ switch (riscv_probe_decode_insn(p->addr, &p->ainsn.api)) { diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index d73e96f6ed7c..a20568bd1f1a 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -71,11 +71,11 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg u32 rd_index = (opcode >> 7) & 0x1f; u32 rs1_index = (opcode >> 15) & 0x1f; - ret = rv_insn_reg_set_val(regs, rd_index, addr + 4); + ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); if (!ret) return ret; - ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); + ret = rv_insn_reg_set_val(regs, rd_index, addr + 4); if (!ret) return ret; diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 3373df413c88..ddb2afba6d25 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -39,7 +39,6 @@ static DECLARE_COMPLETION(cpu_running); void __init smp_prepare_boot_cpu(void) { - init_cpu_topology(); } void __init smp_prepare_cpus(unsigned int max_cpus) @@ -48,6 +47,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) int ret; unsigned int curr_cpuid; + init_cpu_topology(); + curr_cpuid = smp_processor_id(); store_cpu_topology(curr_cpuid); numa_store_cpu_info(curr_cpuid); diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 75c8dd64fc48..f9a5a7c90ff0 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -32,6 +32,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = (unsigned long)__builtin_frame_address(0); sp = current_stack_pointer; pc = (unsigned long)walk_stackframe; + level = -1; } else { /* task blocked in __switch_to */ fp = task->thread.s[0]; @@ -43,7 +44,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, unsigned long low, high; struct stackframe *frame; - if (unlikely(!__kernel_text_address(pc) || (level++ >= 1 && !fn(arg, pc)))) + if (unlikely(!__kernel_text_address(pc) || (level++ >= 0 && !fn(arg, pc)))) break; /* Validate frame pointer */ diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 3cc07ed45aeb..fcd6145fbead 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -90,8 +90,10 @@ void flush_icache_pte(pte_t pte) if (PageHuge(page)) page = compound_head(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + if (!test_bit(PG_dcache_clean, &page->flags)) { flush_icache_all(); + set_bit(PG_dcache_clean, &page->flags); + } } #endif /* CONFIG_MMU */ diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 6645ead1a7c1..fef4e7328e49 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -81,3 +81,23 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + /* + * When leaf PTE entries (regular pages) are collapsed into a leaf + * PMD entry (huge page), a valid non-leaf PTE is converted into a + * valid leaf PTE at the level 1 page table. Since the sfence.vma + * forms that specify an address only apply to leaf PTEs, we need a + * global flush here. collapse_huge_page() assumes these flushes are + * eager, so just do the fence here. + */ + flush_tlb_mm(vma->vm_mm); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index 8dcd7af2911a..b519a1f045d8 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -80,6 +80,6 @@ void *decompress_kernel(void) void *output = (void *)decompress_offset; __decompress(_compressed_start, _compressed_end - _compressed_start, - NULL, NULL, output, 0, NULL, error); + NULL, NULL, output, vmlinux.image_size, NULL, error); return output; } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 3161b9ccd2a5..b6276a3521d7 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -4,6 +4,7 @@ * Written by Niibe Yutaka and Paul Mundt */ OUTPUT_ARCH(sh) +#define RUNTIME_DISCARD_EXIT #include <asm/thread_info.h> #include <asm/cache.h> #include <asm/vmlinux.lds.h> diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 9cf07322875a..73ed982d4100 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -14,13 +14,13 @@ endif ifdef CONFIG_CC_IS_GCC RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) -RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix) RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register) endif ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk RETPOLINE_VDSO_CFLAGS := -mretpoline endif +RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix) ifdef CONFIG_RETHUNK RETHUNK_CFLAGS := -mfunction-return=thunk-extern diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index d4a314cc50d6..321a5011042d 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -180,6 +180,12 @@ void initialize_identity_maps(void *rmode) /* Load the new page-table. */ write_cr3(top_level_pgt); + + /* + * Now that the required page table mappings are established and a + * GHCB can be used, check for SNP guest/HV feature compatibility. + */ + snp_check_features(); } static pte_t *split_large_pmd(struct x86_mapping_info *info, diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 62208ec04ca4..20118fb7c53b 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -126,6 +126,7 @@ static inline void console_init(void) #ifdef CONFIG_AMD_MEM_ENCRYPT void sev_enable(struct boot_params *bp); +void snp_check_features(void); void sev_es_shutdown_ghcb(void); extern bool sev_es_check_ghcb_fault(unsigned long address); void snp_set_page_private(unsigned long paddr); @@ -143,6 +144,7 @@ static inline void sev_enable(struct boot_params *bp) if (bp) bp->cc_blob_address = 0; } +static inline void snp_check_features(void) { } static inline void sev_es_shutdown_ghcb(void) { } static inline bool sev_es_check_ghcb_fault(unsigned long address) { diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index c93930d5ccbd..d63ad8f99f83 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -208,6 +208,23 @@ void sev_es_shutdown_ghcb(void) error("Can't unmap GHCB page"); } +static void __noreturn sev_es_ghcb_terminate(struct ghcb *ghcb, unsigned int set, + unsigned int reason, u64 exit_info_2) +{ + u64 exit_info_1 = SVM_VMGEXIT_TERM_REASON(set, reason); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_TERM_REQUEST); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + bool sev_es_check_ghcb_fault(unsigned long address) { /* Check whether the fault was on the GHCB page */ @@ -270,6 +287,59 @@ static void enforce_vmpl0(void) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); } +/* + * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need + * guest side implementation for proper functioning of the guest. If any + * of these features are enabled in the hypervisor but are lacking guest + * side implementation, the behavior of the guest will be undefined. The + * guest could fail in non-obvious way making it difficult to debug. + * + * As the behavior of reserved feature bits is unknown to be on the + * safe side add them to the required features mask. + */ +#define SNP_FEATURES_IMPL_REQ (MSR_AMD64_SNP_VTOM | \ + MSR_AMD64_SNP_REFLECT_VC | \ + MSR_AMD64_SNP_RESTRICTED_INJ | \ + MSR_AMD64_SNP_ALT_INJ | \ + MSR_AMD64_SNP_DEBUG_SWAP | \ + MSR_AMD64_SNP_VMPL_SSS | \ + MSR_AMD64_SNP_SECURE_TSC | \ + MSR_AMD64_SNP_VMGEXIT_PARAM | \ + MSR_AMD64_SNP_VMSA_REG_PROTECTION | \ + MSR_AMD64_SNP_RESERVED_BIT13 | \ + MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_RESERVED_MASK) + +/* + * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented + * by the guest kernel. As and when a new feature is implemented in the + * guest kernel, a corresponding bit should be added to the mask. + */ +#define SNP_FEATURES_PRESENT (0) + +void snp_check_features(void) +{ + u64 unsupported; + + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* + * Terminate the boot if hypervisor has enabled any feature lacking + * guest side implementation. Pass on the unsupported features mask through + * EXIT_INFO_2 of the GHCB protocol so that those features can be reported + * as part of the guest boot failure. + */ + unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; + if (unsupported) { + if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb())) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + sev_es_ghcb_terminate(boot_ghcb, SEV_TERM_SET_GEN, + GHCB_SNP_UNSUPPORTED, unsupported); + } +} + void sev_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 85a63a41c471..d096b04bf80e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2974,17 +2974,19 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { - if (!x86_pmu_initialized()) { + /* This API doesn't currently support enumerating hybrid PMUs. */ + if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || + !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } - cap->version = x86_pmu.version; /* - * KVM doesn't support the hybrid PMU yet. - * Return the common value in global x86_pmu, - * which available for all cores. + * Note, hybrid CPU models get tracked as having hybrid PMUs even when + * all E-cores are disabled via BIOS. When E-cores are disabled, the + * base PMU holds the correct number of counters for P-cores. */ + cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu.num_counters; cap->num_counters_fixed = x86_pmu.num_counters_fixed; cap->bit_width_gp = x86_pmu.cntval_bits; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dfd2c124cdf8..bafdc2be479a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6339,6 +6339,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_SAPPHIRERAPIDS_X: + case INTEL_FAM6_EMERALDRAPIDS_X: pmem = true; x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 3019fb1926e3..551741e79e03 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -677,6 +677,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 65064d9f7fa6..8eb74cf386db 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -14,6 +14,7 @@ #include <asm/mmu.h> #include <asm/mpspec.h> #include <asm/x86_init.h> +#include <asm/cpufeature.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -63,6 +64,13 @@ extern int (*acpi_suspend_lowlevel)(void); /* Physical address to resume after wakeup */ unsigned long acpi_get_wakeup_address(void); +static inline bool acpi_skip_set_wakeup_address(void) +{ + return cpu_feature_enabled(X86_FEATURE_XENPV); +} + +#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address + /* * Check if the CPU can handle C2 and deeper */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 61012476d66e..8f39c46197b8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -466,5 +466,6 @@ #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ #define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b049d950612f..ca97442e8d49 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -39,7 +39,20 @@ static __always_inline unsigned long native_get_debugreg(int regno) asm("mov %%db6, %0" :"=r" (val)); break; case 7: - asm("mov %%db7, %0" :"=r" (val)); + /* + * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them + * with other code. + * + * This is needed because a DR7 access can cause a #VC exception + * when running under SEV-ES. Taking a #VC exception is not a + * safe thing to do just anywhere in the entry code and + * re-ordering might place the access into an unsafe location. + * + * This happened in the NMI handler, where the DR7 read was + * re-ordered to happen before the call to sev_es_ist_enter(), + * causing stack recursion. + */ + asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); @@ -66,7 +79,16 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) asm("mov %0, %%db6" ::"r" (value)); break; case 7: - asm("mov %0, %%db7" ::"r" (value)); + /* + * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them + * with other code. + * + * While is didn't happen with a DR7 write (see the DR7 read + * comment above which explains where it happened), add the + * __FORCE_ORDER here too to avoid similar problems in the + * future. + */ + asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 347707d459c6..cbaf174d8efd 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -123,6 +123,8 @@ #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_FAM6_LUNARLAKE_M 0xBD + /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 37ff47552bcb..d3fe82c5d6b6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -566,6 +566,26 @@ #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) +/* SNP feature bits enabled by the hypervisor */ +#define MSR_AMD64_SNP_VTOM BIT_ULL(3) +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) +#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) +#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) + +/* SNP feature bits reserved for future use. */ +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) + #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f /* AMD Collaborative Processor Performance Control MSRs */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index f69c168391aa..80e1df482337 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -116,6 +116,12 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ + /* SW_EXITINFO1[3:0] */ \ + (((((u64)reason_set) & 0xf)) | \ + /* SW_EXITINFO1[11:4] */ \ + ((((u64)reason_code) & 0xff) << 4)) #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff /* Exit code reserved for hypervisor/software use */ diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 1f60a2b27936..fdbb5f07448f 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -330,7 +330,16 @@ static void __init bp_init_freq_invariance(void) static void disable_freq_invariance_workfn(struct work_struct *work) { + int cpu; + static_branch_disable(&arch_scale_freq_key); + + /* + * Set arch_freq_scale to a default value on all cpus + * This negates the effect of scaling + */ + for_each_possible_cpu(cpu) + per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; } static DECLARE_WORK(disable_freq_invariance_work, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9cfca3d7d0e2..f3cc7699e1e1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1256,6 +1256,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define MMIO_SBDS BIT(2) /* CPU is affected by RETbleed, speculating where you would not expect it */ #define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1287,8 +1289,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED), - VULNBL_HYGON(0x18, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), {} }; @@ -1406,6 +1408,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !(ia32_cap & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) + setup_force_cpu_bug(X86_BUG_SMT_RSB); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 3aa5304200c5..4d8aff05a509 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq) disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); + irq_set_status_flags(irq, IRQ_LEVEL); enable_irq(irq); lapic_assign_legacy_vector(irq, true); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index beb1bada1b0a..c683666876f1 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -65,8 +65,10 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); - for (i = 0; i < nr_legacy_irqs(); i++) + for (i = 0; i < nr_legacy_irqs(); i++) { irq_set_chip_and_handler(i, chip, handle_level_irq); + irq_set_status_flags(i, IRQ_LEVEL); + } } void __init init_IRQ(void) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b36f3c367cb2..695873c0f50b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -625,7 +625,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn) /* 1 byte conditional jump */ p->ainsn.emulate_op = kprobe_emulate_jcc; p->ainsn.jcc.type = opcode & 0xf; - p->ainsn.rel32 = *(char *)insn->immediate.bytes; + p->ainsn.rel32 = insn->immediate.value; break; case 0x0f: opcode = insn->opcode.bytes[1]; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cdb91009701d..ee67ba625094 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -165,15 +165,27 @@ static inline void kvm_init_pmu_capability(void) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; - perf_get_x86_pmu_capability(&kvm_pmu_cap); - - /* - * For Intel, only support guest architectural pmu - * on a host with architectural pmu. - */ - if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) enable_pmu = false; + if (enable_pmu) { + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || + !kvm_pmu_cap.num_counters_gp) + enable_pmu = false; + } + if (!enable_pmu) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fc9008dbed33..7eec0226d56a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3440,18 +3440,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable || !var->present) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + ar |= (var->unusable || !var->present) << 16; return ar; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da4bbd043a7b..a2c299d47e69 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -191,6 +191,10 @@ module_param(enable_pmu, bool, 0444); bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); +/* Enable/disable SMT_RSB bug mitigation */ +bool __read_mostly mitigate_smt_rsb; +module_param(mitigate_smt_rsb, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -4448,10 +4452,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | - KVM_X86_DISABLE_EXITS_CSTATE; - if(kvm_can_mwait_in_guest()) - r |= KVM_X86_DISABLE_EXITS_MWAIT; + r = KVM_X86_DISABLE_EXITS_PAUSE; + + if (!mitigate_smt_rsb) { + r |= KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_CSTATE; + + if (kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; + } break; case KVM_CAP_X86_SMM: if (!IS_ENABLED(CONFIG_KVM_SMM)) @@ -5254,12 +5263,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, { unsigned long val; + memset(dbgregs, 0, sizeof(*dbgregs)); memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -6227,15 +6235,26 @@ split_irqchip_unlock: if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) break; - if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && - kvm_can_mwait_in_guest()) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; + +#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ + "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." + + if (!mitigate_smt_rsb) { + if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && + (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) + pr_warn_once(SMT_RSB_MSG); + + if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && + kvm_can_mwait_in_guest()) + kvm->arch.mwait_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) + kvm->arch.hlt_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; + } + r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: @@ -13456,6 +13475,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); static int __init kvm_x86_init(void) { kvm_mmu_x86_module_init(); + mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0; } module_init(kvm_x86_init); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index fb4b1b5e0dea..46de9cf5c91d 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -387,8 +387,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, u8 mtrr_type, uniform; mtrr_type = mtrr_type_lookup(start, end, &uniform); - if (mtrr_type != MTRR_TYPE_WRBACK && - mtrr_type != MTRR_TYPE_INVALID) + if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; return _PAGE_CACHE_MODE_WB; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index b94f727251b6..8babce71915f 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -392,6 +392,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) { for (i = 0; i < msidesc->nvec_used; i++) xen_destroy_irq(msidesc->irq + i); + msidesc->irq = 0; } } @@ -433,6 +434,7 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = { }; static struct msi_domain_info xen_pci_msi_domain_info = { + .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS, .ops = &xen_pci_msi_domain_ops, }; diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 7d9b15f0dbd5..0fbde0fc0628 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -769,8 +769,8 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, * request from the old cgroup. */ bfq_put_cooperator(sync_bfqq); - bfq_release_process_ref(bfqd, sync_bfqq); bic_set_bfqq(bic, NULL, true); + bfq_release_process_ref(bfqd, sync_bfqq); } } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ccf2204477a5..380e9bda2e57 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5425,9 +5425,11 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { - bfq_release_process_ref(bfqd, bfqq); + struct bfq_queue *old_bfqq = bfqq; + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); bic_set_bfqq(bic, bfqq, false); + bfq_release_process_ref(bfqd, old_bfqq); } bfqq = bic_to_bfqq(bic, true); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4c94a6560f62..9ac1efb053e0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2001,6 +2001,10 @@ void blk_cgroup_bio_start(struct bio *bio) struct blkg_iostat_set *bis; unsigned long flags; + /* Root-level stats are sourced from system-wide IO stats */ + if (!cgroup_parent(blkcg->css.cgroup)) + return; + cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); diff --git a/block/blk-map.c b/block/blk-map.c index 19940c978c73..f2135e6ee8f6 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -31,7 +31,8 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, return NULL; memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); bmd->iter = *data; - bmd->iter.iov = bmd->iov; + if (iter_is_iovec(data)) + bmd->iter.iov = bmd->iov; return bmd; } @@ -641,7 +642,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, copy = true; else if (iov_iter_is_bvec(iter)) map_bvec = true; - else if (!iter_is_iovec(iter)) + else if (!user_backed_iter(iter)) copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); @@ -682,9 +683,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { - struct iovec iov; struct iov_iter i; - int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); + int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i); if (unlikely(ret < 0)) return ret; diff --git a/block/blk-merge.c b/block/blk-merge.c index b7c193d67185..64bf7d9dd8e8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; @@ -336,6 +336,7 @@ split: bio_clear_polled(bio); return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } +EXPORT_SYMBOL_GPL(bio_split_rw); /** * __bio_split_to_limits - split a bio to fit the queue limits diff --git a/block/blk-mq.c b/block/blk-mq.c index 9d463f7563bc..9c8dc70020bc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4069,8 +4069,9 @@ EXPORT_SYMBOL(blk_mq_init_queue); * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * - * This shuts down a request queue allocated by blk_mq_init_queue() and drops - * the initial reference. All future requests will failed with -ENODEV. + * This shuts down a request queue allocated by blk_mq_init_queue(). All future + * requests will be failed with -ENODEV. The caller is responsible for dropping + * the reference from blk_mq_init_queue() by calling blk_put_queue(). * * Context: can sleep */ diff --git a/certs/Makefile b/certs/Makefile index 9486ed924731..799ad7b9e68a 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -23,8 +23,8 @@ $(obj)/blacklist_hash_list: $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) FORCE targets += blacklist_hash_list quiet_cmd_extract_certs = CERT $@ - cmd_extract_certs = $(obj)/extract-cert $(extract-cert-in) $@ -extract-cert-in = $(or $(filter-out $(obj)/extract-cert, $(real-prereqs)),"") + cmd_extract_certs = $(obj)/extract-cert "$(extract-cert-in)" $@ +extract-cert-in = $(filter-out $(obj)/extract-cert, $(real-prereqs)) $(obj)/system_certificates.o: $(obj)/x509_certificate_list diff --git a/certs/blacklist.c b/certs/blacklist.c index 41f10601cc72..675dd7a8f07a 100644 --- a/certs/blacklist.c +++ b/certs/blacklist.c @@ -183,16 +183,19 @@ static int mark_raw_hash_blacklisted(const char *hash) { key_ref_t key; - key = key_create_or_update(make_key_ref(blacklist_keyring, true), - "blacklist", - hash, - NULL, - 0, - BLACKLIST_KEY_PERM, - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_BUILT_IN); + key = key_create(make_key_ref(blacklist_keyring, true), + "blacklist", + hash, + NULL, + 0, + BLACKLIST_KEY_PERM, + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN); if (IS_ERR(key)) { - pr_err("Problem blacklisting hash (%ld)\n", PTR_ERR(key)); + if (PTR_ERR(key) == -EEXIST) + pr_warn("Duplicate blacklisted hash %s\n", hash); + else + pr_err("Problem blacklisting hash %s: %pe\n", hash, key); return PTR_ERR(key); } return 0; diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig index 3df3fe4ed95f..1ef3b46d6f6e 100644 --- a/crypto/asymmetric_keys/Kconfig +++ b/crypto/asymmetric_keys/Kconfig @@ -83,6 +83,6 @@ config FIPS_SIGNATURE_SELFTEST for FIPS. depends on KEYS depends on ASYMMETRIC_KEY_TYPE - depends on PKCS7_MESSAGE_PARSER + depends on PKCS7_MESSAGE_PARSER=X509_CERTIFICATE_PARSER endif # ASYMMETRIC_KEY_TYPE diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f6321c785714..4fa769c4bcdb 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -485,3 +485,4 @@ int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7, pkcs7->data_len = datalen; return 0; } +EXPORT_SYMBOL_GPL(pkcs7_supply_detached_data); diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 2f8352e88860..eca5671ad3f2 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -186,8 +186,28 @@ static int software_key_query(const struct kernel_pkey_params *params, len = crypto_akcipher_maxsize(tfm); info->key_size = len * 8; - info->max_data_size = len; - info->max_sig_size = len; + + if (strncmp(pkey->pkey_algo, "ecdsa", 5) == 0) { + /* + * ECDSA key sizes are much smaller than RSA, and thus could + * operate on (hashed) inputs that are larger than key size. + * For example SHA384-hashed input used with secp256r1 + * based keys. Set max_data_size to be at least as large as + * the largest supported hash size (SHA512) + */ + info->max_data_size = 64; + + /* + * Verify takes ECDSA-Sig (described in RFC 5480) as input, + * which is actually 2 'key_size'-bit integers encoded in + * ASN.1. Account for the ASN.1 encoding overhead here. + */ + info->max_sig_size = 2 * (len + 3) + 2; + } else { + info->max_data_size = len; + info->max_sig_size = len; + } + info->max_enc_size = len; info->max_dec_size = len; info->supported_ops = (KEYCTL_SUPPORTS_ENCRYPT | diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index f1cc5ec6a3b6..4e48d6db05eb 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3297,8 +3297,8 @@ void acpi_nfit_shutdown(void *data) mutex_lock(&acpi_desc->init_mutex); set_bit(ARS_CANCEL, &acpi_desc->scrub_flags); - cancel_delayed_work_sync(&acpi_desc->dwork); mutex_unlock(&acpi_desc->init_mutex); + cancel_delayed_work_sync(&acpi_desc->dwork); /* * Bounce the nvdimm bus lock to make sure any in-flight diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 0b557c0d405e..4ca667251272 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -60,13 +60,17 @@ static struct notifier_block tts_notifier = { .priority = 0, }; +#ifndef acpi_skip_set_wakeup_address +#define acpi_skip_set_wakeup_address() false +#endif + static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP unsigned long acpi_wakeup_address; /* do we have a wakeup address for S2 and S3? */ - if (acpi_state == ACPI_STATE_S3) { + if (acpi_state == ACPI_STATE_S3 && !acpi_skip_set_wakeup_address()) { acpi_wakeup_address = acpi_get_wakeup_address(); if (!acpi_wakeup_address) return -EFAULT; diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index 65cec7bb6d96..a8c02608dde4 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -110,26 +110,6 @@ static bool nvidia_wmi_ec_supported(void) } #endif -static bool apple_gmux_backlight_present(void) -{ - struct acpi_device *adev; - struct device *dev; - - adev = acpi_dev_get_first_match_dev(GMUX_ACPI_HID, NULL, -1); - if (!adev) - return false; - - dev = acpi_get_first_physical_node(adev); - if (!dev) - return false; - - /* - * drivers/platform/x86/apple-gmux.c only supports old style - * Apple GMUX with an IO-resource. - */ - return pnp_get_resource(to_pnp_dev(dev), IORESOURCE_IO, 0) != NULL; -} - /* Force to use vendor driver when the ACPI device is known to be * buggy */ static int video_detect_force_vendor(const struct dmi_system_id *d) @@ -612,6 +592,14 @@ static const struct dmi_system_id video_detect_dmi_table[] = { }, { .callback = video_detect_force_native, + /* Asus U46E */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "U46E"), + }, + }, + { + .callback = video_detect_force_native, /* Asus UX303UB */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), @@ -620,6 +608,23 @@ static const struct dmi_system_id video_detect_dmi_table[] = { }, { .callback = video_detect_force_native, + /* HP EliteBook 8460p */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP EliteBook 8460p"), + }, + }, + { + .callback = video_detect_force_native, + /* HP Pavilion g6-1d80nr / B4U19UA */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion g6 Notebook PC"), + DMI_MATCH(DMI_PRODUCT_SKU, "B4U19UA"), + }, + }, + { + .callback = video_detect_force_native, /* Samsung N150P */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), @@ -766,6 +771,7 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native) { static DEFINE_MUTEX(init_mutex); static bool nvidia_wmi_ec_present; + static bool apple_gmux_present; static bool native_available; static bool init_done; static long video_caps; @@ -779,6 +785,7 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native) ACPI_UINT32_MAX, find_video, NULL, &video_caps, NULL); nvidia_wmi_ec_present = nvidia_wmi_ec_supported(); + apple_gmux_present = apple_gmux_detect(NULL, NULL); init_done = true; } if (native) @@ -800,7 +807,7 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native) if (nvidia_wmi_ec_present) return acpi_backlight_nvidia_wmi_ec; - if (apple_gmux_backlight_present()) + if (apple_gmux_present) return acpi_backlight_apple_gmux; /* Use ACPI video if available, except when native should be preferred. */ diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index 09b2ce7e4c34..348d63d1e3d3 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -352,7 +352,7 @@ static inline bool is_binderfs_control_device(const struct dentry *dentry) return info->control_dentry == dentry; } -static int binderfs_rename(struct user_namespace *mnt_userns, +static int binderfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -361,7 +361,7 @@ static int binderfs_rename(struct user_namespace *mnt_userns, is_binderfs_control_device(new_dentry)) return -EPERM; - return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir, + return simple_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 14a1c0d14916..3bb9bb483fe3 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -421,6 +421,7 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x34d3), board_ahci_low_power }, /* Ice Lake LP AHCI */ { PCI_VDEVICE(INTEL, 0x02d3), board_ahci_low_power }, /* Comet Lake PCH-U AHCI */ { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_low_power }, /* Comet Lake PCH RAID */ + { PCI_VDEVICE(INTEL, 0xa0d3), board_ahci_low_power }, /* Tiger Lake UP{3,4} AHCI */ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 884ae73b11ea..c4c89d24f84c 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3109,7 +3109,7 @@ int sata_down_spd_limit(struct ata_link *link, u32 spd_limit) */ if (spd > 1) mask &= (1 << (spd - 1)) - 1; - else + else if (link->sata_spd) return -EINVAL; /* were we already at the bottom? */ @@ -4045,6 +4045,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "Samsung SSD 870*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM | ATA_HORKAGE_NO_NCQ_ON_ATI }, + { "SAMSUNG*MZ7LH*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | + ATA_HORKAGE_ZERO_AFTER_TRIM | + ATA_HORKAGE_NO_NCQ_ON_ATI, }, { "FCCT*M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM }, diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c index 35608a0cf552..4cbcdc5da038 100644 --- a/drivers/ata/pata_octeon_cf.c +++ b/drivers/ata/pata_octeon_cf.c @@ -67,7 +67,7 @@ module_param(enable_dma, int, 0444); MODULE_PARM_DESC(enable_dma, "Enable use of DMA on interfaces that support it (0=no dma [default], 1=use dma)"); -/** +/* * Convert nanosecond based time to setting used in the * boot bus timing register, based on timing multiple */ @@ -114,7 +114,7 @@ static void octeon_cf_set_boot_reg_cfg(int cs, unsigned int multiplier) cvmx_write_csr(CVMX_MIO_BOOT_REG_CFGX(cs), reg_cfg.u64); } -/** +/* * Called after libata determines the needed PIO mode. This * function programs the Octeon bootbus regions to support the * timing requirements of the PIO mode. @@ -278,7 +278,7 @@ static void octeon_cf_set_dmamode(struct ata_port *ap, struct ata_device *dev) cvmx_write_csr(cf_port->dma_base + DMA_TIM, dma_tim.u64); } -/** +/* * Handle an 8 bit I/O request. * * @qc: Queued command @@ -317,7 +317,7 @@ static unsigned int octeon_cf_data_xfer8(struct ata_queued_cmd *qc, return buflen; } -/** +/* * Handle a 16 bit I/O request. * * @qc: Queued command @@ -372,7 +372,7 @@ static unsigned int octeon_cf_data_xfer16(struct ata_queued_cmd *qc, return buflen; } -/** +/* * Read the taskfile for 16bit non-True IDE only. */ static void octeon_cf_tf_read16(struct ata_port *ap, struct ata_taskfile *tf) @@ -453,7 +453,7 @@ static int octeon_cf_softreset16(struct ata_link *link, unsigned int *classes, return 0; } -/** +/* * Load the taskfile for 16bit non-True IDE only. The device_addr is * not loaded, we do this as part of octeon_cf_exec_command16. */ @@ -525,7 +525,7 @@ static void octeon_cf_dma_setup(struct ata_queued_cmd *qc) ap->ops->sff_exec_command(ap, &qc->tf); } -/** +/* * Start a DMA transfer that was already setup * * @qc: Information about the DMA @@ -580,7 +580,7 @@ static void octeon_cf_dma_start(struct ata_queued_cmd *qc) cvmx_write_csr(cf_port->dma_base + DMA_CFG, mio_boot_dma_cfg.u64); } -/** +/* * * LOCKING: * spin_lock_irqsave(host lock) diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index e4bffeabf344..03e8a95f1f35 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -173,7 +173,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (IS_ERR(dentry)) return PTR_ERR(dentry); - err = vfs_mkdir(&init_user_ns, d_inode(path.dentry), dentry, mode); + err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); if (!err) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; @@ -223,7 +223,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, if (IS_ERR(dentry)) return PTR_ERR(dentry); - err = vfs_mknod(&init_user_ns, d_inode(path.dentry), dentry, mode, + err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, dev->devt); if (!err) { struct iattr newattrs; @@ -233,7 +233,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; inode_lock(d_inode(dentry)); - notify_change(&init_user_ns, dentry, &newattrs, NULL); + notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ @@ -254,7 +254,7 @@ static int dev_rmdir(const char *name) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { if (d_inode(dentry)->i_private == &thread) - err = vfs_rmdir(&init_user_ns, d_inode(parent.dentry), + err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), dentry); else err = -EPERM; @@ -341,9 +341,9 @@ static int handle_remove(const char *nodename, struct device *dev) newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); - notify_change(&init_user_ns, dentry, &newattrs, NULL); + notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); - err = vfs_unlink(&init_user_ns, d_inode(parent.dentry), + err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 17b677b5d3b2..6368b56eacf1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -137,7 +137,7 @@ struct ublk_device { char *__queues; - unsigned short queue_size; + unsigned int queue_size; struct ublksrv_ctrl_dev_info dev_info; struct blk_mq_tag_set tag_set; @@ -2092,13 +2092,12 @@ static void __exit ublk_exit(void) struct ublk_device *ub; int id; - class_destroy(ublk_chr_class); - - misc_deregister(&ublk_misc); - idr_for_each_entry(&ublk_index_idr, ub, id) ublk_remove(ub); + class_destroy(ublk_chr_class); + misc_deregister(&ublk_misc); + idr_destroy(&ublk_index_idr); unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); } diff --git a/drivers/bus/sunxi-rsb.c b/drivers/bus/sunxi-rsb.c index 3aa91aed3bf7..226e87b85116 100644 --- a/drivers/bus/sunxi-rsb.c +++ b/drivers/bus/sunxi-rsb.c @@ -857,7 +857,13 @@ static int __init sunxi_rsb_init(void) return ret; } - return platform_driver_register(&sunxi_rsb_driver); + ret = platform_driver_register(&sunxi_rsb_driver); + if (ret) { + bus_unregister(&sunxi_rsb_bus); + return ret; + } + + return 0; } module_init(sunxi_rsb_init); diff --git a/drivers/char/tpm/eventlog/acpi.c b/drivers/char/tpm/eventlog/acpi.c index 0913d3eb8d51..40360e599bc3 100644 --- a/drivers/char/tpm/eventlog/acpi.c +++ b/drivers/char/tpm/eventlog/acpi.c @@ -14,6 +14,7 @@ * Access to the event log extended by the TCG BIOS of PC platform */ +#include <linux/device.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/security.h> @@ -135,7 +136,7 @@ int tpm_read_log_acpi(struct tpm_chip *chip) } /* malloc EventLog space */ - log->bios_event_log = kmalloc(len, GFP_KERNEL); + log->bios_event_log = devm_kmalloc(&chip->dev, len, GFP_KERNEL); if (!log->bios_event_log) return -ENOMEM; @@ -160,7 +161,7 @@ int tpm_read_log_acpi(struct tpm_chip *chip) return format; err: - kfree(log->bios_event_log); + devm_kfree(&chip->dev, log->bios_event_log); log->bios_event_log = NULL; return ret; } diff --git a/drivers/char/tpm/eventlog/efi.c b/drivers/char/tpm/eventlog/efi.c index e6cb9d525e30..4e9d7c2bf32e 100644 --- a/drivers/char/tpm/eventlog/efi.c +++ b/drivers/char/tpm/eventlog/efi.c @@ -6,6 +6,7 @@ * Thiebaud Weksteen <tweek@google.com> */ +#include <linux/device.h> #include <linux/efi.h> #include <linux/tpm_eventlog.h> @@ -55,7 +56,7 @@ int tpm_read_log_efi(struct tpm_chip *chip) } /* malloc EventLog space */ - log->bios_event_log = kmemdup(log_tbl->log, log_size, GFP_KERNEL); + log->bios_event_log = devm_kmemdup(&chip->dev, log_tbl->log, log_size, GFP_KERNEL); if (!log->bios_event_log) { ret = -ENOMEM; goto out; @@ -76,7 +77,7 @@ int tpm_read_log_efi(struct tpm_chip *chip) MEMREMAP_WB); if (!final_tbl) { pr_err("Could not map UEFI TPM final log\n"); - kfree(log->bios_event_log); + devm_kfree(&chip->dev, log->bios_event_log); ret = -ENOMEM; goto out; } @@ -91,11 +92,11 @@ int tpm_read_log_efi(struct tpm_chip *chip) * Allocate memory for the 'combined log' where we will append the * 'final events log' to. */ - tmp = krealloc(log->bios_event_log, - log_size + final_events_log_size, - GFP_KERNEL); + tmp = devm_krealloc(&chip->dev, log->bios_event_log, + log_size + final_events_log_size, + GFP_KERNEL); if (!tmp) { - kfree(log->bios_event_log); + devm_kfree(&chip->dev, log->bios_event_log); ret = -ENOMEM; goto out; } diff --git a/drivers/char/tpm/eventlog/of.c b/drivers/char/tpm/eventlog/of.c index a9ce66d09a75..930fe43d5daf 100644 --- a/drivers/char/tpm/eventlog/of.c +++ b/drivers/char/tpm/eventlog/of.c @@ -10,13 +10,44 @@ * Read the event log created by the firmware on PPC64 */ +#include <linux/device.h> #include <linux/slab.h> +#include <linux/io.h> +#include <linux/ioport.h> #include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_reserved_mem.h> #include <linux/tpm_eventlog.h> #include "../tpm.h" #include "common.h" +static int tpm_read_log_memory_region(struct tpm_chip *chip) +{ + struct device_node *node; + struct resource res; + int rc; + + node = of_parse_phandle(chip->dev.parent->of_node, "memory-region", 0); + if (!node) + return -ENODEV; + + rc = of_address_to_resource(node, 0, &res); + of_node_put(node); + if (rc) + return rc; + + chip->log.bios_event_log = devm_memremap(&chip->dev, res.start, resource_size(&res), + MEMREMAP_WB); + if (IS_ERR(chip->log.bios_event_log)) + return -ENOMEM; + + chip->log.bios_event_log_end = chip->log.bios_event_log + resource_size(&res); + + return chip->flags & TPM_CHIP_FLAG_TPM2 ? EFI_TCG2_EVENT_LOG_FORMAT_TCG_2 : + EFI_TCG2_EVENT_LOG_FORMAT_TCG_1_2; +} + int tpm_read_log_of(struct tpm_chip *chip) { struct device_node *np; @@ -38,7 +69,7 @@ int tpm_read_log_of(struct tpm_chip *chip) sizep = of_get_property(np, "linux,sml-size", NULL); basep = of_get_property(np, "linux,sml-base", NULL); if (sizep == NULL && basep == NULL) - return -ENODEV; + return tpm_read_log_memory_region(chip); if (sizep == NULL || basep == NULL) return -EIO; @@ -65,7 +96,7 @@ int tpm_read_log_of(struct tpm_chip *chip) return -EIO; } - log->bios_event_log = kmemdup(__va(base), size, GFP_KERNEL); + log->bios_event_log = devm_kmemdup(&chip->dev, __va(base), size, GFP_KERNEL); if (!log->bios_event_log) return -ENOMEM; diff --git a/drivers/char/tpm/st33zp24/i2c.c b/drivers/char/tpm/st33zp24/i2c.c index 8156bb2af78c..c4d0b744e3cc 100644 --- a/drivers/char/tpm/st33zp24/i2c.c +++ b/drivers/char/tpm/st33zp24/i2c.c @@ -101,8 +101,7 @@ static const struct st33zp24_phy_ops i2c_phy_ops = { * @return: 0 in case of success. * -1 in other case. */ -static int st33zp24_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int st33zp24_i2c_probe(struct i2c_client *client) { struct st33zp24_i2c_phy *phy; @@ -161,7 +160,7 @@ static struct i2c_driver st33zp24_i2c_driver = { .of_match_table = of_match_ptr(of_st33zp24_i2c_match), .acpi_match_table = ACPI_PTR(st33zp24_i2c_acpi_match), }, - .probe = st33zp24_i2c_probe, + .probe_new = st33zp24_i2c_probe, .remove = st33zp24_i2c_remove, .id_table = st33zp24_i2c_id }; diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index 741d8f3e8fb3..b99f55f2d4fd 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -267,7 +267,6 @@ static void tpm_dev_release(struct device *dev) idr_remove(&dev_nums_idr, chip->dev_num); mutex_unlock(&idr_lock); - kfree(chip->log.bios_event_log); kfree(chip->work_space.context_buf); kfree(chip->work_space.session_buf); kfree(chip->allocated_banks); diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 65d03867e114..93545be190a5 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -777,10 +777,12 @@ out: int tpm2_find_cc(struct tpm_chip *chip, u32 cc) { + u32 cc_mask; int i; + cc_mask = 1 << TPM2_CC_ATTR_VENDOR | GENMASK(15, 0); for (i = 0; i < chip->nr_commands; i++) - if (cc == (chip->cc_attrs_tbl[i] & GENMASK(15, 0))) + if (cc == (chip->cc_attrs_tbl[i] & cc_mask)) return i; return -1; diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index 7e9da671a0e8..d43a0d7b97a8 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -98,6 +98,8 @@ struct crb_priv { u8 __iomem *rsp; u32 cmd_size; u32 smc_func_id; + u32 __iomem *pluton_start_addr; + u32 __iomem *pluton_reply_addr; }; struct tpm2_crb_smc { @@ -108,6 +110,11 @@ struct tpm2_crb_smc { u32 smc_func_id; }; +struct tpm2_crb_pluton { + u64 start_addr; + u64 reply_addr; +}; + static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value, unsigned long timeout) { @@ -127,6 +134,25 @@ static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value, return ((ioread32(reg) & mask) == value); } +static int crb_try_pluton_doorbell(struct crb_priv *priv, bool wait_for_complete) +{ + if (priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) + return 0; + + if (!crb_wait_for_reg_32(priv->pluton_reply_addr, ~0, 1, TPM2_TIMEOUT_C)) + return -ETIME; + + iowrite32(1, priv->pluton_start_addr); + if (wait_for_complete == false) + return 0; + + if (!crb_wait_for_reg_32(priv->pluton_start_addr, + 0xffffffff, 0, 200)) + return -ETIME; + + return 0; +} + /** * __crb_go_idle - request tpm crb device to go the idle state * @@ -145,6 +171,8 @@ static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value, */ static int __crb_go_idle(struct device *dev, struct crb_priv *priv) { + int rc; + if ((priv->sm == ACPI_TPM2_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC)) @@ -152,6 +180,10 @@ static int __crb_go_idle(struct device *dev, struct crb_priv *priv) iowrite32(CRB_CTRL_REQ_GO_IDLE, &priv->regs_t->ctrl_req); + rc = crb_try_pluton_doorbell(priv, true); + if (rc) + return rc; + if (!crb_wait_for_reg_32(&priv->regs_t->ctrl_req, CRB_CTRL_REQ_GO_IDLE/* mask */, 0, /* value */ @@ -188,12 +220,19 @@ static int crb_go_idle(struct tpm_chip *chip) */ static int __crb_cmd_ready(struct device *dev, struct crb_priv *priv) { + int rc; + if ((priv->sm == ACPI_TPM2_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC)) return 0; iowrite32(CRB_CTRL_REQ_CMD_READY, &priv->regs_t->ctrl_req); + + rc = crb_try_pluton_doorbell(priv, true); + if (rc) + return rc; + if (!crb_wait_for_reg_32(&priv->regs_t->ctrl_req, CRB_CTRL_REQ_CMD_READY /* mask */, 0, /* value */ @@ -371,6 +410,10 @@ static int crb_send(struct tpm_chip *chip, u8 *buf, size_t len) return -E2BIG; } + /* Seems to be necessary for every command */ + if (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) + __crb_cmd_ready(&chip->dev, priv); + memcpy_toio(priv->cmd, buf, len); /* Make sure that cmd is populated before issuing start. */ @@ -394,7 +437,10 @@ static int crb_send(struct tpm_chip *chip, u8 *buf, size_t len) rc = tpm_crb_smc_start(&chip->dev, priv->smc_func_id); } - return rc; + if (rc) + return rc; + + return crb_try_pluton_doorbell(priv, false); } static void crb_cancel(struct tpm_chip *chip) @@ -524,15 +570,18 @@ static int crb_map_io(struct acpi_device *device, struct crb_priv *priv, return ret; acpi_dev_free_resource_list(&acpi_resource_list); - if (resource_type(iores_array) != IORESOURCE_MEM) { - dev_err(dev, FW_BUG "TPM2 ACPI table does not define a memory resource\n"); - return -EINVAL; - } else if (resource_type(iores_array + TPM_CRB_MAX_RESOURCES) == - IORESOURCE_MEM) { - dev_warn(dev, "TPM2 ACPI table defines too many memory resources\n"); - memset(iores_array + TPM_CRB_MAX_RESOURCES, - 0, sizeof(*iores_array)); - iores_array[TPM_CRB_MAX_RESOURCES].flags = 0; + /* Pluton doesn't appear to define ACPI memory regions */ + if (priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) { + if (resource_type(iores_array) != IORESOURCE_MEM) { + dev_err(dev, FW_BUG "TPM2 ACPI table does not define a memory resource\n"); + return -EINVAL; + } else if (resource_type(iores_array + TPM_CRB_MAX_RESOURCES) == + IORESOURCE_MEM) { + dev_warn(dev, "TPM2 ACPI table defines too many memory resources\n"); + memset(iores_array + TPM_CRB_MAX_RESOURCES, + 0, sizeof(*iores_array)); + iores_array[TPM_CRB_MAX_RESOURCES].flags = 0; + } } iores = NULL; @@ -656,6 +705,22 @@ out_relinquish_locality: return ret; } +static int crb_map_pluton(struct device *dev, struct crb_priv *priv, + struct acpi_table_tpm2 *buf, struct tpm2_crb_pluton *crb_pluton) +{ + priv->pluton_start_addr = crb_map_res(dev, NULL, NULL, + crb_pluton->start_addr, 4); + if (IS_ERR(priv->pluton_start_addr)) + return PTR_ERR(priv->pluton_start_addr); + + priv->pluton_reply_addr = crb_map_res(dev, NULL, NULL, + crb_pluton->reply_addr, 4); + if (IS_ERR(priv->pluton_reply_addr)) + return PTR_ERR(priv->pluton_reply_addr); + + return 0; +} + static int crb_acpi_add(struct acpi_device *device) { struct acpi_table_tpm2 *buf; @@ -663,6 +728,7 @@ static int crb_acpi_add(struct acpi_device *device) struct tpm_chip *chip; struct device *dev = &device->dev; struct tpm2_crb_smc *crb_smc; + struct tpm2_crb_pluton *crb_pluton; acpi_status status; u32 sm; int rc; @@ -700,6 +766,20 @@ static int crb_acpi_add(struct acpi_device *device) priv->smc_func_id = crb_smc->smc_func_id; } + if (sm == ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) { + if (buf->header.length < (sizeof(*buf) + sizeof(*crb_pluton))) { + dev_err(dev, + FW_BUG "TPM2 ACPI table has wrong size %u for start method type %d\n", + buf->header.length, + ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON); + return -EINVAL; + } + crb_pluton = ACPI_ADD_PTR(struct tpm2_crb_pluton, buf, sizeof(*buf)); + rc = crb_map_pluton(dev, priv, buf, crb_pluton); + if (rc) + return rc; + } + priv->sm = sm; priv->hid = acpi_device_hid(device); diff --git a/drivers/char/tpm/tpm_i2c_atmel.c b/drivers/char/tpm/tpm_i2c_atmel.c index 4be3677c1463..8f77154e0550 100644 --- a/drivers/char/tpm/tpm_i2c_atmel.c +++ b/drivers/char/tpm/tpm_i2c_atmel.c @@ -146,8 +146,7 @@ static const struct tpm_class_ops i2c_atmel = { .req_canceled = i2c_atmel_req_canceled, }; -static int i2c_atmel_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int i2c_atmel_probe(struct i2c_client *client) { struct tpm_chip *chip; struct device *dev = &client->dev; @@ -204,7 +203,7 @@ static SIMPLE_DEV_PM_OPS(i2c_atmel_pm_ops, tpm_pm_suspend, tpm_pm_resume); static struct i2c_driver i2c_atmel_driver = { .id_table = i2c_atmel_id, - .probe = i2c_atmel_probe, + .probe_new = i2c_atmel_probe, .remove = i2c_atmel_remove, .driver = { .name = I2C_DRIVER_NAME, diff --git a/drivers/char/tpm/tpm_i2c_infineon.c b/drivers/char/tpm/tpm_i2c_infineon.c index fd3c3661e646..7cdaff52a96d 100644 --- a/drivers/char/tpm/tpm_i2c_infineon.c +++ b/drivers/char/tpm/tpm_i2c_infineon.c @@ -681,8 +681,7 @@ MODULE_DEVICE_TABLE(of, tpm_tis_i2c_of_match); static SIMPLE_DEV_PM_OPS(tpm_tis_i2c_ops, tpm_pm_suspend, tpm_pm_resume); -static int tpm_tis_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int tpm_tis_i2c_probe(struct i2c_client *client) { int rc; struct device *dev = &(client->dev); @@ -717,7 +716,7 @@ static void tpm_tis_i2c_remove(struct i2c_client *client) static struct i2c_driver tpm_tis_i2c_driver = { .id_table = tpm_tis_i2c_table, - .probe = tpm_tis_i2c_probe, + .probe_new = tpm_tis_i2c_probe, .remove = tpm_tis_i2c_remove, .driver = { .name = "tpm_i2c_infineon", diff --git a/drivers/char/tpm/tpm_i2c_nuvoton.c b/drivers/char/tpm/tpm_i2c_nuvoton.c index 95c37350cc8e..a026e98add50 100644 --- a/drivers/char/tpm/tpm_i2c_nuvoton.c +++ b/drivers/char/tpm/tpm_i2c_nuvoton.c @@ -522,9 +522,9 @@ static int get_vid(struct i2c_client *client, u32 *res) return 0; } -static int i2c_nuvoton_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int i2c_nuvoton_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); int rc; struct tpm_chip *chip; struct device *dev = &client->dev; @@ -650,7 +650,7 @@ static SIMPLE_DEV_PM_OPS(i2c_nuvoton_pm_ops, tpm_pm_suspend, tpm_pm_resume); static struct i2c_driver i2c_nuvoton_driver = { .id_table = i2c_nuvoton_id, - .probe = i2c_nuvoton_probe, + .probe_new = i2c_nuvoton_probe, .remove = i2c_nuvoton_remove, .driver = { .name = "tpm_i2c_nuvoton", diff --git a/drivers/char/tpm/tpm_tis_i2c.c b/drivers/char/tpm/tpm_tis_i2c.c index f3a7251c8e38..c8c34adc14c0 100644 --- a/drivers/char/tpm/tpm_tis_i2c.c +++ b/drivers/char/tpm/tpm_tis_i2c.c @@ -312,8 +312,7 @@ static const struct tpm_tis_phy_ops tpm_i2c_phy_ops = { .verify_crc = tpm_tis_i2c_verify_crc, }; -static int tpm_tis_i2c_probe(struct i2c_client *dev, - const struct i2c_device_id *id) +static int tpm_tis_i2c_probe(struct i2c_client *dev) { struct tpm_tis_i2c_phy *phy; const u8 crc_enable = 1; @@ -380,7 +379,7 @@ static struct i2c_driver tpm_tis_i2c_driver = { .pm = &tpm_tis_pm, .of_match_table = of_match_ptr(of_tis_i2c_match), }, - .probe = tpm_tis_i2c_probe, + .probe_new = tpm_tis_i2c_probe, .remove = tpm_tis_i2c_remove, .id_table = tpm_tis_i2c_id, }; diff --git a/drivers/clk/ingenic/jz4760-cgu.c b/drivers/clk/ingenic/jz4760-cgu.c index ecd395ac8a28..e407f00bd594 100644 --- a/drivers/clk/ingenic/jz4760-cgu.c +++ b/drivers/clk/ingenic/jz4760-cgu.c @@ -58,7 +58,7 @@ jz4760_cgu_calc_m_n_od(const struct ingenic_cgu_pll_info *pll_info, unsigned long rate, unsigned long parent_rate, unsigned int *pm, unsigned int *pn, unsigned int *pod) { - unsigned int m, n, od, m_max = (1 << pll_info->m_bits) - 2; + unsigned int m, n, od, m_max = (1 << pll_info->m_bits) - 1; /* The frequency after the N divider must be between 1 and 50 MHz. */ n = parent_rate / (1 * MHZ); @@ -66,19 +66,17 @@ jz4760_cgu_calc_m_n_od(const struct ingenic_cgu_pll_info *pll_info, /* The N divider must be >= 2. */ n = clamp_val(n, 2, 1 << pll_info->n_bits); - for (;; n >>= 1) { - od = (unsigned int)-1; + rate /= MHZ; + parent_rate /= MHZ; - do { - m = (rate / MHZ) * (1 << ++od) * n / (parent_rate / MHZ); - } while ((m > m_max || m & 1) && (od < 4)); - - if (od < 4 && m >= 4 && m <= m_max) - break; + for (m = m_max; m >= m_max && n >= 2; n--) { + m = rate * n / parent_rate; + od = m & 1; + m <<= od; } *pm = m; - *pn = n; + *pn = n + 1; *pod = 1 << od; } diff --git a/drivers/clk/microchip/clk-mpfs-ccc.c b/drivers/clk/microchip/clk-mpfs-ccc.c index 32aae880a14f..0ddc73e07be4 100644 --- a/drivers/clk/microchip/clk-mpfs-ccc.c +++ b/drivers/clk/microchip/clk-mpfs-ccc.c @@ -164,12 +164,11 @@ static int mpfs_ccc_register_outputs(struct device *dev, struct mpfs_ccc_out_hw_ for (unsigned int i = 0; i < num_clks; i++) { struct mpfs_ccc_out_hw_clock *out_hw = &out_hws[i]; - char *name = devm_kzalloc(dev, 23, GFP_KERNEL); + char *name = devm_kasprintf(dev, GFP_KERNEL, "%s_out%u", parent->name, i); if (!name) return -ENOMEM; - snprintf(name, 23, "%s_out%u", parent->name, i); out_hw->divider.hw.init = CLK_HW_INIT_HW(name, &parent->hw, &clk_divider_ops, 0); out_hw->divider.reg = data->pll_base[i / MPFS_CCC_OUTPUTS_PER_PLL] + out_hw->reg_offset; @@ -201,14 +200,13 @@ static int mpfs_ccc_register_plls(struct device *dev, struct mpfs_ccc_pll_hw_clo for (unsigned int i = 0; i < num_clks; i++) { struct mpfs_ccc_pll_hw_clock *pll_hw = &pll_hws[i]; - char *name = devm_kzalloc(dev, 18, GFP_KERNEL); - if (!name) + pll_hw->name = devm_kasprintf(dev, GFP_KERNEL, "ccc%s_pll%u", + strchrnul(dev->of_node->full_name, '@'), i); + if (!pll_hw->name) return -ENOMEM; pll_hw->base = data->pll_base[i]; - snprintf(name, 18, "ccc%s_pll%u", strchrnul(dev->of_node->full_name, '@'), i); - pll_hw->name = (const char *)name; pll_hw->hw.init = CLK_HW_INIT_PARENTS_DATA_FIXED_SIZE(pll_hw->name, pll_hw->parents, &mpfs_ccc_pll_ops, 0); diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 9505a812d6a1..d3f55ca06ed3 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -143,40 +143,42 @@ static unsigned long qcom_lmh_get_throttle_freq(struct qcom_cpufreq_data *data) return lval * xo_rate; } -/* Get the current frequency of the CPU (after throttling) */ -static unsigned int qcom_cpufreq_hw_get(unsigned int cpu) +/* Get the frequency requested by the cpufreq core for the CPU */ +static unsigned int qcom_cpufreq_get_freq(unsigned int cpu) { struct qcom_cpufreq_data *data; + const struct qcom_cpufreq_soc_data *soc_data; struct cpufreq_policy *policy; + unsigned int index; policy = cpufreq_cpu_get_raw(cpu); if (!policy) return 0; data = policy->driver_data; + soc_data = qcom_cpufreq.soc_data; - return qcom_lmh_get_throttle_freq(data) / HZ_PER_KHZ; + index = readl_relaxed(data->base + soc_data->reg_perf_state); + index = min(index, LUT_MAX_ENTRIES - 1); + + return policy->freq_table[index].frequency; } -/* Get the frequency requested by the cpufreq core for the CPU */ -static unsigned int qcom_cpufreq_get_freq(unsigned int cpu) +static unsigned int qcom_cpufreq_hw_get(unsigned int cpu) { struct qcom_cpufreq_data *data; - const struct qcom_cpufreq_soc_data *soc_data; struct cpufreq_policy *policy; - unsigned int index; policy = cpufreq_cpu_get_raw(cpu); if (!policy) return 0; data = policy->driver_data; - soc_data = qcom_cpufreq.soc_data; - index = readl_relaxed(data->base + soc_data->reg_perf_state); - index = min(index, LUT_MAX_ENTRIES - 1); + if (data->throttle_irq >= 0) + return qcom_lmh_get_throttle_freq(data) / HZ_PER_KHZ; - return policy->freq_table[index].frequency; + return qcom_cpufreq_get_freq(cpu); } static unsigned int qcom_cpufreq_hw_fast_switch(struct cpufreq_policy *policy, @@ -704,6 +706,8 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) return -ENOMEM; qcom_cpufreq.soc_data = of_device_get_match_data(dev); + if (!qcom_cpufreq.soc_data) + return -ENODEV; clk_data = devm_kzalloc(dev, struct_size(clk_data, hws, num_domains), GFP_KERNEL); if (!clk_data) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index ad0849af42d7..13cde44c6086 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -736,4 +736,3 @@ module_exit(cxl_acpi_exit); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_IMPORT_NS(ACPI); -MODULE_SOFTDEP("pre: cxl_pmem"); diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index f3d2169b6731..c2e4b1093788 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -227,34 +227,16 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_nvdimm_bridge *cxl_nvb, return cxl_nvd; } -static void cxl_nvd_unregister(void *_cxl_nvd) +static void cxlmd_release_nvdimm(void *_cxlmd) { - struct cxl_nvdimm *cxl_nvd = _cxl_nvd; - struct cxl_memdev *cxlmd = cxl_nvd->cxlmd; + struct cxl_memdev *cxlmd = _cxlmd; + struct cxl_nvdimm *cxl_nvd = cxlmd->cxl_nvd; struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb; - /* - * Either the bridge is in ->remove() context under the device_lock(), - * or cxlmd_release_nvdimm() is cancelling the bridge's release action - * for @cxl_nvd and doing it itself (while manually holding the bridge - * lock). - */ - device_lock_assert(&cxl_nvb->dev); cxl_nvd->cxlmd = NULL; cxlmd->cxl_nvd = NULL; + cxlmd->cxl_nvb = NULL; device_unregister(&cxl_nvd->dev); -} - -static void cxlmd_release_nvdimm(void *_cxlmd) -{ - struct cxl_memdev *cxlmd = _cxlmd; - struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb; - - device_lock(&cxl_nvb->dev); - if (cxlmd->cxl_nvd) - devm_release_action(&cxl_nvb->dev, cxl_nvd_unregister, - cxlmd->cxl_nvd); - device_unlock(&cxl_nvb->dev); put_device(&cxl_nvb->dev); } @@ -293,22 +275,6 @@ int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd) dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev)); - /* - * The two actions below arrange for @cxl_nvd to be deleted when either - * the top-level PMEM bridge goes down, or the endpoint device goes - * through ->remove(). - */ - device_lock(&cxl_nvb->dev); - if (cxl_nvb->dev.driver) - rc = devm_add_action_or_reset(&cxl_nvb->dev, cxl_nvd_unregister, - cxl_nvd); - else - rc = -ENXIO; - device_unlock(&cxl_nvb->dev); - - if (rc) - goto err_alloc; - /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */ return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 02f28da519e3..940f805b1534 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -131,7 +131,7 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_port *iter = cxled_to_port(cxled); struct cxl_ep *ep; - int rc; + int rc = 0; while (!is_cxl_root(to_cxl_port(iter->dev.parent))) iter = to_cxl_port(iter->dev.parent); @@ -143,7 +143,8 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) cxl_rr = cxl_rr_load(iter, cxlr); cxld = cxl_rr->decoder; - rc = cxld->reset(cxld); + if (cxld->reset) + rc = cxld->reset(cxld); if (rc) return rc; } @@ -186,7 +187,8 @@ static int cxl_region_decode_commit(struct cxl_region *cxlr) iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { cxl_rr = cxl_rr_load(iter, cxlr); cxld = cxl_rr->decoder; - cxld->reset(cxld); + if (cxld->reset) + cxld->reset(cxld); } cxled->cxld.reset(&cxled->cxld); @@ -991,10 +993,10 @@ static int cxl_port_setup_targets(struct cxl_port *port, int i, distance; /* - * Passthrough ports impose no distance requirements between + * Passthrough decoders impose no distance requirements between * peers */ - if (port->nr_dports == 1) + if (cxl_rr->nr_targets == 1) distance = 0; else distance = p->nr_targets / cxl_rr->nr_targets; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 33083a522fd1..258004f34281 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -554,8 +554,11 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) /* If multiple errors, log header points to first error from ctrl reg */ if (hweight32(status) > 1) { - addr = cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET; - fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, readl(addr))); + void __iomem *rcc_addr = + cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET; + + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + readl(rcc_addr))); } else { fe = status; } diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index eedefebc4283..08bbbac9a6d0 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -225,11 +225,35 @@ static int cxl_pmem_ctl(struct nvdimm_bus_descriptor *nd_desc, return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len); } +static int detach_nvdimm(struct device *dev, void *data) +{ + struct cxl_nvdimm *cxl_nvd; + bool release = false; + + if (!is_cxl_nvdimm(dev)) + return 0; + + device_lock(dev); + if (!dev->driver) + goto out; + + cxl_nvd = to_cxl_nvdimm(dev); + if (cxl_nvd->cxlmd && cxl_nvd->cxlmd->cxl_nvb == data) + release = true; +out: + device_unlock(dev); + if (release) + device_release_driver(dev); + return 0; +} + static void unregister_nvdimm_bus(void *_cxl_nvb) { struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb; struct nvdimm_bus *nvdimm_bus = cxl_nvb->nvdimm_bus; + bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, detach_nvdimm); + cxl_nvb->nvdimm_bus = NULL; nvdimm_bus_unregister(nvdimm_bus); } diff --git a/drivers/dax/super.c b/drivers/dax/super.c index da4438f3188c..c4c4728a36e4 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -475,7 +475,7 @@ EXPORT_SYMBOL_GPL(put_dax); /** * dax_holder() - obtain the holder of a dax device * @dax_dev: a dax_device instance - + * * Return: the holder's data which represents the holder if registered, * otherwize NULL. */ diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 406b4e26f538..0de0482cd36e 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -167,7 +167,7 @@ struct dma_fence *dma_fence_allocate_private_stub(void) 0, 0); set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, - &dma_fence_stub.flags); + &fence->flags); dma_fence_signal(fence); diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 878deb4880cd..0689e1510721 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -34,6 +34,9 @@ static DEFINE_MUTEX(device_ctls_mutex); static LIST_HEAD(edac_device_list); +/* Default workqueue processing interval on this instance, in msecs */ +#define DEFAULT_POLL_INTERVAL 1000 + #ifdef CONFIG_EDAC_DEBUG static void edac_device_dump_device(struct edac_device_ctl_info *edac_dev) { @@ -336,7 +339,7 @@ static void edac_device_workq_function(struct work_struct *work_req) * whole one second to save timers firing all over the period * between integral seconds */ - if (edac_dev->poll_msec == 1000) + if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL) edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay)); else edac_queue_work(&edac_dev->work, edac_dev->delay); @@ -366,7 +369,7 @@ static void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, * timers firing on sub-second basis, while they are happy * to fire together on the 1 second exactly */ - if (edac_dev->poll_msec == 1000) + if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL) edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay)); else edac_queue_work(&edac_dev->work, edac_dev->delay); @@ -400,7 +403,7 @@ void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev, edac_dev->delay = msecs_to_jiffies(msec); /* See comment in edac_device_workq_setup() above */ - if (edac_dev->poll_msec == 1000) + if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL) edac_mod_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay)); else edac_mod_work(&edac_dev->work, edac_dev->delay); @@ -442,11 +445,7 @@ int edac_device_add_device(struct edac_device_ctl_info *edac_dev) /* This instance is NOW RUNNING */ edac_dev->op_state = OP_RUNNING_POLL; - /* - * enable workq processing on this instance, - * default = 1000 msec - */ - edac_device_workq_setup(edac_dev, 1000); + edac_device_workq_setup(edac_dev, edac_dev->poll_msec ?: DEFAULT_POLL_INTERVAL); } else { edac_dev->op_state = OP_RUNNING_INTERRUPT; } diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c index 97a27e42dd61..c45519f59dc1 100644 --- a/drivers/edac/qcom_edac.c +++ b/drivers/edac/qcom_edac.c @@ -252,7 +252,7 @@ clear: static int dump_syn_reg(struct edac_device_ctl_info *edev_ctl, int err_type, u32 bank) { - struct llcc_drv_data *drv = edev_ctl->pvt_info; + struct llcc_drv_data *drv = edev_ctl->dev->platform_data; int ret; ret = dump_syn_reg_values(drv, bank, err_type); @@ -289,7 +289,7 @@ static irqreturn_t llcc_ecc_irq_handler(int irq, void *edev_ctl) { struct edac_device_ctl_info *edac_dev_ctl = edev_ctl; - struct llcc_drv_data *drv = edac_dev_ctl->pvt_info; + struct llcc_drv_data *drv = edac_dev_ctl->dev->platform_data; irqreturn_t irq_rc = IRQ_NONE; u32 drp_error, trp_error, i; int ret; @@ -358,7 +358,6 @@ static int qcom_llcc_edac_probe(struct platform_device *pdev) edev_ctl->dev_name = dev_name(dev); edev_ctl->ctl_name = "llcc"; edev_ctl->panic_on_ue = LLCC_ERP_PANIC_ON_UE; - edev_ctl->pvt_info = llcc_driv_data; rc = edac_device_add_device(edev_ctl); if (rc) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 9c89f7d53e99..958aa4662ccb 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -819,8 +819,10 @@ static int ioctl_send_response(struct client *client, union ioctl_arg *arg) r = container_of(resource, struct inbound_transaction_resource, resource); - if (is_fcp_request(r->request)) + if (is_fcp_request(r->request)) { + kfree(r->data); goto out; + } if (a->length != fw_get_response_length(r->request)) { ret = -EINVAL; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index a2b0cbc8741c..1e0b016fdc2b 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -1007,6 +1007,8 @@ int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size) /* first try to find a slot in an existing linked list entry */ for (prsv = efi_memreserve_root->next; prsv; ) { rsv = memremap(prsv, sizeof(*rsv), MEMREMAP_WB); + if (!rsv) + return -ENOMEM; index = atomic_fetch_add_unless(&rsv->count, 1, rsv->size); if (index < rsv->size) { rsv->entry[index].base = addr; diff --git a/drivers/firmware/efi/libstub/arm64.c b/drivers/firmware/efi/libstub/arm64.c index ff2d18c42ee7..4501652e11ab 100644 --- a/drivers/firmware/efi/libstub/arm64.c +++ b/drivers/firmware/efi/libstub/arm64.c @@ -19,10 +19,13 @@ static bool system_needs_vamap(void) const u8 *type1_family = efi_get_smbios_string(1, family); /* - * Ampere Altra machines crash in SetTime() if SetVirtualAddressMap() - * has not been called prior. + * Ampere eMAG, Altra, and Altra Max machines crash in SetTime() if + * SetVirtualAddressMap() has not been called prior. */ - if (!type1_family || strcmp(type1_family, "Altra")) + if (!type1_family || ( + strcmp(type1_family, "eMAG") && + strcmp(type1_family, "Altra") && + strcmp(type1_family, "Altra Max"))) return false; efi_warn("Working around broken SetVirtualAddressMap()\n"); diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c index 0a9aba5f9cef..f178b2984dfb 100644 --- a/drivers/firmware/efi/memattr.c +++ b/drivers/firmware/efi/memattr.c @@ -33,7 +33,7 @@ int __init efi_memattr_init(void) return -ENOMEM; } - if (tbl->version > 1) { + if (tbl->version > 2) { pr_warn("Unexpected EFI Memory Attributes table version %d\n", tbl->version); goto unmap; diff --git a/drivers/fpga/intel-m10-bmc-sec-update.c b/drivers/fpga/intel-m10-bmc-sec-update.c index 79d48852825e..03f1bd81c434 100644 --- a/drivers/fpga/intel-m10-bmc-sec-update.c +++ b/drivers/fpga/intel-m10-bmc-sec-update.c @@ -574,20 +574,27 @@ static int m10bmc_sec_probe(struct platform_device *pdev) len = scnprintf(buf, SEC_UPDATE_LEN_MAX, "secure-update%d", sec->fw_name_id); sec->fw_name = kmemdup_nul(buf, len, GFP_KERNEL); - if (!sec->fw_name) - return -ENOMEM; + if (!sec->fw_name) { + ret = -ENOMEM; + goto fw_name_fail; + } fwl = firmware_upload_register(THIS_MODULE, sec->dev, sec->fw_name, &m10bmc_ops, sec); if (IS_ERR(fwl)) { dev_err(sec->dev, "Firmware Upload driver failed to start\n"); - kfree(sec->fw_name); - xa_erase(&fw_upload_xa, sec->fw_name_id); - return PTR_ERR(fwl); + ret = PTR_ERR(fwl); + goto fw_uploader_fail; } sec->fwl = fwl; return 0; + +fw_uploader_fail: + kfree(sec->fw_name); +fw_name_fail: + xa_erase(&fw_upload_xa, sec->fw_name_id); + return ret; } static int m10bmc_sec_remove(struct platform_device *pdev) diff --git a/drivers/fpga/stratix10-soc.c b/drivers/fpga/stratix10-soc.c index 357cea58ec98..f7f01982a512 100644 --- a/drivers/fpga/stratix10-soc.c +++ b/drivers/fpga/stratix10-soc.c @@ -213,9 +213,9 @@ static int s10_ops_write_init(struct fpga_manager *mgr, /* Allocate buffers from the service layer's pool. */ for (i = 0; i < NUM_SVC_BUFS; i++) { kbuf = stratix10_svc_allocate_memory(priv->chan, SVC_BUF_SIZE); - if (!kbuf) { + if (IS_ERR(kbuf)) { s10_free_buffers(mgr); - ret = -ENOMEM; + ret = PTR_ERR(kbuf); goto init_done; } diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index ec7cfd4f52b1..e9917a45b005 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1531,6 +1531,7 @@ config GPIO_MLXBF2 tristate "Mellanox BlueField 2 SoC GPIO" depends on (MELLANOX_PLATFORM && ARM64 && ACPI) || (64BIT && COMPILE_TEST) select GPIO_GENERIC + select GPIOLIB_IRQCHIP help Say Y here if you want GPIO support on Mellanox BlueField 2 SoC. diff --git a/drivers/gpio/gpio-ep93xx.c b/drivers/gpio/gpio-ep93xx.c index 2e1779709113..6cedf46efec6 100644 --- a/drivers/gpio/gpio-ep93xx.c +++ b/drivers/gpio/gpio-ep93xx.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/gpio/driver.h> #include <linux/bitops.h> +#include <linux/seq_file.h> #define EP93XX_GPIO_F_INT_STATUS 0x5c #define EP93XX_GPIO_A_INT_STATUS 0xa0 @@ -40,7 +41,6 @@ #define EP93XX_GPIO_F_IRQ_BASE 80 struct ep93xx_gpio_irq_chip { - struct irq_chip ic; u8 irq_offset; u8 int_unmasked; u8 int_enabled; @@ -148,7 +148,7 @@ static void ep93xx_gpio_f_irq_handler(struct irq_desc *desc) */ struct irq_chip *irqchip = irq_desc_get_chip(desc); unsigned int irq = irq_desc_get_irq(desc); - int port_f_idx = ((irq + 1) & 7) ^ 4; /* {19..22,47..50} -> {0..7} */ + int port_f_idx = (irq & 7) ^ 4; /* {20..23,48..51} -> {0..7} */ int gpio_irq = EP93XX_GPIO_F_IRQ_BASE + port_f_idx; chained_irq_enter(irqchip, desc); @@ -185,6 +185,7 @@ static void ep93xx_gpio_irq_mask_ack(struct irq_data *d) ep93xx_gpio_update_int_params(epg, eic); writeb(port_mask, epg->base + eic->irq_offset + EP93XX_INT_EOI_OFFSET); + gpiochip_disable_irq(gc, irqd_to_hwirq(d)); } static void ep93xx_gpio_irq_mask(struct irq_data *d) @@ -195,6 +196,7 @@ static void ep93xx_gpio_irq_mask(struct irq_data *d) eic->int_unmasked &= ~BIT(d->irq & 7); ep93xx_gpio_update_int_params(epg, eic); + gpiochip_disable_irq(gc, irqd_to_hwirq(d)); } static void ep93xx_gpio_irq_unmask(struct irq_data *d) @@ -203,6 +205,7 @@ static void ep93xx_gpio_irq_unmask(struct irq_data *d) struct ep93xx_gpio_irq_chip *eic = to_ep93xx_gpio_irq_chip(gc); struct ep93xx_gpio *epg = gpiochip_get_data(gc); + gpiochip_enable_irq(gc, irqd_to_hwirq(d)); eic->int_unmasked |= BIT(d->irq & 7); ep93xx_gpio_update_int_params(epg, eic); } @@ -320,15 +323,25 @@ static int ep93xx_gpio_set_config(struct gpio_chip *gc, unsigned offset, return 0; } -static void ep93xx_init_irq_chip(struct device *dev, struct irq_chip *ic) +static void ep93xx_irq_print_chip(struct irq_data *data, struct seq_file *p) { - ic->irq_ack = ep93xx_gpio_irq_ack; - ic->irq_mask_ack = ep93xx_gpio_irq_mask_ack; - ic->irq_mask = ep93xx_gpio_irq_mask; - ic->irq_unmask = ep93xx_gpio_irq_unmask; - ic->irq_set_type = ep93xx_gpio_irq_type; + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + + seq_printf(p, dev_name(gc->parent)); } +static const struct irq_chip gpio_eic_irq_chip = { + .name = "ep93xx-gpio-eic", + .irq_ack = ep93xx_gpio_irq_ack, + .irq_mask = ep93xx_gpio_irq_mask, + .irq_unmask = ep93xx_gpio_irq_unmask, + .irq_mask_ack = ep93xx_gpio_irq_mask_ack, + .irq_set_type = ep93xx_gpio_irq_type, + .irq_print_chip = ep93xx_irq_print_chip, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static int ep93xx_gpio_add_bank(struct ep93xx_gpio_chip *egc, struct platform_device *pdev, struct ep93xx_gpio *epg, @@ -350,8 +363,6 @@ static int ep93xx_gpio_add_bank(struct ep93xx_gpio_chip *egc, girq = &gc->irq; if (bank->has_irq || bank->has_hierarchical_irq) { - struct irq_chip *ic; - gc->set_config = ep93xx_gpio_set_config; egc->eic = devm_kcalloc(dev, 1, sizeof(*egc->eic), @@ -359,12 +370,7 @@ static int ep93xx_gpio_add_bank(struct ep93xx_gpio_chip *egc, if (!egc->eic) return -ENOMEM; egc->eic->irq_offset = bank->irq; - ic = &egc->eic->ic; - ic->name = devm_kasprintf(dev, GFP_KERNEL, "gpio-irq-%s", bank->label); - if (!ic->name) - return -ENOMEM; - ep93xx_init_irq_chip(dev, ic); - girq->chip = ic; + gpio_irq_chip_set_chip(girq, &gpio_eic_irq_chip); } if (bank->has_irq) { diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 6f673b2f2a1b..9d0cec4b82a3 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -249,10 +249,11 @@ static void mxc_flip_edge(struct mxc_gpio_port *port, u32 gpio) } else { pr_err("mxc: invalid configuration for GPIO %d: %x\n", gpio, edge); - return; + goto unlock; } writel(val | (edge << (bit << 1)), reg); +unlock: raw_spin_unlock_irqrestore(&port->gc.bgpio_lock, flags); } diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 60514bc5454f..9e3893b19e4f 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -736,7 +736,7 @@ static void gpio_sim_remove_hogs(struct gpio_sim_device *dev) gpiod_remove_hogs(dev->hogs); - for (hog = dev->hogs; !hog->chip_label; hog++) { + for (hog = dev->hogs; hog->chip_label; hog++) { kfree(hog->chip_label); kfree(hog->line_name); } diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index 9db42f6a2043..9033db00c360 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -30,7 +30,6 @@ struct fsl_gpio_soc_data { struct vf610_gpio_port { struct gpio_chip gc; - struct irq_chip ic; void __iomem *base; void __iomem *gpio_base; const struct fsl_gpio_soc_data *sdata; @@ -207,20 +206,24 @@ static int vf610_gpio_irq_set_type(struct irq_data *d, u32 type) static void vf610_gpio_irq_mask(struct irq_data *d) { - struct vf610_gpio_port *port = - gpiochip_get_data(irq_data_get_irq_chip_data(d)); - void __iomem *pcr_base = port->base + PORT_PCR(d->hwirq); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct vf610_gpio_port *port = gpiochip_get_data(gc); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + void __iomem *pcr_base = port->base + PORT_PCR(gpio_num); vf610_gpio_writel(0, pcr_base); + gpiochip_disable_irq(gc, gpio_num); } static void vf610_gpio_irq_unmask(struct irq_data *d) { - struct vf610_gpio_port *port = - gpiochip_get_data(irq_data_get_irq_chip_data(d)); - void __iomem *pcr_base = port->base + PORT_PCR(d->hwirq); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct vf610_gpio_port *port = gpiochip_get_data(gc); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + void __iomem *pcr_base = port->base + PORT_PCR(gpio_num); - vf610_gpio_writel(port->irqc[d->hwirq] << PORT_PCR_IRQC_OFFSET, + gpiochip_enable_irq(gc, gpio_num); + vf610_gpio_writel(port->irqc[gpio_num] << PORT_PCR_IRQC_OFFSET, pcr_base); } @@ -237,6 +240,17 @@ static int vf610_gpio_irq_set_wake(struct irq_data *d, u32 enable) return 0; } +static const struct irq_chip vf610_irqchip = { + .name = "gpio-vf610", + .irq_ack = vf610_gpio_irq_ack, + .irq_mask = vf610_gpio_irq_mask, + .irq_unmask = vf610_gpio_irq_unmask, + .irq_set_type = vf610_gpio_irq_set_type, + .irq_set_wake = vf610_gpio_irq_set_wake, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static void vf610_gpio_disable_clk(void *data) { clk_disable_unprepare(data); @@ -249,7 +263,6 @@ static int vf610_gpio_probe(struct platform_device *pdev) struct vf610_gpio_port *port; struct gpio_chip *gc; struct gpio_irq_chip *girq; - struct irq_chip *ic; int i; int ret; @@ -315,14 +328,6 @@ static int vf610_gpio_probe(struct platform_device *pdev) gc->direction_output = vf610_gpio_direction_output; gc->set = vf610_gpio_set; - ic = &port->ic; - ic->name = "gpio-vf610"; - ic->irq_ack = vf610_gpio_irq_ack; - ic->irq_mask = vf610_gpio_irq_mask; - ic->irq_unmask = vf610_gpio_irq_unmask; - ic->irq_set_type = vf610_gpio_irq_set_type; - ic->irq_set_wake = vf610_gpio_irq_set_wake; - /* Mask all GPIO interrupts */ for (i = 0; i < gc->ngpio; i++) vf610_gpio_writel(0, port->base + PORT_PCR(i)); @@ -331,7 +336,7 @@ static int vf610_gpio_probe(struct platform_device *pdev) vf610_gpio_writel(~0, port->base + PORT_ISFR); girq = &gc->irq; - girq->chip = ic; + gpio_irq_chip_set_chip(girq, &vf610_irqchip); girq->parent_handler = vf610_gpio_irq_handler; girq->num_parents = 1; girq->parents = devm_kcalloc(&pdev->dev, 1, diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 9ef0f5641b52..34ff048e70d0 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1104,7 +1104,8 @@ int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, in dev_dbg(&adev->dev, "IRQ %d already in use\n", irq); } - if (wake_capable) + /* avoid suspend issues with GPIOs when systems are using S3 */ + if (wake_capable && acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) *wake_capable = info.wake_capable; return irq; @@ -1636,6 +1637,18 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_wake = "ELAN0415:00@9", }, }, + { + /* + * Spurious wakeups from TP_ATTN# pin + * Found in BIOS 1.7.7 + */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "NH5xAx"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_wake = "SYNA1202:00@16", + }, + }, {} /* Terminating entry */ }; diff --git a/drivers/gpio/gpiolib-acpi.h b/drivers/gpio/gpiolib-acpi.h index 9475f99a9694..5a08693b8fb1 100644 --- a/drivers/gpio/gpiolib-acpi.h +++ b/drivers/gpio/gpiolib-acpi.h @@ -14,7 +14,6 @@ #include <linux/gpio/consumer.h> -struct acpi_device; struct device; struct fwnode_handle; diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 315cbdf61979..9abfb482b615 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -53,7 +53,8 @@ config DRM_DEBUG_MM config DRM_USE_DYNAMIC_DEBUG bool "use dynamic debug to implement drm.debug" - default y + default n + depends on BROKEN depends on DRM depends on DYNAMIC_DEBUG || DYNAMIC_DEBUG_CORE depends on JUMP_LABEL diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e3e2e6e3b485..d148a1bd85e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -243,6 +243,7 @@ extern int amdgpu_num_kcq; #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024) extern int amdgpu_vcnfw_log; +extern int amdgpu_sg_display; #define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD (256*1024*1024) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 7b5ce00f0602..7af3041ccd0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1220,10 +1220,13 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) * next job actually sees the results from the previous one * before we start executing on the same scheduler ring. */ - if (!s_fence || s_fence->sched != sched) + if (!s_fence || s_fence->sched != sched) { + dma_fence_put(fence); continue; + } r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence); + dma_fence_put(fence); if (r) return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2f28a8c02f64..fbf2f24169eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4268,6 +4268,9 @@ exit: } adev->in_suspend = false; + if (adev->enable_mes) + amdgpu_mes_self_test(adev); + if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) DRM_WARN("smart shift update failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index cd4caaa29528..3fe277bc233f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -186,6 +186,7 @@ int amdgpu_num_kcq = -1; int amdgpu_smartshift_bias; int amdgpu_use_xgmi_p2p = 1; int amdgpu_vcnfw_log; +int amdgpu_sg_display = -1; /* auto */ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work); @@ -932,6 +933,16 @@ MODULE_PARM_DESC(vcnfw_log, "Enable vcnfw log(0 = disable (default value), 1 = e module_param_named(vcnfw_log, amdgpu_vcnfw_log, int, 0444); /** + * DOC: sg_display (int) + * Disable S/G (scatter/gather) display (i.e., display from system memory). + * This option is only relevant on APUs. Set this option to 0 to disable + * S/G display if you experience flickering or other issues under memory + * pressure and report the issue. + */ +MODULE_PARM_DESC(sg_display, "S/G Display (-1 = auto (default), 0 = disable)"); +module_param_named(sg_display, amdgpu_sg_display, int, 0444); + +/** * DOC: smu_pptable_id (int) * Used to override pptable id. id = 0 use VBIOS pptable. * id > 0 use the soft pptable with specicfied id. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 00444203220d..faff4a3f96e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -618,7 +618,13 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev) if (!ring || !ring->fence_drv.initialized) continue; - if (!ring->no_scheduler) + /* + * Notice we check for sched.ops since there's some + * override on the meaning of sched.ready by amdgpu. + * The natural check would be sched.ready, which is + * set as drm_sched_init() finishes... + */ + if (ring->sched.ops) drm_sched_fini(&ring->sched); for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index f752c7ae7f60..3989e755a5b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -295,7 +295,7 @@ struct amdgpu_ring { #define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib))) #define amdgpu_ring_patch_cs_in_place(r, p, job, ib) ((r)->funcs->patch_cs_in_place((p), (job), (ib))) #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r)) -#define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t)) +#define amdgpu_ring_test_ib(r, t) ((r)->funcs->test_ib ? (r)->funcs->test_ib((r), (t)) : 0) #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r)) #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r)) #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index b5f3bba851db..01e42bdd8e4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -974,7 +974,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, trace_amdgpu_vm_update_ptes(params, frag_start, upd_end, min(nptes, 32u), dst, incr, upd_flags, - vm->task_info.pid, + vm->task_info.tgid, vm->immediate.fence_context); amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt), cursor.level, pe_start, dst, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index b9b57a66e113..66eb102cd88f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -790,8 +790,8 @@ static void gfx_v11_0_read_wave_data(struct amdgpu_device *adev, uint32_t simd, * zero here */ WARN_ON(simd != 0); - /* type 2 wave data */ - dst[(*no_fields)++] = 2; + /* type 3 wave data */ + dst[(*no_fields)++] = 3; dst[(*no_fields)++] = wave_read_ind(adev, wave, ixSQ_WAVE_STATUS); dst[(*no_fields)++] = wave_read_ind(adev, wave, ixSQ_WAVE_PC_LO); dst[(*no_fields)++] = wave_read_ind(adev, wave, ixSQ_WAVE_PC_HI); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index f202b45c413c..5dde6f82a1ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -6877,7 +6877,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch, .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush, .test_ring = gfx_v9_0_ring_test_ring, - .test_ib = gfx_v9_0_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, .pad_ib = amdgpu_ring_generic_pad_ib, .emit_switch_buffer = gfx_v9_ring_emit_sb, diff --git a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c index 95548c512f4f..077c53c6cc08 100644 --- a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c @@ -35,6 +35,7 @@ MODULE_FIRMWARE("amdgpu/gc_11_0_0_imu.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_1_imu.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_2_imu.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_3_imu.bin"); +MODULE_FIRMWARE("amdgpu/gc_11_0_4_imu.bin"); static int imu_v11_0_init_microcode(struct amdgpu_device *adev) { diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 970b066b37bb..1c4787000a5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -40,6 +40,8 @@ MODULE_FIRMWARE("amdgpu/gc_11_0_2_mes.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_2_mes1.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_3_mes.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_3_mes1.bin"); +MODULE_FIRMWARE("amdgpu/gc_11_0_4_mes.bin"); +MODULE_FIRMWARE("amdgpu/gc_11_0_4_mes1.bin"); static int mes_v11_0_hw_fini(void *handle); static int mes_v11_0_kiq_hw_init(struct amdgpu_device *adev); @@ -196,7 +198,6 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes, mes_add_queue_pkt.trap_handler_addr = input->tba_addr; mes_add_queue_pkt.tma_addr = input->tma_addr; mes_add_queue_pkt.is_kfd_process = input->is_kfd_process; - mes_add_queue_pkt.trap_en = 1; /* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */ mes_add_queue_pkt.is_aql_queue = input->is_aql_queue; @@ -1343,7 +1344,7 @@ static int mes_v11_0_late_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && + if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend && (adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))) amdgpu_mes_self_test(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c index 15eb3658d70e..09fdcd20cb91 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c @@ -337,7 +337,13 @@ const struct nbio_hdp_flush_reg nbio_v4_3_hdp_flush_reg = { static void nbio_v4_3_init_registers(struct amdgpu_device *adev) { - return; + if (adev->ip_versions[NBIO_HWIP][0] == IP_VERSION(4, 3, 0)) { + uint32_t data; + + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF2_STRAP2); + data &= ~RCC_DEV0_EPF2_STRAP2__STRAP_NO_SOFT_RESET_DEV0_F2_MASK; + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF2_STRAP2, data); + } } static u32 nbio_v4_3_get_rom_offset(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 5562670b7b52..7050238c4c48 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -640,7 +640,10 @@ static int soc21_common_early_init(void *handle) AMD_CG_SUPPORT_GFX_CGCG | AMD_CG_SUPPORT_GFX_CGLS | AMD_CG_SUPPORT_REPEATER_FGCG | - AMD_CG_SUPPORT_GFX_MGCG; + AMD_CG_SUPPORT_GFX_MGCG | + AMD_CG_SUPPORT_HDP_SD | + AMD_CG_SUPPORT_ATHUB_MGCG | + AMD_CG_SUPPORT_ATHUB_LS; adev->pg_flags = AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_JPEG; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4d42033a703f..9c7b69d377bd 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1184,24 +1184,38 @@ static void mmhub_read_system_context(struct amdgpu_device *adev, struct dc_phy_ memset(pa_config, 0, sizeof(*pa_config)); - logical_addr_low = min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18; - pt_base = amdgpu_gmc_pd_addr(adev->gart.bo); - - if (adev->apu_flags & AMD_APU_IS_RAVEN2) - /* - * Raven2 has a HW issue that it is unable to use the vram which - * is out of MC_VM_SYSTEM_APERTURE_HIGH_ADDR. So here is the - * workaround that increase system aperture high address (add 1) - * to get rid of the VM fault and hardware hang. - */ - logical_addr_high = max((adev->gmc.fb_end >> 18) + 0x1, adev->gmc.agp_end >> 18); - else - logical_addr_high = max(adev->gmc.fb_end, adev->gmc.agp_end) >> 18; - agp_base = 0; agp_bot = adev->gmc.agp_start >> 24; agp_top = adev->gmc.agp_end >> 24; + /* AGP aperture is disabled */ + if (agp_bot == agp_top) { + logical_addr_low = adev->gmc.vram_start >> 18; + if (adev->apu_flags & AMD_APU_IS_RAVEN2) + /* + * Raven2 has a HW issue that it is unable to use the vram which + * is out of MC_VM_SYSTEM_APERTURE_HIGH_ADDR. So here is the + * workaround that increase system aperture high address (add 1) + * to get rid of the VM fault and hardware hang. + */ + logical_addr_high = (adev->gmc.fb_end >> 18) + 0x1; + else + logical_addr_high = adev->gmc.vram_end >> 18; + } else { + logical_addr_low = min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18; + if (adev->apu_flags & AMD_APU_IS_RAVEN2) + /* + * Raven2 has a HW issue that it is unable to use the vram which + * is out of MC_VM_SYSTEM_APERTURE_HIGH_ADDR. So here is the + * workaround that increase system aperture high address (add 1) + * to get rid of the VM fault and hardware hang. + */ + logical_addr_high = max((adev->gmc.fb_end >> 18) + 0x1, adev->gmc.agp_end >> 18); + else + logical_addr_high = max(adev->gmc.fb_end, adev->gmc.agp_end) >> 18; + } + + pt_base = amdgpu_gmc_pd_addr(adev->gart.bo); page_table_start.high_part = (u32)(adev->gmc.gart_start >> 44) & 0xF; page_table_start.low_part = (u32)(adev->gmc.gart_start >> 12); @@ -1503,6 +1517,8 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) case IP_VERSION(3, 0, 1): case IP_VERSION(3, 1, 2): case IP_VERSION(3, 1, 3): + case IP_VERSION(3, 1, 4): + case IP_VERSION(3, 1, 5): case IP_VERSION(3, 1, 6): init_data.flags.gpu_vm_support = true; break; @@ -1511,6 +1527,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) } break; } + if (init_data.flags.gpu_vm_support && + (amdgpu_sg_display == 0)) + init_data.flags.gpu_vm_support = false; if (init_data.flags.gpu_vm_support) adev->mode_info.gpu_vm_support = true; @@ -4501,6 +4520,17 @@ DEVICE_ATTR_WO(s3_debug); static int dm_early_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + struct amdgpu_mode_info *mode_info = &adev->mode_info; + struct atom_context *ctx = mode_info->atom_context; + int index = GetIndexIntoMasterTable(DATA, Object_Header); + u16 data_offset; + + /* if there is no object header, skip DM */ + if (!amdgpu_atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset)) { + adev->harvest_ip_mask |= AMD_HARVEST_IP_DMU_MASK; + dev_info(adev->dev, "No object header, skipping DM\n"); + return -ENOENT; + } switch (adev->asic_type) { #if defined(CONFIG_DRM_AMD_DC_SI) @@ -8881,6 +8911,13 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm, if (!dm_old_crtc_state->stream) goto skip_modeset; + /* Unset freesync video if it was active before */ + if (dm_old_crtc_state->freesync_config.state == VRR_STATE_ACTIVE_FIXED) { + dm_new_crtc_state->freesync_config.state = VRR_STATE_INACTIVE; + dm_new_crtc_state->freesync_config.fixed_refresh_in_uhz = 0; + } + + /* Now check if we should set freesync video mode */ if (amdgpu_freesync_vid_mode && dm_new_crtc_state->stream && is_timing_unchanged_for_freesync(new_crtc_state, old_crtc_state)) { @@ -9497,6 +9534,8 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, bool lock_and_validation_needed = false; struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state; #if defined(CONFIG_DRM_AMD_DC_DCN) + struct drm_dp_mst_topology_mgr *mgr; + struct drm_dp_mst_topology_state *mst_state; struct dsc_mst_fairness_vars vars[MAX_PIPES]; #endif @@ -9619,7 +9658,11 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, * `dcn10_can_pipe_disable_cursor`). By now, all modified planes are in * atomic state, so call drm helper to normalize zpos. */ - drm_atomic_normalize_zpos(dev, state); + ret = drm_atomic_normalize_zpos(dev, state); + if (ret) { + drm_dbg(dev, "drm_atomic_normalize_zpos() failed\n"); + goto fail; + } /* Remove exiting planes if they are modified */ for_each_oldnew_plane_in_state_reverse(state, plane, old_plane_state, new_plane_state, i) { @@ -9745,6 +9788,28 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, lock_and_validation_needed = true; } +#if defined(CONFIG_DRM_AMD_DC_DCN) + /* set the slot info for each mst_state based on the link encoding format */ + for_each_new_mst_mgr_in_state(state, mgr, mst_state, i) { + struct amdgpu_dm_connector *aconnector; + struct drm_connector *connector; + struct drm_connector_list_iter iter; + u8 link_coding_cap; + + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { + if (connector->index == mst_state->mgr->conn_base_id) { + aconnector = to_amdgpu_dm_connector(connector); + link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(aconnector->dc_link); + drm_dp_mst_update_slots(mst_state, link_coding_cap); + + break; + } + } + drm_connector_list_iter_end(&iter); + } +#endif + /** * Streams and planes are reset when there are changes that affect * bandwidth. Anything that affects bandwidth needs to go through diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index 6994c9a1ed85..5cff56bb8f56 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -120,23 +120,50 @@ enum dc_edid_status dm_helpers_parse_edid_caps( } static void -fill_dc_mst_payload_table_from_drm(struct drm_dp_mst_topology_state *mst_state, - struct amdgpu_dm_connector *aconnector, +fill_dc_mst_payload_table_from_drm(struct dc_link *link, + bool enable, + struct drm_dp_mst_atomic_payload *target_payload, struct dc_dp_mst_stream_allocation_table *table) { struct dc_dp_mst_stream_allocation_table new_table = { 0 }; struct dc_dp_mst_stream_allocation *sa; - struct drm_dp_mst_atomic_payload *payload; + struct link_mst_stream_allocation_table copy_of_link_table = + link->mst_stream_alloc_table; + + int i; + int current_hw_table_stream_cnt = copy_of_link_table.stream_count; + struct link_mst_stream_allocation *dc_alloc; + + /* TODO: refactor to set link->mst_stream_alloc_table directly if possible.*/ + if (enable) { + dc_alloc = + ©_of_link_table.stream_allocations[current_hw_table_stream_cnt]; + dc_alloc->vcp_id = target_payload->vcpi; + dc_alloc->slot_count = target_payload->time_slots; + } else { + for (i = 0; i < copy_of_link_table.stream_count; i++) { + dc_alloc = + ©_of_link_table.stream_allocations[i]; + + if (dc_alloc->vcp_id == target_payload->vcpi) { + dc_alloc->vcp_id = 0; + dc_alloc->slot_count = 0; + break; + } + } + ASSERT(i != copy_of_link_table.stream_count); + } /* Fill payload info*/ - list_for_each_entry(payload, &mst_state->payloads, next) { - if (payload->delete) - continue; - - sa = &new_table.stream_allocations[new_table.stream_count]; - sa->slot_count = payload->time_slots; - sa->vcp_id = payload->vcpi; - new_table.stream_count++; + for (i = 0; i < MAX_CONTROLLER_NUM; i++) { + dc_alloc = + ©_of_link_table.stream_allocations[i]; + if (dc_alloc->vcp_id > 0 && dc_alloc->slot_count > 0) { + sa = &new_table.stream_allocations[new_table.stream_count]; + sa->slot_count = dc_alloc->slot_count; + sa->vcp_id = dc_alloc->vcp_id; + new_table.stream_count++; + } } /* Overwrite the old table */ @@ -185,7 +212,7 @@ bool dm_helpers_dp_mst_write_payload_allocation_table( * AUX message. The sequence is slot 1-63 allocated sequence for each * stream. AMD ASIC stream slot allocation should follow the same * sequence. copy DRM MST allocation to dc */ - fill_dc_mst_payload_table_from_drm(mst_state, aconnector, proposed_table); + fill_dc_mst_payload_table_from_drm(stream->link, enable, payload, proposed_table); return true; } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index d7a044e79730..abdbd4352f6f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -903,11 +903,6 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, if (IS_ERR(mst_state)) return PTR_ERR(mst_state); - mst_state->pbn_div = dm_mst_get_pbn_divider(dc_link); -#if defined(CONFIG_DRM_AMD_DC_DCN) - drm_dp_mst_update_slots(mst_state, dc_link_dp_mst_decide_link_encoding_format(dc_link)); -#endif - /* Set up params */ for (i = 0; i < dc_state->stream_count; i++) { struct dc_dsc_policy dsc_policy = {0}; diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 342e906ae26e..c88f044666fe 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -3995,10 +3995,13 @@ static enum dc_status deallocate_mst_payload(struct pipe_ctx *pipe_ctx) struct fixed31_32 avg_time_slots_per_mtp = dc_fixpt_from_int(0); int i; bool mst_mode = (link->type == dc_connection_mst_branch); + /* adjust for drm changes*/ + bool update_drm_mst_state = true; const struct link_hwss *link_hwss = get_link_hwss(link, &pipe_ctx->link_res); const struct dc_link_settings empty_link_settings = {0}; DC_LOGGER_INIT(link->ctx->logger); + /* deallocate_mst_payload is called before disable link. When mode or * disable/enable monitor, new stream is created which is not in link * stream[] yet. For this, payload is not allocated yet, so de-alloc @@ -4014,7 +4017,7 @@ static enum dc_status deallocate_mst_payload(struct pipe_ctx *pipe_ctx) &empty_link_settings, avg_time_slots_per_mtp); - if (mst_mode) { + if (mst_mode || update_drm_mst_state) { /* when link is in mst mode, reply on mst manager to remove * payload */ @@ -4077,11 +4080,18 @@ static enum dc_status deallocate_mst_payload(struct pipe_ctx *pipe_ctx) stream->ctx, stream); + if (!update_drm_mst_state) + dm_helpers_dp_mst_send_payload_allocation( + stream->ctx, + stream, + false); + } + + if (update_drm_mst_state) dm_helpers_dp_mst_send_payload_allocation( stream->ctx, stream, false); - } return DC_OK; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index fe2023f18b7d..8f894c1d1d1e 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -3626,7 +3626,7 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) (int)hubp->curs_attr.width || pos_cpy.x <= (int)hubp->curs_attr.width + pipe_ctx->plane_state->src_rect.x) { - pos_cpy.x = temp_x + viewport_width; + pos_cpy.x = 2 * viewport_width - temp_x; } } } else { diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c index f9ea1e86707f..79850a68f62a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c @@ -874,8 +874,9 @@ static const struct dc_plane_cap plane_cap = { }, // 6:1 downscaling ratio: 1000/6 = 166.666 + // 4:1 downscaling ratio for ARGB888 to prevent underflow during P010 playback: 1000/4 = 250 .max_downscale_factor = { - .argb8888 = 167, + .argb8888 = 250, .nv12 = 167, .fp16 = 167 }, @@ -1763,7 +1764,7 @@ static bool dcn314_resource_construct( pool->base.underlay_pipe_index = NO_UNDERLAY_PIPE; pool->base.pipe_count = pool->base.res_cap->num_timing_generator; pool->base.mpcc_count = pool->base.res_cap->num_timing_generator; - dc->caps.max_downscale_ratio = 600; + dc->caps.max_downscale_ratio = 400; dc->caps.i2c_speed_in_khz = 100; dc->caps.i2c_speed_in_khz_hdcp = 100; dc->caps.max_cursor_size = 256; diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c index dc4649458567..a4e9fd5307c6 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c @@ -94,7 +94,7 @@ static const struct hw_sequencer_funcs dcn32_funcs = { .get_vupdate_offset_from_vsync = dcn10_get_vupdate_offset_from_vsync, .calc_vupdate_position = dcn10_calc_vupdate_position, .apply_idle_power_optimizations = dcn32_apply_idle_power_optimizations, - .does_plane_fit_in_mall = dcn30_does_plane_fit_in_mall, + .does_plane_fit_in_mall = NULL, .set_backlight_level = dcn21_set_backlight_level, .set_abm_immediate_disable = dcn21_set_abm_immediate_disable, .hardware_release = dcn30_hardware_release, diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c b/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c index 950669f2c10d..cb7c0c878423 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c @@ -3183,7 +3183,7 @@ static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerforman } else { v->MIN_DST_Y_NEXT_START[k] = v->VTotal[k] - v->VFrontPorch[k] + v->VTotal[k] - v->VActive[k] - v->VStartup[k]; } - v->MIN_DST_Y_NEXT_START[k] += dml_floor(4.0 * v->TSetup[k] / (double)v->HTotal[k] / v->PixelClock[k], 1.0) / 4.0; + v->MIN_DST_Y_NEXT_START[k] += dml_floor(4.0 * v->TSetup[k] / ((double)v->HTotal[k] / v->PixelClock[k]), 1.0) / 4.0; if (((v->VUpdateOffsetPix[k] + v->VUpdateWidthPix[k] + v->VReadyOffsetPix[k]) / v->HTotal[k]) <= (isInterlaceTiming ? dml_floor((v->VTotal[k] - v->VActive[k] - v->VFrontPorch[k] - v->VStartup[k]) / 2.0, 1.0) : diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index 4a122925c3ae..92c18bfb98b3 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -532,6 +532,9 @@ enum dmub_status dmub_srv_hw_init(struct dmub_srv *dmub, if (dmub->hw_funcs.reset) dmub->hw_funcs.reset(dmub); + /* reset the cache of the last wptr as well now that hw is reset */ + dmub->inbox1_last_wptr = 0; + cw0.offset.quad_part = inst_fb->gpu_addr; cw0.region.base = DMUB_CW0_BASE; cw0.region.top = cw0.region.base + inst_fb->size - 1; @@ -649,6 +652,15 @@ enum dmub_status dmub_srv_hw_reset(struct dmub_srv *dmub) if (dmub->hw_funcs.reset) dmub->hw_funcs.reset(dmub); + /* mailboxes have been reset in hw, so reset the sw state as well */ + dmub->inbox1_last_wptr = 0; + dmub->inbox1_rb.wrpt = 0; + dmub->inbox1_rb.rptr = 0; + dmub->outbox0_rb.wrpt = 0; + dmub->outbox0_rb.rptr = 0; + dmub->outbox1_rb.wrpt = 0; + dmub->outbox1_rb.rptr = 0; + dmub->hw_init = false; return DMUB_STATUS_OK; diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 236657eece47..2f3e239e623d 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -1991,6 +1991,8 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ case IP_VERSION(9, 4, 2): case IP_VERSION(10, 3, 0): case IP_VERSION(11, 0, 0): + case IP_VERSION(11, 0, 1): + case IP_VERSION(11, 0, 2): *states = ATTR_STATE_SUPPORTED; break; default: @@ -2007,14 +2009,16 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ gc_ver == IP_VERSION(10, 3, 0) || gc_ver == IP_VERSION(10, 1, 2) || gc_ver == IP_VERSION(11, 0, 0) || - gc_ver == IP_VERSION(11, 0, 2))) + gc_ver == IP_VERSION(11, 0, 2) || + gc_ver == IP_VERSION(11, 0, 3))) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(pp_dpm_dclk)) { if (!(gc_ver == IP_VERSION(10, 3, 1) || gc_ver == IP_VERSION(10, 3, 0) || gc_ver == IP_VERSION(10, 1, 2) || gc_ver == IP_VERSION(11, 0, 0) || - gc_ver == IP_VERSION(11, 0, 2))) + gc_ver == IP_VERSION(11, 0, 2) || + gc_ver == IP_VERSION(11, 0, 3))) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(pp_power_profile_mode)) { if (amdgpu_dpm_get_power_profile_mode(adev, NULL) == -EOPNOTSUPP) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index ca3beb5d8f27..6ab155023592 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1500,6 +1500,20 @@ static int smu_disable_dpms(struct smu_context *smu) } /* + * For SMU 13.0.4/11, PMFW will handle the features disablement properly + * for gpu reset case. Driver involvement is unnecessary. + */ + if (amdgpu_in_reset(adev)) { + switch (adev->ip_versions[MP1_HWIP][0]) { + case IP_VERSION(13, 0, 4): + case IP_VERSION(13, 0, 11): + return 0; + default: + break; + } + } + + /* * For gpu reset, runpm and hibernation through BACO, * BACO feature has to be kept enabled. */ diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h index d6b964cf73bd..4bc7aee4d44f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h @@ -123,7 +123,8 @@ (1 << FEATURE_DS_FCLK_BIT) | \ (1 << FEATURE_DS_LCLK_BIT) | \ (1 << FEATURE_DS_DCFCLK_BIT) | \ - (1 << FEATURE_DS_UCLK_BIT)) + (1 << FEATURE_DS_UCLK_BIT) | \ + (1ULL << FEATURE_DS_VCN_BIT)) //For use with feature control messages typedef enum { @@ -522,9 +523,9 @@ typedef enum { TEMP_HOTSPOT_M, TEMP_MEM, TEMP_VR_GFX, - TEMP_VR_SOC, TEMP_VR_MEM0, TEMP_VR_MEM1, + TEMP_VR_SOC, TEMP_VR_U, TEMP_LIQUID0, TEMP_LIQUID1, diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h index d6b13933a98f..48a3a3952ceb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h @@ -113,20 +113,21 @@ #define NUM_FEATURES 64 #define ALLOWED_FEATURE_CTRL_DEFAULT 0xFFFFFFFFFFFFFFFFULL -#define ALLOWED_FEATURE_CTRL_SCPM (1 << FEATURE_DPM_GFXCLK_BIT) | \ - (1 << FEATURE_DPM_GFX_POWER_OPTIMIZER_BIT) | \ - (1 << FEATURE_DPM_UCLK_BIT) | \ - (1 << FEATURE_DPM_FCLK_BIT) | \ - (1 << FEATURE_DPM_SOCCLK_BIT) | \ - (1 << FEATURE_DPM_MP0CLK_BIT) | \ - (1 << FEATURE_DPM_LINK_BIT) | \ - (1 << FEATURE_DPM_DCN_BIT) | \ - (1 << FEATURE_DS_GFXCLK_BIT) | \ - (1 << FEATURE_DS_SOCCLK_BIT) | \ - (1 << FEATURE_DS_FCLK_BIT) | \ - (1 << FEATURE_DS_LCLK_BIT) | \ - (1 << FEATURE_DS_DCFCLK_BIT) | \ - (1 << FEATURE_DS_UCLK_BIT) +#define ALLOWED_FEATURE_CTRL_SCPM ((1 << FEATURE_DPM_GFXCLK_BIT) | \ + (1 << FEATURE_DPM_GFX_POWER_OPTIMIZER_BIT) | \ + (1 << FEATURE_DPM_UCLK_BIT) | \ + (1 << FEATURE_DPM_FCLK_BIT) | \ + (1 << FEATURE_DPM_SOCCLK_BIT) | \ + (1 << FEATURE_DPM_MP0CLK_BIT) | \ + (1 << FEATURE_DPM_LINK_BIT) | \ + (1 << FEATURE_DPM_DCN_BIT) | \ + (1 << FEATURE_DS_GFXCLK_BIT) | \ + (1 << FEATURE_DS_SOCCLK_BIT) | \ + (1 << FEATURE_DS_FCLK_BIT) | \ + (1 << FEATURE_DS_LCLK_BIT) | \ + (1 << FEATURE_DS_DCFCLK_BIT) | \ + (1 << FEATURE_DS_UCLK_BIT) | \ + (1ULL << FEATURE_DS_VCN_BIT)) //For use with feature control messages typedef enum { diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index e8c6febb8b64..992163e66f7b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -28,11 +28,11 @@ #define SMU13_DRIVER_IF_VERSION_INV 0xFFFFFFFF #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 #define SMU13_DRIVER_IF_VERSION_ALDE 0x08 -#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_0 0x34 +#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_0 0x37 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_4 0x07 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_5 0x04 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_10 0x32 -#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_7 0x35 +#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_7 0x37 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_10 0x1D #define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 4c20d17e7416..508e392547d7 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -145,6 +145,7 @@ static struct cmn2asic_msg_mapping smu_v13_0_0_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(SetBadMemoryPagesRetiredFlagsPerChannel, PPSMC_MSG_SetBadMemoryPagesRetiredFlagsPerChannel, 0), MSG_MAP(AllowGpo, PPSMC_MSG_SetGpoAllow, 0), + MSG_MAP(AllowIHHostInterrupt, PPSMC_MSG_AllowIHHostInterrupt, 0), }; static struct cmn2asic_mapping smu_v13_0_0_clk_map[SMU_CLK_COUNT] = { @@ -406,6 +407,9 @@ static int smu_v13_0_0_setup_pptable(struct smu_context *smu) struct amdgpu_device *adev = smu->adev; int ret = 0; + if (amdgpu_sriov_vf(smu->adev)) + return 0; + ret = smu_v13_0_0_get_pptable_from_pmfw(smu, &smu_table->power_play_table, &smu_table->power_play_table_size); @@ -1256,6 +1260,9 @@ static int smu_v13_0_0_get_thermal_temperature_range(struct smu_context *smu, table_context->power_play_table; PPTable_t *pptable = smu->smu_table.driver_pptable; + if (amdgpu_sriov_vf(smu->adev)) + return 0; + if (!range) return -EINVAL; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index e87db7e02e8a..9e1967d8049e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -124,6 +124,7 @@ static struct cmn2asic_msg_mapping smu_v13_0_7_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(DFCstateControl, PPSMC_MSG_SetExternalClientDfCstateAllow, 0), MSG_MAP(ArmD3, PPSMC_MSG_ArmD3, 0), MSG_MAP(AllowGpo, PPSMC_MSG_SetGpoAllow, 0), + MSG_MAP(GetPptLimit, PPSMC_MSG_GetPptLimit, 0), }; static struct cmn2asic_mapping smu_v13_0_7_clk_map[SMU_CLK_COUNT] = { diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c index c7443317c747..66a4a41c3fe9 100644 --- a/drivers/gpu/drm/ast/ast_mode.c +++ b/drivers/gpu/drm/ast/ast_mode.c @@ -714,7 +714,7 @@ static int ast_primary_plane_init(struct ast_private *ast) struct ast_plane *ast_primary_plane = &ast->primary_plane; struct drm_plane *primary_plane = &ast_primary_plane->base; void __iomem *vaddr = ast->vram; - u64 offset = ast->vram_base; + u64 offset = 0; /* with shmem, the primary plane is always at offset 0 */ unsigned long cursor_size = roundup(AST_HWC_SIZE + AST_HWC_SIGNATURE_SIZE, PAGE_SIZE); unsigned long size = ast->vram_fb_available - cursor_size; int ret; @@ -972,7 +972,7 @@ static int ast_cursor_plane_init(struct ast_private *ast) return -ENOMEM; vaddr = ast->vram + ast->vram_fb_available - size; - offset = ast->vram_base + ast->vram_fb_available - size; + offset = ast->vram_fb_available - size; ret = ast_plane_init(dev, ast_cursor_plane, vaddr, offset, size, 0x01, &ast_cursor_plane_funcs, diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c index a2f0860b20bb..d751820c6da6 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c @@ -193,6 +193,7 @@ static int snd_dw_hdmi_probe(struct platform_device *pdev) struct hdmi_codec_pdata pdata; struct platform_device *platform; + memset(&pdata, 0, sizeof(pdata)); pdata.ops = &dw_hdmi_i2s_ops; pdata.i2s = 1; pdata.max_i2s_channels = 8; diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c index 51a46689cda7..4ca37261584a 100644 --- a/drivers/gpu/drm/display/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c @@ -3372,6 +3372,9 @@ void drm_dp_remove_payload(struct drm_dp_mst_topology_mgr *mgr, mgr->payload_count--; mgr->next_start_slot -= payload->time_slots; + + if (payload->delete) + drm_dp_mst_put_port_malloc(payload->port); } EXPORT_SYMBOL(drm_dp_remove_payload); @@ -4327,7 +4330,6 @@ int drm_dp_atomic_release_time_slots(struct drm_atomic_state *state, drm_dbg_atomic(mgr->dev, "[MST PORT:%p] TU %d -> 0\n", port, payload->time_slots); if (!payload->delete) { - drm_dp_mst_put_port_malloc(port); payload->pbn = 0; payload->delete = true; topology_state->payload_mask &= ~BIT(payload->vcpi - 1); diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c index fd67efe37c63..056ab9d5f313 100644 --- a/drivers/gpu/drm/drm_client.c +++ b/drivers/gpu/drm/drm_client.c @@ -233,21 +233,17 @@ void drm_client_dev_restore(struct drm_device *dev) static void drm_client_buffer_delete(struct drm_client_buffer *buffer) { - struct drm_device *dev = buffer->client->dev; - if (buffer->gem) { drm_gem_vunmap_unlocked(buffer->gem, &buffer->map); drm_gem_object_put(buffer->gem); } - if (buffer->handle) - drm_mode_destroy_dumb(dev, buffer->handle, buffer->client->file); - kfree(buffer); } static struct drm_client_buffer * -drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format) +drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, + u32 format, u32 *handle) { const struct drm_format_info *info = drm_format_info(format); struct drm_mode_create_dumb dumb_args = { }; @@ -269,16 +265,15 @@ drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u if (ret) goto err_delete; - buffer->handle = dumb_args.handle; - buffer->pitch = dumb_args.pitch; - obj = drm_gem_object_lookup(client->file, dumb_args.handle); if (!obj) { ret = -ENOENT; goto err_delete; } + buffer->pitch = dumb_args.pitch; buffer->gem = obj; + *handle = dumb_args.handle; return buffer; @@ -365,7 +360,8 @@ static void drm_client_buffer_rmfb(struct drm_client_buffer *buffer) } static int drm_client_buffer_addfb(struct drm_client_buffer *buffer, - u32 width, u32 height, u32 format) + u32 width, u32 height, u32 format, + u32 handle) { struct drm_client_dev *client = buffer->client; struct drm_mode_fb_cmd fb_req = { }; @@ -377,7 +373,7 @@ static int drm_client_buffer_addfb(struct drm_client_buffer *buffer, fb_req.depth = info->depth; fb_req.width = width; fb_req.height = height; - fb_req.handle = buffer->handle; + fb_req.handle = handle; fb_req.pitch = buffer->pitch; ret = drm_mode_addfb(client->dev, &fb_req, client->file); @@ -414,13 +410,24 @@ struct drm_client_buffer * drm_client_framebuffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format) { struct drm_client_buffer *buffer; + u32 handle; int ret; - buffer = drm_client_buffer_create(client, width, height, format); + buffer = drm_client_buffer_create(client, width, height, format, + &handle); if (IS_ERR(buffer)) return buffer; - ret = drm_client_buffer_addfb(buffer, width, height, format); + ret = drm_client_buffer_addfb(buffer, width, height, format, handle); + + /* + * The handle is only needed for creating the framebuffer, destroy it + * again to solve a circular dependency should anybody export the GEM + * object as DMA-buf. The framebuffer and our buffer structure are still + * holding references to the GEM object to prevent its destruction. + */ + drm_mode_destroy_dumb(client->dev, handle, client->file); + if (ret) { drm_client_buffer_delete(buffer); return ERR_PTR(ret); diff --git a/drivers/gpu/drm/drm_fbdev_generic.c b/drivers/gpu/drm/drm_fbdev_generic.c index ab8695669279..593aa3283792 100644 --- a/drivers/gpu/drm/drm_fbdev_generic.c +++ b/drivers/gpu/drm/drm_fbdev_generic.c @@ -171,11 +171,6 @@ static const struct fb_ops drm_fbdev_fb_ops = { .fb_imageblit = drm_fbdev_fb_imageblit, }; -static struct fb_deferred_io drm_fbdev_defio = { - .delay = HZ / 20, - .deferred_io = drm_fb_helper_deferred_io, -}; - /* * This function uses the client API to create a framebuffer backed by a dumb buffer. */ @@ -222,8 +217,14 @@ static int drm_fbdev_fb_probe(struct drm_fb_helper *fb_helper, return -ENOMEM; fbi->flags |= FBINFO_VIRTFB | FBINFO_READS_FAST; - fbi->fbdefio = &drm_fbdev_defio; - fb_deferred_io_init(fbi); + /* Set a default deferred I/O handler */ + fb_helper->fbdefio.delay = HZ / 20; + fb_helper->fbdefio.deferred_io = drm_fb_helper_deferred_io; + + fbi->fbdefio = &fb_helper->fbdefio; + ret = fb_deferred_io_init(fbi); + if (ret) + return ret; } else { /* buffer is mapped for HW framebuffer */ ret = drm_client_buffer_vmap(fb_helper->buffer, &map); diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c index 7de37f8c68fd..83229a031af0 100644 --- a/drivers/gpu/drm/drm_vma_manager.c +++ b/drivers/gpu/drm/drm_vma_manager.c @@ -240,27 +240,8 @@ void drm_vma_offset_remove(struct drm_vma_offset_manager *mgr, } EXPORT_SYMBOL(drm_vma_offset_remove); -/** - * drm_vma_node_allow - Add open-file to list of allowed users - * @node: Node to modify - * @tag: Tag of file to remove - * - * Add @tag to the list of allowed open-files for this node. If @tag is - * already on this list, the ref-count is incremented. - * - * The list of allowed-users is preserved across drm_vma_offset_add() and - * drm_vma_offset_remove() calls. You may even call it if the node is currently - * not added to any offset-manager. - * - * You must remove all open-files the same number of times as you added them - * before destroying the node. Otherwise, you will leak memory. - * - * This is locked against concurrent access internally. - * - * RETURNS: - * 0 on success, negative error code on internal failure (out-of-mem) - */ -int drm_vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag) +static int vma_node_allow(struct drm_vma_offset_node *node, + struct drm_file *tag, bool ref_counted) { struct rb_node **iter; struct rb_node *parent = NULL; @@ -282,7 +263,8 @@ int drm_vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag) entry = rb_entry(*iter, struct drm_vma_offset_file, vm_rb); if (tag == entry->vm_tag) { - entry->vm_count++; + if (ref_counted) + entry->vm_count++; goto unlock; } else if (tag > entry->vm_tag) { iter = &(*iter)->rb_right; @@ -307,9 +289,59 @@ unlock: kfree(new); return ret; } + +/** + * drm_vma_node_allow - Add open-file to list of allowed users + * @node: Node to modify + * @tag: Tag of file to remove + * + * Add @tag to the list of allowed open-files for this node. If @tag is + * already on this list, the ref-count is incremented. + * + * The list of allowed-users is preserved across drm_vma_offset_add() and + * drm_vma_offset_remove() calls. You may even call it if the node is currently + * not added to any offset-manager. + * + * You must remove all open-files the same number of times as you added them + * before destroying the node. Otherwise, you will leak memory. + * + * This is locked against concurrent access internally. + * + * RETURNS: + * 0 on success, negative error code on internal failure (out-of-mem) + */ +int drm_vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag) +{ + return vma_node_allow(node, tag, true); +} EXPORT_SYMBOL(drm_vma_node_allow); /** + * drm_vma_node_allow_once - Add open-file to list of allowed users + * @node: Node to modify + * @tag: Tag of file to remove + * + * Add @tag to the list of allowed open-files for this node. + * + * The list of allowed-users is preserved across drm_vma_offset_add() and + * drm_vma_offset_remove() calls. You may even call it if the node is currently + * not added to any offset-manager. + * + * This is not ref-counted unlike drm_vma_node_allow() hence drm_vma_node_revoke() + * should only be called once after this. + * + * This is locked against concurrent access internally. + * + * RETURNS: + * 0 on success, negative error code on internal failure (out-of-mem) + */ +int drm_vma_node_allow_once(struct drm_vma_offset_node *node, struct drm_file *tag) +{ + return vma_node_allow(node, tag, false); +} +EXPORT_SYMBOL(drm_vma_node_allow_once); + +/** * drm_vma_node_revoke - Remove open-file from list of allowed users * @node: Node to modify * @tag: Tag of file to remove diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index 572a4e3769f3..a491e6c38875 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -2466,6 +2466,22 @@ static enum port dvo_port_to_port(struct drm_i915_private *i915, dvo_port); } +static enum port +dsi_dvo_port_to_port(struct drm_i915_private *i915, u8 dvo_port) +{ + switch (dvo_port) { + case DVO_PORT_MIPIA: + return PORT_A; + case DVO_PORT_MIPIC: + if (DISPLAY_VER(i915) >= 11) + return PORT_B; + else + return PORT_C; + default: + return PORT_NONE; + } +} + static int parse_bdb_230_dp_max_link_rate(const int vbt_max_link_rate) { switch (vbt_max_link_rate) { @@ -3414,19 +3430,16 @@ bool intel_bios_is_dsi_present(struct drm_i915_private *i915, dvo_port = child->dvo_port; - if (dvo_port == DVO_PORT_MIPIA || - (dvo_port == DVO_PORT_MIPIB && DISPLAY_VER(i915) >= 11) || - (dvo_port == DVO_PORT_MIPIC && DISPLAY_VER(i915) < 11)) { - if (port) - *port = dvo_port - DVO_PORT_MIPIA; - return true; - } else if (dvo_port == DVO_PORT_MIPIB || - dvo_port == DVO_PORT_MIPIC || - dvo_port == DVO_PORT_MIPID) { + if (dsi_dvo_port_to_port(i915, dvo_port) == PORT_NONE) { drm_dbg_kms(&i915->drm, "VBT has unsupported DSI port %c\n", port_name(dvo_port - DVO_PORT_MIPIA)); + continue; } + + if (port) + *port = dsi_dvo_port_to_port(i915, dvo_port); + return true; } return false; @@ -3511,7 +3524,7 @@ bool intel_bios_get_dsc_params(struct intel_encoder *encoder, if (!(child->device_type & DEVICE_TYPE_MIPI_OUTPUT)) continue; - if (child->dvo_port - DVO_PORT_MIPIA == encoder->port) { + if (dsi_dvo_port_to_port(i915, child->dvo_port) == encoder->port) { if (!devdata->dsc) return false; diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index b74e36d76013..407a477939e5 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -1319,7 +1319,7 @@ static const struct intel_cdclk_vals adlp_cdclk_table[] = { { .refclk = 24000, .cdclk = 192000, .divider = 2, .ratio = 16 }, { .refclk = 24000, .cdclk = 312000, .divider = 2, .ratio = 26 }, { .refclk = 24000, .cdclk = 552000, .divider = 2, .ratio = 46 }, - { .refclk = 24400, .cdclk = 648000, .divider = 2, .ratio = 54 }, + { .refclk = 24000, .cdclk = 648000, .divider = 2, .ratio = 54 }, { .refclk = 38400, .cdclk = 179200, .divider = 3, .ratio = 14 }, { .refclk = 38400, .cdclk = 192000, .divider = 2, .ratio = 10 }, diff --git a/drivers/gpu/drm/i915/display/intel_fbdev.c b/drivers/gpu/drm/i915/display/intel_fbdev.c index 5575d7abdc09..f76c06b7f1d4 100644 --- a/drivers/gpu/drm/i915/display/intel_fbdev.c +++ b/drivers/gpu/drm/i915/display/intel_fbdev.c @@ -328,8 +328,20 @@ out_unlock: return ret; } +static int intelfb_dirty(struct drm_fb_helper *helper, struct drm_clip_rect *clip) +{ + if (!(clip->x1 < clip->x2 && clip->y1 < clip->y2)) + return 0; + + if (helper->fb->funcs->dirty) + return helper->fb->funcs->dirty(helper->fb, NULL, 0, 0, clip, 1); + + return 0; +} + static const struct drm_fb_helper_funcs intel_fb_helper_funcs = { .fb_probe = intelfb_create, + .fb_dirty = intelfb_dirty, }; static void intel_fbdev_destroy(struct intel_fbdev *ifbdev) diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index e0766d1be966..11554645e6ee 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -1587,7 +1587,8 @@ skl_crtc_allocate_plane_ddb(struct intel_atomic_state *state, skl_check_wm_level(&wm->wm[level], ddb); if (icl_need_wm1_wa(i915, plane_id) && - level == 1 && wm->wm[0].enable) { + level == 1 && !wm->wm[level].enable && + wm->wm[0].enable) { wm->wm[level].blocks = wm->wm[0].blocks; wm->wm[level].lines = wm->wm[0].lines; wm->wm[level].ignore_lines = wm->wm[0].ignore_lines; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 6250de9b9196..e4b78ab4773b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -1861,11 +1861,19 @@ static int get_ppgtt(struct drm_i915_file_private *file_priv, vm = ctx->vm; GEM_BUG_ON(!vm); + /* + * Get a reference for the allocated handle. Once the handle is + * visible in the vm_xa table, userspace could try to close it + * from under our feet, so we need to hold the extra reference + * first. + */ + i915_vm_get(vm); + err = xa_alloc(&file_priv->vm_xa, &id, vm, xa_limit_32b, GFP_KERNEL); - if (err) + if (err) { + i915_vm_put(vm); return err; - - i915_vm_get(vm); + } GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */ args->value = id; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index f266b68cf012..0f2e056c02dd 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -3483,6 +3483,13 @@ err_request: eb.composite_fence : &eb.requests[0]->fence); + if (unlikely(eb.gem_context->syncobj)) { + drm_syncobj_replace_fence(eb.gem_context->syncobj, + eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); + } + if (out_fence) { if (err == 0) { fd_install(out_fence_fd, out_fence->file); @@ -3494,13 +3501,6 @@ err_request: } } - if (unlikely(eb.gem_context->syncobj)) { - drm_syncobj_replace_fence(eb.gem_context->syncobj, - eb.composite_fence ? - eb.composite_fence : - &eb.requests[0]->fence); - } - if (!out_fence && eb.composite_fence) dma_fence_put(eb.composite_fence); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 0ad44f3868de..c7c252d4d366 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -697,7 +697,7 @@ insert: GEM_BUG_ON(lookup_mmo(obj, mmap_type) != mmo); out: if (file) - drm_vma_node_allow(&mmo->vma_node, file); + drm_vma_node_allow_once(&mmo->vma_node, file); return mmo; err: diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 9c759df700ca..937728840428 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -579,7 +579,7 @@ static int shmem_object_init(struct intel_memory_region *mem, mapping_set_gfp_mask(mapping, mask); GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM)); - i915_gem_object_init(obj, &i915_gem_shmem_ops, &lock_class, 0); + i915_gem_object_init(obj, &i915_gem_shmem_ops, &lock_class, flags); obj->mem_flags |= I915_BO_FLAG_STRUCT_PAGE; obj->write_domain = I915_GEM_DOMAIN_CPU; obj->read_domains = I915_GEM_DOMAIN_CPU; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c index fd42b89b7162..bc21b1c2350a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c @@ -305,10 +305,6 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj, spin_unlock(&obj->vma.lock); obj->tiling_and_stride = tiling | stride; - i915_gem_object_unlock(obj); - - /* Force the fence to be reacquired for GTT access */ - i915_gem_object_release_mmap_gtt(obj); /* Try to preallocate memory required to save swizzling on put-pages */ if (i915_gem_object_needs_bit17_swizzle(obj)) { @@ -321,6 +317,11 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj, obj->bit_17 = NULL; } + i915_gem_object_unlock(obj); + + /* Force the fence to be reacquired for GTT access */ + i915_gem_object_release_mmap_gtt(obj); + return 0; } diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index e94365b08f1e..2aa63ec521b8 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -528,7 +528,7 @@ retry: return rq; } -struct i915_request *intel_context_find_active_request(struct intel_context *ce) +struct i915_request *intel_context_get_active_request(struct intel_context *ce) { struct intel_context *parent = intel_context_to_parent(ce); struct i915_request *rq, *active = NULL; @@ -552,6 +552,8 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce) active = rq; } + if (active) + active = i915_request_get_rcu(active); spin_unlock_irqrestore(&parent->guc_state.lock, flags); return active; diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index fb62b7b8cbcd..0a8d553da3f4 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -268,8 +268,7 @@ int intel_context_prepare_remote_request(struct intel_context *ce, struct i915_request *intel_context_create_request(struct intel_context *ce); -struct i915_request * -intel_context_find_active_request(struct intel_context *ce); +struct i915_request *intel_context_get_active_request(struct intel_context *ce); static inline bool intel_context_is_barrier(const struct intel_context *ce) { diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index cbc8b857d5f7..7a4504ea35c3 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -248,8 +248,8 @@ void intel_engine_dump_active_requests(struct list_head *requests, ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now); -struct i915_request * -intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine); +void intel_engine_get_hung_entity(struct intel_engine_cs *engine, + struct intel_context **ce, struct i915_request **rq); u32 intel_engine_context_size(struct intel_gt *gt, u8 class); struct intel_context * diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index c33e0d72d670..d37931e16fd9 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -2094,17 +2094,6 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq) } } -static unsigned long list_count(struct list_head *list) -{ - struct list_head *pos; - unsigned long count = 0; - - list_for_each(pos, list) - count++; - - return count; -} - static unsigned long read_ul(void *p, size_t x) { return *(unsigned long *)(p + x); @@ -2196,11 +2185,11 @@ void intel_engine_dump_active_requests(struct list_head *requests, } } -static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m) +static void engine_dump_active_requests(struct intel_engine_cs *engine, + struct drm_printer *m) { + struct intel_context *hung_ce = NULL; struct i915_request *hung_rq = NULL; - struct intel_context *ce; - bool guc; /* * No need for an engine->irq_seqno_barrier() before the seqno reads. @@ -2209,27 +2198,22 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d * But the intention here is just to report an instantaneous snapshot * so that's fine. */ - lockdep_assert_held(&engine->sched_engine->lock); + intel_engine_get_hung_entity(engine, &hung_ce, &hung_rq); drm_printf(m, "\tRequests:\n"); - guc = intel_uc_uses_guc_submission(&engine->gt->uc); - if (guc) { - ce = intel_engine_get_hung_context(engine); - if (ce) - hung_rq = intel_context_find_active_request(ce); - } else { - hung_rq = intel_engine_execlist_find_hung_request(engine); - } - if (hung_rq) engine_dump_request(hung_rq, m, "\t\thung"); + else if (hung_ce) + drm_printf(m, "\t\tGot hung ce but no hung rq!\n"); - if (guc) + if (intel_uc_uses_guc_submission(&engine->gt->uc)) intel_guc_dump_active_requests(engine, hung_rq, m); else - intel_engine_dump_active_requests(&engine->sched_engine->requests, - hung_rq, m); + intel_execlists_dump_active_requests(engine, hung_rq, m); + + if (hung_rq) + i915_request_put(hung_rq); } void intel_engine_dump(struct intel_engine_cs *engine, @@ -2239,7 +2223,6 @@ void intel_engine_dump(struct intel_engine_cs *engine, struct i915_gpu_error * const error = &engine->i915->gpu_error; struct i915_request *rq; intel_wakeref_t wakeref; - unsigned long flags; ktime_t dummy; if (header) { @@ -2276,13 +2259,8 @@ void intel_engine_dump(struct intel_engine_cs *engine, i915_reset_count(error)); print_properties(engine, m); - spin_lock_irqsave(&engine->sched_engine->lock, flags); engine_dump_active_requests(engine, m); - drm_printf(m, "\tOn hold?: %lu\n", - list_count(&engine->sched_engine->hold)); - spin_unlock_irqrestore(&engine->sched_engine->lock, flags); - drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base); wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm); if (wakeref) { @@ -2328,8 +2306,7 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings, return siblings[0]->cops->create_virtual(siblings, count, flags); } -struct i915_request * -intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) +static struct i915_request *engine_execlist_find_hung_request(struct intel_engine_cs *engine) { struct i915_request *request, *active = NULL; @@ -2381,6 +2358,33 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) return active; } +void intel_engine_get_hung_entity(struct intel_engine_cs *engine, + struct intel_context **ce, struct i915_request **rq) +{ + unsigned long flags; + + *ce = intel_engine_get_hung_context(engine); + if (*ce) { + intel_engine_clear_hung_context(engine); + + *rq = intel_context_get_active_request(*ce); + return; + } + + /* + * Getting here with GuC enabled means it is a forced error capture + * with no actual hang. So, no need to attempt the execlist search. + */ + if (intel_uc_uses_guc_submission(&engine->gt->uc)) + return; + + spin_lock_irqsave(&engine->sched_engine->lock, flags); + *rq = engine_execlist_find_hung_request(engine); + if (*rq) + *rq = i915_request_get_rcu(*rq); + spin_unlock_irqrestore(&engine->sched_engine->lock, flags); +} + void xehp_enable_ccs_engines(struct intel_engine_cs *engine) { /* diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 2daffa7c7dfd..21cb5b69d82e 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -4148,6 +4148,33 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, spin_unlock_irqrestore(&sched_engine->lock, flags); } +static unsigned long list_count(struct list_head *list) +{ + struct list_head *pos; + unsigned long count = 0; + + list_for_each(pos, list) + count++; + + return count; +} + +void intel_execlists_dump_active_requests(struct intel_engine_cs *engine, + struct i915_request *hung_rq, + struct drm_printer *m) +{ + unsigned long flags; + + spin_lock_irqsave(&engine->sched_engine->lock, flags); + + intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m); + + drm_printf(m, "\tOn hold?: %lu\n", + list_count(&engine->sched_engine->hold)); + + spin_unlock_irqrestore(&engine->sched_engine->lock, flags); +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftest_execlists.c" #endif diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h index a1aa92c983a5..d2c7d45ea062 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h @@ -32,6 +32,10 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, int indent), unsigned int max); +void intel_execlists_dump_active_requests(struct intel_engine_cs *engine, + struct i915_request *hung_rq, + struct drm_printer *m); + bool intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine); diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 7771a19008c6..bbeeb6dde7ae 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -288,39 +288,6 @@ static const u8 dg2_xcs_offsets[] = { END }; -static const u8 mtl_xcs_offsets[] = { - NOP(1), - LRI(13, POSTED), - REG16(0x244), - REG(0x034), - REG(0x030), - REG(0x038), - REG(0x03c), - REG(0x168), - REG(0x140), - REG(0x110), - REG(0x1c0), - REG(0x1c4), - REG(0x1c8), - REG(0x180), - REG16(0x2b4), - NOP(4), - - NOP(1), - LRI(9, POSTED), - REG16(0x3a8), - REG16(0x28c), - REG16(0x288), - REG16(0x284), - REG16(0x280), - REG16(0x27c), - REG16(0x278), - REG16(0x274), - REG16(0x270), - - END -}; - static const u8 gen8_rcs_offsets[] = { NOP(1), LRI(14, POSTED), @@ -739,9 +706,7 @@ static const u8 *reg_offsets(const struct intel_engine_cs *engine) else return gen8_rcs_offsets; } else { - if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) - return mtl_xcs_offsets; - else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) return dg2_xcs_offsets; else if (GRAPHICS_VER(engine->i915) >= 12) return gen12_xcs_offsets; diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 949c19339015..a0740308555d 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -1355,6 +1355,13 @@ icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) GAMT_CHKN_BIT_REG, GAMT_CHKN_DISABLE_L3_COH_PIPE); + /* + * Wa_1408615072:icl,ehl (vsunit) + * Wa_1407596294:icl,ehl (hsunit) + */ + wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, + VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS); + /* Wa_1407352427:icl,ehl */ wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, PSDUNIT_CLKGATE_DIS); @@ -2540,13 +2547,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN11_ENABLE_32_PLANE_MODE); /* - * Wa_1408615072:icl,ehl (vsunit) - * Wa_1407596294:icl,ehl (hsunit) - */ - wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, - VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS); - - /* * Wa_1408767742:icl[a2..forever],ehl[all] * Wa_1605460711:icl[a0..c0] */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 0a42f1807f52..c10977cb06b9 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1702,7 +1702,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st goto next_context; guilty = false; - rq = intel_context_find_active_request(ce); + rq = intel_context_get_active_request(ce); if (!rq) { head = ce->ring->tail; goto out_replay; @@ -1715,6 +1715,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st head = intel_ring_wrap(ce->ring, rq->head); __i915_request_reset(rq, guilty); + i915_request_put(rq); out_replay: guc_reset_state(ce, head, guilty); next_context: @@ -4817,6 +4818,8 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine) xa_lock_irqsave(&guc->context_lookup, flags); xa_for_each(&guc->context_lookup, index, ce) { + bool found; + if (!kref_get_unless_zero(&ce->ref)) continue; @@ -4833,10 +4836,18 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine) goto next; } + found = false; + spin_lock(&ce->guc_state.lock); list_for_each_entry(rq, &ce->guc_state.requests, sched.link) { if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE) continue; + found = true; + break; + } + spin_unlock(&ce->guc_state.lock); + + if (found) { intel_engine_set_hung_context(engine, ce); /* Can only cope with one hang at a time... */ @@ -4844,6 +4855,7 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine) xa_lock(&guc->context_lookup); goto done; } + next: intel_context_put(ce); xa_lock(&guc->context_lookup); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9d5d5a397b64..b20bd6365615 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1596,43 +1596,20 @@ capture_engine(struct intel_engine_cs *engine, { struct intel_engine_capture_vma *capture = NULL; struct intel_engine_coredump *ee; - struct intel_context *ce; + struct intel_context *ce = NULL; struct i915_request *rq = NULL; - unsigned long flags; ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags); if (!ee) return NULL; - ce = intel_engine_get_hung_context(engine); - if (ce) { - intel_engine_clear_hung_context(engine); - rq = intel_context_find_active_request(ce); - if (!rq || !i915_request_started(rq)) - goto no_request_capture; - } else { - /* - * Getting here with GuC enabled means it is a forced error capture - * with no actual hang. So, no need to attempt the execlist search. - */ - if (!intel_uc_uses_guc_submission(&engine->gt->uc)) { - spin_lock_irqsave(&engine->sched_engine->lock, flags); - rq = intel_engine_execlist_find_hung_request(engine); - spin_unlock_irqrestore(&engine->sched_engine->lock, - flags); - } - } - if (rq) - rq = i915_request_get_rcu(rq); - - if (!rq) + intel_engine_get_hung_entity(engine, &ce, &rq); + if (!rq || !i915_request_started(rq)) goto no_request_capture; capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); - if (!capture) { - i915_request_put(rq); + if (!capture) goto no_request_capture; - } if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) intel_guc_capture_get_matching_node(engine->gt, ee, ce); @@ -1642,6 +1619,8 @@ capture_engine(struct intel_engine_cs *engine, return ee; no_request_capture: + if (rq) + i915_request_put(rq); kfree(ee); return NULL; } diff --git a/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c b/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c index 310fb83c527e..2990dd4d4a0d 100644 --- a/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c +++ b/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c @@ -28,8 +28,7 @@ struct intel_engine_cs *intel_selftest_find_any_engine(struct intel_gt *gt) int intel_selftest_modify_policy(struct intel_engine_cs *engine, struct intel_selftest_saved_policy *saved, - u32 modify_type) - + enum selftest_scheduler_modify modify_type) { int err; diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h index 40768373cdd9..c5a4f49ee206 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h @@ -97,6 +97,7 @@ int gp100_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct n int gp102_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int gp10b_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int gv100_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); +int tu102_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int ga100_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int ga102_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); diff --git a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c index fcf2a002f6cb..91fb494d4009 100644 --- a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c +++ b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c @@ -151,6 +151,9 @@ nvkm_firmware_mem_page(struct nvkm_memory *memory) static enum nvkm_memory_target nvkm_firmware_mem_target(struct nvkm_memory *memory) { + if (nvkm_firmware_mem(memory)->device->func->tegra) + return NVKM_MEM_TARGET_NCOH; + return NVKM_MEM_TARGET_HOST; } diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 364fea320cb3..1c81e5b34d29 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2405,7 +2405,7 @@ nv162_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2440,7 +2440,7 @@ nv164_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2475,7 +2475,7 @@ nv166_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2510,7 +2510,7 @@ nv167_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2545,7 +2545,7 @@ nv168_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c b/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c index 393ade9f7e6c..b7da3ab44c27 100644 --- a/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c @@ -48,6 +48,16 @@ gm200_flcn_pio_dmem_rd(struct nvkm_falcon *falcon, u8 port, const u8 *img, int l img += 4; len -= 4; } + + /* Sigh. Tegra PMU FW's init message... */ + if (len) { + u32 data = nvkm_falcon_rd32(falcon, 0x1c4 + (port * 8)); + + while (len--) { + *(u8 *)img++ = data & 0xff; + data >>= 8; + } + } } static void @@ -64,6 +74,8 @@ gm200_flcn_pio_dmem_wr(struct nvkm_falcon *falcon, u8 port, const u8 *img, int l img += 4; len -= 4; } + + WARN_ON(len); } static void @@ -74,7 +86,7 @@ gm200_flcn_pio_dmem_wr_init(struct nvkm_falcon *falcon, u8 port, bool sec, u32 d const struct nvkm_falcon_func_pio gm200_flcn_dmem_pio = { - .min = 4, + .min = 1, .max = 0x100, .wr_init = gm200_flcn_pio_dmem_wr_init, .wr = gm200_flcn_pio_dmem_wr, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c index 634f64f88fc8..81a1ad2c88a7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c @@ -65,10 +65,33 @@ tu102_devinit_pll_set(struct nvkm_devinit *init, u32 type, u32 freq) return ret; } +static int +tu102_devinit_wait(struct nvkm_device *device) +{ + unsigned timeout = 50 + 2000; + + do { + if (nvkm_rd32(device, 0x118128) & 0x00000001) { + if ((nvkm_rd32(device, 0x118234) & 0x000000ff) == 0xff) + return 0; + } + + usleep_range(1000, 2000); + } while (timeout--); + + return -ETIMEDOUT; +} + int tu102_devinit_post(struct nvkm_devinit *base, bool post) { struct nv50_devinit *init = nv50_devinit(base); + int ret; + + ret = tu102_devinit_wait(init->base.subdev.device); + if (ret) + return ret; + gm200_devinit_preos(init, post); return 0; } diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild index 5d0bab8ecb43..6ba5120a2ebe 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild @@ -32,6 +32,7 @@ nvkm-y += nvkm/subdev/fb/gp100.o nvkm-y += nvkm/subdev/fb/gp102.o nvkm-y += nvkm/subdev/fb/gp10b.o nvkm-y += nvkm/subdev/fb/gv100.o +nvkm-y += nvkm/subdev/fb/tu102.o nvkm-y += nvkm/subdev/fb/ga100.o nvkm-y += nvkm/subdev/fb/ga102.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c index 8b7c8ea5e8a5..5a21b0ae4595 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c @@ -40,12 +40,6 @@ ga102_fb_vpr_scrub(struct nvkm_fb *fb) return ret; } -static bool -ga102_fb_vpr_scrub_required(struct nvkm_fb *fb) -{ - return (nvkm_rd32(fb->subdev.device, 0x1fa80c) & 0x00000010) != 0; -} - static const struct nvkm_fb_func ga102_fb = { .dtor = gf100_fb_dtor, @@ -56,7 +50,7 @@ ga102_fb = { .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, .ram_new = ga102_ram_new, .default_bigpage = 16, - .vpr.scrub_required = ga102_fb_vpr_scrub_required, + .vpr.scrub_required = tu102_fb_vpr_scrub_required, .vpr.scrub = ga102_fb_vpr_scrub, }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c index 1f0126437c1a..0e3c0a8f5d71 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c @@ -49,8 +49,3 @@ gv100_fb_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, s } MODULE_FIRMWARE("nvidia/gv100/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu102/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu104/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu106/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu116/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu117/nvdec/scrubber.bin"); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h index ac03eac0f261..f517751f94ac 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h @@ -89,4 +89,6 @@ bool gp102_fb_vpr_scrub_required(struct nvkm_fb *); int gp102_fb_vpr_scrub(struct nvkm_fb *); int gv100_fb_init_page(struct nvkm_fb *); + +bool tu102_fb_vpr_scrub_required(struct nvkm_fb *); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/tu102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/tu102.c new file mode 100644 index 000000000000..be82af0364ee --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/tu102.c @@ -0,0 +1,55 @@ +/* + * Copyright 2018 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "gf100.h" +#include "ram.h" + +bool +tu102_fb_vpr_scrub_required(struct nvkm_fb *fb) +{ + return (nvkm_rd32(fb->subdev.device, 0x1fa80c) & 0x00000010) != 0; +} + +static const struct nvkm_fb_func +tu102_fb = { + .dtor = gf100_fb_dtor, + .oneinit = gf100_fb_oneinit, + .init = gm200_fb_init, + .init_page = gv100_fb_init_page, + .init_unkn = gp100_fb_init_unkn, + .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, + .vpr.scrub_required = tu102_fb_vpr_scrub_required, + .vpr.scrub = gp102_fb_vpr_scrub, + .ram_new = gp100_ram_new, + .default_bigpage = 16, +}; + +int +tu102_fb_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, struct nvkm_fb **pfb) +{ + return gp102_fb_new_(&tu102_fb, device, type, inst, pfb); +} + +MODULE_FIRMWARE("nvidia/tu102/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu104/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu106/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu116/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu117/nvdec/scrubber.bin"); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c index a72403777329..2ed04da3621d 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c @@ -225,7 +225,7 @@ gm20b_pmu_init(struct nvkm_pmu *pmu) pmu->initmsg_received = false; - nvkm_falcon_load_dmem(falcon, &args, addr_args, sizeof(args), 0); + nvkm_falcon_pio_wr(falcon, (u8 *)&args, 0, 0, DMEM, addr_args, sizeof(args), 0, false); nvkm_falcon_start(falcon); return 0; } diff --git a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c index 857a2f0420d7..c924f1124ebc 100644 --- a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c +++ b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c @@ -1193,14 +1193,11 @@ static int boe_panel_enter_sleep_mode(struct boe_panel *boe) return 0; } -static int boe_panel_unprepare(struct drm_panel *panel) +static int boe_panel_disable(struct drm_panel *panel) { struct boe_panel *boe = to_boe_panel(panel); int ret; - if (!boe->prepared) - return 0; - ret = boe_panel_enter_sleep_mode(boe); if (ret < 0) { dev_err(panel->dev, "failed to set panel off: %d\n", ret); @@ -1209,6 +1206,16 @@ static int boe_panel_unprepare(struct drm_panel *panel) msleep(150); + return 0; +} + +static int boe_panel_unprepare(struct drm_panel *panel) +{ + struct boe_panel *boe = to_boe_panel(panel); + + if (!boe->prepared) + return 0; + if (boe->desc->discharge_on_disable) { regulator_disable(boe->avee); regulator_disable(boe->avdd); @@ -1528,6 +1535,7 @@ static enum drm_panel_orientation boe_panel_get_orientation(struct drm_panel *pa } static const struct drm_panel_funcs boe_panel_funcs = { + .disable = boe_panel_disable, .unprepare = boe_panel_unprepare, .prepare = boe_panel_prepare, .enable = boe_panel_enable, diff --git a/drivers/gpu/drm/solomon/ssd130x.c b/drivers/gpu/drm/solomon/ssd130x.c index 53464afc2b9a..91f69e62430b 100644 --- a/drivers/gpu/drm/solomon/ssd130x.c +++ b/drivers/gpu/drm/solomon/ssd130x.c @@ -656,18 +656,8 @@ static const struct drm_crtc_helper_funcs ssd130x_crtc_helper_funcs = { .atomic_check = drm_crtc_helper_atomic_check, }; -static void ssd130x_crtc_reset(struct drm_crtc *crtc) -{ - struct drm_device *drm = crtc->dev; - struct ssd130x_device *ssd130x = drm_to_ssd130x(drm); - - ssd130x_init(ssd130x); - - drm_atomic_helper_crtc_reset(crtc); -} - static const struct drm_crtc_funcs ssd130x_crtc_funcs = { - .reset = ssd130x_crtc_reset, + .reset = drm_atomic_helper_crtc_reset, .destroy = drm_crtc_cleanup, .set_config = drm_atomic_helper_set_config, .page_flip = drm_atomic_helper_page_flip, @@ -686,6 +676,12 @@ static void ssd130x_encoder_helper_atomic_enable(struct drm_encoder *encoder, if (ret) return; + ret = ssd130x_init(ssd130x); + if (ret) { + ssd130x_power_off(ssd130x); + return; + } + ssd130x_write_cmd(ssd130x, 1, SSD130X_DISPLAY_ON); backlight_enable(ssd130x->bl_dev); diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index 0108613e79d5..7258975331ca 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -711,7 +711,7 @@ static int vc4_crtc_atomic_check(struct drm_crtc *crtc, struct vc4_encoder *vc4_encoder = to_vc4_encoder(encoder); if (vc4_encoder->type == VC4_ENCODER_TYPE_HDMI0) { - vc4_state->hvs_load = max(mode->clock * mode->hdisplay / mode->htotal + 1000, + vc4_state->hvs_load = max(mode->clock * mode->hdisplay / mode->htotal + 8000, mode->clock * 9 / 10) * 1000; } else { vc4_state->hvs_load = mode->clock * 1000; diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 12a00d644b61..7546103f1499 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -97,6 +97,10 @@ #define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1_SHIFT 8 #define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1_MASK VC4_MASK(15, 8) +#define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_MASK VC4_MASK(7, 0) +#define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_SET_AVMUTE BIT(0) +#define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_CLEAR_AVMUTE BIT(4) + # define VC4_HD_M_SW_RST BIT(2) # define VC4_HD_M_ENABLE BIT(0) @@ -1306,7 +1310,6 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, VC4_HDMI_VERTB_VBP)); unsigned long flags; unsigned char gcp; - bool gcp_en; u32 reg; int idx; @@ -1341,16 +1344,13 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, switch (vc4_state->output_bpc) { case 12: gcp = 6; - gcp_en = true; break; case 10: gcp = 5; - gcp_en = true; break; case 8: default: - gcp = 4; - gcp_en = false; + gcp = 0; break; } @@ -1359,8 +1359,7 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, * doesn't signal in GCP. */ if (vc4_state->output_format == VC4_HDMI_OUTPUT_YUV422) { - gcp = 4; - gcp_en = false; + gcp = 0; } reg = HDMI_READ(HDMI_DEEP_COLOR_CONFIG_1); @@ -1373,11 +1372,12 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, reg = HDMI_READ(HDMI_GCP_WORD_1); reg &= ~VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1_MASK; reg |= VC4_SET_FIELD(gcp, VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1); + reg &= ~VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_MASK; + reg |= VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_CLEAR_AVMUTE; HDMI_WRITE(HDMI_GCP_WORD_1, reg); reg = HDMI_READ(HDMI_GCP_CONFIG); - reg &= ~VC5_HDMI_GCP_CONFIG_GCP_ENABLE; - reg |= gcp_en ? VC5_HDMI_GCP_CONFIG_GCP_ENABLE : 0; + reg |= VC5_HDMI_GCP_CONFIG_GCP_ENABLE; HDMI_WRITE(HDMI_GCP_CONFIG, reg); reg = HDMI_READ(HDMI_MISC_CONTROL); @@ -3018,7 +3018,8 @@ static int vc4_hdmi_cec_init(struct vc4_hdmi *vc4_hdmi) } vc4_hdmi->cec_adap = cec_allocate_adapter(&vc4_hdmi_cec_adap_ops, - vc4_hdmi, "vc4", + vc4_hdmi, + vc4_hdmi->variant->card_name, CEC_CAP_DEFAULTS | CEC_CAP_CONNECTOR_INFO, 1); ret = PTR_ERR_OR_ZERO(vc4_hdmi->cec_adap); diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c index 8b92a45a3c89..bd5acc4a8687 100644 --- a/drivers/gpu/drm/vc4/vc4_plane.c +++ b/drivers/gpu/drm/vc4/vc4_plane.c @@ -340,7 +340,7 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state) { struct vc4_plane_state *vc4_state = to_vc4_plane_state(state); struct drm_framebuffer *fb = state->fb; - struct drm_gem_dma_object *bo = drm_fb_dma_get_gem_obj(fb, 0); + struct drm_gem_dma_object *bo; int num_planes = fb->format->num_planes; struct drm_crtc_state *crtc_state; u32 h_subsample = fb->format->hsub; @@ -359,8 +359,10 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state) if (ret) return ret; - for (i = 0; i < num_planes; i++) + for (i = 0; i < num_planes; i++) { + bo = drm_fb_dma_get_gem_obj(fb, i); vc4_state->offsets[i] = bo->dma_addr + fb->offsets[i]; + } /* * We don't support subpixel source positioning for scaling, diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c index 9f4a90493aea..da45215a933d 100644 --- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c +++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c @@ -126,7 +126,6 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data, void __user *user_bo_handles = NULL; struct virtio_gpu_object_array *buflist = NULL; struct sync_file *sync_file; - int in_fence_fd = exbuf->fence_fd; int out_fence_fd = -1; void *buf; uint64_t fence_ctx; @@ -152,13 +151,11 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data, ring_idx = exbuf->ring_idx; } - exbuf->fence_fd = -1; - virtio_gpu_create_context(dev, file); if (exbuf->flags & VIRTGPU_EXECBUF_FENCE_FD_IN) { struct dma_fence *in_fence; - in_fence = sync_file_get_fence(in_fence_fd); + in_fence = sync_file_get_fence(exbuf->fence_fd); if (!in_fence) return -EINVAL; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c index aa1cd5126a32..4dcf2eb7aa80 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c @@ -462,6 +462,9 @@ int vmw_bo_create(struct vmw_private *vmw, return -ENOMEM; } + /* + * vmw_bo_init will delete the *p_bo object if it fails + */ ret = vmw_bo_init(vmw, *p_bo, size, placement, interruptible, pin, bo_free); @@ -470,7 +473,6 @@ int vmw_bo_create(struct vmw_private *vmw, return ret; out_error: - kfree(*p_bo); *p_bo = NULL; return ret; } @@ -596,6 +598,7 @@ static int vmw_user_bo_synccpu_release(struct drm_file *filp, ttm_bo_put(&vmw_bo->base); } + drm_gem_object_put(&vmw_bo->base.base); return ret; } @@ -636,6 +639,7 @@ int vmw_user_bo_synccpu_ioctl(struct drm_device *dev, void *data, ret = vmw_user_bo_synccpu_grab(vbo, arg->flags); vmw_bo_unreference(&vbo); + drm_gem_object_put(&vbo->base.base); if (unlikely(ret != 0)) { if (ret == -ERESTARTSYS || ret == -EBUSY) return -EBUSY; @@ -693,7 +697,7 @@ int vmw_bo_unref_ioctl(struct drm_device *dev, void *data, * struct vmw_buffer_object should be placed. * Return: Zero on success, Negative error code on error. * - * The vmw buffer object pointer will be refcounted. + * The vmw buffer object pointer will be refcounted (both ttm and gem) */ int vmw_user_bo_lookup(struct drm_file *filp, uint32_t handle, @@ -710,7 +714,6 @@ int vmw_user_bo_lookup(struct drm_file *filp, *out = gem_to_vmw_bo(gobj); ttm_bo_get(&(*out)->base); - drm_gem_object_put(gobj); return 0; } @@ -791,7 +794,8 @@ int vmw_dumb_create(struct drm_file *file_priv, ret = vmw_gem_object_create_with_handle(dev_priv, file_priv, args->size, &args->handle, &vbo); - + /* drop reference from allocate - handle holds it now */ + drm_gem_object_put(&vbo->base.base); return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index a44d53e33cdb..c0686283ffd1 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -1160,6 +1160,7 @@ static int vmw_translate_mob_ptr(struct vmw_private *dev_priv, } ret = vmw_validation_add_bo(sw_context->ctx, vmw_bo, true, false); ttm_bo_put(&vmw_bo->base); + drm_gem_object_put(&vmw_bo->base.base); if (unlikely(ret != 0)) return ret; @@ -1214,6 +1215,7 @@ static int vmw_translate_guest_ptr(struct vmw_private *dev_priv, } ret = vmw_validation_add_bo(sw_context->ctx, vmw_bo, false, false); ttm_bo_put(&vmw_bo->base); + drm_gem_object_put(&vmw_bo->base.base); if (unlikely(ret != 0)) return ret; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c index ce609e7d758f..4d2c28e39f4e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c @@ -146,14 +146,12 @@ int vmw_gem_object_create_with_handle(struct vmw_private *dev_priv, &vmw_sys_placement : &vmw_vram_sys_placement, true, false, &vmw_gem_destroy, p_vbo); - - (*p_vbo)->base.base.funcs = &vmw_gem_object_funcs; if (ret != 0) goto out_no_bo; + (*p_vbo)->base.base.funcs = &vmw_gem_object_funcs; + ret = drm_gem_handle_create(filp, &(*p_vbo)->base.base, handle); - /* drop reference from allocate - handle holds it now */ - drm_gem_object_put(&(*p_vbo)->base.base); out_no_bo: return ret; } @@ -180,6 +178,8 @@ int vmw_gem_object_create_ioctl(struct drm_device *dev, void *data, rep->map_handle = drm_vma_node_offset_addr(&vbo->base.base.vma_node); rep->cur_gmr_id = handle; rep->cur_gmr_offset = 0; + /* drop reference from allocate - handle holds it now */ + drm_gem_object_put(&vbo->base.base); out_no_bo: return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 257f090071f1..445d619e1fdc 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1815,8 +1815,10 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, err_out: /* vmw_user_lookup_handle takes one ref so does new_fb */ - if (bo) + if (bo) { vmw_bo_unreference(&bo); + drm_gem_object_put(&bo->base.base); + } if (surface) vmw_surface_unreference(&surface); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_msg_arm64.h b/drivers/gpu/drm/vmwgfx/vmwgfx_msg_arm64.h index 4f40167ad61f..4f40167ad61f 100755..100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_msg_arm64.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_msg_arm64.h diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c index e9f5c89b4ca6..b5b311f2a91a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c @@ -458,6 +458,7 @@ int vmw_overlay_ioctl(struct drm_device *dev, void *data, ret = vmw_overlay_update_stream(dev_priv, buf, arg, true); vmw_bo_unreference(&buf); + drm_gem_object_put(&buf->base.base); out_unlock: mutex_unlock(&overlay->mutex); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c index 108a496b5d18..51e83dfa1cac 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c @@ -807,6 +807,7 @@ static int vmw_shader_define(struct drm_device *dev, struct drm_file *file_priv, num_output_sig, tfile, shader_handle); out_bad_arg: vmw_bo_unreference(&buffer); + drm_gem_object_put(&buffer->base.base); return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index 3bc63ae768f3..dcfb003841b3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -683,7 +683,7 @@ static void vmw_user_surface_base_release(struct ttm_base_object **p_base) container_of(base, struct vmw_user_surface, prime.base); struct vmw_resource *res = &user_srf->srf.res; - if (base->shareable && res && res->backup) + if (res && res->backup) drm_gem_object_put(&res->backup->base.base); *p_base = NULL; @@ -864,7 +864,11 @@ int vmw_surface_define_ioctl(struct drm_device *dev, void *data, goto out_unlock; } vmw_bo_reference(res->backup); - drm_gem_object_get(&res->backup->base.base); + /* + * We don't expose the handle to the userspace and surface + * already holds a gem reference + */ + drm_gem_handle_delete(file_priv, backup_handle); } tmp = vmw_resource_reference(&srf->res); @@ -1568,8 +1572,6 @@ vmw_gb_surface_define_internal(struct drm_device *dev, drm_vma_node_offset_addr(&res->backup->base.base.vma_node); rep->buffer_size = res->backup->base.base.size; rep->buffer_handle = backup_handle; - if (user_srf->prime.base.shareable) - drm_gem_object_get(&res->backup->base.base); } else { rep->buffer_map_handle = 0; rep->buffer_size = 0; diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c index 1fb0f7105fb2..c751d12f5df8 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c @@ -227,6 +227,7 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) cl_data->num_hid_devices = amd_mp2_get_sensor_num(privdata, &cl_data->sensor_idx[0]); if (cl_data->num_hid_devices == 0) return -ENODEV; + cl_data->is_any_sensor_enabled = false; INIT_DELAYED_WORK(&cl_data->work, amd_sfh_work); INIT_DELAYED_WORK(&cl_data->work_buffer, amd_sfh_work_buffer); @@ -287,6 +288,7 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) status = amd_sfh_wait_for_response (privdata, cl_data->sensor_idx[i], SENSOR_ENABLED); if (status == SENSOR_ENABLED) { + cl_data->is_any_sensor_enabled = true; cl_data->sensor_sts[i] = SENSOR_ENABLED; rc = amdtp_hid_probe(cl_data->cur_hid_dev, cl_data); if (rc) { @@ -301,19 +303,26 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) cl_data->sensor_sts[i]); goto cleanup; } + } else { + cl_data->sensor_sts[i] = SENSOR_DISABLED; + dev_dbg(dev, "sid 0x%x (%s) status 0x%x\n", + cl_data->sensor_idx[i], + get_sensor_name(cl_data->sensor_idx[i]), + cl_data->sensor_sts[i]); } dev_dbg(dev, "sid 0x%x (%s) status 0x%x\n", cl_data->sensor_idx[i], get_sensor_name(cl_data->sensor_idx[i]), cl_data->sensor_sts[i]); } - if (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0) { + if (!cl_data->is_any_sensor_enabled || + (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0)) { amd_sfh_hid_client_deinit(privdata); for (i = 0; i < cl_data->num_hid_devices; i++) { devm_kfree(dev, cl_data->feature_report[i]); devm_kfree(dev, in_data->input_report[i]); devm_kfree(dev, cl_data->report_descr[i]); } - dev_warn(dev, "Failed to discover, sensors not enabled\n"); + dev_warn(dev, "Failed to discover, sensors not enabled is %d\n", cl_data->is_any_sensor_enabled); return -EOPNOTSUPP; } schedule_delayed_work(&cl_data->work_buffer, msecs_to_jiffies(AMD_SFH_IDLE_LOOP)); diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_hid.h b/drivers/hid/amd-sfh-hid/amd_sfh_hid.h index 3754fb423e3a..528036892c9d 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_hid.h +++ b/drivers/hid/amd-sfh-hid/amd_sfh_hid.h @@ -32,6 +32,7 @@ struct amd_input_data { struct amdtp_cl_data { u8 init_done; u32 cur_hid_dev; + bool is_any_sensor_enabled; u32 hid_dev_count; u32 num_hid_devices; struct device_info *hid_devices; diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 3e1803592bd4..5c72aef3d3dd 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1202,6 +1202,7 @@ int hid_open_report(struct hid_device *device) __u8 *end; __u8 *next; int ret; + int i; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_parser_main, @@ -1252,6 +1253,8 @@ int hid_open_report(struct hid_device *device) goto err; } device->collection_size = HID_DEFAULT_NUM_COLLECTIONS; + for (i = 0; i < HID_DEFAULT_NUM_COLLECTIONS; i++) + device->collection[i].parent_idx = -1; ret = -EINVAL; while ((next = fetch_item(start, end, &item)) != NULL) { diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c index e59e9911fc37..4fa45ee77503 100644 --- a/drivers/hid/hid-elecom.c +++ b/drivers/hid/hid-elecom.c @@ -12,6 +12,7 @@ * Copyright (c) 2017 Alex Manoussakis <amanou@gnu.org> * Copyright (c) 2017 Tomasz Kramkowski <tk@the-tk.com> * Copyright (c) 2020 YOSHIOKA Takuma <lo48576@hard-wi.red> + * Copyright (c) 2022 Takahiro Fujii <fujii@xaxxi.net> */ /* @@ -89,7 +90,7 @@ static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, case USB_DEVICE_ID_ELECOM_M_DT1URBK: case USB_DEVICE_ID_ELECOM_M_DT1DRBK: case USB_DEVICE_ID_ELECOM_M_HT1URBK: - case USB_DEVICE_ID_ELECOM_M_HT1DRBK: + case USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D: /* * Report descriptor format: * 12: button bit count @@ -99,6 +100,16 @@ static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, */ mouse_button_fixup(hdev, rdesc, *rsize, 12, 30, 14, 20, 8); break; + case USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C: + /* + * Report descriptor format: + * 22: button bit count + * 30: padding bit count + * 24: button report size + * 16: button usage maximum + */ + mouse_button_fixup(hdev, rdesc, *rsize, 22, 30, 24, 16, 8); + break; } return rdesc; } @@ -112,7 +123,8 @@ static const struct hid_device_id elecom_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C) }, { } }; MODULE_DEVICE_TABLE(hid, elecom_devices); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0f8c11842a3a..9e36b4cd905e 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -413,6 +413,8 @@ #define I2C_DEVICE_ID_HP_ENVY_X360_15T_DR100 0x29CF #define I2C_DEVICE_ID_HP_ENVY_X360_EU0009NV 0x2CF9 #define I2C_DEVICE_ID_HP_SPECTRE_X360_15 0x2817 +#define I2C_DEVICE_ID_HP_SPECTRE_X360_13_AW0020NG 0x29DF +#define I2C_DEVICE_ID_ASUS_TP420IA_TOUCHSCREEN 0x2BC8 #define USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN 0x2544 #define USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN 0x2706 #define I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN 0x261A @@ -428,7 +430,8 @@ #define USB_DEVICE_ID_ELECOM_M_DT1URBK 0x00fe #define USB_DEVICE_ID_ELECOM_M_DT1DRBK 0x00ff #define USB_DEVICE_ID_ELECOM_M_HT1URBK 0x010c -#define USB_DEVICE_ID_ELECOM_M_HT1DRBK 0x010d +#define USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D 0x010d +#define USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C 0x011c #define USB_VENDOR_ID_DREAM_CHEEKY 0x1d34 #define USB_DEVICE_ID_DREAM_CHEEKY_WN 0x0004 diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 9b59e436df0a..77c8c49852b5 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -370,6 +370,8 @@ static const struct hid_device_id hid_battery_quirks[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DINOVO_EDGE_KBD), HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_ASUS_TP420IA_TOUCHSCREEN), + HID_BATTERY_QUIRK_IGNORE }, { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN), HID_BATTERY_QUIRK_IGNORE }, { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN), @@ -384,6 +386,8 @@ static const struct hid_device_id hid_battery_quirks[] = { HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_15), HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_13_AW0020NG), + HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN), HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_SURFACE_GO2_TOUCHSCREEN), diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index abf2c95e4d0b..9c1ee8e91e0c 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -3978,7 +3978,8 @@ static void hidpp_connect_event(struct hidpp_device *hidpp) } hidpp_initialize_battery(hidpp); - hidpp_initialize_hires_scroll(hidpp); + if (!hid_is_usb(hidpp->hid_dev)) + hidpp_initialize_hires_scroll(hidpp); /* forward current battery state */ if (hidpp->capabilities & HIDPP_CAPABILITY_HIDPP10_BATTERY) { diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index be3ad02573de..5bc91f68b374 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -393,7 +393,8 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C) }, #endif #if IS_ENABLED(CONFIG_HID_ELO) { HID_USB_DEVICE(USB_VENDOR_ID_ELO, 0x0009) }, diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index cbe43e2567a7..64ac5bdee3a6 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1963,7 +1963,7 @@ static void hv_balloon_debugfs_init(struct hv_dynmem_device *b) static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b) { - debugfs_remove(debugfs_lookup("hv-balloon", NULL)); + debugfs_lookup_and_remove("hv-balloon", NULL); } #else diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c index bdf3b50de8ad..c1c74ce08407 100644 --- a/drivers/i2c/busses/i2c-axxia.c +++ b/drivers/i2c/busses/i2c-axxia.c @@ -118,7 +118,7 @@ #define SDA_HOLD_TIME 0x90 /** - * axxia_i2c_dev - I2C device context + * struct axxia_i2c_dev - I2C device context * @base: pointer to register struct * @msg: pointer to current message * @msg_r: pointer to current read message (sequence transfer) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index a3240ece55b2..581e02cc979a 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -351,7 +351,8 @@ u32 i2c_dw_scl_hcnt(u32 ic_clk, u32 tSYMBOL, u32 tf, int cond, int offset) * * If your hardware is free from tHD;STA issue, try this one. */ - return DIV_ROUND_CLOSEST(ic_clk * tSYMBOL, MICRO) - 8 + offset; + return DIV_ROUND_CLOSEST_ULL((u64)ic_clk * tSYMBOL, MICRO) - + 8 + offset; else /* * Conditional expression: @@ -367,7 +368,8 @@ u32 i2c_dw_scl_hcnt(u32 ic_clk, u32 tSYMBOL, u32 tf, int cond, int offset) * The reason why we need to take into account "tf" here, * is the same as described in i2c_dw_scl_lcnt(). */ - return DIV_ROUND_CLOSEST(ic_clk * (tSYMBOL + tf), MICRO) - 3 + offset; + return DIV_ROUND_CLOSEST_ULL((u64)ic_clk * (tSYMBOL + tf), MICRO) - + 3 + offset; } u32 i2c_dw_scl_lcnt(u32 ic_clk, u32 tLOW, u32 tf, int offset) @@ -383,7 +385,8 @@ u32 i2c_dw_scl_lcnt(u32 ic_clk, u32 tLOW, u32 tf, int offset) * account the fall time of SCL signal (tf). Default tf value * should be 0.3 us, for safety. */ - return DIV_ROUND_CLOSEST(ic_clk * (tLOW + tf), MICRO) - 1 + offset; + return DIV_ROUND_CLOSEST_ULL((u64)ic_clk * (tLOW + tf), MICRO) - + 1 + offset; } int i2c_dw_set_sda_hold(struct dw_i2c_dev *dev) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index e499f96506c5..782fe1ef3ca1 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -396,6 +396,8 @@ static const struct pci_device_id i2_designware_pci_ids[] = { { PCI_VDEVICE(ATI, 0x73a4), navi_amd }, { PCI_VDEVICE(ATI, 0x73e4), navi_amd }, { PCI_VDEVICE(ATI, 0x73c4), navi_amd }, + { PCI_VDEVICE(ATI, 0x7444), navi_amd }, + { PCI_VDEVICE(ATI, 0x7464), navi_amd }, { 0,} }; MODULE_DEVICE_TABLE(pci, i2_designware_pci_ids); diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index ba043b547393..74182db03a88 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -351,13 +351,11 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) if (dev->flags & ACCESS_NO_IRQ_SUSPEND) { dev_pm_set_driver_flags(&pdev->dev, - DPM_FLAG_SMART_PREPARE | - DPM_FLAG_MAY_SKIP_RESUME); + DPM_FLAG_SMART_PREPARE); } else { dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE | - DPM_FLAG_SMART_SUSPEND | - DPM_FLAG_MAY_SKIP_RESUME); + DPM_FLAG_SMART_SUSPEND); } device_enable_async_suspend(&pdev->dev); @@ -419,21 +417,8 @@ static int dw_i2c_plat_prepare(struct device *dev) */ return !has_acpi_companion(dev); } - -static void dw_i2c_plat_complete(struct device *dev) -{ - /* - * The device can only be in runtime suspend at this point if it has not - * been resumed throughout the ending system suspend/resume cycle, so if - * the platform firmware might mess up with it, request the runtime PM - * framework to resume it. - */ - if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) - pm_request_resume(dev); -} #else #define dw_i2c_plat_prepare NULL -#define dw_i2c_plat_complete NULL #endif #ifdef CONFIG_PM @@ -483,7 +468,6 @@ static int __maybe_unused dw_i2c_plat_resume(struct device *dev) static const struct dev_pm_ops dw_i2c_dev_pm_ops = { .prepare = dw_i2c_plat_prepare, - .complete = dw_i2c_plat_complete, SET_LATE_SYSTEM_SLEEP_PM_OPS(dw_i2c_plat_suspend, dw_i2c_plat_resume) SET_RUNTIME_PM_OPS(dw_i2c_plat_runtime_suspend, dw_i2c_plat_runtime_resume, NULL) }; diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index 5af5cffc444e..d113bed79545 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -826,8 +826,8 @@ static int mxs_i2c_probe(struct platform_device *pdev) /* Setup the DMA */ i2c->dmach = dma_request_chan(dev, "rx-tx"); if (IS_ERR(i2c->dmach)) { - dev_err(dev, "Failed to request dma\n"); - return PTR_ERR(i2c->dmach); + return dev_err_probe(dev, PTR_ERR(i2c->dmach), + "Failed to request dma\n"); } platform_set_drvdata(pdev, i2c); diff --git a/drivers/i2c/busses/i2c-rk3x.c b/drivers/i2c/busses/i2c-rk3x.c index d1658ed76562..b31cf4f18f85 100644 --- a/drivers/i2c/busses/i2c-rk3x.c +++ b/drivers/i2c/busses/i2c-rk3x.c @@ -80,7 +80,7 @@ enum { #define DEFAULT_SCL_RATE (100 * 1000) /* Hz */ /** - * struct i2c_spec_values: + * struct i2c_spec_values - I2C specification values for various modes * @min_hold_start_ns: min hold time (repeated) START condition * @min_low_ns: min LOW period of the SCL clock * @min_high_ns: min HIGH period of the SCL cloc @@ -136,7 +136,7 @@ static const struct i2c_spec_values fast_mode_plus_spec = { }; /** - * struct rk3x_i2c_calced_timings: + * struct rk3x_i2c_calced_timings - calculated V1 timings * @div_low: Divider output for low * @div_high: Divider output for high * @tuning: Used to adjust setup/hold data time, @@ -159,7 +159,7 @@ enum rk3x_i2c_state { }; /** - * struct rk3x_i2c_soc_data: + * struct rk3x_i2c_soc_data - SOC-specific data * @grf_offset: offset inside the grf regmap for setting the i2c type * @calc_timings: Callback function for i2c timing information calculated */ @@ -239,7 +239,8 @@ static inline void rk3x_i2c_clean_ipd(struct rk3x_i2c *i2c) } /** - * Generate a START condition, which triggers a REG_INT_START interrupt. + * rk3x_i2c_start - Generate a START condition, which triggers a REG_INT_START interrupt. + * @i2c: target controller data */ static void rk3x_i2c_start(struct rk3x_i2c *i2c) { @@ -258,8 +259,8 @@ static void rk3x_i2c_start(struct rk3x_i2c *i2c) } /** - * Generate a STOP condition, which triggers a REG_INT_STOP interrupt. - * + * rk3x_i2c_stop - Generate a STOP condition, which triggers a REG_INT_STOP interrupt. + * @i2c: target controller data * @error: Error code to return in rk3x_i2c_xfer */ static void rk3x_i2c_stop(struct rk3x_i2c *i2c, int error) @@ -298,7 +299,8 @@ static void rk3x_i2c_stop(struct rk3x_i2c *i2c, int error) } /** - * Setup a read according to i2c->msg + * rk3x_i2c_prepare_read - Setup a read according to i2c->msg + * @i2c: target controller data */ static void rk3x_i2c_prepare_read(struct rk3x_i2c *i2c) { @@ -329,7 +331,8 @@ static void rk3x_i2c_prepare_read(struct rk3x_i2c *i2c) } /** - * Fill the transmit buffer with data from i2c->msg + * rk3x_i2c_fill_transmit_buf - Fill the transmit buffer with data from i2c->msg + * @i2c: target controller data */ static void rk3x_i2c_fill_transmit_buf(struct rk3x_i2c *i2c) { @@ -532,11 +535,10 @@ out: } /** - * Get timing values of I2C specification - * + * rk3x_i2c_get_spec - Get timing values of I2C specification * @speed: Desired SCL frequency * - * Returns: Matched i2c spec values. + * Return: Matched i2c_spec_values. */ static const struct i2c_spec_values *rk3x_i2c_get_spec(unsigned int speed) { @@ -549,13 +551,12 @@ static const struct i2c_spec_values *rk3x_i2c_get_spec(unsigned int speed) } /** - * Calculate divider values for desired SCL frequency - * + * rk3x_i2c_v0_calc_timings - Calculate divider values for desired SCL frequency * @clk_rate: I2C input clock rate * @t: Known I2C timing information * @t_calc: Caculated rk3x private timings that would be written into regs * - * Returns: 0 on success, -EINVAL if the goal SCL rate is too slow. In that case + * Return: %0 on success, -%EINVAL if the goal SCL rate is too slow. In that case * a best-effort divider value is returned in divs. If the target rate is * too high, we silently use the highest possible rate. */ @@ -710,13 +711,12 @@ static int rk3x_i2c_v0_calc_timings(unsigned long clk_rate, } /** - * Calculate timing values for desired SCL frequency - * + * rk3x_i2c_v1_calc_timings - Calculate timing values for desired SCL frequency * @clk_rate: I2C input clock rate * @t: Known I2C timing information * @t_calc: Caculated rk3x private timings that would be written into regs * - * Returns: 0 on success, -EINVAL if the goal SCL rate is too slow. In that case + * Return: %0 on success, -%EINVAL if the goal SCL rate is too slow. In that case * a best-effort divider value is returned in divs. If the target rate is * too high, we silently use the highest possible rate. * The following formulas are v1's method to calculate timings. @@ -960,14 +960,14 @@ static int rk3x_i2c_clk_notifier_cb(struct notifier_block *nb, unsigned long } /** - * Setup I2C registers for an I2C operation specified by msgs, num. - * - * Must be called with i2c->lock held. - * + * rk3x_i2c_setup - Setup I2C registers for an I2C operation specified by msgs, num. + * @i2c: target controller data * @msgs: I2C msgs to process * @num: Number of msgs * - * returns: Number of I2C msgs processed or negative in case of error + * Must be called with i2c->lock held. + * + * Return: Number of I2C msgs processed or negative in case of error */ static int rk3x_i2c_setup(struct rk3x_i2c *i2c, struct i2c_msg *msgs, int num) { diff --git a/drivers/iio/accel/hid-sensor-accel-3d.c b/drivers/iio/accel/hid-sensor-accel-3d.c index a2def6f9380a..5eac7ea19993 100644 --- a/drivers/iio/accel/hid-sensor-accel-3d.c +++ b/drivers/iio/accel/hid-sensor-accel-3d.c @@ -280,6 +280,7 @@ static int accel_3d_capture_sample(struct hid_sensor_hub_device *hsdev, hid_sensor_convert_timestamp( &accel_state->common_attributes, *(int64_t *)raw_data); + ret = 0; break; default: break; diff --git a/drivers/iio/adc/berlin2-adc.c b/drivers/iio/adc/berlin2-adc.c index 3d2e8b4db61a..a4e7c7eff5ac 100644 --- a/drivers/iio/adc/berlin2-adc.c +++ b/drivers/iio/adc/berlin2-adc.c @@ -298,8 +298,10 @@ static int berlin2_adc_probe(struct platform_device *pdev) int ret; indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*priv)); - if (!indio_dev) + if (!indio_dev) { + of_node_put(parent_np); return -ENOMEM; + } priv = iio_priv(indio_dev); diff --git a/drivers/iio/adc/imx8qxp-adc.c b/drivers/iio/adc/imx8qxp-adc.c index 36777b827165..f5a0fc9e64c5 100644 --- a/drivers/iio/adc/imx8qxp-adc.c +++ b/drivers/iio/adc/imx8qxp-adc.c @@ -86,6 +86,8 @@ #define IMX8QXP_ADC_TIMEOUT msecs_to_jiffies(100) +#define IMX8QXP_ADC_MAX_FIFO_SIZE 16 + struct imx8qxp_adc { struct device *dev; void __iomem *regs; @@ -95,6 +97,7 @@ struct imx8qxp_adc { /* Serialise ADC channel reads */ struct mutex lock; struct completion completion; + u32 fifo[IMX8QXP_ADC_MAX_FIFO_SIZE]; }; #define IMX8QXP_ADC_CHAN(_idx) { \ @@ -238,8 +241,7 @@ static int imx8qxp_adc_read_raw(struct iio_dev *indio_dev, return ret; } - *val = FIELD_GET(IMX8QXP_ADC_RESFIFO_VAL_MASK, - readl(adc->regs + IMX8QXP_ADR_ADC_RESFIFO)); + *val = adc->fifo[0]; mutex_unlock(&adc->lock); return IIO_VAL_INT; @@ -265,10 +267,15 @@ static irqreturn_t imx8qxp_adc_isr(int irq, void *dev_id) { struct imx8qxp_adc *adc = dev_id; u32 fifo_count; + int i; fifo_count = FIELD_GET(IMX8QXP_ADC_FCTRL_FCOUNT_MASK, readl(adc->regs + IMX8QXP_ADR_ADC_FCTRL)); + for (i = 0; i < fifo_count; i++) + adc->fifo[i] = FIELD_GET(IMX8QXP_ADC_RESFIFO_VAL_MASK, + readl_relaxed(adc->regs + IMX8QXP_ADR_ADC_RESFIFO)); + if (fifo_count) complete(&adc->completion); diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c index 6d21ea84fa82..a428bdb567d5 100644 --- a/drivers/iio/adc/stm32-dfsdm-adc.c +++ b/drivers/iio/adc/stm32-dfsdm-adc.c @@ -1520,6 +1520,7 @@ static const struct of_device_id stm32_dfsdm_adc_match[] = { }, {} }; +MODULE_DEVICE_TABLE(of, stm32_dfsdm_adc_match); static int stm32_dfsdm_adc_probe(struct platform_device *pdev) { diff --git a/drivers/iio/adc/twl6030-gpadc.c b/drivers/iio/adc/twl6030-gpadc.c index f53e8558b560..32873fb5f367 100644 --- a/drivers/iio/adc/twl6030-gpadc.c +++ b/drivers/iio/adc/twl6030-gpadc.c @@ -57,6 +57,18 @@ #define TWL6030_GPADCS BIT(1) #define TWL6030_GPADCR BIT(0) +#define USB_VBUS_CTRL_SET 0x04 +#define USB_ID_CTRL_SET 0x06 + +#define TWL6030_MISC1 0xE4 +#define VBUS_MEAS 0x01 +#define ID_MEAS 0x01 + +#define VAC_MEAS 0x04 +#define VBAT_MEAS 0x02 +#define BB_MEAS 0x01 + + /** * struct twl6030_chnl_calib - channel calibration * @gain: slope coefficient for ideal curve @@ -927,6 +939,26 @@ static int twl6030_gpadc_probe(struct platform_device *pdev) return ret; } + ret = twl_i2c_write_u8(TWL_MODULE_USB, VBUS_MEAS, USB_VBUS_CTRL_SET); + if (ret < 0) { + dev_err(dev, "failed to wire up inputs\n"); + return ret; + } + + ret = twl_i2c_write_u8(TWL_MODULE_USB, ID_MEAS, USB_ID_CTRL_SET); + if (ret < 0) { + dev_err(dev, "failed to wire up inputs\n"); + return ret; + } + + ret = twl_i2c_write_u8(TWL6030_MODULE_ID0, + VBAT_MEAS | BB_MEAS | VAC_MEAS, + TWL6030_MISC1); + if (ret < 0) { + dev_err(dev, "failed to wire up inputs\n"); + return ret; + } + indio_dev->name = DRIVER_NAME; indio_dev->info = &twl6030_gpadc_iio_info; indio_dev->modes = INDIO_DIRECT_MODE; diff --git a/drivers/iio/adc/xilinx-ams.c b/drivers/iio/adc/xilinx-ams.c index 5b4bdf3a26bb..a507d2e17079 100644 --- a/drivers/iio/adc/xilinx-ams.c +++ b/drivers/iio/adc/xilinx-ams.c @@ -1329,7 +1329,7 @@ static int ams_parse_firmware(struct iio_dev *indio_dev) dev_channels = devm_krealloc(dev, ams_channels, dev_size, GFP_KERNEL); if (!dev_channels) - ret = -ENOMEM; + return -ENOMEM; indio_dev->channels = dev_channels; indio_dev->num_channels = num_channels; diff --git a/drivers/iio/gyro/hid-sensor-gyro-3d.c b/drivers/iio/gyro/hid-sensor-gyro-3d.c index 8f0ad022c7f1..698c50da1f10 100644 --- a/drivers/iio/gyro/hid-sensor-gyro-3d.c +++ b/drivers/iio/gyro/hid-sensor-gyro-3d.c @@ -231,6 +231,7 @@ static int gyro_3d_capture_sample(struct hid_sensor_hub_device *hsdev, gyro_state->timestamp = hid_sensor_convert_timestamp(&gyro_state->common_attributes, *(s64 *)raw_data); + ret = 0; break; default: break; diff --git a/drivers/iio/imu/fxos8700_core.c b/drivers/iio/imu/fxos8700_core.c index 423cfe526f2a..6d189c4b9ff9 100644 --- a/drivers/iio/imu/fxos8700_core.c +++ b/drivers/iio/imu/fxos8700_core.c @@ -10,6 +10,7 @@ #include <linux/regmap.h> #include <linux/acpi.h> #include <linux/bitops.h> +#include <linux/bitfield.h> #include <linux/iio/iio.h> #include <linux/iio/sysfs.h> @@ -144,9 +145,8 @@ #define FXOS8700_NVM_DATA_BNK0 0xa7 /* Bit definitions for FXOS8700_CTRL_REG1 */ -#define FXOS8700_CTRL_ODR_MSK 0x38 #define FXOS8700_CTRL_ODR_MAX 0x00 -#define FXOS8700_CTRL_ODR_MIN GENMASK(4, 3) +#define FXOS8700_CTRL_ODR_MSK GENMASK(5, 3) /* Bit definitions for FXOS8700_M_CTRL_REG1 */ #define FXOS8700_HMS_MASK GENMASK(1, 0) @@ -320,7 +320,7 @@ static enum fxos8700_sensor fxos8700_to_sensor(enum iio_chan_type iio_type) switch (iio_type) { case IIO_ACCEL: return FXOS8700_ACCEL; - case IIO_ANGL_VEL: + case IIO_MAGN: return FXOS8700_MAGN; default: return -EINVAL; @@ -345,15 +345,35 @@ static int fxos8700_set_active_mode(struct fxos8700_data *data, static int fxos8700_set_scale(struct fxos8700_data *data, enum fxos8700_sensor t, int uscale) { - int i; + int i, ret, val; + bool active_mode; static const int scale_num = ARRAY_SIZE(fxos8700_accel_scale); struct device *dev = regmap_get_device(data->regmap); if (t == FXOS8700_MAGN) { - dev_err(dev, "Magnetometer scale is locked at 1200uT\n"); + dev_err(dev, "Magnetometer scale is locked at 0.001Gs\n"); return -EINVAL; } + /* + * When device is in active mode, it failed to set an ACCEL + * full-scale range(2g/4g/8g) in FXOS8700_XYZ_DATA_CFG. + * This is not align with the datasheet, but it is a fxos8700 + * chip behavier. Set the device in standby mode before setting + * an ACCEL full-scale range. + */ + ret = regmap_read(data->regmap, FXOS8700_CTRL_REG1, &val); + if (ret) + return ret; + + active_mode = val & FXOS8700_ACTIVE; + if (active_mode) { + ret = regmap_write(data->regmap, FXOS8700_CTRL_REG1, + val & ~FXOS8700_ACTIVE); + if (ret) + return ret; + } + for (i = 0; i < scale_num; i++) if (fxos8700_accel_scale[i].uscale == uscale) break; @@ -361,8 +381,12 @@ static int fxos8700_set_scale(struct fxos8700_data *data, if (i == scale_num) return -EINVAL; - return regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, + ret = regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, fxos8700_accel_scale[i].bits); + if (ret) + return ret; + return regmap_write(data->regmap, FXOS8700_CTRL_REG1, + active_mode); } static int fxos8700_get_scale(struct fxos8700_data *data, @@ -372,7 +396,7 @@ static int fxos8700_get_scale(struct fxos8700_data *data, static const int scale_num = ARRAY_SIZE(fxos8700_accel_scale); if (t == FXOS8700_MAGN) { - *uscale = 1200; /* Magnetometer is locked at 1200uT */ + *uscale = 1000; /* Magnetometer is locked at 0.001Gs */ return 0; } @@ -394,22 +418,61 @@ static int fxos8700_get_data(struct fxos8700_data *data, int chan_type, int axis, int *val) { u8 base, reg; + s16 tmp; int ret; - enum fxos8700_sensor type = fxos8700_to_sensor(chan_type); - base = type ? FXOS8700_OUT_X_MSB : FXOS8700_M_OUT_X_MSB; + /* + * Different register base addresses varies with channel types. + * This bug hasn't been noticed before because using an enum is + * really hard to read. Use an a switch statement to take over that. + */ + switch (chan_type) { + case IIO_ACCEL: + base = FXOS8700_OUT_X_MSB; + break; + case IIO_MAGN: + base = FXOS8700_M_OUT_X_MSB; + break; + default: + return -EINVAL; + } /* Block read 6 bytes of device output registers to avoid data loss */ ret = regmap_bulk_read(data->regmap, base, data->buf, - FXOS8700_DATA_BUF_SIZE); + sizeof(data->buf)); if (ret) return ret; /* Convert axis to buffer index */ reg = axis - IIO_MOD_X; + /* + * Convert to native endianness. The accel data and magn data + * are signed, so a forced type conversion is needed. + */ + tmp = be16_to_cpu(data->buf[reg]); + + /* + * ACCEL output data registers contain the X-axis, Y-axis, and Z-axis + * 14-bit left-justified sample data and MAGN output data registers + * contain the X-axis, Y-axis, and Z-axis 16-bit sample data. Apply + * a signed 2 bits right shift to the readback raw data from ACCEL + * output data register and keep that from MAGN sensor as the origin. + * Value should be extended to 32 bit. + */ + switch (chan_type) { + case IIO_ACCEL: + tmp = tmp >> 2; + break; + case IIO_MAGN: + /* Nothing to do */ + break; + default: + return -EINVAL; + } + /* Convert to native endianness */ - *val = sign_extend32(be16_to_cpu(data->buf[reg]), 15); + *val = sign_extend32(tmp, 15); return 0; } @@ -445,10 +508,9 @@ static int fxos8700_set_odr(struct fxos8700_data *data, enum fxos8700_sensor t, if (i >= odr_num) return -EINVAL; - return regmap_update_bits(data->regmap, - FXOS8700_CTRL_REG1, - FXOS8700_CTRL_ODR_MSK + FXOS8700_ACTIVE, - fxos8700_odr[i].bits << 3 | active_mode); + val &= ~FXOS8700_CTRL_ODR_MSK; + val |= FIELD_PREP(FXOS8700_CTRL_ODR_MSK, fxos8700_odr[i].bits) | FXOS8700_ACTIVE; + return regmap_write(data->regmap, FXOS8700_CTRL_REG1, val); } static int fxos8700_get_odr(struct fxos8700_data *data, enum fxos8700_sensor t, @@ -461,7 +523,7 @@ static int fxos8700_get_odr(struct fxos8700_data *data, enum fxos8700_sensor t, if (ret) return ret; - val &= FXOS8700_CTRL_ODR_MSK; + val = FIELD_GET(FXOS8700_CTRL_ODR_MSK, val); for (i = 0; i < odr_num; i++) if (val == fxos8700_odr[i].bits) @@ -526,7 +588,7 @@ static IIO_CONST_ATTR(in_accel_sampling_frequency_available, static IIO_CONST_ATTR(in_magn_sampling_frequency_available, "1.5625 6.25 12.5 50 100 200 400 800"); static IIO_CONST_ATTR(in_accel_scale_available, "0.000244 0.000488 0.000976"); -static IIO_CONST_ATTR(in_magn_scale_available, "0.000001200"); +static IIO_CONST_ATTR(in_magn_scale_available, "0.001000"); static struct attribute *fxos8700_attrs[] = { &iio_const_attr_in_accel_sampling_frequency_available.dev_attr.attr, @@ -592,14 +654,19 @@ static int fxos8700_chip_init(struct fxos8700_data *data, bool use_spi) if (ret) return ret; - /* Max ODR (800Hz individual or 400Hz hybrid), active mode */ - ret = regmap_write(data->regmap, FXOS8700_CTRL_REG1, - FXOS8700_CTRL_ODR_MAX | FXOS8700_ACTIVE); + /* + * Set max full-scale range (+/-8G) for ACCEL sensor in chip + * initialization then activate the device. + */ + ret = regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, MODE_8G); if (ret) return ret; - /* Set for max full-scale range (+/-8G) */ - return regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, MODE_8G); + /* Max ODR (800Hz individual or 400Hz hybrid), active mode */ + return regmap_update_bits(data->regmap, FXOS8700_CTRL_REG1, + FXOS8700_CTRL_ODR_MSK | FXOS8700_ACTIVE, + FIELD_PREP(FXOS8700_CTRL_ODR_MSK, FXOS8700_CTRL_ODR_MAX) | + FXOS8700_ACTIVE); } static void fxos8700_chip_uninit(void *data) diff --git a/drivers/iio/imu/st_lsm6dsx/Kconfig b/drivers/iio/imu/st_lsm6dsx/Kconfig index f6660847fb58..8c16cdacf2f2 100644 --- a/drivers/iio/imu/st_lsm6dsx/Kconfig +++ b/drivers/iio/imu/st_lsm6dsx/Kconfig @@ -4,6 +4,7 @@ config IIO_ST_LSM6DSX tristate "ST_LSM6DSx driver for STM 6-axis IMU MEMS sensors" depends on (I2C || SPI || I3C) select IIO_BUFFER + select IIO_TRIGGERED_BUFFER select IIO_KFIFO_BUF select IIO_ST_LSM6DSX_I2C if (I2C) select IIO_ST_LSM6DSX_SPI if (SPI_MASTER) diff --git a/drivers/iio/light/cm32181.c b/drivers/iio/light/cm32181.c index 001055d09750..b1674a5bfa36 100644 --- a/drivers/iio/light/cm32181.c +++ b/drivers/iio/light/cm32181.c @@ -440,6 +440,8 @@ static int cm32181_probe(struct i2c_client *client) if (!indio_dev) return -ENOMEM; + i2c_set_clientdata(client, indio_dev); + /* * Some ACPI systems list 2 I2C resources for the CM3218 sensor, the * SMBus Alert Response Address (ARA, 0x0c) and the actual I2C address. @@ -460,8 +462,6 @@ static int cm32181_probe(struct i2c_client *client) return PTR_ERR(client); } - i2c_set_clientdata(client, indio_dev); - cm32181 = iio_priv(indio_dev); cm32181->client = client; cm32181->dev = dev; @@ -490,7 +490,8 @@ static int cm32181_probe(struct i2c_client *client) static int cm32181_suspend(struct device *dev) { - struct i2c_client *client = to_i2c_client(dev); + struct cm32181_chip *cm32181 = iio_priv(dev_get_drvdata(dev)); + struct i2c_client *client = cm32181->client; return i2c_smbus_write_word_data(client, CM32181_REG_ADDR_CMD, CM32181_CMD_ALS_DISABLE); @@ -498,8 +499,8 @@ static int cm32181_suspend(struct device *dev) static int cm32181_resume(struct device *dev) { - struct i2c_client *client = to_i2c_client(dev); struct cm32181_chip *cm32181 = iio_priv(dev_get_drvdata(dev)); + struct i2c_client *client = cm32181->client; return i2c_smbus_write_word_data(client, CM32181_REG_ADDR_CMD, cm32181->conf_regs[CM32181_REG_ADDR_CMD]); diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 43b26bc12288..39357dc2d229 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -26,8 +26,8 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) if (umem_dmabuf->sgt) goto wait_fence; - sgt = dma_buf_map_attachment_unlocked(umem_dmabuf->attach, - DMA_BIDIRECTIONAL); + sgt = dma_buf_map_attachment(umem_dmabuf->attach, + DMA_BIDIRECTIONAL); if (IS_ERR(sgt)) return PTR_ERR(sgt); @@ -103,8 +103,8 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) umem_dmabuf->last_sg_trim = 0; } - dma_buf_unmap_attachment_unlocked(umem_dmabuf->attach, umem_dmabuf->sgt, - DMA_BIDIRECTIONAL); + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, + DMA_BIDIRECTIONAL); umem_dmabuf->sgt = NULL; } diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f5f9269fdc16..7c5d487ec916 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1318,12 +1318,15 @@ static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg, addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, sizeof(tinfo.tidcnt))) - return -EFAULT; + ret = -EFAULT; addr = arg + offsetof(struct hfi1_tid_info, length); - if (copy_to_user((void __user *)addr, &tinfo.length, + if (!ret && copy_to_user((void __user *)addr, &tinfo.length, sizeof(tinfo.length))) ret = -EFAULT; + + if (ret) + hfi1_user_exp_rcv_invalid(fd, &tinfo); } return ret; diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index b02f2f0809c8..350884d5f089 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -160,16 +160,11 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd, static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) { int pinned; - unsigned int npages; + unsigned int npages = tidbuf->npages; unsigned long vaddr = tidbuf->vaddr; struct page **pages = NULL; struct hfi1_devdata *dd = fd->uctxt->dd; - /* Get the number of pages the user buffer spans */ - npages = num_user_pages(vaddr, tidbuf->length); - if (!npages) - return -EINVAL; - if (npages > fd->uctxt->expected_count) { dd_dev_err(dd, "Expected buffer too big\n"); return -EINVAL; @@ -196,7 +191,6 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) return pinned; } tidbuf->pages = pages; - tidbuf->npages = npages; fd->tid_n_pinned += pinned; return pinned; } @@ -274,6 +268,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, mutex_init(&tidbuf->cover_mutex); tidbuf->vaddr = tinfo->vaddr; tidbuf->length = tinfo->length; + tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length); tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), GFP_KERNEL); if (!tidbuf->psets) { diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 7b086fe63a24..195aa9ea18b6 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1722,6 +1722,9 @@ static int irdma_add_mqh_4(struct irdma_device *iwdev, continue; idev = in_dev_get(ip_dev); + if (!idev) + continue; + in_dev_for_each_ifa_rtnl(ifa, idev) { ibdev_dbg(&iwdev->ibdev, "CM: Allocating child CM Listener forIP=%pI4, vlan_id=%d, MAC=%pM\n", diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index ea15ec77e321..54b61930a7fd 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -289,7 +289,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, /* IB ports start with 1, MANA Ethernet ports start with 0 */ port = ucmd.port; - if (ucmd.port > mc->num_ports) + if (port < 1 || port > mc->num_ports) return -EINVAL; if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) { diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index c301b3be9f30..a2857accc427 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -276,8 +276,8 @@ iter_chunk: size = pa_end - pa_start + PAGE_SIZE; usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", va_start, &pa_start, size, flags); - err = iommu_map(pd->domain, va_start, pa_start, - size, flags); + err = iommu_map_atomic(pd->domain, va_start, + pa_start, size, flags); if (err) { usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", va_start, &pa_start, size, err); @@ -293,8 +293,8 @@ iter_chunk: size = pa - pa_start + PAGE_SIZE; usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", va_start, &pa_start, size, flags); - err = iommu_map(pd->domain, va_start, pa_start, - size, flags); + err = iommu_map_atomic(pd->domain, va_start, + pa_start, size, flags); if (err) { usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", va_start, &pa_start, size, err); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ac25fc80fb33..f10d4bcf87d2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2200,6 +2200,14 @@ int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; rn->hca = hca; + + rc = netif_set_real_num_tx_queues(dev, 1); + if (rc) + goto out; + + rc = netif_set_real_num_rx_queues(dev, 1); + if (rc) + goto out; } priv->rn_ops = dev->netdev_ops; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index c76ba29da1e2..5adba0f754b6 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -312,9 +312,8 @@ void rtrs_srv_destroy_path_files(struct rtrs_srv_path *srv_path) if (srv_path->kobj.state_in_sysfs) { sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group); - kobject_del(&srv_path->kobj); kobject_put(&srv_path->kobj); + rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } - rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index b0f776448a1c..fa021af8506e 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -192,7 +192,6 @@ static const char * const smbus_pnp_ids[] = { "SYN3221", /* HP 15-ay000 */ "SYN323d", /* HP Spectre X360 13-w013dx */ "SYN3257", /* HP Envy 13-ad105ng */ - "SYN3286", /* HP Laptop 15-da3001TU */ NULL }; diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 46f8a694291e..efc61736099b 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1240,6 +1240,13 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { }, { .matches = { + DMI_MATCH(DMI_BOARD_NAME, "PCX0DX"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, + { + .matches = { DMI_MATCH(DMI_BOARD_NAME, "X170SM"), }, .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h index 97413586195b..f96034e0ba4f 100644 --- a/drivers/md/bcache/bcache_ondisk.h +++ b/drivers/md/bcache/bcache_ondisk.h @@ -106,7 +106,8 @@ static inline unsigned long bkey_bytes(const struct bkey *k) return bkey_u64s(k) * sizeof(__u64); } -#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src)) +#define bkey_copy(_dest, _src) unsafe_memcpy(_dest, _src, bkey_bytes(_src), \ + /* bkey is always padded */) static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index e5da469a4235..c182c21de2e8 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -149,7 +149,8 @@ add: bytes, GFP_KERNEL); if (!i) return -ENOMEM; - memcpy(&i->j, j, bytes); + unsafe_memcpy(&i->j, j, bytes, + /* "bytes" was calculated by set_bytes() above */); /* Add to the location after 'where' points to */ list_add(&i->list, where); ret = 1; diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index fc3758a5bc1c..53e495223ea0 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -2149,8 +2149,6 @@ int vb2_core_streamon(struct vb2_queue *q, unsigned int type) if (ret) return ret; - q->streaming = 1; - /* * Tell driver to start streaming provided sufficient buffers * are available. @@ -2161,12 +2159,13 @@ int vb2_core_streamon(struct vb2_queue *q, unsigned int type) goto unprepare; } + q->streaming = 1; + dprintk(q, 3, "successful\n"); return 0; unprepare: call_void_qop(q, unprepare_streaming, q); - q->streaming = 0; return ret; } EXPORT_SYMBOL_GPL(vb2_core_streamon); diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c index 3d3b6dc24ca6..002ea6588edf 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-api.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c @@ -150,8 +150,8 @@ static int user_to_new(struct v4l2_ext_control *c, struct v4l2_ctrl *ctrl) * then return an error. */ if (strlen(ctrl->p_new.p_char) == ctrl->maximum && last) - ctrl->is_new = 1; return -ERANGE; + ctrl->is_new = 1; } return ret; default: diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index babf21a0adeb..f191a2a76f3b 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -294,6 +294,12 @@ static void sdio_release_func(struct device *dev) if (!(func->card->quirks & MMC_QUIRK_NONSTD_SDIO)) sdio_free_func_cis(func); + /* + * We have now removed the link to the tuples in the + * card structure, so remove the reference. + */ + put_device(&func->card->dev); + kfree(func->info); kfree(func->tmpbuf); kfree(func); @@ -324,6 +330,12 @@ struct sdio_func *sdio_alloc_func(struct mmc_card *card) device_initialize(&func->dev); + /* + * We may link to tuples in the card structure, + * we need make sure we have a reference to it. + */ + get_device(&func->card->dev); + func->dev.parent = &card->dev; func->dev.bus = &sdio_bus_type; func->dev.release = sdio_release_func; @@ -377,10 +389,9 @@ int sdio_add_func(struct sdio_func *func) */ void sdio_remove_func(struct sdio_func *func) { - if (!sdio_func_present(func)) - return; + if (sdio_func_present(func)) + device_del(&func->dev); - device_del(&func->dev); of_node_put(func->dev.of_node); put_device(&func->dev); } diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index a705ba6eff5b..afaa6cab1adc 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -404,12 +404,6 @@ int sdio_read_func_cis(struct sdio_func *func) return ret; /* - * Since we've linked to tuples in the card structure, - * we must make sure we have a reference to it. - */ - get_device(&func->card->dev); - - /* * Vendor/device id is optional for function CIS, so * copy it from the card structure as needed. */ @@ -434,11 +428,5 @@ void sdio_free_func_cis(struct sdio_func *func) } func->tuples = NULL; - - /* - * We have now removed the link to the tuples in the - * card structure, so remove the reference. - */ - put_device(&func->card->dev); } diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index dc2db9c185ea..eda1e2ddcaca 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -1053,6 +1053,16 @@ static int jz4740_mmc_probe(struct platform_device* pdev) mmc->ops = &jz4740_mmc_ops; if (!mmc->f_max) mmc->f_max = JZ_MMC_CLK_RATE; + + /* + * There seems to be a problem with this driver on the JZ4760 and + * JZ4760B SoCs. There, when using the maximum rate supported (50 MHz), + * the communication fails with many SD cards. + * Until this bug is sorted out, limit the maximum rate to 24 MHz. + */ + if (host->version == JZ_MMC_JZ4760 && mmc->f_max > JZ_MMC_CLK_RATE) + mmc->f_max = JZ_MMC_CLK_RATE; + mmc->f_min = mmc->f_max / 128; mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 6e5ea0213b47..5c94ad4661ce 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -435,7 +435,8 @@ static int meson_mmc_clk_init(struct meson_host *host) clk_reg |= FIELD_PREP(CLK_CORE_PHASE_MASK, CLK_PHASE_180); clk_reg |= FIELD_PREP(CLK_TX_PHASE_MASK, CLK_PHASE_0); clk_reg |= FIELD_PREP(CLK_RX_PHASE_MASK, CLK_PHASE_0); - clk_reg |= CLK_IRQ_SDIO_SLEEP(host); + if (host->mmc->caps & MMC_CAP_SDIO_IRQ) + clk_reg |= CLK_IRQ_SDIO_SLEEP(host); writel(clk_reg, host->regs + SD_EMMC_CLOCK); /* get the mux parents */ @@ -948,16 +949,18 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id) { struct meson_host *host = dev_id; struct mmc_command *cmd; - u32 status, raw_status; + u32 status, raw_status, irq_mask = IRQ_EN_MASK; irqreturn_t ret = IRQ_NONE; + if (host->mmc->caps & MMC_CAP_SDIO_IRQ) + irq_mask |= IRQ_SDIO; raw_status = readl(host->regs + SD_EMMC_STATUS); - status = raw_status & (IRQ_EN_MASK | IRQ_SDIO); + status = raw_status & irq_mask; if (!status) { dev_dbg(host->dev, - "Unexpected IRQ! irq_en 0x%08lx - status 0x%08x\n", - IRQ_EN_MASK | IRQ_SDIO, raw_status); + "Unexpected IRQ! irq_en 0x%08x - status 0x%08x\n", + irq_mask, raw_status); return IRQ_NONE; } @@ -1204,6 +1207,11 @@ static int meson_mmc_probe(struct platform_device *pdev) goto free_host; } + mmc->caps |= MMC_CAP_CMD23; + + if (mmc->caps & MMC_CAP_SDIO_IRQ) + mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD; + host->data = (struct meson_mmc_data *) of_device_get_match_data(&pdev->dev); if (!host->data) { @@ -1277,11 +1285,6 @@ static int meson_mmc_probe(struct platform_device *pdev) spin_lock_init(&host->lock); - mmc->caps |= MMC_CAP_CMD23; - - if (mmc->caps & MMC_CAP_SDIO_IRQ) - mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD; - if (host->dram_access_quirk) { /* Limit segments to 1 due to low available sram memory */ mmc->max_segs = 1; diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 106dd204b1a7..cc333ad67cac 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1437,7 +1437,7 @@ static int mmc_spi_probe(struct spi_device *spi) status = mmc_add_host(mmc); if (status != 0) - goto fail_add_host; + goto fail_glue_init; /* * Index 0 is card detect @@ -1445,7 +1445,7 @@ static int mmc_spi_probe(struct spi_device *spi) */ status = mmc_gpiod_request_cd(mmc, NULL, 0, false, 1000); if (status == -EPROBE_DEFER) - goto fail_add_host; + goto fail_gpiod_request; if (!status) { /* * The platform has a CD GPIO signal that may support @@ -1460,7 +1460,7 @@ static int mmc_spi_probe(struct spi_device *spi) /* Index 1 is write protect/read only */ status = mmc_gpiod_request_ro(mmc, NULL, 1, 0); if (status == -EPROBE_DEFER) - goto fail_add_host; + goto fail_gpiod_request; if (!status) has_ro = true; @@ -1474,7 +1474,7 @@ static int mmc_spi_probe(struct spi_device *spi) ? ", cd polling" : ""); return 0; -fail_add_host: +fail_gpiod_request: mmc_remove_host(mmc); fail_glue_init: mmc_spi_dma_free(host); diff --git a/drivers/net/bonding/bond_debugfs.c b/drivers/net/bonding/bond_debugfs.c index 4f9b4a18c74c..594094526648 100644 --- a/drivers/net/bonding/bond_debugfs.c +++ b/drivers/net/bonding/bond_debugfs.c @@ -76,7 +76,7 @@ void bond_debug_reregister(struct bonding *bond) d = debugfs_rename(bonding_debug_root, bond->debug_dir, bonding_debug_root, bond->dev->name); - if (d) { + if (!IS_ERR(d)) { bond->debug_dir = d; } else { netdev_warn(bond->dev, "failed to reregister, so just unregister old one\n"); diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c index 3585f02575df..57eeb066a945 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c @@ -48,6 +48,7 @@ mcp251xfd_ring_set_ringparam(struct net_device *ndev, priv->rx_obj_num = layout.cur_rx; priv->rx_obj_num_coalesce_irq = layout.rx_coalesce; priv->tx->obj_num = layout.cur_tx; + priv->tx_obj_num_coalesce_irq = layout.tx_coalesce; return 0; } diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index c26755f662c1..f6f3b43dfb06 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -35,12 +35,13 @@ config NET_DSA_LANTIQ_GSWIP the xrx200 / VR9 SoC. config NET_DSA_MT7530 - tristate "MediaTek MT753x and MT7621 Ethernet switch support" + tristate "MediaTek MT7530 and MT7531 Ethernet switch support" select NET_DSA_TAG_MTK select MEDIATEK_GE_PHY help - This enables support for the MediaTek MT7530, MT7531, and MT7621 - Ethernet switch chips. + This enables support for the MediaTek MT7530 and MT7531 Ethernet + switch chips. Multi-chip module MT7530 in MT7621AT, MT7621DAT, + MT7621ST and MT7623AI SoCs is supported. config NET_DSA_MV88E6060 tristate "Marvell 88E6060 ethernet switch chip support" diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c index c1a633ca1e6d..e315f669ec06 100644 --- a/drivers/net/dsa/microchip/ksz9477_i2c.c +++ b/drivers/net/dsa/microchip/ksz9477_i2c.c @@ -104,7 +104,7 @@ static const struct of_device_id ksz9477_dt_ids[] = { }, { .compatible = "microchip,ksz8563", - .data = &ksz_switch_chips[KSZ9893] + .data = &ksz_switch_chips[KSZ8563] }, { .compatible = "microchip,ksz9567", diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 908fa89444c9..338f238f2043 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1309,14 +1309,26 @@ mt7530_port_set_vlan_aware(struct dsa_switch *ds, int port) if (!priv->ports[port].pvid) mt7530_rmw(priv, MT7530_PVC_P(port), ACC_FRM_MASK, MT7530_VLAN_ACC_TAGGED); - } - /* Set the port as a user port which is to be able to recognize VID - * from incoming packets before fetching entry within the VLAN table. - */ - mt7530_rmw(priv, MT7530_PVC_P(port), VLAN_ATTR_MASK | PVC_EG_TAG_MASK, - VLAN_ATTR(MT7530_VLAN_USER) | - PVC_EG_TAG(MT7530_VLAN_EG_DISABLED)); + /* Set the port as a user port which is to be able to recognize + * VID from incoming packets before fetching entry within the + * VLAN table. + */ + mt7530_rmw(priv, MT7530_PVC_P(port), + VLAN_ATTR_MASK | PVC_EG_TAG_MASK, + VLAN_ATTR(MT7530_VLAN_USER) | + PVC_EG_TAG(MT7530_VLAN_EG_DISABLED)); + } else { + /* Also set CPU ports to the "user" VLAN port attribute, to + * allow VLAN classification, but keep the EG_TAG attribute as + * "consistent" (i.o.w. don't change its value) for packets + * received by the switch from the CPU, so that tagged packets + * are forwarded to user ports as tagged, and untagged as + * untagged. + */ + mt7530_rmw(priv, MT7530_PVC_P(port), VLAN_ATTR_MASK, + VLAN_ATTR(MT7530_VLAN_USER)); + } } static void diff --git a/drivers/net/ethernet/adi/adin1110.c b/drivers/net/ethernet/adi/adin1110.c index 0805f249fff2..c26b8597945b 100644 --- a/drivers/net/ethernet/adi/adin1110.c +++ b/drivers/net/ethernet/adi/adin1110.c @@ -356,7 +356,7 @@ static int adin1110_read_fifo(struct adin1110_port_priv *port_priv) if ((port_priv->flags & IFF_ALLMULTI && rxb->pkt_type == PACKET_MULTICAST) || (port_priv->flags & IFF_BROADCAST && rxb->pkt_type == PACKET_BROADCAST)) - rxb->offload_fwd_mark = 1; + rxb->offload_fwd_mark = port_priv->priv->forwarding; netif_rx(rxb); diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index 02bd3cf9a260..6e4f36aaf5db 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -240,12 +240,12 @@ static int bgmac_probe(struct bcma_device *core) bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL1; bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_PHY; - if (ci->pkg == BCMA_PKG_ID_BCM47188 || - ci->pkg == BCMA_PKG_ID_BCM47186) { + if ((ci->id == BCMA_CHIP_ID_BCM5357 && ci->pkg == BCMA_PKG_ID_BCM47186) || + (ci->id == BCMA_CHIP_ID_BCM53572 && ci->pkg == BCMA_PKG_ID_BCM47188)) { bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_RGMII; bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED; } - if (ci->pkg == BCMA_PKG_ID_BCM5358) + if (ci->id == BCMA_CHIP_ID_BCM5357 && ci->pkg == BCMA_PKG_ID_BCM5358) bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_EPHYRMII; break; case BCMA_CHIP_ID_BCM53573: diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 240a7e8a7652..6c32f5c427b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9274,10 +9274,14 @@ int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init) netdev_err(bp->dev, "ring reservation/IRQ init failure rc: %d\n", rc); return rc; } - if (tcs && (bp->tx_nr_rings_per_tc * tcs != bp->tx_nr_rings)) { + if (tcs && (bp->tx_nr_rings_per_tc * tcs != + bp->tx_nr_rings - bp->tx_nr_rings_xdp)) { netdev_err(bp->dev, "tx ring reservation failure\n"); netdev_reset_tc(bp->dev); - bp->tx_nr_rings_per_tc = bp->tx_nr_rings; + if (bp->tx_nr_rings_xdp) + bp->tx_nr_rings_per_tc = bp->tx_nr_rings_xdp; + else + bp->tx_nr_rings_per_tc = bp->tx_nr_rings; return -ENOMEM; } return 0; diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 59debdc344a5..58747292521d 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -11166,7 +11166,7 @@ static void tg3_reset_task(struct work_struct *work) rtnl_lock(); tg3_full_lock(tp, 0); - if (!netif_running(tp->dev)) { + if (tp->pcierr_recovery || !netif_running(tp->dev)) { tg3_flag_clear(tp, RESET_TASK_PENDING); tg3_full_unlock(tp); rtnl_unlock(); @@ -18101,6 +18101,9 @@ static pci_ers_result_t tg3_io_error_detected(struct pci_dev *pdev, netdev_info(netdev, "PCI I/O error detected\n"); + /* Want to make sure that the reset task doesn't run */ + tg3_reset_task_cancel(tp); + rtnl_lock(); /* Could be second call or maybe we don't have netdev yet */ @@ -18117,9 +18120,6 @@ static pci_ers_result_t tg3_io_error_detected(struct pci_dev *pdev, tg3_timer_stop(tp); - /* Want to make sure that the reset task doesn't run */ - tg3_reset_task_cancel(tp); - netif_device_detach(netdev); /* Clean up software state, even if MMIO is blocked */ diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 72e42820713d..6cda31520c42 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4627,25 +4627,26 @@ static int init_reset_optional(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "failed to init SGMII PHY\n"); - } - ret = zynqmp_pm_is_function_supported(PM_IOCTL, IOCTL_SET_GEM_CONFIG); - if (!ret) { - u32 pm_info[2]; + ret = zynqmp_pm_is_function_supported(PM_IOCTL, IOCTL_SET_GEM_CONFIG); + if (!ret) { + u32 pm_info[2]; + + ret = of_property_read_u32_array(pdev->dev.of_node, "power-domains", + pm_info, ARRAY_SIZE(pm_info)); + if (ret) { + dev_err(&pdev->dev, "Failed to read power management information\n"); + goto err_out_phy_exit; + } + ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_FIXED, 0); + if (ret) + goto err_out_phy_exit; - ret = of_property_read_u32_array(pdev->dev.of_node, "power-domains", - pm_info, ARRAY_SIZE(pm_info)); - if (ret) { - dev_err(&pdev->dev, "Failed to read power management information\n"); - goto err_out_phy_exit; + ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_SGMII_MODE, 1); + if (ret) + goto err_out_phy_exit; } - ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_FIXED, 0); - if (ret) - goto err_out_phy_exit; - ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_SGMII_MODE, 1); - if (ret) - goto err_out_phy_exit; } /* Fully reset controller at hardware level if mapped in device tree */ diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index bf0190e1d2ea..00e2108f2ca4 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -450,7 +450,7 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb, /* ring full, shall not happen because queue is stopped if full * below */ - netif_stop_queue(tx->adapter->netdev); + netif_stop_subqueue(tx->adapter->netdev, tx->queue_index); spin_unlock_irqrestore(&tx->lock, flags); @@ -493,7 +493,7 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb, if (tsnep_tx_desc_available(tx) < (MAX_SKB_FRAGS + 1)) { /* ring can get full with next frame */ - netif_stop_queue(tx->adapter->netdev); + netif_stop_subqueue(tx->adapter->netdev, tx->queue_index); } spin_unlock_irqrestore(&tx->lock, flags); @@ -503,11 +503,14 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb, static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget) { + struct tsnep_tx_entry *entry; + struct netdev_queue *nq; unsigned long flags; int budget = 128; - struct tsnep_tx_entry *entry; - int count; int length; + int count; + + nq = netdev_get_tx_queue(tx->adapter->netdev, tx->queue_index); spin_lock_irqsave(&tx->lock, flags); @@ -564,8 +567,8 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget) } while (likely(budget)); if ((tsnep_tx_desc_available(tx) >= ((MAX_SKB_FRAGS + 1) * 2)) && - netif_queue_stopped(tx->adapter->netdev)) { - netif_wake_queue(tx->adapter->netdev); + netif_tx_queue_stopped(nq)) { + netif_tx_wake_queue(nq); } spin_unlock_irqrestore(&tx->lock, flags); diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 3f8032947d86..027fff9f7db0 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2410,6 +2410,9 @@ static int dpaa_eth_poll(struct napi_struct *napi, int budget) cleaned = qman_p_poll_dqrr(np->p, budget); + if (np->xdp_act & XDP_REDIRECT) + xdp_do_flush(); + if (cleaned < budget) { napi_complete_done(napi, cleaned); qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); @@ -2417,9 +2420,6 @@ static int dpaa_eth_poll(struct napi_struct *napi, int budget) qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); } - if (np->xdp_act & XDP_REDIRECT) - xdp_do_flush(); - return cleaned; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0c35abb7d065..2e79d18fc3c7 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1993,10 +1993,15 @@ static int dpaa2_eth_poll(struct napi_struct *napi, int budget) if (rx_cleaned >= budget || txconf_cleaned >= DPAA2_ETH_TXCONF_PER_NAPI) { work_done = budget; + if (ch->xdp.res & XDP_REDIRECT) + xdp_do_flush(); goto out; } } while (store_cleaned); + if (ch->xdp.res & XDP_REDIRECT) + xdp_do_flush(); + /* Update NET DIM with the values for this CDAN */ dpaa2_io_update_net_dim(ch->dpio, ch->stats.frames_per_cdan, ch->stats.bytes_per_cdan); @@ -2032,9 +2037,7 @@ out: txc_fq->dq_bytes = 0; } - if (ch->xdp.res & XDP_REDIRECT) - xdp_do_flush_map(); - else if (rx_cleaned && ch->xdp.res & XDP_TX) + if (rx_cleaned && ch->xdp.res & XDP_TX) dpaa2_eth_xdp_tx_flush(priv, ch, &priv->fq[flowid]); return work_done; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 644f3c963730..2341597408d1 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3191,7 +3191,7 @@ static void fec_enet_free_buffers(struct net_device *ndev) for (q = 0; q < fep->num_rx_queues; q++) { rxq = fep->rx_queue[q]; for (i = 0; i < rxq->bd.ring_size; i++) - page_pool_release_page(rxq->page_pool, rxq->rx_skb_info[i].page); + page_pool_put_full_page(rxq->page_pool, rxq->rx_skb_info[i].page, false); for (i = 0; i < XDP_STATS_TOTAL; i++) rxq->stats[i] = 0; diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c b/drivers/net/ethernet/freescale/fman/fman_memac.c index 9349f841bd06..587ad81a2dc3 100644 --- a/drivers/net/ethernet/freescale/fman/fman_memac.c +++ b/drivers/net/ethernet/freescale/fman/fman_memac.c @@ -1055,6 +1055,9 @@ static struct phylink_pcs *memac_pcs_create(struct device_node *mac_node, return ERR_PTR(-EPROBE_DEFER); pcs = lynx_pcs_create(mdiodev); + if (!pcs) + mdio_device_free(mdiodev); + return pcs; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 53d0083e35da..52eec0a50492 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2921,7 +2921,7 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu) struct i40e_pf *pf = vsi->back; if (i40e_enabled_xdp_vsi(vsi)) { - int frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + int frame_size = new_mtu + I40E_PACKET_HDR_PAD; if (frame_size > i40e_max_xdp_frame_size(vsi)) return -EINVAL; @@ -13167,6 +13167,8 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev, } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (!br_spec) + return -EINVAL; nla_for_each_nested(attr, br_spec, rem) { __u16 mode; diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 0d1bab4ac1b0..2a9f1eeeb701 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -249,6 +249,7 @@ struct iavf_cloud_filter { /* board specific private data structure */ struct iavf_adapter { + struct workqueue_struct *wq; struct work_struct reset_task; struct work_struct adminq_task; struct delayed_work client_task; @@ -459,7 +460,6 @@ struct iavf_device { /* needed by iavf_ethtool.c */ extern char iavf_driver_name[]; -extern struct workqueue_struct *iavf_wq; static inline const char *iavf_state_str(enum iavf_state_t state) { diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index d79ead5e8d0c..6f171d1d85b7 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -532,7 +532,7 @@ static int iavf_set_priv_flags(struct net_device *netdev, u32 flags) if (changed_flags & IAVF_FLAG_LEGACY_RX) { if (netif_running(netdev)) { adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); } } @@ -672,7 +672,7 @@ static int iavf_set_ringparam(struct net_device *netdev, if (netif_running(netdev)) { adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); } return 0; @@ -1433,7 +1433,7 @@ static int iavf_add_fdir_ethtool(struct iavf_adapter *adapter, struct ethtool_rx adapter->aq_required |= IAVF_FLAG_AQ_ADD_FDIR_FILTER; spin_unlock_bh(&adapter->fdir_fltr_lock); - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 0); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); ret: if (err && fltr) @@ -1474,7 +1474,7 @@ static int iavf_del_fdir_ethtool(struct iavf_adapter *adapter, struct ethtool_rx spin_unlock_bh(&adapter->fdir_fltr_lock); if (fltr && fltr->state == IAVF_FDIR_FLTR_DEL_REQUEST) - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 0); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); return err; } @@ -1658,7 +1658,7 @@ iavf_set_adv_rss_hash_opt(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->adv_rss_lock); if (!err) - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 0); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); mutex_unlock(&adapter->crit_lock); diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index adc02adef83a..4b09785d2147 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -49,7 +49,6 @@ MODULE_DESCRIPTION("Intel(R) Ethernet Adaptive Virtual Function Network Driver") MODULE_LICENSE("GPL v2"); static const struct net_device_ops iavf_netdev_ops; -struct workqueue_struct *iavf_wq; int iavf_status_to_errno(enum iavf_status status) { @@ -277,7 +276,7 @@ void iavf_schedule_reset(struct iavf_adapter *adapter) if (!(adapter->flags & (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED))) { adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); } } @@ -291,7 +290,7 @@ void iavf_schedule_reset(struct iavf_adapter *adapter) void iavf_schedule_request_stats(struct iavf_adapter *adapter) { adapter->aq_required |= IAVF_FLAG_AQ_REQUEST_STATS; - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 0); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); } /** @@ -411,7 +410,7 @@ static irqreturn_t iavf_msix_aq(int irq, void *data) if (adapter->state != __IAVF_REMOVE) /* schedule work on the private workqueue */ - queue_work(iavf_wq, &adapter->adminq_task); + queue_work(adapter->wq, &adapter->adminq_task); return IRQ_HANDLED; } @@ -1034,7 +1033,7 @@ int iavf_replace_primary_mac(struct iavf_adapter *adapter, /* schedule the watchdog task to immediately process the request */ if (f) { - queue_work(iavf_wq, &adapter->watchdog_task.work); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); return 0; } return -ENOMEM; @@ -1257,7 +1256,7 @@ static void iavf_up_complete(struct iavf_adapter *adapter) adapter->aq_required |= IAVF_FLAG_AQ_ENABLE_QUEUES; if (CLIENT_ENABLED(adapter)) adapter->flags |= IAVF_FLAG_CLIENT_NEEDS_OPEN; - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 0); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); } /** @@ -1414,7 +1413,7 @@ void iavf_down(struct iavf_adapter *adapter) adapter->aq_required |= IAVF_FLAG_AQ_DISABLE_QUEUES; } - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 0); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); } /** @@ -2248,7 +2247,7 @@ iavf_set_vlan_offload_features(struct iavf_adapter *adapter, if (aq_required) { adapter->aq_required |= aq_required; - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 0); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 0); } } @@ -2693,6 +2692,15 @@ static void iavf_watchdog_task(struct work_struct *work) goto restart_watchdog; } + if ((adapter->flags & IAVF_FLAG_SETUP_NETDEV_FEATURES) && + adapter->netdev_registered && + !test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section) && + rtnl_trylock()) { + netdev_update_features(adapter->netdev); + rtnl_unlock(); + adapter->flags &= ~IAVF_FLAG_SETUP_NETDEV_FEATURES; + } + if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) iavf_change_state(adapter, __IAVF_COMM_FAILED); @@ -2700,7 +2708,7 @@ static void iavf_watchdog_task(struct work_struct *work) adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; mutex_unlock(&adapter->crit_lock); - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); return; } @@ -2708,31 +2716,31 @@ static void iavf_watchdog_task(struct work_struct *work) case __IAVF_STARTUP: iavf_startup(adapter); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(30)); return; case __IAVF_INIT_VERSION_CHECK: iavf_init_version_check(adapter); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(30)); return; case __IAVF_INIT_GET_RESOURCES: iavf_init_get_resources(adapter); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(1)); return; case __IAVF_INIT_EXTENDED_CAPS: iavf_init_process_extended_caps(adapter); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(1)); return; case __IAVF_INIT_CONFIG_ADAPTER: iavf_init_config_adapter(adapter); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(1)); return; case __IAVF_INIT_FAILED: @@ -2751,14 +2759,14 @@ static void iavf_watchdog_task(struct work_struct *work) adapter->flags |= IAVF_FLAG_PF_COMMS_FAILED; iavf_shutdown_adminq(hw); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, (5 * HZ)); return; } /* Try again from failed step*/ iavf_change_state(adapter, adapter->last_state); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, HZ); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ); return; case __IAVF_COMM_FAILED: if (test_bit(__IAVF_IN_REMOVE_TASK, @@ -2789,13 +2797,14 @@ static void iavf_watchdog_task(struct work_struct *work) adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(10)); return; case __IAVF_RESETTING: mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, HZ * 2); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + HZ * 2); return; case __IAVF_DOWN: case __IAVF_DOWN_PENDING: @@ -2834,9 +2843,9 @@ static void iavf_watchdog_task(struct work_struct *work) adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; dev_err(&adapter->pdev->dev, "Hardware reset detected\n"); - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); mutex_unlock(&adapter->crit_lock); - queue_delayed_work(iavf_wq, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ * 2); return; } @@ -2845,12 +2854,13 @@ static void iavf_watchdog_task(struct work_struct *work) mutex_unlock(&adapter->crit_lock); restart_watchdog: if (adapter->state >= __IAVF_DOWN) - queue_work(iavf_wq, &adapter->adminq_task); + queue_work(adapter->wq, &adapter->adminq_task); if (adapter->aq_required) - queue_delayed_work(iavf_wq, &adapter->watchdog_task, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(20)); else - queue_delayed_work(iavf_wq, &adapter->watchdog_task, HZ * 2); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + HZ * 2); } /** @@ -2952,7 +2962,7 @@ static void iavf_reset_task(struct work_struct *work) */ if (!mutex_trylock(&adapter->crit_lock)) { if (adapter->state != __IAVF_REMOVE) - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); goto reset_finish; } @@ -3116,7 +3126,7 @@ continue_reset: bitmap_clear(adapter->vsi.active_cvlans, 0, VLAN_N_VID); bitmap_clear(adapter->vsi.active_svlans, 0, VLAN_N_VID); - mod_delayed_work(iavf_wq, &adapter->watchdog_task, 2); + mod_delayed_work(adapter->wq, &adapter->watchdog_task, 2); /* We were running when the reset started, so we need to restore some * state here. @@ -3208,7 +3218,7 @@ static void iavf_adminq_task(struct work_struct *work) if (adapter->state == __IAVF_REMOVE) return; - queue_work(iavf_wq, &adapter->adminq_task); + queue_work(adapter->wq, &adapter->adminq_task); goto out; } @@ -3232,24 +3242,6 @@ static void iavf_adminq_task(struct work_struct *work) } while (pending); mutex_unlock(&adapter->crit_lock); - if ((adapter->flags & IAVF_FLAG_SETUP_NETDEV_FEATURES)) { - if (adapter->netdev_registered || - !test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) { - struct net_device *netdev = adapter->netdev; - - rtnl_lock(); - netdev_update_features(netdev); - rtnl_unlock(); - /* Request VLAN offload settings */ - if (VLAN_V2_ALLOWED(adapter)) - iavf_set_vlan_offload_features - (adapter, 0, netdev->features); - - iavf_set_queue_vlan_tag_loc(adapter); - } - - adapter->flags &= ~IAVF_FLAG_SETUP_NETDEV_FEATURES; - } if ((adapter->flags & (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED)) || adapter->state == __IAVF_RESETTING) @@ -4349,7 +4341,7 @@ static int iavf_change_mtu(struct net_device *netdev, int new_mtu) if (netif_running(netdev)) { adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); } return 0; @@ -4898,6 +4890,13 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw = &adapter->hw; hw->back = adapter; + adapter->wq = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, + iavf_driver_name); + if (!adapter->wq) { + err = -ENOMEM; + goto err_alloc_wq; + } + adapter->msg_enable = BIT(DEFAULT_DEBUG_LEVEL_SHIFT) - 1; iavf_change_state(adapter, __IAVF_STARTUP); @@ -4942,7 +4941,7 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&adapter->adminq_task, iavf_adminq_task); INIT_DELAYED_WORK(&adapter->watchdog_task, iavf_watchdog_task); INIT_DELAYED_WORK(&adapter->client_task, iavf_client_task); - queue_delayed_work(iavf_wq, &adapter->watchdog_task, + queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(5 * (pdev->devfn & 0x07))); /* Setup the wait queue for indicating transition to down status */ @@ -4954,6 +4953,8 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; err_ioremap: + destroy_workqueue(adapter->wq); +err_alloc_wq: free_netdev(netdev); err_alloc_etherdev: pci_disable_pcie_error_reporting(pdev); @@ -5023,7 +5024,7 @@ static int __maybe_unused iavf_resume(struct device *dev_d) return err; } - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); netif_device_attach(adapter->netdev); @@ -5170,6 +5171,8 @@ static void iavf_remove(struct pci_dev *pdev) } spin_unlock_bh(&adapter->adv_rss_lock); + destroy_workqueue(adapter->wq); + free_netdev(netdev); pci_disable_pcie_error_reporting(pdev); @@ -5196,24 +5199,11 @@ static struct pci_driver iavf_driver = { **/ static int __init iavf_init_module(void) { - int ret; - pr_info("iavf: %s\n", iavf_driver_string); pr_info("%s\n", iavf_copyright); - iavf_wq = alloc_workqueue("%s", WQ_UNBOUND | WQ_MEM_RECLAIM, 1, - iavf_driver_name); - if (!iavf_wq) { - pr_err("%s: Failed to create workqueue\n", iavf_driver_name); - return -ENOMEM; - } - - ret = pci_register_driver(&iavf_driver); - if (ret) - destroy_workqueue(iavf_wq); - - return ret; + return pci_register_driver(&iavf_driver); } module_init(iavf_init_module); @@ -5227,7 +5217,6 @@ module_init(iavf_init_module); static void __exit iavf_exit_module(void) { pci_unregister_driver(&iavf_driver); - destroy_workqueue(iavf_wq); } module_exit(iavf_exit_module); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 24a701fd140e..365ca0c710c4 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -1952,7 +1952,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, if (!(adapter->flags & IAVF_FLAG_RESET_PENDING)) { adapter->flags |= IAVF_FLAG_RESET_PENDING; dev_info(&adapter->pdev->dev, "Scheduling reset task\n"); - queue_work(iavf_wq, &adapter->reset_task); + queue_work(adapter->wq, &adapter->reset_task); } break; default: @@ -2226,6 +2226,14 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, iavf_process_config(adapter); adapter->flags |= IAVF_FLAG_SETUP_NETDEV_FEATURES; + + /* Request VLAN offload settings */ + if (VLAN_V2_ALLOWED(adapter)) + iavf_set_vlan_offload_features(adapter, 0, + netdev->features); + + iavf_set_queue_vlan_tag_loc(adapter); + was_mac_changed = !ether_addr_equal(netdev->dev_addr, adapter->hw.mac.addr); diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2f0b604abc5e..713069f809ec 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -880,7 +880,7 @@ void ice_set_ethtool_repr_ops(struct net_device *netdev); void ice_set_ethtool_safe_mode_ops(struct net_device *netdev); u16 ice_get_avail_txq_count(struct ice_pf *pf); u16 ice_get_avail_rxq_count(struct ice_pf *pf); -int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx); +int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked); void ice_update_vsi_stats(struct ice_vsi *vsi); void ice_update_pf_stats(struct ice_pf *pf); void diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index d02b55b6aa9c..3e08847505ce 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -5524,7 +5524,7 @@ bool ice_fw_supports_report_dflt_cfg(struct ice_hw *hw) * returned by the firmware is a 16 bit * value, but is indexed * by [fls(speed) - 1] */ -static const u32 ice_aq_to_link_speed[15] = { +static const u32 ice_aq_to_link_speed[] = { SPEED_10, /* BIT(0) */ SPEED_100, SPEED_1000, @@ -5536,10 +5536,6 @@ static const u32 ice_aq_to_link_speed[15] = { SPEED_40000, SPEED_50000, SPEED_100000, /* BIT(10) */ - 0, - 0, - 0, - 0 /* BIT(14) */ }; /** @@ -5550,5 +5546,8 @@ static const u32 ice_aq_to_link_speed[15] = { */ u32 ice_get_link_speed(u16 index) { + if (index >= ARRAY_SIZE(ice_aq_to_link_speed)) + return 0; + return ice_aq_to_link_speed[index]; } diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 4f24d441c35e..0a55c552189a 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -441,7 +441,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked) goto out; } - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, false); out: /* enable previously downed VSIs */ @@ -731,12 +731,13 @@ static int ice_dcb_noncontig_cfg(struct ice_pf *pf) /** * ice_pf_dcb_recfg - Reconfigure all VEBs and VSIs * @pf: pointer to the PF struct + * @locked: is adev device lock held * * Assumed caller has already disabled all VSIs before * calling this function. Reconfiguring DCB based on * local_dcbx_cfg. */ -void ice_pf_dcb_recfg(struct ice_pf *pf) +void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; struct iidc_event *event; @@ -783,14 +784,16 @@ void ice_pf_dcb_recfg(struct ice_pf *pf) if (vsi->type == ICE_VSI_PF) ice_dcbnl_set_all(vsi); } - /* Notify the AUX drivers that TC change is finished */ - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return; + if (!locked) { + /* Notify the AUX drivers that TC change is finished */ + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return; - set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); - ice_send_event_to_aux(pf, event); - kfree(event); + set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); + ice_send_event_to_aux(pf, event); + kfree(event); + } } /** @@ -1044,7 +1047,7 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, } /* changes in configuration update VSI */ - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, false); /* enable previously downed VSIs */ ice_dcb_ena_dis_vsi(pf, true, true); diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h index 4c421c842a13..800879a88c5e 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h @@ -23,7 +23,7 @@ u8 ice_dcb_get_tc(struct ice_vsi *vsi, int queue_index); int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked); int ice_dcb_bwchk(struct ice_pf *pf, struct ice_dcbx_cfg *dcbcfg); -void ice_pf_dcb_recfg(struct ice_pf *pf); +void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked); void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi); int ice_init_pf_dcb(struct ice_pf *pf, bool locked); void ice_update_dcb_stats(struct ice_pf *pf); @@ -128,7 +128,7 @@ static inline u8 ice_get_pfc_mode(struct ice_pf *pf) return 0; } -static inline void ice_pf_dcb_recfg(struct ice_pf *pf) { } +static inline void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { } static inline void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi) { } static inline void ice_update_dcb_stats(struct ice_pf *pf) { } static inline void diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c index 8286e47b4bae..0fae0186bd85 100644 --- a/drivers/net/ethernet/intel/ice/ice_devlink.c +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c @@ -899,7 +899,7 @@ static int ice_set_object_tx_priority(struct ice_port_info *pi, struct ice_sched { int status; - if (node->tx_priority >= 8) { + if (priority >= 8) { NL_SET_ERR_MSG_MOD(extack, "Priority should be less than 8"); return -EINVAL; } @@ -929,7 +929,7 @@ static int ice_set_object_tx_weight(struct ice_port_info *pi, struct ice_sched_n { int status; - if (node->tx_weight > 200 || node->tx_weight < 1) { + if (weight > 200 || weight < 1) { NL_SET_ERR_MSG_MOD(extack, "Weight must be between 1 and 200"); return -EINVAL; } diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 4191994d8f3a..a359f1610fc1 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3641,7 +3641,9 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; int new_rx = 0, new_tx = 0; + bool locked = false; u32 curr_combined; + int ret = 0; /* do not support changing channels in Safe Mode */ if (ice_is_safe_mode(pf)) { @@ -3705,15 +3707,33 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) return -EINVAL; } - ice_vsi_recfg_qs(vsi, new_rx, new_tx); + if (pf->adev) { + mutex_lock(&pf->adev_mutex); + device_lock(&pf->adev->dev); + locked = true; + if (pf->adev->dev.driver) { + netdev_err(dev, "Cannot change channels when RDMA is active\n"); + ret = -EBUSY; + goto adev_unlock; + } + } + + ice_vsi_recfg_qs(vsi, new_rx, new_tx, locked); - if (!netif_is_rxfh_configured(dev)) - return ice_vsi_set_dflt_rss_lut(vsi, new_rx); + if (!netif_is_rxfh_configured(dev)) { + ret = ice_vsi_set_dflt_rss_lut(vsi, new_rx); + goto adev_unlock; + } /* Update rss_size due to change in Rx queues */ vsi->rss_size = ice_get_valid_rss_size(&pf->hw, new_rx); - return 0; +adev_unlock: + if (locked) { + device_unlock(&pf->adev->dev); + mutex_unlock(&pf->adev_mutex); + } + return ret; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 94aa834cd9a6..a596e07b3ce9 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -3235,9 +3235,6 @@ int ice_vsi_release(struct ice_vsi *vsi) } } - if (vsi->type == ICE_VSI_PF) - ice_devlink_destroy_pf_port(pf); - if (vsi->type == ICE_VSI_VF && vsi->agg_node && vsi->agg_node->valid) vsi->agg_node->num_vsis--; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index a9a7f8b52140..8ec24f6cf6be 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -275,6 +275,8 @@ static int ice_set_promisc(struct ice_vsi *vsi, u8 promisc_m) if (status && status != -EEXIST) return status; + netdev_dbg(vsi->netdev, "set promisc filter bits for VSI %i: 0x%x\n", + vsi->vsi_num, promisc_m); return 0; } @@ -300,6 +302,8 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m) promisc_m, 0); } + netdev_dbg(vsi->netdev, "clear promisc filter bits for VSI %i: 0x%x\n", + vsi->vsi_num, promisc_m); return status; } @@ -414,6 +418,16 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) } err = 0; vlan_ops->dis_rx_filtering(vsi); + + /* promiscuous mode implies allmulticast so + * that VSIs that are in promiscuous mode are + * subscribed to multicast packets coming to + * the port + */ + err = ice_set_promisc(vsi, + ICE_MCAST_PROMISC_BITS); + if (err) + goto out_promisc; } } else { /* Clear Rx filter to remove traffic from wire */ @@ -430,6 +444,18 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) NETIF_F_HW_VLAN_CTAG_FILTER) vlan_ops->ena_rx_filtering(vsi); } + + /* disable allmulti here, but only if allmulti is not + * still enabled for the netdev + */ + if (!(vsi->current_netdev_flags & IFF_ALLMULTI)) { + err = ice_clear_promisc(vsi, + ICE_MCAST_PROMISC_BITS); + if (err) { + netdev_err(netdev, "Error %d clearing multicast promiscuous on VSI %i\n", + err, vsi->vsi_num); + } + } } } goto exit; @@ -4195,12 +4221,13 @@ bool ice_is_wol_supported(struct ice_hw *hw) * @vsi: VSI being changed * @new_rx: new number of Rx queues * @new_tx: new number of Tx queues + * @locked: is adev device_lock held * * Only change the number of queues if new_tx, or new_rx is non-0. * * Returns 0 on success. */ -int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx) +int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked) { struct ice_pf *pf = vsi->back; int err = 0, timeout = 50; @@ -4229,7 +4256,7 @@ int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx) ice_vsi_close(vsi); ice_vsi_rebuild(vsi, false); - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, locked); ice_vsi_open(vsi); done: clear_bit(ICE_CFG_BUSY, pf->state); @@ -4590,7 +4617,7 @@ static void ice_print_wake_reason(struct ice_pf *pf) } /** - * ice_register_netdev - register netdev and devlink port + * ice_register_netdev - register netdev * @pf: pointer to the PF struct */ static int ice_register_netdev(struct ice_pf *pf) @@ -4602,11 +4629,6 @@ static int ice_register_netdev(struct ice_pf *pf) if (!vsi || !vsi->netdev) return -EIO; - err = ice_devlink_create_pf_port(pf); - if (err) - goto err_devlink_create; - - SET_NETDEV_DEVLINK_PORT(vsi->netdev, &pf->devlink_port); err = register_netdev(vsi->netdev); if (err) goto err_register_netdev; @@ -4617,8 +4639,6 @@ static int ice_register_netdev(struct ice_pf *pf) return 0; err_register_netdev: - ice_devlink_destroy_pf_port(pf); -err_devlink_create: free_netdev(vsi->netdev); vsi->netdev = NULL; clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state); @@ -4636,6 +4656,7 @@ static int ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) { struct device *dev = &pdev->dev; + struct ice_vsi *vsi; struct ice_pf *pf; struct ice_hw *hw; int i, err; @@ -4918,6 +4939,18 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) pcie_print_link_status(pf->pdev); probe_done: + err = ice_devlink_create_pf_port(pf); + if (err) + goto err_create_pf_port; + + vsi = ice_get_main_vsi(pf); + if (!vsi || !vsi->netdev) { + err = -EINVAL; + goto err_netdev_reg; + } + + SET_NETDEV_DEVLINK_PORT(vsi->netdev, &pf->devlink_port); + err = ice_register_netdev(pf); if (err) goto err_netdev_reg; @@ -4955,6 +4988,8 @@ err_init_aux_unroll: err_devlink_reg_param: ice_devlink_unregister_params(pf); err_netdev_reg: + ice_devlink_destroy_pf_port(pf); +err_create_pf_port: err_send_version_unroll: ice_vsi_release_all(pf); err_alloc_sw_unroll: @@ -5083,6 +5118,7 @@ static void ice_remove(struct pci_dev *pdev) ice_setup_mc_magic_wake(pf); ice_vsi_release_all(pf); mutex_destroy(&(&pf->hw)->fdir_fltr_lock); + ice_devlink_destroy_pf_port(pf); ice_set_wake(pf); ice_free_irq_msix_misc(pf); ice_for_each_vsi(pf, i) { @@ -5531,7 +5567,7 @@ static int __init ice_module_init(void) pr_info("%s\n", ice_driver_string); pr_info("%s\n", ice_copyright); - ice_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, KBUILD_MODNAME); + ice_wq = alloc_workqueue("%s", 0, 0, KBUILD_MODNAME); if (!ice_wq) { pr_err("Failed to create workqueue\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 9b762f7972ce..61f844d22512 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -5420,7 +5420,7 @@ ice_add_adv_recipe(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups, */ status = ice_add_special_words(rinfo, lkup_exts, ice_is_dvm_ena(hw)); if (status) - goto err_free_lkup_exts; + goto err_unroll; /* Group match words into recipes using preferred recipe grouping * criteria. diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index faba0f857cd9..95f392ab9670 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -1681,7 +1681,7 @@ ice_tc_forward_to_queue(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr, struct ice_vsi *ch_vsi = NULL; u16 queue = act->rx_queue; - if (queue > vsi->num_rxq) { + if (queue >= vsi->num_rxq) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter because specified queue is invalid"); return -EINVAL; diff --git a/drivers/net/ethernet/intel/ice/ice_vf_mbx.c b/drivers/net/ethernet/intel/ice/ice_vf_mbx.c index d4a4001b6e5d..f56fa94ff3d0 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_mbx.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_mbx.c @@ -39,7 +39,7 @@ ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd); } -static const u32 ice_legacy_aq_to_vc_speed[15] = { +static const u32 ice_legacy_aq_to_vc_speed[] = { VIRTCHNL_LINK_SPEED_100MB, /* BIT(0) */ VIRTCHNL_LINK_SPEED_100MB, VIRTCHNL_LINK_SPEED_1GB, @@ -51,10 +51,6 @@ static const u32 ice_legacy_aq_to_vc_speed[15] = { VIRTCHNL_LINK_SPEED_40GB, VIRTCHNL_LINK_SPEED_40GB, VIRTCHNL_LINK_SPEED_40GB, - VIRTCHNL_LINK_SPEED_UNKNOWN, - VIRTCHNL_LINK_SPEED_UNKNOWN, - VIRTCHNL_LINK_SPEED_UNKNOWN, - VIRTCHNL_LINK_SPEED_UNKNOWN /* BIT(14) */ }; /** @@ -71,21 +67,20 @@ static const u32 ice_legacy_aq_to_vc_speed[15] = { */ u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed) { - u32 speed; + /* convert a BIT() value into an array index */ + u32 index = fls(link_speed) - 1; - if (adv_link_support) { - /* convert a BIT() value into an array index */ - speed = ice_get_link_speed(fls(link_speed) - 1); - } else { + if (adv_link_support) + return ice_get_link_speed(index); + else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed)) /* Virtchnl speeds are not defined for every speed supported in * the hardware. To maintain compatibility with older AVF * drivers, while reporting the speed the new speed values are * resolved to the closest known virtchnl speeds */ - speed = ice_legacy_aq_to_vc_speed[fls(link_speed) - 1]; - } + return ice_legacy_aq_to_vc_speed[index]; - return speed; + return VIRTCHNL_LINK_SPEED_UNKNOWN; } /* The mailbox overflow detection algorithm helps to check if there diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c index 5ecc0ee9a78e..b1ffb81893d4 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c @@ -44,13 +44,17 @@ void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) /* outer VLAN ops regardless of port VLAN config */ vlan_ops->add_vlan = ice_vsi_add_vlan; - vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering; vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering; vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering; if (ice_vf_is_port_vlan_ena(vf)) { /* setup outer VLAN ops */ vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; + /* all Rx traffic should be in the domain of the + * assigned port VLAN, so prevent disabling Rx VLAN + * filtering + */ + vlan_ops->dis_rx_filtering = noop_vlan; vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering; @@ -63,6 +67,9 @@ void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion; vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion; } else { + vlan_ops->dis_rx_filtering = + ice_vsi_dis_rx_vlan_filtering; + if (!test_bit(ICE_FLAG_VF_VLAN_PRUNING, pf->flags)) vlan_ops->ena_rx_filtering = noop_vlan; else @@ -96,7 +103,14 @@ void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan; vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering; + /* all Rx traffic should be in the domain of the + * assigned port VLAN, so prevent disabling Rx VLAN + * filtering + */ + vlan_ops->dis_rx_filtering = noop_vlan; } else { + vlan_ops->dis_rx_filtering = + ice_vsi_dis_rx_vlan_filtering; if (!test_bit(ICE_FLAG_VF_VLAN_PRUNING, pf->flags)) vlan_ops->ena_rx_filtering = noop_vlan; else diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 7105de6fb344..374b7f10b549 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -800,6 +800,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) struct ice_tx_desc *tx_desc; u16 cnt = xdp_ring->count; struct ice_tx_buf *tx_buf; + u16 completed_frames = 0; u16 xsk_frames = 0; u16 last_rs; int i; @@ -809,19 +810,21 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) if ((tx_desc->cmd_type_offset_bsz & cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE))) { if (last_rs >= ntc) - xsk_frames = last_rs - ntc + 1; + completed_frames = last_rs - ntc + 1; else - xsk_frames = last_rs + cnt - ntc + 1; + completed_frames = last_rs + cnt - ntc + 1; } - if (!xsk_frames) + if (!completed_frames) return; - if (likely(!xdp_ring->xdp_tx_active)) + if (likely(!xdp_ring->xdp_tx_active)) { + xsk_frames = completed_frames; goto skip; + } ntc = xdp_ring->next_to_clean; - for (i = 0; i < xsk_frames; i++) { + for (i = 0; i < completed_frames; i++) { tx_buf = &xdp_ring->tx_buf[ntc]; if (tx_buf->raw_buf) { @@ -837,7 +840,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) } skip: tx_desc->cmd_type_offset_bsz = 0; - xdp_ring->next_to_clean += xsk_frames; + xdp_ring->next_to_clean += completed_frames; if (xdp_ring->next_to_clean >= cnt) xdp_ring->next_to_clean -= cnt; if (xsk_frames) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 3c0c35ecea10..b5b443883da9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2256,6 +2256,30 @@ static void igb_enable_mas(struct igb_adapter *adapter) } } +#ifdef CONFIG_IGB_HWMON +/** + * igb_set_i2c_bb - Init I2C interface + * @hw: pointer to hardware structure + **/ +static void igb_set_i2c_bb(struct e1000_hw *hw) +{ + u32 ctrl_ext; + s32 i2cctl; + + ctrl_ext = rd32(E1000_CTRL_EXT); + ctrl_ext |= E1000_CTRL_I2C_ENA; + wr32(E1000_CTRL_EXT, ctrl_ext); + wrfl(); + + i2cctl = rd32(E1000_I2CPARAMS); + i2cctl |= E1000_I2CBB_EN + | E1000_I2C_CLK_OE_N + | E1000_I2C_DATA_OE_N; + wr32(E1000_I2CPARAMS, i2cctl); + wrfl(); +} +#endif + void igb_reset(struct igb_adapter *adapter) { struct pci_dev *pdev = adapter->pdev; @@ -2400,7 +2424,8 @@ void igb_reset(struct igb_adapter *adapter) * interface. */ if (adapter->ets) - mac->ops.init_thermal_sensor_thresh(hw); + igb_set_i2c_bb(hw); + mac->ops.init_thermal_sensor_thresh(hw); } } #endif @@ -3117,21 +3142,12 @@ static void igb_init_mas(struct igb_adapter *adapter) **/ static s32 igb_init_i2c(struct igb_adapter *adapter) { - struct e1000_hw *hw = &adapter->hw; s32 status = 0; - s32 i2cctl; /* I2C interface supported on i350 devices */ if (adapter->hw.mac.type != e1000_i350) return 0; - i2cctl = rd32(E1000_I2CPARAMS); - i2cctl |= E1000_I2CBB_EN - | E1000_I2C_CLK_OUT | E1000_I2C_CLK_OE_N - | E1000_I2C_DATA_OUT | E1000_I2C_DATA_OE_N; - wr32(E1000_I2CPARAMS, i2cctl); - wrfl(); - /* Initialize the i2c bus which is controlled by the registers. * This bus will use the i2c_algo_bit structure that implements * the protocol through toggling of the 4 bits in the register. @@ -3521,6 +3537,12 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->ets = true; else adapter->ets = false; + /* Only enable I2C bit banging if an external thermal + * sensor is supported. + */ + if (adapter->ets) + igb_set_i2c_bb(hw); + hw->mac.ops.init_thermal_sensor_thresh(hw); if (igb_sysfs_init(adapter)) dev_err(&pdev->dev, "failed to allocate sysfs resources\n"); @@ -6794,7 +6816,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) struct timespec64 ts; u32 tsauxc; - if (pin < 0 || pin >= IGB_N_PEROUT) + if (pin < 0 || pin >= IGB_N_SDP) return; spin_lock(&adapter->tmreg_lock); @@ -6802,7 +6824,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) if (hw->mac.type == e1000_82580 || hw->mac.type == e1000_i354 || hw->mac.type == e1000_i350) { - s64 ns = timespec64_to_ns(&adapter->perout[pin].period); + s64 ns = timespec64_to_ns(&adapter->perout[tsintr_tt].period); u32 systiml, systimh, level_mask, level, rem; u64 systim, now; @@ -6850,8 +6872,8 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) ts.tv_nsec = (u32)systim; ts.tv_sec = ((u32)(systim >> 32)) & 0xFF; } else { - ts = timespec64_add(adapter->perout[pin].start, - adapter->perout[pin].period); + ts = timespec64_add(adapter->perout[tsintr_tt].start, + adapter->perout[tsintr_tt].period); } /* u32 conversion of tv_sec is safe until y2106 */ @@ -6860,7 +6882,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) tsauxc = rd32(E1000_TSAUXC); tsauxc |= TSAUXC_EN_TT0; wr32(E1000_TSAUXC, tsauxc); - adapter->perout[pin].start = ts; + adapter->perout[tsintr_tt].start = ts; spin_unlock(&adapter->tmreg_lock); } @@ -6874,7 +6896,7 @@ static void igb_extts(struct igb_adapter *adapter, int tsintr_tt) struct ptp_clock_event event; struct timespec64 ts; - if (pin < 0 || pin >= IGB_N_EXTTS) + if (pin < 0 || pin >= IGB_N_SDP) return; if (hw->mac.type == e1000_82580 || diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 44b1740dc098..1dd2a7fee8d4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2942,7 +2942,9 @@ static bool igc_clean_tx_irq(struct igc_q_vector *q_vector, int napi_budget) if (tx_buffer->next_to_watch && time_after(jiffies, tx_buffer->time_stamp + (adapter->tx_timeout_factor * HZ)) && - !(rd32(IGC_STATUS) & IGC_STATUS_TXOFF)) { + !(rd32(IGC_STATUS) & IGC_STATUS_TXOFF) && + (rd32(IGC_TDH(tx_ring->reg_idx)) != + readl(tx_ring->tail))) { /* detected Tx unit hang */ netdev_err(tx_ring->netdev, "Detected Tx Unit Hang\n" @@ -5069,6 +5071,24 @@ static int igc_change_mtu(struct net_device *netdev, int new_mtu) } /** + * igc_tx_timeout - Respond to a Tx Hang + * @netdev: network interface device structure + * @txqueue: queue number that timed out + **/ +static void igc_tx_timeout(struct net_device *netdev, + unsigned int __always_unused txqueue) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + struct igc_hw *hw = &adapter->hw; + + /* Do the reset outside of interrupt context */ + adapter->tx_timeout_count++; + schedule_work(&adapter->reset_task); + wr32(IGC_EICS, + (adapter->eims_enable_mask & ~adapter->eims_other)); +} + +/** * igc_get_stats64 - Get System Network Statistics * @netdev: network interface device structure * @stats: rtnl_link_stats64 pointer @@ -5495,7 +5515,7 @@ static void igc_watchdog_task(struct work_struct *work) case SPEED_100: case SPEED_1000: case SPEED_2500: - adapter->tx_timeout_factor = 7; + adapter->tx_timeout_factor = 1; break; } @@ -6320,6 +6340,7 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_set_rx_mode = igc_set_rx_mode, .ndo_set_mac_address = igc_set_mac, .ndo_change_mtu = igc_change_mtu, + .ndo_tx_timeout = igc_tx_timeout, .ndo_get_stats64 = igc_get_stats64, .ndo_fix_features = igc_fix_features, .ndo_set_features = igc_set_features, diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index c34734d432e0..4e10ced736db 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -417,10 +417,12 @@ static int igc_ptp_verify_pin(struct ptp_clock_info *ptp, unsigned int pin, * * We need to convert the system time value stored in the RX/TXSTMP registers * into a hwtstamp which can be used by the upper level timestamping functions. + * + * Returns 0 on success. **/ -static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, - struct skb_shared_hwtstamps *hwtstamps, - u64 systim) +static int igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, + struct skb_shared_hwtstamps *hwtstamps, + u64 systim) { switch (adapter->hw.mac.type) { case igc_i225: @@ -430,8 +432,9 @@ static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, systim & 0xFFFFFFFF); break; default: - break; + return -EINVAL; } + return 0; } /** @@ -652,7 +655,8 @@ static void igc_ptp_tx_hwtstamp(struct igc_adapter *adapter) regval = rd32(IGC_TXSTMPL); regval |= (u64)rd32(IGC_TXSTMPH) << 32; - igc_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval); + if (igc_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval)) + return; switch (adapter->link_speed) { case SPEED_10: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index bc68b8f2176d..8736ca4b2628 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -73,6 +73,8 @@ #define IXGBE_RXBUFFER_4K 4096 #define IXGBE_MAX_RXBUFFER 16384 /* largest size for a single descriptor */ +#define IXGBE_PKT_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2)) + /* Attempt to maximize the headroom available for incoming frames. We * use a 2K buffer for receives and need 1536/1534 to store the data for * the frame. This leaves us with 512 bytes of room. From that we need diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index ab8370c413f3..4507fba8747a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6778,6 +6778,18 @@ static void ixgbe_free_all_rx_resources(struct ixgbe_adapter *adapter) } /** + * ixgbe_max_xdp_frame_size - returns the maximum allowed frame size for XDP + * @adapter: device handle, pointer to adapter + */ +static int ixgbe_max_xdp_frame_size(struct ixgbe_adapter *adapter) +{ + if (PAGE_SIZE >= 8192 || adapter->flags2 & IXGBE_FLAG2_RX_LEGACY) + return IXGBE_RXBUFFER_2K; + else + return IXGBE_RXBUFFER_3K; +} + +/** * ixgbe_change_mtu - Change the Maximum Transfer Unit * @netdev: network interface device structure * @new_mtu: new value for maximum frame size @@ -6788,18 +6800,12 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) { struct ixgbe_adapter *adapter = netdev_priv(netdev); - if (adapter->xdp_prog) { - int new_frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + - VLAN_HLEN; - int i; - - for (i = 0; i < adapter->num_rx_queues; i++) { - struct ixgbe_ring *ring = adapter->rx_ring[i]; + if (ixgbe_enabled_xdp_adapter(adapter)) { + int new_frame_size = new_mtu + IXGBE_PKT_HDR_PAD; - if (new_frame_size > ixgbe_rx_bufsz(ring)) { - e_warn(probe, "Requested MTU size is not supported with XDP\n"); - return -EINVAL; - } + if (new_frame_size > ixgbe_max_xdp_frame_size(adapter)) { + e_warn(probe, "Requested MTU size is not supported with XDP\n"); + return -EINVAL; } } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c index bda1a6fa2ec4..e4407f09c9d3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -1500,6 +1500,9 @@ static const struct devlink_param rvu_af_dl_params[] = { BIT(DEVLINK_PARAM_CMODE_RUNTIME), rvu_af_dl_dwrr_mtu_get, rvu_af_dl_dwrr_mtu_set, rvu_af_dl_dwrr_mtu_validate), +}; + +static const struct devlink_param rvu_af_dl_param_exact_match[] = { DEVLINK_PARAM_DRIVER(RVU_AF_DEVLINK_PARAM_ID_NPC_EXACT_FEATURE_DISABLE, "npc_exact_feature_disable", DEVLINK_PARAM_TYPE_STRING, BIT(DEVLINK_PARAM_CMODE_RUNTIME), @@ -1556,7 +1559,6 @@ int rvu_register_dl(struct rvu *rvu) { struct rvu_devlink *rvu_dl; struct devlink *dl; - size_t size; int err; dl = devlink_alloc(&rvu_devlink_ops, sizeof(struct rvu_devlink), @@ -1578,21 +1580,32 @@ int rvu_register_dl(struct rvu *rvu) goto err_dl_health; } + err = devlink_params_register(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + if (err) { + dev_err(rvu->dev, + "devlink params register failed with error %d", err); + goto err_dl_health; + } + /* Register exact match devlink only for CN10K-B */ - size = ARRAY_SIZE(rvu_af_dl_params); if (!rvu_npc_exact_has_match_table(rvu)) - size -= 1; + goto done; - err = devlink_params_register(dl, rvu_af_dl_params, size); + err = devlink_params_register(dl, rvu_af_dl_param_exact_match, + ARRAY_SIZE(rvu_af_dl_param_exact_match)); if (err) { dev_err(rvu->dev, - "devlink params register failed with error %d", err); - goto err_dl_health; + "devlink exact match params register failed with error %d", err); + goto err_dl_exact_match; } +done: devlink_register(dl); return 0; +err_dl_exact_match: + devlink_params_unregister(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + err_dl_health: rvu_health_reporters_destroy(rvu); devlink_free(dl); @@ -1605,8 +1618,14 @@ void rvu_unregister_dl(struct rvu *rvu) struct devlink *dl = rvu_dl->dl; devlink_unregister(dl); - devlink_params_unregister(dl, rvu_af_dl_params, - ARRAY_SIZE(rvu_af_dl_params)); + + devlink_params_unregister(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + + /* Unregister exact match devlink only for CN10K-B */ + if (rvu_npc_exact_has_match_table(rvu)) + devlink_params_unregister(dl, rvu_af_dl_param_exact_match, + ARRAY_SIZE(rvu_af_dl_param_exact_match)); + rvu_health_reporters_destroy(rvu); devlink_free(dl); } diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index e3de9a53b2d9..e3123723522e 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -1570,8 +1570,8 @@ static struct page_pool *mtk_create_page_pool(struct mtk_eth *eth, if (IS_ERR(pp)) return pp; - err = __xdp_rxq_info_reg(xdp_q, ð->dummy_dev, eth->rx_napi.napi_id, - id, PAGE_SIZE); + err = __xdp_rxq_info_reg(xdp_q, ð->dummy_dev, id, + eth->rx_napi.napi_id, PAGE_SIZE); if (err < 0) goto err_free_pp; @@ -1870,7 +1870,9 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, while (done < budget) { unsigned int pktlen, *rxdcsum; + bool has_hwaccel_tag = false; struct net_device *netdev; + u16 vlan_proto, vlan_tci; dma_addr_t dma_addr; u32 hash, reason; int mac = 0; @@ -2010,27 +2012,29 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { - if (trxd.rxd3 & RX_DMA_VTAG_V2) - __vlan_hwaccel_put_tag(skb, - htons(RX_DMA_VPID(trxd.rxd4)), - RX_DMA_VID(trxd.rxd4)); + if (trxd.rxd3 & RX_DMA_VTAG_V2) { + vlan_proto = RX_DMA_VPID(trxd.rxd4); + vlan_tci = RX_DMA_VID(trxd.rxd4); + has_hwaccel_tag = true; + } } else if (trxd.rxd2 & RX_DMA_VTAG) { - __vlan_hwaccel_put_tag(skb, htons(RX_DMA_VPID(trxd.rxd3)), - RX_DMA_VID(trxd.rxd3)); + vlan_proto = RX_DMA_VPID(trxd.rxd3); + vlan_tci = RX_DMA_VID(trxd.rxd3); + has_hwaccel_tag = true; } } /* When using VLAN untagging in combination with DSA, the * hardware treats the MTK special tag as a VLAN and untags it. */ - if (skb_vlan_tag_present(skb) && netdev_uses_dsa(netdev)) { - unsigned int port = ntohs(skb->vlan_proto) & GENMASK(2, 0); + if (has_hwaccel_tag && netdev_uses_dsa(netdev)) { + unsigned int port = vlan_proto & GENMASK(2, 0); if (port < ARRAY_SIZE(eth->dsa_meta) && eth->dsa_meta[port]) skb_dst_set_noref(skb, ð->dsa_meta[port]->dst); - - __vlan_hwaccel_clear_tag(skb); + } else if (has_hwaccel_tag) { + __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan_tci); } skb_record_rx_queue(skb, 0); @@ -3111,7 +3115,7 @@ static void mtk_gdm_config(struct mtk_eth *eth, u32 config) val |= config; - if (!i && eth->netdev[0] && netdev_uses_dsa(eth->netdev[0])) + if (eth->netdev[i] && netdev_uses_dsa(eth->netdev[i])) val |= MTK_GDMA_SPECIAL_TAG; mtk_w32(eth, val, MTK_GDMA_FWD_CFG(i)); diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 18a50529ce7b..2d9186d32bc0 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -519,7 +519,7 @@ #define SGMII_SPEED_10 FIELD_PREP(SGMII_SPEED_MASK, 0) #define SGMII_SPEED_100 FIELD_PREP(SGMII_SPEED_MASK, 1) #define SGMII_SPEED_1000 FIELD_PREP(SGMII_SPEED_MASK, 2) -#define SGMII_DUPLEX_FULL BIT(4) +#define SGMII_DUPLEX_HALF BIT(4) #define SGMII_IF_MODE_BIT5 BIT(5) #define SGMII_REMOTE_FAULT_DIS BIT(8) #define SGMII_CODE_SYNC_SET_VAL BIT(9) @@ -1036,11 +1036,13 @@ struct mtk_soc_data { * @regmap: The register map pointing at the range used to setup * SGMII modes * @ana_rgc3: The offset refers to register ANA_RGC3 related to regmap + * @interface: Currently configured interface mode * @pcs: Phylink PCS structure */ struct mtk_pcs { struct regmap *regmap; u32 ana_rgc3; + phy_interface_t interface; struct phylink_pcs pcs; }; diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index 269208a841c7..1ff024f42444 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -615,8 +615,7 @@ mtk_foe_entry_commit_subflow(struct mtk_ppe *ppe, struct mtk_flow_entry *entry, u32 ib1_mask = mtk_get_ib1_pkt_type_mask(ppe->eth) | MTK_FOE_IB1_UDP; int type; - flow_info = kzalloc(offsetof(struct mtk_flow_entry, l2_data.end), - GFP_ATOMIC); + flow_info = kzalloc(sizeof(*flow_info), GFP_ATOMIC); if (!flow_info) return; diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h index ea64fac1d425..b5e432031340 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.h +++ b/drivers/net/ethernet/mediatek/mtk_ppe.h @@ -279,7 +279,6 @@ struct mtk_flow_entry { struct { struct mtk_flow_entry *base_flow; struct hlist_node list; - struct {} end; } l2_data; }; struct rhash_head node; diff --git a/drivers/net/ethernet/mediatek/mtk_sgmii.c b/drivers/net/ethernet/mediatek/mtk_sgmii.c index 5c286f2c9418..bb00de1003ac 100644 --- a/drivers/net/ethernet/mediatek/mtk_sgmii.c +++ b/drivers/net/ethernet/mediatek/mtk_sgmii.c @@ -43,11 +43,6 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, int advertise, link_timer; bool changed, use_an; - if (interface == PHY_INTERFACE_MODE_2500BASEX) - rgc3 = RG_PHY_SPEED_3_125G; - else - rgc3 = 0; - advertise = phylink_mii_c22_pcs_encode_advertisement(interface, advertising); if (advertise < 0) @@ -88,9 +83,22 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, bmcr = 0; } - /* Configure the underlying interface speed */ - regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3, - RG_PHY_SPEED_3_125G, rgc3); + if (mpcs->interface != interface) { + /* PHYA power down */ + regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, + SGMII_PHYA_PWD, SGMII_PHYA_PWD); + + if (interface == PHY_INTERFACE_MODE_2500BASEX) + rgc3 = RG_PHY_SPEED_3_125G; + else + rgc3 = 0; + + /* Configure the underlying interface speed */ + regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3, + RG_PHY_SPEED_3_125G, rgc3); + + mpcs->interface = interface; + } /* Update the advertisement, noting whether it has changed */ regmap_update_bits_check(mpcs->regmap, SGMSYS_PCS_ADVERTISE, @@ -108,9 +116,17 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, regmap_update_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1, SGMII_AN_RESTART | SGMII_AN_ENABLE, bmcr); - /* Release PHYA power down state */ - regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, - SGMII_PHYA_PWD, 0); + /* Release PHYA power down state + * Only removing bit SGMII_PHYA_PWD isn't enough. + * There are cases when the SGMII_PHYA_PWD register contains 0x9 which + * prevents SGMII from working. The SGMII still shows link but no traffic + * can flow. Writing 0x0 to the PHYA_PWD register fix the issue. 0x0 was + * taken from a good working state of the SGMII interface. + * Unknown how much the QPHY needs but it is racy without a sleep. + * Tested on mt7622 & mt7986. + */ + usleep_range(50, 100); + regmap_write(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, 0); return changed; } @@ -138,11 +154,11 @@ static void mtk_pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, else sgm_mode = SGMII_SPEED_1000; - if (duplex == DUPLEX_FULL) - sgm_mode |= SGMII_DUPLEX_FULL; + if (duplex != DUPLEX_FULL) + sgm_mode |= SGMII_DUPLEX_HALF; regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE, - SGMII_DUPLEX_FULL | SGMII_SPEED_MASK, + SGMII_DUPLEX_HALF | SGMII_SPEED_MASK, sgm_mode); } } @@ -171,6 +187,8 @@ int mtk_sgmii_init(struct mtk_sgmii *ss, struct device_node *r, u32 ana_rgc3) return PTR_ERR(ss->pcs[i].regmap); ss->pcs[i].pcs.ops = &mtk_pcs_ops; + ss->pcs[i].pcs.poll = true; + ss->pcs[i].interface = PHY_INTERFACE_MODE_NA; } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 3e232a65a0c3..bb95b40d25eb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -245,8 +245,9 @@ void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev) pages = dev->priv.dbg.pages_debugfs; debugfs_create_u32("fw_pages_total", 0400, pages, &dev->priv.fw_pages); - debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.vfs_pages); - debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.host_pf_pages); + debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.page_counters[MLX5_VF]); + debugfs_create_u32("fw_pages_sfs", 0400, pages, &dev->priv.page_counters[MLX5_SF]); + debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.page_counters[MLX5_HOST_PF]); debugfs_create_u32("fw_pages_alloc_failed", 0400, pages, &dev->priv.fw_pages_alloc_failed); debugfs_create_u32("fw_pages_give_dropped", 0400, pages, &dev->priv.give_pages_dropped); debugfs_create_u32("fw_pages_reclaim_discard", 0400, pages, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index 21831386b26e..5b05b884b5fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -64,6 +64,7 @@ static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer) MLX5_GET(mtrc_cap, out, num_string_trace); tracer->str_db.num_string_db = MLX5_GET(mtrc_cap, out, num_string_db); tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner); + tracer->str_db.loaded = false; for (i = 0; i < tracer->str_db.num_string_db; i++) { mtrc_cap_sp = MLX5_ADDR_OF(mtrc_cap, out, string_db_param[i]); @@ -756,6 +757,7 @@ static int mlx5_fw_tracer_set_mtrc_conf(struct mlx5_fw_tracer *tracer) if (err) mlx5_core_warn(dev, "FWTracer: Failed to set tracer configurations %d\n", err); + tracer->buff.consumer_index = 0; return err; } @@ -820,7 +822,6 @@ static void mlx5_fw_tracer_ownership_change(struct work_struct *work) mlx5_core_dbg(tracer->dev, "FWTracer: ownership changed, current=(%d)\n", tracer->owner); if (tracer->owner) { tracer->owner = false; - tracer->buff.consumer_index = 0; return; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index 464eb3a18450..cdc87ecae5d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -87,7 +87,7 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev) mlx5_host_pf_cleanup(dev); - err = mlx5_wait_for_pages(dev, &dev->priv.host_pf_pages); + err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]); if (err) mlx5_core_warn(dev, "Timeout reclaiming external host PF pages err(%d)\n", err); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index 8099a21e674c..ce85b48d327d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -438,10 +438,6 @@ static int mlx5_esw_bridge_switchdev_event(struct notifier_block *nb, switch (event) { case SWITCHDEV_FDB_ADD_TO_BRIDGE: - /* only handle the event on native eswtich of representor */ - if (!mlx5_esw_bridge_is_local(dev, rep, esw)) - break; - fdb_info = container_of(info, struct switchdev_notifier_fdb_info, info); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 1892ccb889b3..7cd36f4ac3ef 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -443,7 +443,7 @@ void mlx5e_enable_cvlan_filter(struct mlx5e_flow_steering *fs, bool promisc) void mlx5e_disable_cvlan_filter(struct mlx5e_flow_steering *fs, bool promisc) { - if (fs->vlan->cvlan_filter_disabled) + if (!fs->vlan || fs->vlan->cvlan_filter_disabled) return; fs->vlan->cvlan_filter_disabled = true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index abcc614b6191..6c24f33a5ea5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -591,7 +591,8 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param rq->ix = c->ix; rq->channel = c; rq->mdev = mdev; - rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->hw_mtu = + MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN * !params->scatter_fcs_en; rq->xdpsq = &c->rq_xdpsq; rq->stats = &c->priv->channel_stats[c->ix]->rq; rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); @@ -1014,35 +1015,6 @@ int mlx5e_flush_rq(struct mlx5e_rq *rq, int curr_state) return mlx5e_rq_to_ready(rq, curr_state); } -static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable) -{ - struct mlx5_core_dev *mdev = rq->mdev; - - void *in; - void *rqc; - int inlen; - int err; - - inlen = MLX5_ST_SZ_BYTES(modify_rq_in); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; - - rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); - - MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY); - MLX5_SET64(modify_rq_in, in, modify_bitmask, - MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS); - MLX5_SET(rqc, rqc, scatter_fcs, enable); - MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); - - err = mlx5_core_modify_rq(mdev, rq->rqn, in); - - kvfree(in); - - return err; -} - static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd) { struct mlx5_core_dev *mdev = rq->mdev; @@ -3314,20 +3286,6 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv) mlx5e_destroy_tises(priv); } -static int mlx5e_modify_channels_scatter_fcs(struct mlx5e_channels *chs, bool enable) -{ - int err = 0; - int i; - - for (i = 0; i < chs->num; i++) { - err = mlx5e_modify_rq_scatter_fcs(&chs->c[i]->rq, enable); - if (err) - return err; - } - - return 0; -} - static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd) { int err; @@ -3903,41 +3861,27 @@ static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable) return mlx5_set_ports_check(mdev, in, sizeof(in)); } +static int mlx5e_set_rx_port_ts_wrap(struct mlx5e_priv *priv, void *ctx) +{ + struct mlx5_core_dev *mdev = priv->mdev; + bool enable = *(bool *)ctx; + + return mlx5e_set_rx_port_ts(mdev, enable); +} + static int set_feature_rx_fcs(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5e_channels *chs = &priv->channels; - struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; int err; mutex_lock(&priv->state_lock); - if (enable) { - err = mlx5e_set_rx_port_ts(mdev, false); - if (err) - goto out; - - chs->params.scatter_fcs_en = true; - err = mlx5e_modify_channels_scatter_fcs(chs, true); - if (err) { - chs->params.scatter_fcs_en = false; - mlx5e_set_rx_port_ts(mdev, true); - } - } else { - chs->params.scatter_fcs_en = false; - err = mlx5e_modify_channels_scatter_fcs(chs, false); - if (err) { - chs->params.scatter_fcs_en = true; - goto out; - } - err = mlx5e_set_rx_port_ts(mdev, true); - if (err) { - mlx5_core_warn(mdev, "Failed to set RX port timestamp %d\n", err); - err = 0; - } - } - -out: + new_params = chs->params; + new_params.scatter_fcs_en = enable; + err = mlx5e_safe_switch_params(priv, &new_params, mlx5e_set_rx_port_ts_wrap, + &new_params.scatter_fcs_en, true); mutex_unlock(&priv->state_lock); return err; } @@ -4074,6 +4018,10 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev if (netdev->features & NETIF_F_GRO_HW) netdev_warn(netdev, "Disabling HW_GRO, not supported in switchdev mode\n"); + features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; + if (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER) + netdev_warn(netdev, "Disabling HW_VLAN CTAG FILTERING, not supported in switchdev mode\n"); + return features; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index b176648d1343..3cdcb0e0b20f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -1715,7 +1715,7 @@ void mlx5_esw_bridge_fdb_update_used(struct net_device *dev, u16 vport_num, u16 struct mlx5_esw_bridge *bridge; port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); - if (!port || port->flags & MLX5_ESW_BRIDGE_PORT_FLAG_PEER) + if (!port) return; bridge = port->bridge; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index eff92dc0927c..e09518f887a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -189,16 +189,16 @@ static inline int mlx5_ptys_rate_enum_to_int(enum mlx5_ptys_rate rate) } } -static int mlx5i_get_speed_settings(u16 ib_link_width_oper, u16 ib_proto_oper) +static u32 mlx5i_get_speed_settings(u16 ib_link_width_oper, u16 ib_proto_oper) { int rate, width; rate = mlx5_ptys_rate_enum_to_int(ib_proto_oper); if (rate < 0) - return -EINVAL; + return SPEED_UNKNOWN; width = mlx5_ptys_width_enum_to_int(ib_link_width_oper); if (width < 0) - return -EINVAL; + return SPEED_UNKNOWN; return rate * width; } @@ -221,16 +221,13 @@ static int mlx5i_get_link_ksettings(struct net_device *netdev, ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising); speed = mlx5i_get_speed_settings(ib_link_width_oper, ib_proto_oper); - if (speed < 0) - return -EINVAL; + link_ksettings->base.speed = speed; + link_ksettings->base.duplex = speed == SPEED_UNKNOWN ? DUPLEX_UNKNOWN : DUPLEX_FULL; - link_ksettings->base.duplex = DUPLEX_FULL; link_ksettings->base.port = PORT_OTHER; link_ksettings->base.autoneg = AUTONEG_DISABLE; - link_ksettings->base.speed = speed; - return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 3d5f2a4b1fed..4e1b5757528a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -2110,7 +2110,7 @@ static int __init mlx5_init(void) mlx5_core_verify_params(); mlx5_register_debugfs(); - err = pci_register_driver(&mlx5_core_driver); + err = mlx5e_init(); if (err) goto err_debug; @@ -2118,16 +2118,16 @@ static int __init mlx5_init(void) if (err) goto err_sf; - err = mlx5e_init(); + err = pci_register_driver(&mlx5_core_driver); if (err) - goto err_en; + goto err_pci; return 0; -err_en: +err_pci: mlx5_sf_driver_unregister(); err_sf: - pci_unregister_driver(&mlx5_core_driver); + mlx5e_cleanup(); err_debug: mlx5_unregister_debugfs(); return err; @@ -2135,9 +2135,9 @@ err_debug: static void __exit mlx5_cleanup(void) { - mlx5e_cleanup(); - mlx5_sf_driver_unregister(); pci_unregister_driver(&mlx5_core_driver); + mlx5_sf_driver_unregister(); + mlx5e_cleanup(); mlx5_unregister_debugfs(); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 60596357bfc7..0eb50be175cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -74,6 +74,14 @@ static u32 get_function(u16 func_id, bool ec_function) return (u32)func_id | (ec_function << 16); } +static u16 func_id_to_type(struct mlx5_core_dev *dev, u16 func_id, bool ec_function) +{ + if (!func_id) + return mlx5_core_is_ecpf(dev) && !ec_function ? MLX5_HOST_PF : MLX5_PF; + + return func_id <= mlx5_core_max_vfs(dev) ? MLX5_VF : MLX5_SF; +} + static struct rb_root *page_root_per_function(struct mlx5_core_dev *dev, u32 function) { struct rb_root *root; @@ -332,6 +340,7 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0}; int inlen = MLX5_ST_SZ_BYTES(manage_pages_in); int notify_fail = event; + u16 func_type; u64 addr; int err; u32 *in; @@ -383,11 +392,9 @@ retry: goto out_dropped; } + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] += npages; dev->priv.fw_pages += npages; - if (func_id) - dev->priv.vfs_pages += npages; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages += npages; mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x, err %d\n", npages, ec_function, func_id, err); @@ -414,6 +421,7 @@ static void release_all_pages(struct mlx5_core_dev *dev, u16 func_id, struct rb_root *root; struct rb_node *p; int npages = 0; + u16 func_type; root = xa_load(&dev->priv.page_root_xa, function); if (WARN_ON_ONCE(!root)) @@ -428,11 +436,9 @@ static void release_all_pages(struct mlx5_core_dev *dev, u16 func_id, free_fwp(dev, fwp, fwp->free_count); } + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] -= npages; dev->priv.fw_pages -= npages; - if (func_id) - dev->priv.vfs_pages -= npages; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages -= npages; mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x\n", npages, ec_function, func_id); @@ -498,6 +504,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, int outlen = MLX5_ST_SZ_BYTES(manage_pages_out); u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {}; int num_claimed; + u16 func_type; u32 *out; int err; int i; @@ -549,11 +556,9 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, if (nclaimed) *nclaimed = num_claimed; + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] -= num_claimed; dev->priv.fw_pages -= num_claimed; - if (func_id) - dev->priv.vfs_pages -= num_claimed; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages -= num_claimed; out_free: kvfree(out); @@ -706,12 +711,12 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev) WARN(dev->priv.fw_pages, "FW pages counter is %d after reclaiming all pages\n", dev->priv.fw_pages); - WARN(dev->priv.vfs_pages, + WARN(dev->priv.page_counters[MLX5_VF], "VFs FW pages counter is %d after reclaiming all pages\n", - dev->priv.vfs_pages); - WARN(dev->priv.host_pf_pages, + dev->priv.page_counters[MLX5_VF]); + WARN(dev->priv.page_counters[MLX5_HOST_PF], "External host PF FW pages counter is %d after reclaiming all pages\n", - dev->priv.host_pf_pages); + dev->priv.page_counters[MLX5_HOST_PF]); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index c0e6c487c63c..3008e9ce2bbf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -147,7 +147,7 @@ mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf) mlx5_eswitch_disable_sriov(dev->priv.eswitch, clear_vf); - if (mlx5_wait_for_pages(dev, &dev->priv.vfs_pages)) + if (mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_VF])) mlx5_core_warn(dev, "timeout reclaiming VFs pages\n"); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index b851141e03de..042ca0349124 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -1138,12 +1138,14 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, rule->flow_source)) return 0; + mlx5dr_domain_nic_lock(nic_dmn); + ret = mlx5dr_matcher_select_builders(matcher, nic_matcher, dr_rule_get_ipv(¶m->outer), dr_rule_get_ipv(¶m->inner)); if (ret) - return ret; + goto err_unlock; hw_ste_arr_is_opt = nic_matcher->num_of_builders <= DR_RULE_MAX_STES_OPTIMIZED; if (likely(hw_ste_arr_is_opt)) { @@ -1152,12 +1154,12 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, hw_ste_arr = kzalloc((nic_matcher->num_of_builders + DR_ACTION_MAX_STES) * DR_STE_SIZE, GFP_KERNEL); - if (!hw_ste_arr) - return -ENOMEM; + if (!hw_ste_arr) { + ret = -ENOMEM; + goto err_unlock; + } } - mlx5dr_domain_nic_lock(nic_dmn); - ret = mlx5dr_matcher_add_to_tbl_nic(dmn, nic_matcher); if (ret) goto free_hw_ste; @@ -1223,7 +1225,10 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, mlx5dr_domain_nic_unlock(nic_dmn); - goto out; + if (unlikely(!hw_ste_arr_is_opt)) + kfree(hw_ste_arr); + + return 0; free_rule: dr_rule_clean_rule_members(rule, nic_rule); @@ -1238,12 +1243,12 @@ remove_from_nic_tbl: mlx5dr_matcher_remove_from_tbl_nic(dmn, nic_matcher); free_hw_ste: - mlx5dr_domain_nic_unlock(nic_dmn); - -out: - if (unlikely(!hw_ste_arr_is_opt)) + if (!hw_ste_arr_is_opt) kfree(hw_ste_arr); +err_unlock: + mlx5dr_domain_nic_unlock(nic_dmn); + return ret; } diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c index 5314c064ceae..55b484b10562 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c @@ -608,12 +608,12 @@ allocate_new: lan966x_fdma_rx_reload(rx); } - if (counter < weight && napi_complete_done(napi, counter)) - lan_wr(0xff, lan966x, FDMA_INTR_DB_ENA); - if (redirect) xdp_do_flush(); + if (counter < weight && napi_complete_done(napi, counter)) + lan_wr(0xff, lan966x, FDMA_INTR_DB_ENA); + return counter; } diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c b/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c index 0ed1ea7727c5..69e76634f9aa 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c @@ -633,7 +633,7 @@ int sparx5_ptp_init(struct sparx5 *sparx5) /* Enable master counters */ spx5_wr(PTP_PTP_DOM_CFG_PTP_ENA_SET(0x7), sparx5, PTP_PTP_DOM_CFG); - for (i = 0; i < sparx5->port_count; i++) { + for (i = 0; i < SPX5_PORTS; i++) { port = sparx5->ports[i]; if (!port) continue; @@ -649,7 +649,7 @@ void sparx5_ptp_deinit(struct sparx5 *sparx5) struct sparx5_port *port; int i; - for (i = 0; i < sparx5->port_count; i++) { + for (i = 0; i < SPX5_PORTS; i++) { port = sparx5->ports[i]; if (!port) continue; diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index e708c2d04983..f9b8f372ec8a 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1217,9 +1217,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) unsigned int max_queues_per_port = num_online_cpus(); struct gdma_context *gc = pci_get_drvdata(pdev); struct gdma_irq_context *gic; - unsigned int max_irqs; - u16 *cpus; - cpumask_var_t req_mask; + unsigned int max_irqs, cpu; int nvec, irq; int err, i = 0, j; @@ -1240,39 +1238,31 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) goto free_irq_vector; } - if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) { - err = -ENOMEM; - goto free_irq; - } - - cpus = kcalloc(nvec, sizeof(*cpus), GFP_KERNEL); - if (!cpus) { - err = -ENOMEM; - goto free_mask; - } - for (i = 0; i < nvec; i++) - cpus[i] = cpumask_local_spread(i, gc->numa_node); - for (i = 0; i < nvec; i++) { - cpumask_set_cpu(cpus[i], req_mask); gic = &gc->irq_contexts[i]; gic->handler = NULL; gic->arg = NULL; + if (!i) + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s", + pci_name(pdev)); + else + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", + i - 1, pci_name(pdev)); + irq = pci_irq_vector(pdev, i); if (irq < 0) { err = irq; - goto free_mask; + goto free_irq; } - err = request_irq(irq, mana_gd_intr, 0, "mana_intr", gic); + err = request_irq(irq, mana_gd_intr, 0, gic->name, gic); if (err) - goto free_mask; - irq_set_affinity_and_hint(irq, req_mask); - cpumask_clear(req_mask); + goto free_irq; + + cpu = cpumask_local_spread(i, gc->numa_node); + irq_set_affinity_and_hint(irq, cpumask_of(cpu)); } - free_cpumask_var(req_mask); - kfree(cpus); err = mana_gd_alloc_res_map(nvec, &gc->msix_resource); if (err) @@ -1283,13 +1273,12 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) return 0; -free_mask: - free_cpumask_var(req_mask); - kfree(cpus); free_irq: for (j = i - 1; j >= 0; j--) { irq = pci_irq_vector(pdev, j); gic = &gc->irq_contexts[j]; + + irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); } @@ -1317,6 +1306,9 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev) continue; gic = &gc->irq_contexts[i]; + + /* Need to clear the hint before free_irq */ + irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); } diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 7c0897e779dc..ee052404eb55 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -605,6 +605,18 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, flow_rule_match_control(rule, &match); } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN)) { + struct flow_match_vlan match; + + flow_rule_match_vlan(rule, &match); + filter->key_type = OCELOT_VCAP_KEY_ANY; + filter->vlan.vid.value = match.key->vlan_id; + filter->vlan.vid.mask = match.mask->vlan_id; + filter->vlan.pcp.value[0] = match.key->vlan_priority; + filter->vlan.pcp.mask[0] = match.mask->vlan_priority; + match_protocol = false; + } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct flow_match_eth_addrs match; @@ -737,18 +749,6 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, match_protocol = false; } - if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN)) { - struct flow_match_vlan match; - - flow_rule_match_vlan(rule, &match); - filter->key_type = OCELOT_VCAP_KEY_ANY; - filter->vlan.vid.value = match.key->vlan_id; - filter->vlan.vid.mask = match.mask->vlan_id; - filter->vlan.pcp.value[0] = match.key->vlan_priority; - filter->vlan.pcp.mask[0] = match.mask->vlan_priority; - match_protocol = false; - } - finished_key_parsing: if (match_protocol && proto != ETH_P_ALL) { if (filter->block_id == VCAP_ES0) { diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index 1a82f10c8853..2180ae94c744 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -335,8 +335,8 @@ static void ocelot_populate_ipv6_ptp_event_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV6; - trap->key.ipv4.proto.value[0] = IPPROTO_UDP; - trap->key.ipv4.proto.mask[0] = 0xff; + trap->key.ipv6.proto.value[0] = IPPROTO_UDP; + trap->key.ipv6.proto.mask[0] = 0xff; trap->key.ipv6.dport.value = PTP_EV_PORT; trap->key.ipv6.dport.mask = 0xffff; } @@ -355,8 +355,8 @@ static void ocelot_populate_ipv6_ptp_general_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV6; - trap->key.ipv4.proto.value[0] = IPPROTO_UDP; - trap->key.ipv4.proto.mask[0] = 0xff; + trap->key.ipv6.proto.value[0] = IPPROTO_UDP; + trap->key.ipv6.proto.mask[0] = 0xff; trap->key.ipv6.dport.value = PTP_GEN_PORT; trap->key.ipv6.dport.mask = 0xffff; } diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c index 4632268695cb..063cd371033a 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c @@ -129,26 +129,31 @@ struct nfp_ipsec_cfg_mssg { }; }; -static int nfp_ipsec_cfg_cmd_issue(struct nfp_net *nn, int type, int saidx, - struct nfp_ipsec_cfg_mssg *msg) +static int nfp_net_ipsec_cfg(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry) { + unsigned int offset = nn->tlv_caps.mbox_off + NFP_NET_CFG_MBOX_SIMPLE_VAL; + struct nfp_ipsec_cfg_mssg *msg = (struct nfp_ipsec_cfg_mssg *)entry->msg; int i, msg_size, ret; - msg->cmd = type; - msg->sa_idx = saidx; - msg->rsp = 0; - msg_size = ARRAY_SIZE(msg->raw); + ret = nfp_net_mbox_lock(nn, sizeof(*msg)); + if (ret) + return ret; + msg_size = ARRAY_SIZE(msg->raw); for (i = 0; i < msg_size; i++) - nn_writel(nn, NFP_NET_CFG_MBOX_VAL + 4 * i, msg->raw[i]); + nn_writel(nn, offset + 4 * i, msg->raw[i]); - ret = nfp_net_mbox_reconfig(nn, NFP_NET_CFG_MBOX_CMD_IPSEC); - if (ret < 0) + ret = nfp_net_mbox_reconfig(nn, entry->cmd); + if (ret < 0) { + nn_ctrl_bar_unlock(nn); return ret; + } /* For now we always read the whole message response back */ for (i = 0; i < msg_size; i++) - msg->raw[i] = nn_readl(nn, NFP_NET_CFG_MBOX_VAL + 4 * i); + msg->raw[i] = nn_readl(nn, offset + 4 * i); + + nn_ctrl_bar_unlock(nn); switch (msg->rsp) { case NFP_IPSEC_CFG_MSSG_OK: @@ -477,7 +482,10 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x) } /* Allocate saidx and commit the SA */ - err = nfp_ipsec_cfg_cmd_issue(nn, NFP_IPSEC_CFG_MSSG_ADD_SA, saidx, &msg); + msg.cmd = NFP_IPSEC_CFG_MSSG_ADD_SA; + msg.sa_idx = saidx; + err = nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_IPSEC, &msg, + sizeof(msg), nfp_net_ipsec_cfg); if (err) { xa_erase(&nn->xa_ipsec, saidx); nn_err(nn, "Failed to issue IPsec command err ret=%d\n", err); @@ -491,14 +499,17 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x) static void nfp_net_xfrm_del_state(struct xfrm_state *x) { + struct nfp_ipsec_cfg_mssg msg = { + .cmd = NFP_IPSEC_CFG_MSSG_INV_SA, + .sa_idx = x->xso.offload_handle - 1, + }; struct net_device *netdev = x->xso.dev; - struct nfp_ipsec_cfg_mssg msg; struct nfp_net *nn; int err; nn = netdev_priv(netdev); - err = nfp_ipsec_cfg_cmd_issue(nn, NFP_IPSEC_CFG_MSSG_INV_SA, - x->xso.offload_handle - 1, &msg); + err = nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_IPSEC, &msg, + sizeof(msg), nfp_net_ipsec_cfg); if (err) nn_warn(nn, "Failed to invalidate SA in hardware\n"); diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index a8678d5612ee..060a77f2265d 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -460,6 +460,7 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, sizeof(struct nfp_tun_neigh_v4); unsigned long cookie = (unsigned long)neigh; struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_neigh_lag lag_info; struct nfp_neigh_entry *nn_entry; u32 port_id; u8 mtype; @@ -468,6 +469,11 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, if (!port_id) return; + if ((port_id & NFP_FL_LAG_OUT) == NFP_FL_LAG_OUT) { + memset(&lag_info, 0, sizeof(struct nfp_tun_neigh_lag)); + nfp_flower_lag_get_info_from_netdev(app, netdev, &lag_info); + } + spin_lock_bh(&priv->predt_lock); nn_entry = rhashtable_lookup_fast(&priv->neigh_table, &cookie, neigh_table_params); @@ -515,7 +521,7 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, neigh_ha_snapshot(common->dst_addr, neigh, netdev); if ((port_id & NFP_FL_LAG_OUT) == NFP_FL_LAG_OUT) - nfp_flower_lag_get_info_from_netdev(app, netdev, lag); + memcpy(lag, &lag_info, sizeof(struct nfp_tun_neigh_lag)); common->port_id = cpu_to_be32(port_id); if (rhashtable_insert_fast(&priv->neigh_table, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 432d79d691c2..939cfce15830 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -617,9 +617,10 @@ struct nfp_net_dp { * @vnic_no_name: For non-port PF vNIC make ndo_get_phys_port_name return * -EOPNOTSUPP to keep backwards compatibility (set by app) * @port: Pointer to nfp_port structure if vNIC is a port - * @mc_lock: Protect mc_addrs list - * @mc_addrs: List of mc addrs to add/del to HW - * @mc_work: Work to update mc addrs + * @mbox_amsg: Asynchronously processed message via mailbox + * @mbox_amsg.lock: Protect message list + * @mbox_amsg.list: List of message to process + * @mbox_amsg.work: Work to process message asynchronously * @app_priv: APP private data for this vNIC */ struct nfp_net { @@ -721,13 +722,25 @@ struct nfp_net { struct nfp_port *port; - spinlock_t mc_lock; - struct list_head mc_addrs; - struct work_struct mc_work; + struct { + spinlock_t lock; + struct list_head list; + struct work_struct work; + } mbox_amsg; void *app_priv; }; +struct nfp_mbox_amsg_entry { + struct list_head list; + int (*cfg)(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry); + u32 cmd; + char msg[]; +}; + +int nfp_net_sched_mbox_amsg_work(struct nfp_net *nn, u32 cmd, const void *data, size_t len, + int (*cb)(struct nfp_net *, struct nfp_mbox_amsg_entry *)); + /* Functions to read/write from/to a BAR * Performs any endian conversion necessary. */ diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 18fc9971f1c8..70d7484c82af 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1334,14 +1334,54 @@ err_unlock: return err; } -struct nfp_mc_addr_entry { - u8 addr[ETH_ALEN]; - u32 cmd; - struct list_head list; -}; +int nfp_net_sched_mbox_amsg_work(struct nfp_net *nn, u32 cmd, const void *data, size_t len, + int (*cb)(struct nfp_net *, struct nfp_mbox_amsg_entry *)) +{ + struct nfp_mbox_amsg_entry *entry; + + entry = kmalloc(sizeof(*entry) + len, GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + memcpy(entry->msg, data, len); + entry->cmd = cmd; + entry->cfg = cb; + + spin_lock_bh(&nn->mbox_amsg.lock); + list_add_tail(&entry->list, &nn->mbox_amsg.list); + spin_unlock_bh(&nn->mbox_amsg.lock); + + schedule_work(&nn->mbox_amsg.work); + + return 0; +} + +static void nfp_net_mbox_amsg_work(struct work_struct *work) +{ + struct nfp_net *nn = container_of(work, struct nfp_net, mbox_amsg.work); + struct nfp_mbox_amsg_entry *entry, *tmp; + struct list_head tmp_list; + + INIT_LIST_HEAD(&tmp_list); + + spin_lock_bh(&nn->mbox_amsg.lock); + list_splice_init(&nn->mbox_amsg.list, &tmp_list); + spin_unlock_bh(&nn->mbox_amsg.lock); + + list_for_each_entry_safe(entry, tmp, &tmp_list, list) { + int err = entry->cfg(nn, entry); + + if (err) + nn_err(nn, "Config cmd %d to HW failed %d.\n", entry->cmd, err); + + list_del(&entry->list); + kfree(entry); + } +} -static int nfp_net_mc_cfg(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) +static int nfp_net_mc_cfg(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry) { + unsigned char *addr = entry->msg; int ret; ret = nfp_net_mbox_lock(nn, NFP_NET_CFG_MULTICAST_SZ); @@ -1353,26 +1393,7 @@ static int nfp_net_mc_cfg(struct nfp_net *nn, const unsigned char *addr, const u nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_MULTICAST_MAC_LO, get_unaligned_be16(addr + 4)); - return nfp_net_mbox_reconfig_and_unlock(nn, cmd); -} - -static int nfp_net_mc_prep(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) -{ - struct nfp_mc_addr_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) - return -ENOMEM; - - ether_addr_copy(entry->addr, addr); - entry->cmd = cmd; - spin_lock_bh(&nn->mc_lock); - list_add_tail(&entry->list, &nn->mc_addrs); - spin_unlock_bh(&nn->mc_lock); - - schedule_work(&nn->mc_work); - - return 0; + return nfp_net_mbox_reconfig_and_unlock(nn, entry->cmd); } static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) @@ -1385,35 +1406,16 @@ static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) return -EINVAL; } - return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD); + return nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD, addr, + NFP_NET_CFG_MULTICAST_SZ, nfp_net_mc_cfg); } static int nfp_net_mc_unsync(struct net_device *netdev, const unsigned char *addr) { struct nfp_net *nn = netdev_priv(netdev); - return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL); -} - -static void nfp_net_mc_addr_config(struct work_struct *work) -{ - struct nfp_net *nn = container_of(work, struct nfp_net, mc_work); - struct nfp_mc_addr_entry *entry, *tmp; - struct list_head tmp_list; - - INIT_LIST_HEAD(&tmp_list); - - spin_lock_bh(&nn->mc_lock); - list_splice_init(&nn->mc_addrs, &tmp_list); - spin_unlock_bh(&nn->mc_lock); - - list_for_each_entry_safe(entry, tmp, &tmp_list, list) { - if (nfp_net_mc_cfg(nn, entry->addr, entry->cmd)) - nn_err(nn, "Config mc address to HW failed.\n"); - - list_del(&entry->list); - kfree(entry); - } + return nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL, addr, + NFP_NET_CFG_MULTICAST_SZ, nfp_net_mc_cfg); } static void nfp_net_set_rx_mode(struct net_device *netdev) @@ -2681,9 +2683,9 @@ int nfp_net_init(struct nfp_net *nn) if (!nn->dp.netdev) return 0; - spin_lock_init(&nn->mc_lock); - INIT_LIST_HEAD(&nn->mc_addrs); - INIT_WORK(&nn->mc_work, nfp_net_mc_addr_config); + spin_lock_init(&nn->mbox_amsg.lock); + INIT_LIST_HEAD(&nn->mbox_amsg.list); + INIT_WORK(&nn->mbox_amsg.work, nfp_net_mbox_amsg_work); return register_netdev(nn->dp.netdev); @@ -2704,6 +2706,6 @@ void nfp_net_clean(struct nfp_net *nn) unregister_netdev(nn->dp.netdev); nfp_net_ipsec_clean(nn); nfp_ccm_mbox_clean(nn); - flush_work(&nn->mc_work); + flush_work(&nn->mbox_amsg.work); nfp_net_reconfig_wait_posted(nn); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h index 51124309ae1f..f03dcadff738 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h @@ -403,7 +403,6 @@ */ #define NFP_NET_CFG_MBOX_BASE 0x1800 #define NFP_NET_CFG_MBOX_VAL_MAX_SZ 0x1F8 -#define NFP_NET_CFG_MBOX_VAL 0x1808 #define NFP_NET_CFG_MBOX_SIMPLE_CMD 0x0 #define NFP_NET_CFG_MBOX_SIMPLE_RET 0x4 #define NFP_NET_CFG_MBOX_SIMPLE_VAL 0x8 diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index a4a89ef3f18b..cc97b3d00414 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -293,35 +293,131 @@ nfp_net_set_fec_link_mode(struct nfp_eth_table_port *eth_port, } } -static const u16 nfp_eth_media_table[] = { - [NFP_MEDIA_1000BASE_CX] = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, - [NFP_MEDIA_1000BASE_KX] = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, - [NFP_MEDIA_10GBASE_KX4] = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, - [NFP_MEDIA_10GBASE_KR] = ETHTOOL_LINK_MODE_10000baseKR_Full_BIT, - [NFP_MEDIA_10GBASE_CX4] = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, - [NFP_MEDIA_10GBASE_CR] = ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, - [NFP_MEDIA_10GBASE_SR] = ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, - [NFP_MEDIA_10GBASE_ER] = ETHTOOL_LINK_MODE_10000baseER_Full_BIT, - [NFP_MEDIA_25GBASE_KR] = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, - [NFP_MEDIA_25GBASE_KR_S] = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, - [NFP_MEDIA_25GBASE_CR] = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, - [NFP_MEDIA_25GBASE_CR_S] = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, - [NFP_MEDIA_25GBASE_SR] = ETHTOOL_LINK_MODE_25000baseSR_Full_BIT, - [NFP_MEDIA_40GBASE_CR4] = ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT, - [NFP_MEDIA_40GBASE_KR4] = ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT, - [NFP_MEDIA_40GBASE_SR4] = ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT, - [NFP_MEDIA_40GBASE_LR4] = ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT, - [NFP_MEDIA_50GBASE_KR] = ETHTOOL_LINK_MODE_50000baseKR_Full_BIT, - [NFP_MEDIA_50GBASE_SR] = ETHTOOL_LINK_MODE_50000baseSR_Full_BIT, - [NFP_MEDIA_50GBASE_CR] = ETHTOOL_LINK_MODE_50000baseCR_Full_BIT, - [NFP_MEDIA_50GBASE_LR] = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, - [NFP_MEDIA_50GBASE_ER] = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, - [NFP_MEDIA_50GBASE_FR] = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, - [NFP_MEDIA_100GBASE_KR4] = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, - [NFP_MEDIA_100GBASE_SR4] = ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT, - [NFP_MEDIA_100GBASE_CR4] = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, - [NFP_MEDIA_100GBASE_KP4] = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, - [NFP_MEDIA_100GBASE_CR10] = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, +static const struct nfp_eth_media_link_mode { + u16 ethtool_link_mode; + u16 speed; +} nfp_eth_media_table[NFP_MEDIA_LINK_MODES_NUMBER] = { + [NFP_MEDIA_1000BASE_CX] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, + .speed = NFP_SPEED_1G, + }, + [NFP_MEDIA_1000BASE_KX] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, + .speed = NFP_SPEED_1G, + }, + [NFP_MEDIA_10GBASE_KX4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_KR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseKR_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_CX4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_CR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_SR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_ER] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseER_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_25GBASE_KR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_KR_S] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_CR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_CR_S] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_SR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseSR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_40GBASE_CR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_40GBASE_KR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_40GBASE_SR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_40GBASE_LR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_50GBASE_KR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseKR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_SR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseSR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_CR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseCR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_LR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_ER] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_FR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_100GBASE_KR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_SR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_CR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_KP4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_CR10] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, +}; + +static const unsigned int nfp_eth_speed_map[NFP_SUP_SPEED_NUMBER] = { + [NFP_SPEED_1G] = SPEED_1000, + [NFP_SPEED_10G] = SPEED_10000, + [NFP_SPEED_25G] = SPEED_25000, + [NFP_SPEED_40G] = SPEED_40000, + [NFP_SPEED_50G] = SPEED_50000, + [NFP_SPEED_100G] = SPEED_100000, }; static void nfp_add_media_link_mode(struct nfp_port *port, @@ -334,8 +430,12 @@ static void nfp_add_media_link_mode(struct nfp_port *port, }; struct nfp_cpp *cpp = port->app->cpp; - if (nfp_eth_read_media(cpp, ðm)) + if (nfp_eth_read_media(cpp, ðm)) { + bitmap_fill(port->speed_bitmap, NFP_SUP_SPEED_NUMBER); return; + } + + bitmap_zero(port->speed_bitmap, NFP_SUP_SPEED_NUMBER); for (u32 i = 0; i < 2; i++) { supported_modes[i] = le64_to_cpu(ethm.supported_modes[i]); @@ -344,20 +444,26 @@ static void nfp_add_media_link_mode(struct nfp_port *port, for (u32 i = 0; i < NFP_MEDIA_LINK_MODES_NUMBER; i++) { if (i < 64) { - if (supported_modes[0] & BIT_ULL(i)) - __set_bit(nfp_eth_media_table[i], + if (supported_modes[0] & BIT_ULL(i)) { + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.supported); + __set_bit(nfp_eth_media_table[i].speed, + port->speed_bitmap); + } if (advertised_modes[0] & BIT_ULL(i)) - __set_bit(nfp_eth_media_table[i], + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.advertising); } else { - if (supported_modes[1] & BIT_ULL(i - 64)) - __set_bit(nfp_eth_media_table[i], + if (supported_modes[1] & BIT_ULL(i - 64)) { + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.supported); + __set_bit(nfp_eth_media_table[i].speed, + port->speed_bitmap); + } if (advertised_modes[1] & BIT_ULL(i - 64)) - __set_bit(nfp_eth_media_table[i], + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.advertising); } } @@ -468,6 +574,22 @@ nfp_net_set_link_ksettings(struct net_device *netdev, if (cmd->base.speed != SPEED_UNKNOWN) { u32 speed = cmd->base.speed / eth_port->lanes; + bool is_supported = false; + + for (u32 i = 0; i < NFP_SUP_SPEED_NUMBER; i++) { + if (cmd->base.speed == nfp_eth_speed_map[i] && + test_bit(i, port->speed_bitmap)) { + is_supported = true; + break; + } + } + + if (!is_supported) { + netdev_err(netdev, "Speed %u is not supported.\n", + cmd->base.speed); + err = -EINVAL; + goto err_bad_set; + } if (req_aneg) { netdev_err(netdev, "Speed changing is not allowed when working on autoneg mode.\n"); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h index f8cd157ca1d7..9c04f9f0e2c9 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_port.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h @@ -38,6 +38,16 @@ enum nfp_port_flags { NFP_PORT_CHANGED = 0, }; +enum { + NFP_SPEED_1G, + NFP_SPEED_10G, + NFP_SPEED_25G, + NFP_SPEED_40G, + NFP_SPEED_50G, + NFP_SPEED_100G, + NFP_SUP_SPEED_NUMBER +}; + /** * struct nfp_port - structure representing NFP port * @netdev: backpointer to associated netdev @@ -52,6 +62,7 @@ enum nfp_port_flags { * @eth_forced: for %NFP_PORT_PHYS_PORT port is forced UP or DOWN, don't change * @eth_port: for %NFP_PORT_PHYS_PORT translated ETH Table port entry * @eth_stats: for %NFP_PORT_PHYS_PORT MAC stats if available + * @speed_bitmap: for %NFP_PORT_PHYS_PORT supported speed bitmap * @pf_id: for %NFP_PORT_PF_PORT, %NFP_PORT_VF_PORT ID of the PCI PF (0-3) * @vf_id: for %NFP_PORT_VF_PORT ID of the PCI VF within @pf_id * @pf_split: for %NFP_PORT_PF_PORT %true if PCI PF has more than one vNIC @@ -78,6 +89,7 @@ struct nfp_port { bool eth_forced; struct nfp_eth_table_port *eth_port; u8 __iomem *eth_stats; + DECLARE_BITMAP(speed_bitmap, NFP_SUP_SPEED_NUMBER); }; /* NFP_PORT_PF_PORT, NFP_PORT_VF_PORT */ struct { diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 626b9113e7c4..d911f4fd9af6 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -708,9 +708,16 @@ void ionic_q_post(struct ionic_queue *q, bool ring_doorbell, ionic_desc_cb cb, q->lif->index, q->name, q->hw_type, q->hw_index, q->head_idx, ring_doorbell); - if (ring_doorbell) + if (ring_doorbell) { ionic_dbell_ring(lif->kern_dbpage, q->hw_type, q->dbval | q->head_idx); + + q->dbell_jiffies = jiffies; + + if (q_to_qcq(q)->napi_qcq) + mod_timer(&q_to_qcq(q)->napi_qcq->napi_deadline, + jiffies + IONIC_NAPI_DEADLINE); + } } static bool ionic_q_is_posted(struct ionic_queue *q, unsigned int pos) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h index 2a1d7b9c07e7..bce3ca38669b 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h @@ -25,6 +25,12 @@ #define IONIC_DEV_INFO_REG_COUNT 32 #define IONIC_DEV_CMD_REG_COUNT 32 +#define IONIC_NAPI_DEADLINE (HZ / 200) /* 5ms */ +#define IONIC_ADMIN_DOORBELL_DEADLINE (HZ / 2) /* 500ms */ +#define IONIC_TX_DOORBELL_DEADLINE (HZ / 100) /* 10ms */ +#define IONIC_RX_MIN_DOORBELL_DEADLINE (HZ / 100) /* 10ms */ +#define IONIC_RX_MAX_DOORBELL_DEADLINE (HZ * 5) /* 5s */ + struct ionic_dev_bar { void __iomem *vaddr; phys_addr_t bus_addr; @@ -216,6 +222,8 @@ struct ionic_queue { struct ionic_lif *lif; struct ionic_desc_info *info; u64 dbval; + unsigned long dbell_deadline; + unsigned long dbell_jiffies; u16 head_idx; u16 tail_idx; unsigned int index; @@ -361,4 +369,8 @@ void ionic_q_service(struct ionic_queue *q, struct ionic_cq_info *cq_info, int ionic_heartbeat_check(struct ionic *ionic); bool ionic_is_fw_running(struct ionic_dev *idev); +bool ionic_adminq_poke_doorbell(struct ionic_queue *q); +bool ionic_txq_poke_doorbell(struct ionic_queue *q); +bool ionic_rxq_poke_doorbell(struct ionic_queue *q); + #endif /* _IONIC_DEV_H_ */ diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 4dd16c487f2b..63a78a9ac241 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -16,6 +16,7 @@ #include "ionic.h" #include "ionic_bus.h" +#include "ionic_dev.h" #include "ionic_lif.h" #include "ionic_txrx.h" #include "ionic_ethtool.h" @@ -200,6 +201,13 @@ void ionic_link_status_check_request(struct ionic_lif *lif, bool can_sleep) } } +static void ionic_napi_deadline(struct timer_list *timer) +{ + struct ionic_qcq *qcq = container_of(timer, struct ionic_qcq, napi_deadline); + + napi_schedule(&qcq->napi); +} + static irqreturn_t ionic_isr(int irq, void *data) { struct napi_struct *napi = data; @@ -269,6 +277,7 @@ static int ionic_qcq_enable(struct ionic_qcq *qcq) .oper = IONIC_Q_ENABLE, }, }; + int ret; idev = &lif->ionic->idev; dev = lif->ionic->dev; @@ -276,16 +285,24 @@ static int ionic_qcq_enable(struct ionic_qcq *qcq) dev_dbg(dev, "q_enable.index %d q_enable.qtype %d\n", ctx.cmd.q_control.index, ctx.cmd.q_control.type); + if (qcq->flags & IONIC_QCQ_F_INTR) + ionic_intr_clean(idev->intr_ctrl, qcq->intr.index); + + ret = ionic_adminq_post_wait(lif, &ctx); + if (ret) + return ret; + + if (qcq->napi.poll) + napi_enable(&qcq->napi); + if (qcq->flags & IONIC_QCQ_F_INTR) { irq_set_affinity_hint(qcq->intr.vector, &qcq->intr.affinity_mask); - napi_enable(&qcq->napi); - ionic_intr_clean(idev->intr_ctrl, qcq->intr.index); ionic_intr_mask(idev->intr_ctrl, qcq->intr.index, IONIC_INTR_MASK_CLEAR); } - return ionic_adminq_post_wait(lif, &ctx); + return 0; } static int ionic_qcq_disable(struct ionic_lif *lif, struct ionic_qcq *qcq, int fw_err) @@ -316,6 +333,7 @@ static int ionic_qcq_disable(struct ionic_lif *lif, struct ionic_qcq *qcq, int f synchronize_irq(qcq->intr.vector); irq_set_affinity_hint(qcq->intr.vector, NULL); napi_disable(&qcq->napi); + del_timer_sync(&qcq->napi_deadline); } /* If there was a previous fw communcation error, don't bother with @@ -451,6 +469,7 @@ static void ionic_link_qcq_interrupts(struct ionic_qcq *src_qcq, n_qcq->intr.vector = src_qcq->intr.vector; n_qcq->intr.index = src_qcq->intr.index; + n_qcq->napi_qcq = src_qcq->napi_qcq; } static int ionic_alloc_qcq_interrupt(struct ionic_lif *lif, struct ionic_qcq *qcq) @@ -564,13 +583,15 @@ static int ionic_qcq_alloc(struct ionic_lif *lif, unsigned int type, } if (flags & IONIC_QCQ_F_NOTIFYQ) { - int q_size, cq_size; + int q_size; - /* q & cq need to be contiguous in case of notifyq */ + /* q & cq need to be contiguous in NotifyQ, so alloc it all in q + * and don't alloc qc. We leave new->qc_size and new->qc_base + * as 0 to be sure we don't try to free it later. + */ q_size = ALIGN(num_descs * desc_size, PAGE_SIZE); - cq_size = ALIGN(num_descs * cq_desc_size, PAGE_SIZE); - - new->q_size = PAGE_SIZE + q_size + cq_size; + new->q_size = PAGE_SIZE + q_size + + ALIGN(num_descs * cq_desc_size, PAGE_SIZE); new->q_base = dma_alloc_coherent(dev, new->q_size, &new->q_base_pa, GFP_KERNEL); if (!new->q_base) { @@ -773,8 +794,14 @@ static int ionic_lif_txq_init(struct ionic_lif *lif, struct ionic_qcq *qcq) dev_dbg(dev, "txq->hw_type %d\n", q->hw_type); dev_dbg(dev, "txq->hw_index %d\n", q->hw_index); - if (test_bit(IONIC_LIF_F_SPLIT_INTR, lif->state)) + q->dbell_deadline = IONIC_TX_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + + if (test_bit(IONIC_LIF_F_SPLIT_INTR, lif->state)) { netif_napi_add(lif->netdev, &qcq->napi, ionic_tx_napi); + qcq->napi_qcq = qcq; + timer_setup(&qcq->napi_deadline, ionic_napi_deadline, 0); + } qcq->flags |= IONIC_QCQ_F_INITED; @@ -828,11 +855,17 @@ static int ionic_lif_rxq_init(struct ionic_lif *lif, struct ionic_qcq *qcq) dev_dbg(dev, "rxq->hw_type %d\n", q->hw_type); dev_dbg(dev, "rxq->hw_index %d\n", q->hw_index); + q->dbell_deadline = IONIC_RX_MIN_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + if (test_bit(IONIC_LIF_F_SPLIT_INTR, lif->state)) netif_napi_add(lif->netdev, &qcq->napi, ionic_rx_napi); else netif_napi_add(lif->netdev, &qcq->napi, ionic_txrx_napi); + qcq->napi_qcq = qcq; + timer_setup(&qcq->napi_deadline, ionic_napi_deadline, 0); + qcq->flags |= IONIC_QCQ_F_INITED; return 0; @@ -1150,6 +1183,7 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget) struct ionic_dev *idev = &lif->ionic->idev; unsigned long irqflags; unsigned int flags = 0; + bool resched = false; int rx_work = 0; int tx_work = 0; int n_work = 0; @@ -1187,6 +1221,16 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget) ionic_intr_credits(idev->intr_ctrl, intr->index, credits, flags); } + if (!a_work && ionic_adminq_poke_doorbell(&lif->adminqcq->q)) + resched = true; + if (lif->hwstamp_rxq && !rx_work && ionic_rxq_poke_doorbell(&lif->hwstamp_rxq->q)) + resched = true; + if (lif->hwstamp_txq && !tx_work && ionic_txq_poke_doorbell(&lif->hwstamp_txq->q)) + resched = true; + if (resched) + mod_timer(&lif->adminqcq->napi_deadline, + jiffies + IONIC_NAPI_DEADLINE); + return work_done; } @@ -3245,8 +3289,14 @@ static int ionic_lif_adminq_init(struct ionic_lif *lif) dev_dbg(dev, "adminq->hw_type %d\n", q->hw_type); dev_dbg(dev, "adminq->hw_index %d\n", q->hw_index); + q->dbell_deadline = IONIC_ADMIN_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + netif_napi_add(lif->netdev, &qcq->napi, ionic_adminq_napi); + qcq->napi_qcq = qcq; + timer_setup(&qcq->napi_deadline, ionic_napi_deadline, 0); + napi_enable(&qcq->napi); if (qcq->flags & IONIC_QCQ_F_INTR) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h index a53984bf3544..734519895614 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h @@ -74,8 +74,10 @@ struct ionic_qcq { struct ionic_queue q; struct ionic_cq cq; struct ionic_intr_info intr; + struct timer_list napi_deadline; struct napi_struct napi; unsigned int flags; + struct ionic_qcq *napi_qcq; struct dentry *dentry; }; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index a13530ec4dd8..08c42b039d92 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -289,6 +289,35 @@ static void ionic_adminq_cb(struct ionic_queue *q, complete_all(&ctx->work); } +bool ionic_adminq_poke_doorbell(struct ionic_queue *q) +{ + struct ionic_lif *lif = q->lif; + unsigned long now, then, dif; + unsigned long irqflags; + + spin_lock_irqsave(&lif->adminq_lock, irqflags); + + if (q->tail_idx == q->head_idx) { + spin_unlock_irqrestore(&lif->adminq_lock, irqflags); + return false; + } + + now = READ_ONCE(jiffies); + then = q->dbell_jiffies; + dif = now - then; + + if (dif > q->dbell_deadline) { + ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, + q->dbval | q->head_idx); + + q->dbell_jiffies = now; + } + + spin_unlock_irqrestore(&lif->adminq_lock, irqflags); + + return true; +} + int ionic_adminq_post(struct ionic_lif *lif, struct ionic_admin_ctx *ctx) { struct ionic_desc_info *desc_info; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 0c3977416cd1..f761780f0162 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -22,6 +22,67 @@ static inline void ionic_rxq_post(struct ionic_queue *q, bool ring_dbell, ionic_q_post(q, ring_dbell, cb_func, cb_arg); } +bool ionic_txq_poke_doorbell(struct ionic_queue *q) +{ + unsigned long now, then, dif; + struct netdev_queue *netdev_txq; + struct net_device *netdev; + + netdev = q->lif->netdev; + netdev_txq = netdev_get_tx_queue(netdev, q->index); + + HARD_TX_LOCK(netdev, netdev_txq, smp_processor_id()); + + if (q->tail_idx == q->head_idx) { + HARD_TX_UNLOCK(netdev, netdev_txq); + return false; + } + + now = READ_ONCE(jiffies); + then = q->dbell_jiffies; + dif = now - then; + + if (dif > q->dbell_deadline) { + ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, + q->dbval | q->head_idx); + + q->dbell_jiffies = now; + } + + HARD_TX_UNLOCK(netdev, netdev_txq); + + return true; +} + +bool ionic_rxq_poke_doorbell(struct ionic_queue *q) +{ + unsigned long now, then, dif; + + /* no lock, called from rx napi or txrx napi, nothing else can fill */ + + if (q->tail_idx == q->head_idx) + return false; + + now = READ_ONCE(jiffies); + then = q->dbell_jiffies; + dif = now - then; + + if (dif > q->dbell_deadline) { + ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, + q->dbval | q->head_idx); + + q->dbell_jiffies = now; + + dif = 2 * q->dbell_deadline; + if (dif > IONIC_RX_MAX_DOORBELL_DEADLINE) + dif = IONIC_RX_MAX_DOORBELL_DEADLINE; + + q->dbell_deadline = dif; + } + + return true; +} + static inline struct netdev_queue *q_to_ndq(struct ionic_queue *q) { return netdev_get_tx_queue(q->lif->netdev, q->index); @@ -424,6 +485,12 @@ void ionic_rx_fill(struct ionic_queue *q) ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, q->dbval | q->head_idx); + + q->dbell_deadline = IONIC_RX_MIN_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + + mod_timer(&q_to_qcq(q)->napi_qcq->napi_deadline, + jiffies + IONIC_NAPI_DEADLINE); } void ionic_rx_empty(struct ionic_queue *q) @@ -511,6 +578,9 @@ int ionic_tx_napi(struct napi_struct *napi, int budget) work_done, flags); } + if (!work_done && ionic_txq_poke_doorbell(&qcq->q)) + mod_timer(&qcq->napi_deadline, jiffies + IONIC_NAPI_DEADLINE); + return work_done; } @@ -544,23 +614,29 @@ int ionic_rx_napi(struct napi_struct *napi, int budget) work_done, flags); } + if (!work_done && ionic_rxq_poke_doorbell(&qcq->q)) + mod_timer(&qcq->napi_deadline, jiffies + IONIC_NAPI_DEADLINE); + return work_done; } int ionic_txrx_napi(struct napi_struct *napi, int budget) { - struct ionic_qcq *qcq = napi_to_qcq(napi); + struct ionic_qcq *rxqcq = napi_to_qcq(napi); struct ionic_cq *rxcq = napi_to_cq(napi); unsigned int qi = rxcq->bound_q->index; + struct ionic_qcq *txqcq; struct ionic_dev *idev; struct ionic_lif *lif; struct ionic_cq *txcq; + bool resched = false; u32 rx_work_done = 0; u32 tx_work_done = 0; u32 flags = 0; lif = rxcq->bound_q->lif; idev = &lif->ionic->idev; + txqcq = lif->txqcqs[qi]; txcq = &lif->txqcqs[qi]->cq; tx_work_done = ionic_cq_service(txcq, IONIC_TX_BUDGET_DEFAULT, @@ -572,7 +648,7 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) ionic_rx_fill(rxcq->bound_q); if (rx_work_done < budget && napi_complete_done(napi, rx_work_done)) { - ionic_dim_update(qcq, 0); + ionic_dim_update(rxqcq, 0); flags |= IONIC_INTR_CRED_UNMASK; rxcq->bound_intr->rearm_count++; } @@ -583,6 +659,13 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) tx_work_done + rx_work_done, flags); } + if (!rx_work_done && ionic_rxq_poke_doorbell(&rxqcq->q)) + resched = true; + if (!tx_work_done && ionic_txq_poke_doorbell(&txqcq->q)) + resched = true; + if (resched) + mod_timer(&rxqcq->napi_deadline, jiffies + IONIC_NAPI_DEADLINE); + return rx_work_done; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 7c2af482192d..cb1746bc0e0c 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1438,6 +1438,10 @@ int qede_poll(struct napi_struct *napi, int budget) rx_work_done = (likely(fp->type & QEDE_FASTPATH_RX) && qede_has_rx_work(fp->rxq)) ? qede_rx_int(fp, budget) : 0; + + if (fp->xdp_xmit & QEDE_XDP_REDIRECT) + xdp_do_flush(); + /* Handle case where we are called by netpoll with a budget of 0 */ if (rx_work_done < budget || !budget) { if (!qede_poll_is_more_work(fp)) { @@ -1457,9 +1461,6 @@ int qede_poll(struct napi_struct *napi, int budget) qede_update_tx_producer(fp->xdp_tx); } - if (fp->xdp_xmit & QEDE_XDP_REDIRECT) - xdp_do_flush_map(); - return rx_work_done; } diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index b4e0fc7f65bd..0f54849a3823 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1101,14 +1101,14 @@ static void ravb_error_interrupt(struct net_device *ndev) ravb_write(ndev, ~(EIS_QFS | EIS_RESERVED), EIS); if (eis & EIS_QFS) { ris2 = ravb_read(ndev, RIS2); - ravb_write(ndev, ~(RIS2_QFF0 | RIS2_RFFF | RIS2_RESERVED), + ravb_write(ndev, ~(RIS2_QFF0 | RIS2_QFF1 | RIS2_RFFF | RIS2_RESERVED), RIS2); /* Receive Descriptor Empty int */ if (ris2 & RIS2_QFF0) priv->stats[RAVB_BE].rx_over_errors++; - /* Receive Descriptor Empty int */ + /* Receive Descriptor Empty int */ if (ris2 & RIS2_QFF1) priv->stats[RAVB_NC].rx_over_errors++; @@ -2973,6 +2973,9 @@ static int __maybe_unused ravb_suspend(struct device *dev) else ret = ravb_close(ndev); + if (priv->info->ccc_gac) + ravb_ptp_stop(ndev); + return ret; } @@ -3011,6 +3014,9 @@ static int __maybe_unused ravb_resume(struct device *dev) /* Restore descriptor base address table */ ravb_write(ndev, priv->desc_bat_dma, DBAT); + if (priv->info->ccc_gac) + ravb_ptp_init(ndev, priv->pdev); + if (netif_running(ndev)) { if (priv->wol_enabled) { ret = ravb_wol_restore(ndev); diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 6441892636db..2370c7797a0a 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1074,8 +1074,11 @@ static struct device_node *rswitch_get_port_node(struct rswitch_device *rdev) port = NULL; goto out; } - if (index == rdev->etha->index) + if (index == rdev->etha->index) { + if (!of_device_is_available(port)) + port = NULL; break; + } } out: @@ -1106,7 +1109,7 @@ static int rswitch_etha_get_params(struct rswitch_device *rdev) port = rswitch_get_port_node(rdev); if (!port) - return -ENODEV; + return 0; /* ignored */ err = of_get_phy_mode(port, &rdev->etha->phy_interface); of_node_put(port); @@ -1324,13 +1327,13 @@ static int rswitch_ether_port_init_all(struct rswitch_private *priv) { int i, err; - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { + rswitch_for_each_enabled_port(priv, i) { err = rswitch_ether_port_init_one(priv->rdev[i]); if (err) goto err_init_one; } - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { + rswitch_for_each_enabled_port(priv, i) { err = rswitch_serdes_init(priv->rdev[i]); if (err) goto err_serdes; @@ -1339,12 +1342,12 @@ static int rswitch_ether_port_init_all(struct rswitch_private *priv) return 0; err_serdes: - for (i--; i >= 0; i--) + rswitch_for_each_enabled_port_continue_reverse(priv, i) rswitch_serdes_deinit(priv->rdev[i]); i = RSWITCH_NUM_PORTS; err_init_one: - for (i--; i >= 0; i--) + rswitch_for_each_enabled_port_continue_reverse(priv, i) rswitch_ether_port_deinit_one(priv->rdev[i]); return err; @@ -1608,6 +1611,7 @@ static int rswitch_device_alloc(struct rswitch_private *priv, int index) netif_napi_add(ndev, &rdev->napi, rswitch_poll); port = rswitch_get_port_node(rdev); + rdev->disabled = !port; err = of_get_ethdev_address(port, ndev); of_node_put(port); if (err) { @@ -1707,16 +1711,16 @@ static int rswitch_init(struct rswitch_private *priv) if (err) goto err_ether_port_init_all; - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { + rswitch_for_each_enabled_port(priv, i) { err = register_netdev(priv->rdev[i]->ndev); if (err) { - for (i--; i >= 0; i--) + rswitch_for_each_enabled_port_continue_reverse(priv, i) unregister_netdev(priv->rdev[i]->ndev); goto err_register_netdev; } } - for (i = 0; i < RSWITCH_NUM_PORTS; i++) + rswitch_for_each_enabled_port(priv, i) netdev_info(priv->rdev[i]->ndev, "MAC address %pM\n", priv->rdev[i]->ndev->dev_addr); diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index edbdd1b98d3d..49efb0f31c77 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -13,6 +13,17 @@ #define RSWITCH_MAX_NUM_QUEUES 128 #define RSWITCH_NUM_PORTS 3 +#define rswitch_for_each_enabled_port(priv, i) \ + for (i = 0; i < RSWITCH_NUM_PORTS; i++) \ + if (priv->rdev[i]->disabled) \ + continue; \ + else + +#define rswitch_for_each_enabled_port_continue_reverse(priv, i) \ + for (i--; i >= 0; i--) \ + if (priv->rdev[i]->disabled) \ + continue; \ + else #define TX_RING_SIZE 1024 #define RX_RING_SIZE 1024 @@ -938,6 +949,7 @@ struct rswitch_device { struct rswitch_gwca_queue *tx_queue; struct rswitch_gwca_queue *rx_queue; u8 ts_tag; + bool disabled; int port; struct rswitch_etha *etha; diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 0556542d7a6b..3a86f1213a05 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -1003,8 +1003,11 @@ static int efx_pci_probe_post_io(struct efx_nic *efx) /* Determine netdevice features */ net_dev->features |= (efx->type->offload_features | NETIF_F_SG | NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_RXALL); - if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) + if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) { net_dev->features |= NETIF_F_TSO6; + if (efx_has_cap(efx, TX_TSO_V2_ENCAP)) + net_dev->hw_enc_features |= NETIF_F_TSO6; + } /* Check whether device supports TSO */ if (!efx->type->tso_versions || !efx->type->tso_versions(efx)) net_dev->features &= ~NETIF_F_ALL_TSO; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 835caa15d55f..732774645c1a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -560,6 +560,8 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->has_gmac4 = 1; plat_dat->pmt = 1; plat_dat->tso_en = of_property_read_bool(np, "snps,tso"); + if (of_device_is_compatible(np, "qcom,qcs404-ethqos")) + plat_dat->rx_clk_runs_in_lpi = 1; ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c index 413f66017219..e95d35f1e5a0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c @@ -541,9 +541,9 @@ int dwmac5_flex_pps_config(void __iomem *ioaddr, int index, return 0; } - val |= PPSCMDx(index, 0x2); val |= TRGTMODSELx(index, 0x2); val |= PPSEN0; + writel(val, ioaddr + MAC_PPS_CONTROL); writel(cfg->start.tv_sec, ioaddr + MAC_PPSx_TARGET_TIME_SEC(index)); @@ -568,6 +568,7 @@ int dwmac5_flex_pps_config(void __iomem *ioaddr, int index, writel(period - 1, ioaddr + MAC_PPSx_WIDTH(index)); /* Finally, activate it */ + val |= PPSCMDx(index, 0x2); writel(val, ioaddr + MAC_PPS_CONTROL); return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b7e5af58ab75..1a5b8dab5e9b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1080,7 +1080,8 @@ static void stmmac_mac_link_up(struct phylink_config *config, stmmac_mac_set(priv, priv->ioaddr, true); if (phy && priv->dma_cap.eee) { - priv->eee_active = phy_init_eee(phy, 1) >= 0; + priv->eee_active = + phy_init_eee(phy, !priv->plat->rx_clk_runs_in_lpi) >= 0; priv->eee_enabled = stmmac_eee_init(priv); priv->tx_lpi_enabled = priv->eee_enabled; stmmac_set_eee_pls(priv, priv->hw, true); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index eb6d9cd8e93f..0046a4ee6e64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -559,7 +559,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst"); plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode"); - if (plat->force_thresh_dma_mode) { + if (plat->force_thresh_dma_mode && plat->force_sf_dma_mode) { plat->force_sf_dma_mode = 0; dev_warn(&pdev->dev, "force_sf_dma_mode is ignored if force_thresh_dma_mode is set.\n"); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index ecbde83b5243..6cda4b7c10cb 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -501,7 +501,15 @@ static int am65_cpsw_nuss_common_stop(struct am65_cpsw_common *common) k3_udma_glue_disable_tx_chn(common->tx_chns[i].tx_chn); } + reinit_completion(&common->tdown_complete); k3_udma_glue_tdown_rx_chn(common->rx_chns.rx_chn, true); + + if (common->pdata.quirks & AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ) { + i = wait_for_completion_timeout(&common->tdown_complete, msecs_to_jiffies(1000)); + if (!i) + dev_err(common->dev, "rx teardown timeout\n"); + } + napi_disable(&common->napi_rx); for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++) @@ -721,6 +729,8 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_common *common, if (cppi5_desc_is_tdcm(desc_dma)) { dev_dbg(dev, "%s RX tdown flow: %u\n", __func__, flow_idx); + if (common->pdata.quirks & AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ) + complete(&common->tdown_complete); return 0; } @@ -2672,7 +2682,7 @@ static const struct am65_cpsw_pdata j721e_pdata = { }; static const struct am65_cpsw_pdata am64x_cpswxg_pdata = { - .quirks = 0, + .quirks = AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ, .ale_dev_id = "am64-cpswxg", .fdqring_mode = K3_RINGACC_RING_MODE_RING, }; diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h index 4b75620f8d28..e5f1c44788c1 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h @@ -90,6 +90,7 @@ struct am65_cpsw_rx_chn { }; #define AM65_CPSW_QUIRK_I2027_NO_TX_CSUM BIT(0) +#define AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ BIT(1) struct am65_cpsw_pdata { u32 quirks; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 9352dad58996..79f4e13620a4 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -987,9 +987,6 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, void netvsc_dma_unmap(struct hv_device *hv_dev, struct hv_netvsc_packet *packet) { - u32 page_count = packet->cp_partial ? - packet->page_buf_cnt - packet->rmsg_pgcnt : - packet->page_buf_cnt; int i; if (!hv_is_isolation_supported()) @@ -998,7 +995,7 @@ void netvsc_dma_unmap(struct hv_device *hv_dev, if (!packet->dma_range) return; - for (i = 0; i < page_count; i++) + for (i = 0; i < packet->page_buf_cnt; i++) dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma, packet->dma_range[i].mapping_size, DMA_TO_DEVICE); @@ -1028,9 +1025,7 @@ static int netvsc_dma_map(struct hv_device *hv_dev, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 page_count = packet->cp_partial ? - packet->page_buf_cnt - packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->page_buf_cnt; dma_addr_t dma; int i; @@ -1039,7 +1034,7 @@ static int netvsc_dma_map(struct hv_device *hv_dev, packet->dma_range = kcalloc(page_count, sizeof(*packet->dma_range), - GFP_KERNEL); + GFP_ATOMIC); if (!packet->dma_range) return -ENOMEM; diff --git a/drivers/net/mdio/mdio-mux-meson-g12a.c b/drivers/net/mdio/mdio-mux-meson-g12a.c index 4a2e94faf57e..c4542ecf5623 100644 --- a/drivers/net/mdio/mdio-mux-meson-g12a.c +++ b/drivers/net/mdio/mdio-mux-meson-g12a.c @@ -4,6 +4,7 @@ */ #include <linux/bitfield.h> +#include <linux/delay.h> #include <linux/clk.h> #include <linux/clk-provider.h> #include <linux/device.h> @@ -150,6 +151,7 @@ static const struct clk_ops g12a_ephy_pll_ops = { static int g12a_enable_internal_mdio(struct g12a_mdio_mux *priv) { + u32 value; int ret; /* Enable the phy clock */ @@ -163,18 +165,25 @@ static int g12a_enable_internal_mdio(struct g12a_mdio_mux *priv) /* Initialize ephy control */ writel(EPHY_G12A_ID, priv->regs + ETH_PHY_CNTL0); - writel(FIELD_PREP(PHY_CNTL1_ST_MODE, 3) | - FIELD_PREP(PHY_CNTL1_ST_PHYADD, EPHY_DFLT_ADD) | - FIELD_PREP(PHY_CNTL1_MII_MODE, EPHY_MODE_RMII) | - PHY_CNTL1_CLK_EN | - PHY_CNTL1_CLKFREQ | - PHY_CNTL1_PHY_ENB, - priv->regs + ETH_PHY_CNTL1); + + /* Make sure we get a 0 -> 1 transition on the enable bit */ + value = FIELD_PREP(PHY_CNTL1_ST_MODE, 3) | + FIELD_PREP(PHY_CNTL1_ST_PHYADD, EPHY_DFLT_ADD) | + FIELD_PREP(PHY_CNTL1_MII_MODE, EPHY_MODE_RMII) | + PHY_CNTL1_CLK_EN | + PHY_CNTL1_CLKFREQ; + writel(value, priv->regs + ETH_PHY_CNTL1); writel(PHY_CNTL2_USE_INTERNAL | PHY_CNTL2_SMI_SRC_MAC | PHY_CNTL2_RX_CLK_EPHY, priv->regs + ETH_PHY_CNTL2); + value |= PHY_CNTL1_PHY_ENB; + writel(value, priv->regs + ETH_PHY_CNTL1); + + /* The phy needs a bit of time to power up */ + mdelay(10); + return 0; } diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index a6f05e35d91f..b7cb71817780 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -233,7 +233,8 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_ENERGY_DET_INT_EN | DP83822_LINK_QUAL_INT_EN); - if (!dp83822->fx_enabled) + /* Private data pointer is NULL on DP83825/26 */ + if (!dp83822 || !dp83822->fx_enabled) misr_status |= DP83822_ANEG_COMPLETE_INT_EN | DP83822_DUP_MODE_CHANGE_INT_EN | DP83822_SPEED_CHANGED_INT_EN; @@ -253,7 +254,8 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_PAGE_RX_INT_EN | DP83822_EEE_ERROR_CHANGE_INT_EN); - if (!dp83822->fx_enabled) + /* Private data pointer is NULL on DP83825/26 */ + if (!dp83822 || !dp83822->fx_enabled) misr_status |= DP83822_ANEG_ERR_INT_EN | DP83822_WOL_PKT_INT_EN; diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index c49062ad72c6..a6015cd03bff 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -261,6 +261,8 @@ static struct phy_driver meson_gxl_phy[] = { .handle_interrupt = meson_gxl_handle_interrupt, .suspend = genphy_suspend, .resume = genphy_resume, + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, }, { PHY_ID_MATCH_EXACT(0x01803301), .name = "Meson G12A Internal PHY", @@ -271,6 +273,8 @@ static struct phy_driver meson_gxl_phy[] = { .handle_interrupt = meson_gxl_handle_interrupt, .suspend = genphy_suspend, .resume = genphy_resume, + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, }, }; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 716870a4499c..607aa786c8cb 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1517,7 +1517,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, * another mac interface, so we should create a device link between * phy dev and mac dev. */ - if (phydev->mdio.bus->parent && dev->dev.parent != phydev->mdio.bus->parent) + if (dev && phydev->mdio.bus->parent && dev->dev.parent != phydev->mdio.bus->parent) phydev->devlink = device_link_add(dev->dev.parent, &phydev->mdio.dev, DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 09cc65c0da93..4d2519cdb801 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1812,10 +1812,9 @@ int phylink_fwnode_phy_connect(struct phylink *pl, ret = phy_attach_direct(pl->netdev, phy_dev, flags, pl->link_interface); - if (ret) { - phy_device_free(phy_dev); + phy_device_free(phy_dev); + if (ret) return ret; - } ret = phylink_bringup_phy(pl, phy_dev, pl->link_config.interface); if (ret) diff --git a/drivers/net/usb/kalmia.c b/drivers/net/usb/kalmia.c index 9f2b70ef39aa..613fc6910f14 100644 --- a/drivers/net/usb/kalmia.c +++ b/drivers/net/usb/kalmia.c @@ -65,8 +65,8 @@ kalmia_send_init_packet(struct usbnet *dev, u8 *init_msg, u8 init_msg_len, init_msg, init_msg_len, &act_len, KALMIA_USB_TIMEOUT); if (status != 0) { netdev_err(dev->net, - "Error sending init packet. Status %i, length %i\n", - status, act_len); + "Error sending init packet. Status %i\n", + status); return status; } else if (act_len != init_msg_len) { @@ -83,8 +83,8 @@ kalmia_send_init_packet(struct usbnet *dev, u8 *init_msg, u8 init_msg_len, if (status != 0) netdev_err(dev->net, - "Error receiving init result. Status %i, length %i\n", - status, act_len); + "Error receiving init result. Status %i\n", + status); else if (act_len != expected_len) netdev_err(dev->net, "Unexpected init result length: %i\n", act_len); diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c index 2c82fbcaab22..7a2b0094de51 100644 --- a/drivers/net/usb/plusb.c +++ b/drivers/net/usb/plusb.c @@ -57,9 +57,7 @@ static inline int pl_vendor_req(struct usbnet *dev, u8 req, u8 val, u8 index) { - return usbnet_read_cmd(dev, req, - USB_DIR_IN | USB_TYPE_VENDOR | - USB_RECIP_DEVICE, + return usbnet_write_cmd(dev, req, USB_TYPE_VENDOR | USB_RECIP_DEVICE, val, index, NULL, 0); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 18b3de854aeb..61e33e4dd0cd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1677,13 +1677,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget) received = virtnet_receive(rq, budget, &xdp_xmit); + if (xdp_xmit & VIRTIO_XDP_REDIR) + xdp_do_flush(); + /* Out of packets? */ if (received < budget) virtqueue_napi_complete(napi, rq->vq, received); - if (xdp_xmit & VIRTIO_XDP_REDIR) - xdp_do_flush(); - if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_get_sq(vi); if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { @@ -2158,8 +2158,8 @@ static int virtnet_close(struct net_device *dev) cancel_delayed_work_sync(&vi->refill); for (i = 0; i < vi->max_queue_pairs; i++) { - xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); napi_disable(&vi->rq[i].napi); + xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); virtnet_napi_tx_disable(&vi->sq[i].napi); } diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 56267c327f0b..682987040ea8 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1546,31 +1546,6 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, rxd->len = rbi->len; } -#ifdef VMXNET3_RSS - if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE && - (adapter->netdev->features & NETIF_F_RXHASH)) { - enum pkt_hash_types hash_type; - - switch (rcd->rssType) { - case VMXNET3_RCD_RSS_TYPE_IPV4: - case VMXNET3_RCD_RSS_TYPE_IPV6: - hash_type = PKT_HASH_TYPE_L3; - break; - case VMXNET3_RCD_RSS_TYPE_TCPIPV4: - case VMXNET3_RCD_RSS_TYPE_TCPIPV6: - case VMXNET3_RCD_RSS_TYPE_UDPIPV4: - case VMXNET3_RCD_RSS_TYPE_UDPIPV6: - hash_type = PKT_HASH_TYPE_L4; - break; - default: - hash_type = PKT_HASH_TYPE_L3; - break; - } - skb_set_hash(ctx->skb, - le32_to_cpu(rcd->rssHash), - hash_type); - } -#endif skb_record_rx_queue(ctx->skb, rq->qid); skb_put(ctx->skb, rcd->len); @@ -1653,6 +1628,31 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, u32 mtu = adapter->netdev->mtu; skb->len += skb->data_len; +#ifdef VMXNET3_RSS + if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE && + (adapter->netdev->features & NETIF_F_RXHASH)) { + enum pkt_hash_types hash_type; + + switch (rcd->rssType) { + case VMXNET3_RCD_RSS_TYPE_IPV4: + case VMXNET3_RCD_RSS_TYPE_IPV6: + hash_type = PKT_HASH_TYPE_L3; + break; + case VMXNET3_RCD_RSS_TYPE_TCPIPV4: + case VMXNET3_RCD_RSS_TYPE_TCPIPV6: + case VMXNET3_RCD_RSS_TYPE_UDPIPV4: + case VMXNET3_RCD_RSS_TYPE_UDPIPV6: + hash_type = PKT_HASH_TYPE_L4; + break; + default: + hash_type = PKT_HASH_TYPE_L3; + break; + } + skb_set_hash(skb, + le32_to_cpu(rcd->rssHash), + hash_type); + } +#endif vmxnet3_rx_csum(adapter, skb, (union Vmxnet3_GenericDesc *)rcd); skb->protocol = eth_type_trans(skb, adapter->netdev); diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c index 7eff3531b9a5..7ff33c1d6ac7 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c @@ -152,6 +152,15 @@ static irqreturn_t t7xx_dpmaif_isr_handler(int irq, void *data) } t7xx_pcie_mac_clear_int(dpmaif_ctrl->t7xx_dev, isr_para->pcie_int); + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t t7xx_dpmaif_isr_thread(int irq, void *data) +{ + struct dpmaif_isr_para *isr_para = data; + struct dpmaif_ctrl *dpmaif_ctrl = isr_para->dpmaif_ctrl; + t7xx_dpmaif_irq_cb(isr_para); t7xx_pcie_mac_set_int(dpmaif_ctrl->t7xx_dev, isr_para->pcie_int); return IRQ_HANDLED; @@ -188,7 +197,7 @@ static void t7xx_dpmaif_register_pcie_irq(struct dpmaif_ctrl *dpmaif_ctrl) t7xx_pcie_mac_clear_int(t7xx_dev, int_type); t7xx_dev->intr_handler[int_type] = t7xx_dpmaif_isr_handler; - t7xx_dev->intr_thread[int_type] = NULL; + t7xx_dev->intr_thread[int_type] = t7xx_dpmaif_isr_thread; t7xx_dev->callback_param[int_type] = isr_para; t7xx_pcie_mac_clear_int_status(t7xx_dev, int_type); diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c index aa2174a10437..f4ff2198b5ef 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c @@ -840,14 +840,13 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget) if (!rxq->que_started) { atomic_set(&rxq->rx_processing, 0); + pm_runtime_put_autosuspend(rxq->dpmaif_ctrl->dev); dev_err(rxq->dpmaif_ctrl->dev, "Work RXQ: %d has not been started\n", rxq->index); return work_done; } - if (!rxq->sleep_lock_pending) { - pm_runtime_get_noresume(rxq->dpmaif_ctrl->dev); + if (!rxq->sleep_lock_pending) t7xx_pci_disable_sleep(t7xx_dev); - } ret = try_wait_for_completion(&t7xx_dev->sleep_lock_acquire); if (!ret) { @@ -876,22 +875,22 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget) napi_complete_done(napi, work_done); t7xx_dpmaif_clr_ip_busy_sts(&rxq->dpmaif_ctrl->hw_info); t7xx_dpmaif_dlq_unmask_rx_done(&rxq->dpmaif_ctrl->hw_info, rxq->index); + t7xx_pci_enable_sleep(rxq->dpmaif_ctrl->t7xx_dev); + pm_runtime_mark_last_busy(rxq->dpmaif_ctrl->dev); + pm_runtime_put_autosuspend(rxq->dpmaif_ctrl->dev); + atomic_set(&rxq->rx_processing, 0); } else { t7xx_dpmaif_clr_ip_busy_sts(&rxq->dpmaif_ctrl->hw_info); } - t7xx_pci_enable_sleep(rxq->dpmaif_ctrl->t7xx_dev); - pm_runtime_mark_last_busy(rxq->dpmaif_ctrl->dev); - pm_runtime_put_noidle(rxq->dpmaif_ctrl->dev); - atomic_set(&rxq->rx_processing, 0); - return work_done; } void t7xx_dpmaif_irq_rx_done(struct dpmaif_ctrl *dpmaif_ctrl, const unsigned int que_mask) { struct dpmaif_rx_queue *rxq; - int qno; + struct dpmaif_ctrl *ctrl; + int qno, ret; qno = ffs(que_mask) - 1; if (qno < 0 || qno > DPMAIF_RXQ_NUM - 1) { @@ -900,6 +899,18 @@ void t7xx_dpmaif_irq_rx_done(struct dpmaif_ctrl *dpmaif_ctrl, const unsigned int } rxq = &dpmaif_ctrl->rxq[qno]; + ctrl = rxq->dpmaif_ctrl; + /* We need to make sure that the modem has been resumed before + * calling napi. This can't be done inside the polling function + * as we could be blocked waiting for device to be resumed, + * which can't be done from softirq context the poll function + * is running in. + */ + ret = pm_runtime_resume_and_get(ctrl->dev); + if (ret < 0 && ret != -EACCES) { + dev_err(ctrl->dev, "Failed to resume device: %d\n", ret); + return; + } napi_schedule(&rxq->napi); } diff --git a/drivers/net/wwan/t7xx/t7xx_netdev.c b/drivers/net/wwan/t7xx/t7xx_netdev.c index 494a28e386a3..3ef4a8a4f8fd 100644 --- a/drivers/net/wwan/t7xx/t7xx_netdev.c +++ b/drivers/net/wwan/t7xx/t7xx_netdev.c @@ -27,6 +27,7 @@ #include <linux/list.h> #include <linux/netdev_features.h> #include <linux/netdevice.h> +#include <linux/pm_runtime.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/wwan.h> @@ -45,12 +46,25 @@ static void t7xx_ccmni_enable_napi(struct t7xx_ccmni_ctrl *ctlb) { - int i; + struct dpmaif_ctrl *ctrl; + int i, ret; + + ctrl = ctlb->hif_ctrl; if (ctlb->is_napi_en) return; for (i = 0; i < RXQ_NUM; i++) { + /* The usage count has to be bumped every time before calling + * napi_schedule. It will be decresed in the poll routine, + * right after napi_complete_done is called. + */ + ret = pm_runtime_resume_and_get(ctrl->dev); + if (ret < 0) { + dev_err(ctrl->dev, "Failed to resume device: %d\n", + ret); + return; + } napi_enable(ctlb->napi[i]); napi_schedule(ctlb->napi[i]); } diff --git a/drivers/net/wwan/t7xx/t7xx_pci.c b/drivers/net/wwan/t7xx/t7xx_pci.c index 871f2a27a398..226fc1703e90 100644 --- a/drivers/net/wwan/t7xx/t7xx_pci.c +++ b/drivers/net/wwan/t7xx/t7xx_pci.c @@ -121,6 +121,8 @@ void t7xx_pci_pm_init_late(struct t7xx_pci_dev *t7xx_dev) iowrite32(T7XX_L1_BIT(0), IREG_BASE(t7xx_dev) + ENABLE_ASPM_LOWPWR); atomic_set(&t7xx_dev->md_pm_state, MTK_PM_RESUMED); + pm_runtime_mark_last_busy(&t7xx_dev->pdev->dev); + pm_runtime_allow(&t7xx_dev->pdev->dev); pm_runtime_put_noidle(&t7xx_dev->pdev->dev); } diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 79d93126453d..77b06d54cc62 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -102,6 +102,25 @@ config NVDIMM_KEYS depends on ENCRYPTED_KEYS depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m +config NVDIMM_KMSAN + bool + depends on KMSAN + help + KMSAN, and other memory debug facilities, increase the size of + 'struct page' to contain extra metadata. This collides with + the NVDIMM capability to store a potentially + larger-than-"System RAM" size 'struct page' array in a + reservation of persistent memory rather than limited / + precious DRAM. However, that reservation needs to persist for + the life of the given NVDIMM namespace. If you are using KMSAN + to debug an issue unrelated to NVDIMMs or DAX then say N to this + option. Otherwise, say Y but understand that any namespaces + (with the page array stored pmem) created with this build of + the kernel will permanently reserve and strand excess + capacity compared to the CONFIG_KMSAN=n case. + + Select N if unsure. + config NVDIMM_TEST_BUILD tristate "Build the unit test core" depends on m diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 85ca5b4da3cf..ec5219680092 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -652,7 +652,7 @@ void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns); #if IS_ENABLED(CONFIG_ND_CLAIM) /* max struct page size independent of kernel config */ -#define MAX_STRUCT_PAGE_SIZE 128 +#define MAX_STRUCT_PAGE_SIZE 64 int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); #else static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 61af072ac98f..af7d9301520c 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -13,6 +13,8 @@ #include "pfn.h" #include "nd.h" +static const bool page_struct_override = IS_ENABLED(CONFIG_NVDIMM_KMSAN); + static void nd_pfn_release(struct device *dev) { struct nd_region *nd_region = to_nd_region(dev->parent); @@ -758,12 +760,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) return -ENXIO; } - /* - * Note, we use 64 here for the standard size of struct page, - * debugging options may cause it to be larger in which case the - * implementation will limit the pfns advertised through - * ->direct_access() to those that are included in the memmap. - */ start = nsio->res.start; size = resource_size(&nsio->res); npfns = PHYS_PFN(size - SZ_8K); @@ -782,20 +778,33 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) } end_trunc = start + size - ALIGN_DOWN(start + size, align); if (nd_pfn->mode == PFN_MODE_PMEM) { + unsigned long page_map_size = MAX_STRUCT_PAGE_SIZE * npfns; + /* * The altmap should be padded out to the block size used * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. * - * Also make sure size of struct page is less than 128. We - * want to make sure we use large enough size here so that - * we don't have a dynamic reserve space depending on - * struct page size. But we also want to make sure we notice - * when we end up adding new elements to struct page. + * Also make sure size of struct page is less than + * MAX_STRUCT_PAGE_SIZE. The goal here is compatibility in the + * face of production kernel configurations that reduce the + * 'struct page' size below MAX_STRUCT_PAGE_SIZE. For debug + * kernel configurations that increase the 'struct page' size + * above MAX_STRUCT_PAGE_SIZE, the page_struct_override allows + * for continuing with the capacity that will be wasted when + * reverting to a production kernel configuration. Otherwise, + * those configurations are blocked by default. */ - BUILD_BUG_ON(sizeof(struct page) > MAX_STRUCT_PAGE_SIZE); - offset = ALIGN(start + SZ_8K + MAX_STRUCT_PAGE_SIZE * npfns, align) - - start; + if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE) { + if (page_struct_override) + page_map_size = sizeof(struct page) * npfns; + else { + dev_err(&nd_pfn->dev, + "Memory debug options prevent using pmem for the page map\n"); + return -EINVAL; + } + } + offset = ALIGN(start + SZ_8K + page_map_size, align) - start; } else if (nd_pfn->mode == PFN_MODE_RAM) offset = ALIGN(start + SZ_8K, align) - start; else @@ -818,7 +827,10 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) pfn_sb->version_minor = cpu_to_le16(4); pfn_sb->end_trunc = cpu_to_le32(end_trunc); pfn_sb->align = cpu_to_le32(nd_pfn->align); - pfn_sb->page_struct_size = cpu_to_le16(MAX_STRUCT_PAGE_SIZE); + if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE && page_struct_override) + pfn_sb->page_struct_size = cpu_to_le16(sizeof(struct page)); + else + pfn_sb->page_struct_size = cpu_to_le16(MAX_STRUCT_PAGE_SIZE); pfn_sb->page_size = cpu_to_le32(PAGE_SIZE); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 4424f53a8a0a..bdb97496ba2d 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -45,6 +45,8 @@ struct nvme_dhchap_queue_context { int sess_key_len; }; +static struct workqueue_struct *nvme_auth_wq; + #define nvme_auth_flags_from_qid(qid) \ (qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED #define nvme_auth_queue_from_qid(ctrl, qid) \ @@ -866,7 +868,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) chap = &ctrl->dhchap_ctxs[qid]; cancel_work_sync(&chap->auth_work); - queue_work(nvme_wq, &chap->auth_work); + queue_work(nvme_auth_wq, &chap->auth_work); return 0; } EXPORT_SYMBOL_GPL(nvme_auth_negotiate); @@ -1008,10 +1010,15 @@ EXPORT_SYMBOL_GPL(nvme_auth_free); int __init nvme_init_auth(void) { + nvme_auth_wq = alloc_workqueue("nvme-auth-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_auth_wq) + return -ENOMEM; + nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache", CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); if (!nvme_chap_buf_cache) - return -ENOMEM; + goto err_destroy_workqueue; nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab, mempool_free_slab, nvme_chap_buf_cache); @@ -1021,6 +1028,8 @@ int __init nvme_init_auth(void) return 0; err_destroy_chap_buf_cache: kmem_cache_destroy(nvme_chap_buf_cache); +err_destroy_workqueue: + destroy_workqueue(nvme_auth_wq); return -ENOMEM; } @@ -1028,4 +1037,5 @@ void __exit nvme_exit_auth(void) { mempool_destroy(nvme_chap_buf_pool); kmem_cache_destroy(nvme_chap_buf_cache); + destroy_workqueue(nvme_auth_wq); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7be562a4e1aa..8b6421141162 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1093,7 +1093,7 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) if (ns) { if (ns->head->effects) effects = le32_to_cpu(ns->head->effects->iocs[opcode]); - if (ns->head->ids.csi == NVME_CAP_CSS_NVM) + if (ns->head->ids.csi == NVME_CSI_NVM) effects |= nvme_known_nvm_effects(opcode); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn_once(ctrl->device, @@ -4921,7 +4921,9 @@ out_cleanup_admin_q: blk_mq_destroy_queue(ctrl->admin_q); blk_put_queue(ctrl->admin_q); out_free_tagset: - blk_mq_free_tag_set(ctrl->admin_tagset); + blk_mq_free_tag_set(set); + ctrl->admin_q = NULL; + ctrl->fabrics_q = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); @@ -4983,6 +4985,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, out_free_tag_set: blk_mq_free_tag_set(set); + ctrl->connect_q = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 4564f16a0b20..456ee42a6133 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3521,13 +3521,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_fc_init_queue(ctrl, 0); - ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_fc_admin_mq_ops, - struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, - ctrl->lport->ops->fcprqst_priv_sz)); - if (ret) - goto out_free_queues; - /* * Would have been nice to init io queues tag set as well. * However, we require interaction from the controller @@ -3537,10 +3530,17 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); if (ret) - goto out_cleanup_tagset; + goto out_free_queues; /* at this point, teardown path changes to ref counting on nvme ctrl */ + ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, + &nvme_fc_admin_mq_ops, + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, + ctrl->lport->ops->fcprqst_priv_sz)); + if (ret) + goto fail_ctrl; + spin_lock_irqsave(&rport->lock, flags); list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); @@ -3592,8 +3592,6 @@ fail_ctrl: return ERR_PTR(-EIO); -out_cleanup_tagset: - nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_queues: kfree(ctrl->queues); out_free_ida: diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1ff8843bc4b3..c11e0cfeef0f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -110,6 +110,7 @@ struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); +static void nvme_update_attrs(struct nvme_dev *dev); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -1923,6 +1924,8 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); + + nvme_update_attrs(dev); } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2209,6 +2212,11 @@ static const struct attribute_group *nvme_pci_dev_attr_groups[] = { NULL, }; +static void nvme_update_attrs(struct nvme_dev *dev) +{ + sysfs_update_group(&dev->ctrl.device->kobj, &nvme_pci_dev_attrs_group); +} + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2509,18 +2517,12 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); - int dma_address_bits = 64; if (pci_enable_device_mem(pdev)) return result; pci_set_master(pdev); - if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) - dma_address_bits = 48; - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits))) - goto disable; - if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; goto disable; @@ -2970,7 +2972,7 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return NULL; + return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); mutex_init(&dev->shutdown_lock); @@ -2998,7 +3000,11 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, quirks); if (ret) goto out_put_device; - + + if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48)); + else + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); dma_set_max_seg_size(&pdev->dev, 0xffffffff); @@ -3031,8 +3037,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int result = -ENOMEM; dev = nvme_pci_alloc_dev(pdev, id); - if (!dev) - return -ENOMEM; + if (IS_ERR(dev)) + return PTR_ERR(dev); result = nvme_dev_map(dev); if (result) @@ -3102,6 +3108,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_start_ctrl(&dev->ctrl); nvme_put_ctrl(&dev->ctrl); + flush_work(&dev->ctrl.scan_work); return 0; out_disable: @@ -3422,6 +3429,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x10ec, 0x5763), /* ADATA SX6000PNP */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index ab2627e17bb9..1ab6601fdd5c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1685,8 +1685,10 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, else { queue = nvmet_fc_alloc_target_queue(iod->assoc, 0, be16_to_cpu(rqst->assoc_cmd.sqsize)); - if (!queue) + if (!queue) { ret = VERR_QUEUE_ALLOC_FAIL; + nvmet_fc_tgt_a_put(iod->assoc); + } } } diff --git a/drivers/nvmem/brcm_nvram.c b/drivers/nvmem/brcm_nvram.c index 34130449f2d2..39aa27942f28 100644 --- a/drivers/nvmem/brcm_nvram.c +++ b/drivers/nvmem/brcm_nvram.c @@ -98,6 +98,9 @@ static int brcm_nvram_parse(struct brcm_nvram *priv) len = le32_to_cpu(header.len); data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + memcpy_fromio(data, priv->base, len); data[len - 1] = '\0'; diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 321d7d63e068..34ee9d36ee7b 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -770,31 +770,32 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) return ERR_PTR(rval); } - if (config->wp_gpio) - nvmem->wp_gpio = config->wp_gpio; - else if (!config->ignore_wp) + nvmem->id = rval; + + nvmem->dev.type = &nvmem_provider_type; + nvmem->dev.bus = &nvmem_bus_type; + nvmem->dev.parent = config->dev; + + device_initialize(&nvmem->dev); + + if (!config->ignore_wp) nvmem->wp_gpio = gpiod_get_optional(config->dev, "wp", GPIOD_OUT_HIGH); if (IS_ERR(nvmem->wp_gpio)) { - ida_free(&nvmem_ida, nvmem->id); rval = PTR_ERR(nvmem->wp_gpio); - kfree(nvmem); - return ERR_PTR(rval); + nvmem->wp_gpio = NULL; + goto err_put_device; } kref_init(&nvmem->refcnt); INIT_LIST_HEAD(&nvmem->cells); - nvmem->id = rval; nvmem->owner = config->owner; if (!nvmem->owner && config->dev->driver) nvmem->owner = config->dev->driver->owner; nvmem->stride = config->stride ?: 1; nvmem->word_size = config->word_size ?: 1; nvmem->size = config->size; - nvmem->dev.type = &nvmem_provider_type; - nvmem->dev.bus = &nvmem_bus_type; - nvmem->dev.parent = config->dev; nvmem->root_only = config->root_only; nvmem->priv = config->priv; nvmem->type = config->type; @@ -822,11 +823,8 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) break; } - if (rval) { - ida_free(&nvmem_ida, nvmem->id); - kfree(nvmem); - return ERR_PTR(rval); - } + if (rval) + goto err_put_device; nvmem->read_only = device_property_present(config->dev, "read-only") || config->read_only || !nvmem->reg_write; @@ -835,28 +833,22 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) nvmem->dev.groups = nvmem_dev_groups; #endif - dev_dbg(&nvmem->dev, "Registering nvmem device %s\n", config->name); - - rval = device_register(&nvmem->dev); - if (rval) - goto err_put_device; - if (nvmem->nkeepout) { rval = nvmem_validate_keepouts(nvmem); if (rval) - goto err_device_del; + goto err_put_device; } if (config->compat) { rval = nvmem_sysfs_setup_compat(nvmem, config); if (rval) - goto err_device_del; + goto err_put_device; } if (config->cells) { rval = nvmem_add_cells(nvmem, config->cells, config->ncells); if (rval) - goto err_teardown_compat; + goto err_remove_cells; } rval = nvmem_add_cells_from_table(nvmem); @@ -867,17 +859,20 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) if (rval) goto err_remove_cells; + dev_dbg(&nvmem->dev, "Registering nvmem device %s\n", config->name); + + rval = device_add(&nvmem->dev); + if (rval) + goto err_remove_cells; + blocking_notifier_call_chain(&nvmem_notifier, NVMEM_ADD, nvmem); return nvmem; err_remove_cells: nvmem_device_remove_all_cells(nvmem); -err_teardown_compat: if (config->compat) nvmem_sysfs_remove_compat(nvmem, config); -err_device_del: - device_del(&nvmem->dev); err_put_device: put_device(&nvmem->dev); @@ -1242,16 +1237,21 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, const char *id) if (!cell_np) return ERR_PTR(-ENOENT); - nvmem_np = of_get_next_parent(cell_np); - if (!nvmem_np) + nvmem_np = of_get_parent(cell_np); + if (!nvmem_np) { + of_node_put(cell_np); return ERR_PTR(-EINVAL); + } nvmem = __nvmem_device_get(nvmem_np, device_match_of_node); of_node_put(nvmem_np); - if (IS_ERR(nvmem)) + if (IS_ERR(nvmem)) { + of_node_put(cell_np); return ERR_CAST(nvmem); + } cell_entry = nvmem_find_cell_entry_by_node(nvmem, cell_np); + of_node_put(cell_np); if (!cell_entry) { __nvmem_device_put(nvmem); return ERR_PTR(-ENOENT); diff --git a/drivers/nvmem/qcom-spmi-sdam.c b/drivers/nvmem/qcom-spmi-sdam.c index 4fcb63507ecd..8499892044b7 100644 --- a/drivers/nvmem/qcom-spmi-sdam.c +++ b/drivers/nvmem/qcom-spmi-sdam.c @@ -166,6 +166,7 @@ static const struct of_device_id sdam_match_table[] = { { .compatible = "qcom,spmi-sdam" }, {}, }; +MODULE_DEVICE_TABLE(of, sdam_match_table); static struct platform_driver sdam_driver = { .driver = { diff --git a/drivers/nvmem/sunxi_sid.c b/drivers/nvmem/sunxi_sid.c index 5750e1f4bcdb..92dfe4cb10e3 100644 --- a/drivers/nvmem/sunxi_sid.c +++ b/drivers/nvmem/sunxi_sid.c @@ -41,8 +41,21 @@ static int sunxi_sid_read(void *context, unsigned int offset, void *val, size_t bytes) { struct sunxi_sid *sid = context; + u32 word; + + /* .stride = 4 so offset is guaranteed to be aligned */ + __ioread32_copy(val, sid->base + sid->value_offset + offset, bytes / 4); - memcpy_fromio(val, sid->base + sid->value_offset + offset, bytes); + val += round_down(bytes, 4); + offset += round_down(bytes, 4); + bytes = bytes % 4; + + if (!bytes) + return 0; + + /* Handle any trailing bytes */ + word = readl_relaxed(sid->base + sid->value_offset + offset); + memcpy(val, &word, bytes); return 0; } diff --git a/drivers/of/address.c b/drivers/of/address.c index c34ac33b7338..67763e5b8c0e 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -965,8 +965,19 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) } of_dma_range_parser_init(&parser, node); - for_each_of_range(&parser, &range) + for_each_of_range(&parser, &range) { + if (range.cpu_addr == OF_BAD_ADDR) { + pr_err("translation of DMA address(%llx) to CPU address failed node(%pOF)\n", + range.bus_addr, node); + continue; + } num_ranges++; + } + + if (!num_ranges) { + ret = -EINVAL; + goto out; + } r = kcalloc(num_ranges + 1, sizeof(*r), GFP_KERNEL); if (!r) { @@ -975,18 +986,16 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) } /* - * Record all info in the generic DMA ranges array for struct device. + * Record all info in the generic DMA ranges array for struct device, + * returning an error if we don't find any parsable ranges. */ *map = r; of_dma_range_parser_init(&parser, node); for_each_of_range(&parser, &range) { pr_debug("dma_addr(%llx) cpu_addr(%llx) size(%llx)\n", range.bus_addr, range.cpu_addr, range.size); - if (range.cpu_addr == OF_BAD_ADDR) { - pr_err("translation of DMA address(%llx) to CPU address failed node(%pOF)\n", - range.bus_addr, node); + if (range.cpu_addr == OF_BAD_ADDR) continue; - } r->cpu_start = range.cpu_addr; r->dma_start = range.bus_addr; r->size = range.size; diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index f08b25195ae7..d1a68b6d03b3 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -26,7 +26,6 @@ #include <linux/serial_core.h> #include <linux/sysfs.h> #include <linux/random.h> -#include <linux/kmemleak.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include <asm/page.h> @@ -525,12 +524,9 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, size = dt_mem_next_cell(dt_root_size_cells, &prop); if (size && - early_init_dt_reserve_memory(base, size, nomap) == 0) { + early_init_dt_reserve_memory(base, size, nomap) == 0) pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); - if (!nomap) - kmemleak_alloc_phys(base, size, 0); - } else pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 65f3b02a0e4e..f90975e00446 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -48,9 +48,10 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, err = memblock_mark_nomap(base, size); if (err) memblock_phys_free(base, size); - kmemleak_ignore_phys(base); } + kmemleak_ignore_phys(base); + return err; } diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 81c8c227ab6b..b3878a98d27f 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -525,6 +525,7 @@ static int __init of_platform_default_populate_init(void) if (IS_ENABLED(CONFIG_PPC)) { struct device_node *boot_display = NULL; struct platform_device *dev; + int display_number = 0; int ret; /* Check if we have a MacOS display without a node spec */ @@ -555,16 +556,23 @@ static int __init of_platform_default_populate_init(void) if (!of_get_property(node, "linux,opened", NULL) || !of_get_property(node, "linux,boot-display", NULL)) continue; - dev = of_platform_device_create(node, "of-display", NULL); + dev = of_platform_device_create(node, "of-display.0", NULL); + of_node_put(node); if (WARN_ON(!dev)) return -ENOMEM; boot_display = node; + display_number++; break; } for_each_node_by_type(node, "display") { + char buf[14]; + const char *of_display_format = "of-display.%d"; + if (!of_get_property(node, "linux,opened", NULL) || node == boot_display) continue; - of_platform_device_create(node, "of-display", NULL); + ret = snprintf(buf, sizeof(buf), of_display_format, display_number++); + if (ret < sizeof(buf)) + of_platform_device_create(node, buf, NULL); } } else { diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index d6af5726ddf3..2a18f7ba2398 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -274,8 +274,7 @@ pdcspath_hwpath_write(struct pdcspath_entry *entry, const char *buf, size_t coun /* We'll use a local copy of buf */ count = min_t(size_t, count, sizeof(in)-1); - strncpy(in, buf, count); - in[count] = '\0'; + strscpy(in, buf, count + 1); /* Let's clean up the target. 0xff is a blank pattern */ memset(&hwpath, 0xff, sizeof(hwpath)); @@ -388,8 +387,7 @@ pdcspath_layer_write(struct pdcspath_entry *entry, const char *buf, size_t count /* We'll use a local copy of buf */ count = min_t(size_t, count, sizeof(in)-1); - strncpy(in, buf, count); - in[count] = '\0'; + strscpy(in, buf, count + 1); /* Let's clean up the target. 0 is a blank pattern */ memset(&layers, 0, sizeof(layers)); @@ -756,8 +754,7 @@ static ssize_t pdcs_auto_write(struct kobject *kobj, /* We'll use a local copy of buf */ count = min_t(size_t, count, sizeof(in)-1); - strncpy(in, buf, count); - in[count] = '\0'; + strscpy(in, buf, count + 1); /* Current flags are stored in primary boot path entry */ pathentry = &pdcspath_entry_primary; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index fba95486caaf..5641786bd020 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1665,7 +1665,6 @@ int pci_save_state(struct pci_dev *dev) return i; pci_save_ltr_state(dev); - pci_save_aspm_l1ss_state(dev); pci_save_dpc_state(dev); pci_save_aer_state(dev); pci_save_ptm_state(dev); @@ -1772,7 +1771,6 @@ void pci_restore_state(struct pci_dev *dev) * LTR itself (in the PCIe capability). */ pci_restore_ltr_state(dev); - pci_restore_aspm_l1ss_state(dev); pci_restore_pcie_state(dev); pci_restore_pasid_state(dev); @@ -3465,11 +3463,6 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) if (error) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); - error = pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_L1SS, - 2 * sizeof(u32)); - if (error) - pci_err(dev, "unable to allocate suspend buffer for ASPM-L1SS\n"); - pci_allocate_vc_save_buffers(dev); } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9ed3b5550043..9049d07d3aae 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -566,14 +566,10 @@ bool pcie_wait_for_link(struct pci_dev *pdev, bool active); void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); -void pci_save_aspm_l1ss_state(struct pci_dev *dev); -void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } -static inline void pci_save_aspm_l1ss_state(struct pci_dev *dev) { } -static inline void pci_restore_aspm_l1ss_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCIE_ECRC diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 53a1fa306e1e..4b4184563a92 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -470,31 +470,6 @@ static void pci_clear_and_set_dword(struct pci_dev *pdev, int pos, pci_write_config_dword(pdev, pos, val); } -static void aspm_program_l1ss(struct pci_dev *dev, u32 ctl1, u32 ctl2) -{ - u16 l1ss = dev->l1ss; - u32 l1_2_enable; - - /* - * Per PCIe r6.0, sec 5.5.4, T_POWER_ON in PCI_L1SS_CTL2 must be - * programmed prior to setting the L1.2 enable bits in PCI_L1SS_CTL1. - */ - pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL2, ctl2); - - /* - * In addition, Common_Mode_Restore_Time and LTR_L1.2_THRESHOLD in - * PCI_L1SS_CTL1 must be programmed *before* setting the L1.2 - * enable bits, even though they're all in PCI_L1SS_CTL1. - */ - l1_2_enable = ctl1 & PCI_L1SS_CTL1_L1_2_MASK; - ctl1 &= ~PCI_L1SS_CTL1_L1_2_MASK; - - pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL1, ctl1); - if (l1_2_enable) - pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL1, - ctl1 | l1_2_enable); -} - /* Calculate L1.2 PM substate timing parameters */ static void aspm_calc_l1ss_info(struct pcie_link_state *link, u32 parent_l1ss_cap, u32 child_l1ss_cap) @@ -504,6 +479,7 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, u32 t_common_mode, t_power_on, l1_2_threshold, scale, value; u32 ctl1 = 0, ctl2 = 0; u32 pctl1, pctl2, cctl1, cctl2; + u32 pl1_2_enables, cl1_2_enables; if (!(link->aspm_support & ASPM_STATE_L1_2_MASK)) return; @@ -552,21 +528,39 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, ctl2 == pctl2 && ctl2 == cctl2) return; - pctl1 &= ~(PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE); - pctl1 |= (ctl1 & (PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE)); - aspm_program_l1ss(parent, pctl1, ctl2); - - cctl1 &= ~(PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE); - cctl1 |= (ctl1 & (PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE)); - aspm_program_l1ss(child, cctl1, ctl2); + /* Disable L1.2 while updating. See PCIe r5.0, sec 5.5.4, 7.8.3.3 */ + pl1_2_enables = pctl1 & PCI_L1SS_CTL1_L1_2_MASK; + cl1_2_enables = cctl1 & PCI_L1SS_CTL1_L1_2_MASK; + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + } + + /* Program T_POWER_ON times in both ports */ + pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, ctl2); + pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2); + + /* Program Common_Mode_Restore_Time in upstream device */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); + + /* Program LTR_L1.2_THRESHOLD time in both ports */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0, + pl1_2_enables); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, 0, + cl1_2_enables); + } } static void aspm_l1ss_init(struct pcie_link_state *link) @@ -757,43 +751,6 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) PCI_L1SS_CTL1_L1SS_MASK, val); } -void pci_save_aspm_l1ss_state(struct pci_dev *dev) -{ - struct pci_cap_saved_state *save_state; - u16 l1ss = dev->l1ss; - u32 *cap; - - if (!l1ss) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - pci_read_config_dword(dev, l1ss + PCI_L1SS_CTL2, cap++); - pci_read_config_dword(dev, l1ss + PCI_L1SS_CTL1, cap++); -} - -void pci_restore_aspm_l1ss_state(struct pci_dev *dev) -{ - struct pci_cap_saved_state *save_state; - u32 *cap, ctl1, ctl2; - u16 l1ss = dev->l1ss; - - if (!l1ss) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - ctl2 = *cap++; - ctl1 = *cap; - aspm_program_l1ss(dev, ctl1, ctl2); -} - static void pcie_config_aspm_dev(struct pci_dev *pdev, u32 val) { pcie_capability_clear_and_set_word(pdev, PCI_EXP_LNKCTL, diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index b80a9b74662b..1deb61b22bc7 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -1576,7 +1576,6 @@ static int arm_cmn_event_init(struct perf_event *event) hw->dn++; continue; } - hw->dtcs_used |= arm_cmn_node_to_xp(cmn, dn)->dtc; hw->num_dns++; if (bynodeid) break; @@ -1589,6 +1588,12 @@ static int arm_cmn_event_init(struct perf_event *event) nodeid, nid.x, nid.y, nid.port, nid.dev, type); return -EINVAL; } + /* + * Keep assuming non-cycles events count in all DTC domains; turns out + * it's hard to make a worthwhile optimisation around this, short of + * going all-in with domain-local counter allocation as well. + */ + hw->dtcs_used = (1U << cmn->num_dtcs) - 1; return arm_cmn_validate_group(cmn, event); } diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 9b593f985805..40f70f83daba 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -550,13 +550,7 @@ static void armpmu_disable(struct pmu *pmu) static bool armpmu_filter(struct pmu *pmu, int cpu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); - bool ret; - - ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus); - if (ret && armpmu->filter) - return armpmu->filter(pmu, cpu); - - return ret; + return !cpumask_test_cpu(cpu, &armpmu->supported_cpus); } static ssize_t cpus_show(struct device *dev, diff --git a/drivers/pinctrl/aspeed/pinctrl-aspeed.c b/drivers/pinctrl/aspeed/pinctrl-aspeed.c index 3945612900e6..9c6ee46ac7a0 100644 --- a/drivers/pinctrl/aspeed/pinctrl-aspeed.c +++ b/drivers/pinctrl/aspeed/pinctrl-aspeed.c @@ -93,10 +93,19 @@ static int aspeed_sig_expr_enable(struct aspeed_pinmux_data *ctx, static int aspeed_sig_expr_disable(struct aspeed_pinmux_data *ctx, const struct aspeed_sig_expr *expr) { + int ret; + pr_debug("Disabling signal %s for %s\n", expr->signal, expr->function); - return aspeed_sig_expr_set(ctx, expr, false); + ret = aspeed_sig_expr_eval(ctx, expr, true); + if (ret < 0) + return ret; + + if (ret) + return aspeed_sig_expr_set(ctx, expr, false); + + return 0; } /** @@ -114,7 +123,7 @@ static int aspeed_disable_sig(struct aspeed_pinmux_data *ctx, int ret = 0; if (!exprs) - return true; + return -EINVAL; while (*exprs && !ret) { ret = aspeed_sig_expr_disable(ctx, *exprs); diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index cc3aaba24188..e49f271de936 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -1709,6 +1709,12 @@ const struct intel_pinctrl_soc_data *intel_pinctrl_get_soc_data(struct platform_ EXPORT_SYMBOL_GPL(intel_pinctrl_get_soc_data); #ifdef CONFIG_PM_SLEEP +static bool __intel_gpio_is_direct_irq(u32 value) +{ + return (value & PADCFG0_GPIROUTIOXAPIC) && (value & PADCFG0_GPIOTXDIS) && + (__intel_gpio_get_gpio_mode(value) == PADCFG0_PMODE_GPIO); +} + static bool intel_pinctrl_should_save(struct intel_pinctrl *pctrl, unsigned int pin) { const struct pin_desc *pd = pin_desc_get(pctrl->pctldev, pin); @@ -1742,8 +1748,7 @@ static bool intel_pinctrl_should_save(struct intel_pinctrl *pctrl, unsigned int * See https://bugzilla.kernel.org/show_bug.cgi?id=214749. */ value = readl(intel_get_padcfg(pctrl, pin, PADCFG0)); - if ((value & PADCFG0_GPIROUTIOXAPIC) && (value & PADCFG0_GPIOTXDIS) && - (__intel_gpio_get_gpio_mode(value) == PADCFG0_PMODE_GPIO)) + if (__intel_gpio_is_direct_irq(value)) return true; return false; @@ -1873,7 +1878,12 @@ int intel_pinctrl_resume_noirq(struct device *dev) for (i = 0; i < pctrl->soc->npins; i++) { const struct pinctrl_pin_desc *desc = &pctrl->soc->pins[i]; - if (!intel_pinctrl_should_save(pctrl, desc->number)) + if (!(intel_pinctrl_should_save(pctrl, desc->number) || + /* + * If the firmware mangled the register contents too much, + * check the saved value for the Direct IRQ mode. + */ + __intel_gpio_is_direct_irq(pads[i].padcfg0))) continue; intel_restore_padcfg(pctrl, desc->number, PADCFG0, pads[i].padcfg0); diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8195.c b/drivers/pinctrl/mediatek/pinctrl-mt8195.c index 89557c7ed2ab..09c4dcef9338 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt8195.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt8195.c @@ -659,7 +659,7 @@ static const struct mtk_pin_field_calc mt8195_pin_drv_range[] = { PIN_FIELD_BASE(10, 10, 4, 0x010, 0x10, 9, 3), PIN_FIELD_BASE(11, 11, 4, 0x000, 0x10, 24, 3), PIN_FIELD_BASE(12, 12, 4, 0x010, 0x10, 12, 3), - PIN_FIELD_BASE(13, 13, 4, 0x010, 0x10, 27, 3), + PIN_FIELD_BASE(13, 13, 4, 0x000, 0x10, 27, 3), PIN_FIELD_BASE(14, 14, 4, 0x010, 0x10, 15, 3), PIN_FIELD_BASE(15, 15, 4, 0x010, 0x10, 0, 3), PIN_FIELD_BASE(16, 16, 4, 0x010, 0x10, 18, 3), @@ -708,7 +708,7 @@ static const struct mtk_pin_field_calc mt8195_pin_drv_range[] = { PIN_FIELD_BASE(78, 78, 3, 0x000, 0x10, 15, 3), PIN_FIELD_BASE(79, 79, 3, 0x000, 0x10, 18, 3), PIN_FIELD_BASE(80, 80, 3, 0x000, 0x10, 21, 3), - PIN_FIELD_BASE(81, 81, 3, 0x000, 0x10, 28, 3), + PIN_FIELD_BASE(81, 81, 3, 0x000, 0x10, 24, 3), PIN_FIELD_BASE(82, 82, 3, 0x000, 0x10, 27, 3), PIN_FIELD_BASE(83, 83, 3, 0x010, 0x10, 0, 3), PIN_FIELD_BASE(84, 84, 3, 0x010, 0x10, 3, 3), diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 9bc6e3922e78..32c3edaf9038 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -365,6 +365,7 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) } else { debounce_enable = " ∅"; + time = 0; } snprintf(debounce_value, sizeof(debounce_value), "%u", time * unit); seq_printf(s, "debounce %s (🕑 %sus)| ", debounce_enable, debounce_value); diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 99c3745da456..190923757cda 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -372,6 +372,8 @@ static int pcs_set_mux(struct pinctrl_dev *pctldev, unsigned fselector, if (!pcs->fmask) return 0; function = pinmux_generic_get_function(pctldev, fselector); + if (!function) + return -EINVAL; func = function->data; if (!func) return -EINVAL; diff --git a/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c index c3c8c34148f1..e22d03ce292e 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c @@ -105,7 +105,7 @@ static const struct pinctrl_pin_desc sm8450_lpi_pins[] = { static const char * const swr_tx_clk_groups[] = { "gpio0" }; static const char * const swr_tx_data_groups[] = { "gpio1", "gpio2", "gpio14" }; static const char * const swr_rx_clk_groups[] = { "gpio3" }; -static const char * const swr_rx_data_groups[] = { "gpio4", "gpio5", "gpio15" }; +static const char * const swr_rx_data_groups[] = { "gpio4", "gpio5" }; static const char * const dmic1_clk_groups[] = { "gpio6" }; static const char * const dmic1_data_groups[] = { "gpio7" }; static const char * const dmic2_clk_groups[] = { "gpio8" }; diff --git a/drivers/platform/x86/amd/Kconfig b/drivers/platform/x86/amd/Kconfig index a825af8126c8..2ce8cb2170df 100644 --- a/drivers/platform/x86/amd/Kconfig +++ b/drivers/platform/x86/amd/Kconfig @@ -8,6 +8,7 @@ source "drivers/platform/x86/amd/pmf/Kconfig" config AMD_PMC tristate "AMD SoC PMC driver" depends on ACPI && PCI && RTC_CLASS + select SERIO help The driver provides support for AMD Power Management Controller primarily responsible for S2Idle transactions that are driven from diff --git a/drivers/platform/x86/amd/pmc.c b/drivers/platform/x86/amd/pmc.c index 8d924986381b..3cbb01ec10e3 100644 --- a/drivers/platform/x86/amd/pmc.c +++ b/drivers/platform/x86/amd/pmc.c @@ -22,6 +22,7 @@ #include <linux/pci.h> #include <linux/platform_device.h> #include <linux/rtc.h> +#include <linux/serio.h> #include <linux/suspend.h> #include <linux/seq_file.h> #include <linux/uaccess.h> @@ -160,6 +161,10 @@ static bool enable_stb; module_param(enable_stb, bool, 0644); MODULE_PARM_DESC(enable_stb, "Enable the STB debug mechanism"); +static bool disable_workarounds; +module_param(disable_workarounds, bool, 0644); +MODULE_PARM_DESC(disable_workarounds, "Disable workarounds for platform bugs"); + static struct amd_pmc_dev pmc; static int amd_pmc_send_cmd(struct amd_pmc_dev *dev, u32 arg, u32 *data, u8 msg, bool ret); static int amd_pmc_read_stb(struct amd_pmc_dev *dev, u32 *buf); @@ -653,6 +658,33 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev) return -EINVAL; } +static int amd_pmc_czn_wa_irq1(struct amd_pmc_dev *pdev) +{ + struct device *d; + int rc; + + if (!pdev->major) { + rc = amd_pmc_get_smu_version(pdev); + if (rc) + return rc; + } + + if (pdev->major > 64 || (pdev->major == 64 && pdev->minor > 65)) + return 0; + + d = bus_find_device_by_name(&serio_bus, NULL, "serio0"); + if (!d) + return 0; + if (device_may_wakeup(d)) { + dev_info_once(d, "Disabling IRQ1 wakeup source to avoid platform firmware bug\n"); + disable_irq_wake(1); + device_set_wakeup_enable(d, false); + } + put_device(d); + + return 0; +} + static int amd_pmc_verify_czn_rtc(struct amd_pmc_dev *pdev, u32 *arg) { struct rtc_device *rtc_device; @@ -715,8 +747,8 @@ static void amd_pmc_s2idle_prepare(void) /* Reset and Start SMU logging - to monitor the s0i3 stats */ amd_pmc_setup_smu_logging(pdev); - /* Activate CZN specific RTC functionality */ - if (pdev->cpu_id == AMD_CPU_ID_CZN) { + /* Activate CZN specific platform bug workarounds */ + if (pdev->cpu_id == AMD_CPU_ID_CZN && !disable_workarounds) { rc = amd_pmc_verify_czn_rtc(pdev, &arg); if (rc) { dev_err(pdev->dev, "failed to set RTC: %d\n", rc); @@ -782,6 +814,25 @@ static struct acpi_s2idle_dev_ops amd_pmc_s2idle_dev_ops = { .check = amd_pmc_s2idle_check, .restore = amd_pmc_s2idle_restore, }; + +static int __maybe_unused amd_pmc_suspend_handler(struct device *dev) +{ + struct amd_pmc_dev *pdev = dev_get_drvdata(dev); + + if (pdev->cpu_id == AMD_CPU_ID_CZN && !disable_workarounds) { + int rc = amd_pmc_czn_wa_irq1(pdev); + + if (rc) { + dev_err(pdev->dev, "failed to adjust keyboard wakeup: %d\n", rc); + return rc; + } + } + + return 0; +} + +static SIMPLE_DEV_PM_OPS(amd_pmc_pm, amd_pmc_suspend_handler, NULL); + #endif static const struct pci_device_id pmc_pci_ids[] = { @@ -980,6 +1031,9 @@ static struct platform_driver amd_pmc_driver = { .name = "amd_pmc", .acpi_match_table = amd_pmc_acpi_ids, .dev_groups = pmc_groups, +#ifdef CONFIG_SUSPEND + .pm = &amd_pmc_pm, +#endif }, .probe = amd_pmc_probe, .remove = amd_pmc_remove, diff --git a/drivers/platform/x86/amd/pmf/auto-mode.c b/drivers/platform/x86/amd/pmf/auto-mode.c index 644af42e07cf..96a8e1832c05 100644 --- a/drivers/platform/x86/amd/pmf/auto-mode.c +++ b/drivers/platform/x86/amd/pmf/auto-mode.c @@ -275,13 +275,8 @@ int amd_pmf_reset_amt(struct amd_pmf_dev *dev) */ if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { - int mode = amd_pmf_get_pprof_modes(dev); - - if (mode < 0) - return mode; - dev_dbg(dev->dev, "resetting AMT thermals\n"); - amd_pmf_update_slider(dev, SLIDER_OP_SET, mode, NULL); + amd_pmf_set_sps_power_limits(dev); } return 0; } @@ -299,7 +294,5 @@ void amd_pmf_deinit_auto_mode(struct amd_pmf_dev *dev) void amd_pmf_init_auto_mode(struct amd_pmf_dev *dev) { amd_pmf_load_defaults_auto_mode(dev); - /* update the thermal limits for Automode */ - amd_pmf_set_automode(dev, config_store.current_mode, NULL); amd_pmf_init_metrics_table(dev); } diff --git a/drivers/platform/x86/amd/pmf/cnqf.c b/drivers/platform/x86/amd/pmf/cnqf.c index 3f9731a2ac28..4beb22a19466 100644 --- a/drivers/platform/x86/amd/pmf/cnqf.c +++ b/drivers/platform/x86/amd/pmf/cnqf.c @@ -103,7 +103,7 @@ int amd_pmf_trans_cnqf(struct amd_pmf_dev *dev, int socket_power, ktime_t time_l src = amd_pmf_cnqf_get_power_source(dev); - if (dev->current_profile == PLATFORM_PROFILE_BALANCED) { + if (is_pprof_balanced(dev)) { amd_pmf_set_cnqf(dev, src, config_store.current_mode, NULL); } else { /* @@ -307,13 +307,9 @@ static ssize_t cnqf_enable_store(struct device *dev, const char *buf, size_t count) { struct amd_pmf_dev *pdev = dev_get_drvdata(dev); - int mode, result, src; + int result, src; bool input; - mode = amd_pmf_get_pprof_modes(pdev); - if (mode < 0) - return mode; - result = kstrtobool(buf, &input); if (result) return result; @@ -321,11 +317,11 @@ static ssize_t cnqf_enable_store(struct device *dev, src = amd_pmf_cnqf_get_power_source(pdev); pdev->cnqf_enabled = input; - if (pdev->cnqf_enabled && pdev->current_profile == PLATFORM_PROFILE_BALANCED) { + if (pdev->cnqf_enabled && is_pprof_balanced(pdev)) { amd_pmf_set_cnqf(pdev, src, config_store.current_mode, NULL); } else { if (is_apmf_func_supported(pdev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) - amd_pmf_update_slider(pdev, SLIDER_OP_SET, mode, NULL); + amd_pmf_set_sps_power_limits(pdev); } dev_dbg(pdev->dev, "Received CnQF %s\n", input ? "on" : "off"); @@ -386,7 +382,7 @@ int amd_pmf_init_cnqf(struct amd_pmf_dev *dev) dev->cnqf_enabled = amd_pmf_check_flags(dev); /* update the thermal for CnQF */ - if (dev->cnqf_enabled && dev->current_profile == PLATFORM_PROFILE_BALANCED) { + if (dev->cnqf_enabled && is_pprof_balanced(dev)) { src = amd_pmf_cnqf_get_power_source(dev); amd_pmf_set_cnqf(dev, src, config_store.current_mode, NULL); } diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index a5f5a4bcff6d..da23639071d7 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -58,6 +58,25 @@ static bool force_load; module_param(force_load, bool, 0444); MODULE_PARM_DESC(force_load, "Force load this driver on supported older platforms (experimental)"); +static int amd_pmf_pwr_src_notify_call(struct notifier_block *nb, unsigned long event, void *data) +{ + struct amd_pmf_dev *pmf = container_of(nb, struct amd_pmf_dev, pwr_src_notifier); + + if (event != PSY_EVENT_PROP_CHANGED) + return NOTIFY_OK; + + if (is_apmf_func_supported(pmf, APMF_FUNC_AUTO_MODE) || + is_apmf_func_supported(pmf, APMF_FUNC_DYN_SLIDER_DC) || + is_apmf_func_supported(pmf, APMF_FUNC_DYN_SLIDER_AC)) { + if ((pmf->amt_enabled || pmf->cnqf_enabled) && is_pprof_balanced(pmf)) + return NOTIFY_DONE; + } + + amd_pmf_set_sps_power_limits(pmf); + + return NOTIFY_OK; +} + static int current_power_limits_show(struct seq_file *seq, void *unused) { struct amd_pmf_dev *dev = seq->private; @@ -366,14 +385,18 @@ static int amd_pmf_probe(struct platform_device *pdev) if (!dev->regbase) return -ENOMEM; + mutex_init(&dev->lock); + mutex_init(&dev->update_mutex); + apmf_acpi_init(dev); platform_set_drvdata(pdev, dev); amd_pmf_init_features(dev); apmf_install_handler(dev); amd_pmf_dbgfs_register(dev); - mutex_init(&dev->lock); - mutex_init(&dev->update_mutex); + dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; + power_supply_reg_notifier(&dev->pwr_src_notifier); + dev_info(dev->dev, "registered PMF device successfully\n"); return 0; @@ -383,11 +406,12 @@ static int amd_pmf_remove(struct platform_device *pdev) { struct amd_pmf_dev *dev = platform_get_drvdata(pdev); - mutex_destroy(&dev->lock); - mutex_destroy(&dev->update_mutex); + power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_features(dev); apmf_acpi_deinit(dev); amd_pmf_dbgfs_unregister(dev); + mutex_destroy(&dev->lock); + mutex_destroy(&dev->update_mutex); kfree(dev->buf); return 0; } diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 84bbe2c6ea61..06c30cdc0573 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -169,6 +169,7 @@ struct amd_pmf_dev { struct mutex update_mutex; /* protects race between ACPI handler and metrics thread */ bool cnqf_enabled; bool cnqf_supported; + struct notifier_block pwr_src_notifier; }; struct apmf_sps_prop_granular { @@ -391,9 +392,11 @@ int amd_pmf_init_sps(struct amd_pmf_dev *dev); void amd_pmf_deinit_sps(struct amd_pmf_dev *dev); int apmf_get_static_slider_granular(struct amd_pmf_dev *pdev, struct apmf_static_slider_granular_output *output); +bool is_pprof_balanced(struct amd_pmf_dev *pmf); int apmf_update_fan_idx(struct amd_pmf_dev *pdev, bool manual, u32 idx); +int amd_pmf_set_sps_power_limits(struct amd_pmf_dev *pmf); /* Auto Mode Layer */ int apmf_get_auto_mode_def(struct amd_pmf_dev *pdev, struct apmf_auto_mode *data); diff --git a/drivers/platform/x86/amd/pmf/sps.c b/drivers/platform/x86/amd/pmf/sps.c index dba7e36962dc..bed762d47a14 100644 --- a/drivers/platform/x86/amd/pmf/sps.c +++ b/drivers/platform/x86/amd/pmf/sps.c @@ -70,6 +70,24 @@ void amd_pmf_update_slider(struct amd_pmf_dev *dev, bool op, int idx, } } +int amd_pmf_set_sps_power_limits(struct amd_pmf_dev *pmf) +{ + int mode; + + mode = amd_pmf_get_pprof_modes(pmf); + if (mode < 0) + return mode; + + amd_pmf_update_slider(pmf, SLIDER_OP_SET, mode, NULL); + + return 0; +} + +bool is_pprof_balanced(struct amd_pmf_dev *pmf) +{ + return (pmf->current_profile == PLATFORM_PROFILE_BALANCED) ? true : false; +} + static int amd_pmf_profile_get(struct platform_profile_handler *pprof, enum platform_profile_option *profile) { @@ -105,15 +123,10 @@ static int amd_pmf_profile_set(struct platform_profile_handler *pprof, enum platform_profile_option profile) { struct amd_pmf_dev *pmf = container_of(pprof, struct amd_pmf_dev, pprof); - int mode; pmf->current_profile = profile; - mode = amd_pmf_get_pprof_modes(pmf); - if (mode < 0) - return mode; - amd_pmf_update_slider(pmf, SLIDER_OP_SET, mode, NULL); - return 0; + return amd_pmf_set_sps_power_limits(pmf); } int amd_pmf_init_sps(struct amd_pmf_dev *dev) @@ -123,6 +136,9 @@ int amd_pmf_init_sps(struct amd_pmf_dev *dev) dev->current_profile = PLATFORM_PROFILE_BALANCED; amd_pmf_load_defaults_sps(dev); + /* update SPS balanced power mode thermals */ + amd_pmf_set_sps_power_limits(dev); + dev->pprof.profile_get = amd_pmf_profile_get; dev->pprof.profile_set = amd_pmf_profile_set; diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c index ca33df7ea550..9333f82cfa8a 100644 --- a/drivers/platform/x86/apple-gmux.c +++ b/drivers/platform/x86/apple-gmux.c @@ -64,29 +64,6 @@ struct apple_gmux_data { static struct apple_gmux_data *apple_gmux_data; -/* - * gmux port offsets. Many of these are not yet used, but may be in the - * future, and it's useful to have them documented here anyhow. - */ -#define GMUX_PORT_VERSION_MAJOR 0x04 -#define GMUX_PORT_VERSION_MINOR 0x05 -#define GMUX_PORT_VERSION_RELEASE 0x06 -#define GMUX_PORT_SWITCH_DISPLAY 0x10 -#define GMUX_PORT_SWITCH_GET_DISPLAY 0x11 -#define GMUX_PORT_INTERRUPT_ENABLE 0x14 -#define GMUX_PORT_INTERRUPT_STATUS 0x16 -#define GMUX_PORT_SWITCH_DDC 0x28 -#define GMUX_PORT_SWITCH_EXTERNAL 0x40 -#define GMUX_PORT_SWITCH_GET_EXTERNAL 0x41 -#define GMUX_PORT_DISCRETE_POWER 0x50 -#define GMUX_PORT_MAX_BRIGHTNESS 0x70 -#define GMUX_PORT_BRIGHTNESS 0x74 -#define GMUX_PORT_VALUE 0xc2 -#define GMUX_PORT_READ 0xd0 -#define GMUX_PORT_WRITE 0xd4 - -#define GMUX_MIN_IO_LEN (GMUX_PORT_BRIGHTNESS + 4) - #define GMUX_INTERRUPT_ENABLE 0xff #define GMUX_INTERRUPT_DISABLE 0x00 @@ -249,23 +226,6 @@ static void gmux_write32(struct apple_gmux_data *gmux_data, int port, gmux_pio_write32(gmux_data, port, val); } -static bool gmux_is_indexed(struct apple_gmux_data *gmux_data) -{ - u16 val; - - outb(0xaa, gmux_data->iostart + 0xcc); - outb(0x55, gmux_data->iostart + 0xcd); - outb(0x00, gmux_data->iostart + 0xce); - - val = inb(gmux_data->iostart + 0xcc) | - (inb(gmux_data->iostart + 0xcd) << 8); - - if (val == 0x55aa) - return true; - - return false; -} - /** * DOC: Backlight control * @@ -605,60 +565,43 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) int ret = -ENXIO; acpi_status status; unsigned long long gpe; + bool indexed = false; + u32 version; if (apple_gmux_data) return -EBUSY; + if (!apple_gmux_detect(pnp, &indexed)) { + pr_info("gmux device not present\n"); + return -ENODEV; + } + gmux_data = kzalloc(sizeof(*gmux_data), GFP_KERNEL); if (!gmux_data) return -ENOMEM; pnp_set_drvdata(pnp, gmux_data); res = pnp_get_resource(pnp, IORESOURCE_IO, 0); - if (!res) { - pr_err("Failed to find gmux I/O resource\n"); - goto err_free; - } - gmux_data->iostart = res->start; gmux_data->iolen = resource_size(res); - if (gmux_data->iolen < GMUX_MIN_IO_LEN) { - pr_err("gmux I/O region too small (%lu < %u)\n", - gmux_data->iolen, GMUX_MIN_IO_LEN); - goto err_free; - } - if (!request_region(gmux_data->iostart, gmux_data->iolen, "Apple gmux")) { pr_err("gmux I/O already in use\n"); goto err_free; } - /* - * Invalid version information may indicate either that the gmux - * device isn't present or that it's a new one that uses indexed - * io - */ - - ver_major = gmux_read8(gmux_data, GMUX_PORT_VERSION_MAJOR); - ver_minor = gmux_read8(gmux_data, GMUX_PORT_VERSION_MINOR); - ver_release = gmux_read8(gmux_data, GMUX_PORT_VERSION_RELEASE); - if (ver_major == 0xff && ver_minor == 0xff && ver_release == 0xff) { - if (gmux_is_indexed(gmux_data)) { - u32 version; - mutex_init(&gmux_data->index_lock); - gmux_data->indexed = true; - version = gmux_read32(gmux_data, - GMUX_PORT_VERSION_MAJOR); - ver_major = (version >> 24) & 0xff; - ver_minor = (version >> 16) & 0xff; - ver_release = (version >> 8) & 0xff; - } else { - pr_info("gmux device not present\n"); - ret = -ENODEV; - goto err_release; - } + if (indexed) { + mutex_init(&gmux_data->index_lock); + gmux_data->indexed = true; + version = gmux_read32(gmux_data, GMUX_PORT_VERSION_MAJOR); + ver_major = (version >> 24) & 0xff; + ver_minor = (version >> 16) & 0xff; + ver_release = (version >> 8) & 0xff; + } else { + ver_major = gmux_read8(gmux_data, GMUX_PORT_VERSION_MAJOR); + ver_minor = gmux_read8(gmux_data, GMUX_PORT_VERSION_MINOR); + ver_release = gmux_read8(gmux_data, GMUX_PORT_VERSION_RELEASE); } pr_info("Found gmux version %d.%d.%d [%s]\n", ver_major, ver_minor, ver_release, (gmux_data->indexed ? "indexed" : "classic")); diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 104188d70988..1038dfdcdd32 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -225,6 +225,7 @@ struct asus_wmi { int tablet_switch_event_code; u32 tablet_switch_dev_id; + bool tablet_switch_inverted; enum fan_type fan_type; enum fan_type gpu_fan_type; @@ -493,6 +494,13 @@ static bool asus_wmi_dev_is_present(struct asus_wmi *asus, u32 dev_id) } /* Input **********************************************************************/ +static void asus_wmi_tablet_sw_report(struct asus_wmi *asus, bool value) +{ + input_report_switch(asus->inputdev, SW_TABLET_MODE, + asus->tablet_switch_inverted ? !value : value); + input_sync(asus->inputdev); +} + static void asus_wmi_tablet_sw_init(struct asus_wmi *asus, u32 dev_id, int event_code) { struct device *dev = &asus->platform_device->dev; @@ -501,7 +509,7 @@ static void asus_wmi_tablet_sw_init(struct asus_wmi *asus, u32 dev_id, int event result = asus_wmi_get_devstate_simple(asus, dev_id); if (result >= 0) { input_set_capability(asus->inputdev, EV_SW, SW_TABLET_MODE); - input_report_switch(asus->inputdev, SW_TABLET_MODE, result); + asus_wmi_tablet_sw_report(asus, result); asus->tablet_switch_dev_id = dev_id; asus->tablet_switch_event_code = event_code; } else if (result == -ENODEV) { @@ -534,6 +542,7 @@ static int asus_wmi_input_init(struct asus_wmi *asus) case asus_wmi_no_tablet_switch: break; case asus_wmi_kbd_dock_devid: + asus->tablet_switch_inverted = true; asus_wmi_tablet_sw_init(asus, ASUS_WMI_DEVID_KBD_DOCK, NOTIFY_KBD_DOCK_CHANGE); break; case asus_wmi_lid_flip_devid: @@ -573,10 +582,8 @@ static void asus_wmi_tablet_mode_get_state(struct asus_wmi *asus) return; result = asus_wmi_get_devstate_simple(asus, asus->tablet_switch_dev_id); - if (result >= 0) { - input_report_switch(asus->inputdev, SW_TABLET_MODE, result); - input_sync(asus->inputdev); - } + if (result >= 0) + asus_wmi_tablet_sw_report(asus, result); } /* dGPU ********************************************************************/ diff --git a/drivers/platform/x86/dell/dell-wmi-base.c b/drivers/platform/x86/dell/dell-wmi-base.c index 0a259a27459f..502783a7adb1 100644 --- a/drivers/platform/x86/dell/dell-wmi-base.c +++ b/drivers/platform/x86/dell/dell-wmi-base.c @@ -261,6 +261,9 @@ static const struct key_entry dell_wmi_keymap_type_0010[] = { { KE_KEY, 0x57, { KEY_BRIGHTNESSDOWN } }, { KE_KEY, 0x58, { KEY_BRIGHTNESSUP } }, + /*Speaker Mute*/ + { KE_KEY, 0x109, { KEY_MUTE} }, + /* Mic mute */ { KE_KEY, 0x150, { KEY_MICMUTE } }, diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c index 5e7e6659a849..322cfaeda17b 100644 --- a/drivers/platform/x86/gigabyte-wmi.c +++ b/drivers/platform/x86/gigabyte-wmi.c @@ -141,6 +141,7 @@ static u8 gigabyte_wmi_detect_sensor_usability(struct wmi_device *wdev) static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M DS3H-CF"), + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M DS3H WIFI-CF"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M S2H V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE AX V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE"), diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 0a99058be813..2ef201b625b3 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -90,6 +90,7 @@ enum hp_wmi_event_ids { HPWMI_PEAKSHIFT_PERIOD = 0x0F, HPWMI_BATTERY_CHARGE_PERIOD = 0x10, HPWMI_SANITIZATION_MODE = 0x17, + HPWMI_OMEN_KEY = 0x1D, HPWMI_SMART_EXPERIENCE_APP = 0x21, }; @@ -216,6 +217,8 @@ static const struct key_entry hp_wmi_keymap[] = { { KE_KEY, 0x213b, { KEY_INFO } }, { KE_KEY, 0x2169, { KEY_ROTATE_DISPLAY } }, { KE_KEY, 0x216a, { KEY_SETUP } }, + { KE_KEY, 0x21a5, { KEY_PROG2 } }, /* HP Omen Key */ + { KE_KEY, 0x21a7, { KEY_FN_ESC } }, { KE_KEY, 0x21a9, { KEY_TOUCHPAD_OFF } }, { KE_KEY, 0x121a9, { KEY_TOUCHPAD_ON } }, { KE_KEY, 0x231b, { KEY_HELP } }, @@ -548,7 +551,7 @@ static int __init hp_wmi_enable_hotkeys(void) static int hp_wmi_set_block(void *data, bool blocked) { - enum hp_wmi_radio r = (enum hp_wmi_radio) data; + enum hp_wmi_radio r = (long)data; int query = BIT(r + 8) | ((!blocked) << r); int ret; @@ -810,6 +813,7 @@ static void hp_wmi_notify(u32 value, void *context) case HPWMI_SMART_ADAPTER: break; case HPWMI_BEZEL_BUTTON: + case HPWMI_OMEN_KEY: key_code = hp_wmi_read_int(HPWMI_HOTKEY_QUERY); if (key_code < 0) break; diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index bb81b8b1f7e9..89c5374e33b3 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -408,14 +408,23 @@ static const struct intel_vsec_platform_info dg1_info = { .quirks = VSEC_QUIRK_NO_DVSEC | VSEC_QUIRK_EARLY_HW, }; +/* MTL info */ +static const struct intel_vsec_platform_info mtl_info = { + .quirks = VSEC_QUIRK_NO_WATCHER | VSEC_QUIRK_NO_CRASHLOG, +}; + #define PCI_DEVICE_ID_INTEL_VSEC_ADL 0x467d #define PCI_DEVICE_ID_INTEL_VSEC_DG1 0x490e +#define PCI_DEVICE_ID_INTEL_VSEC_MTL_M 0x7d0d +#define PCI_DEVICE_ID_INTEL_VSEC_MTL_S 0xad0d #define PCI_DEVICE_ID_INTEL_VSEC_OOBMSM 0x09a7 #define PCI_DEVICE_ID_INTEL_VSEC_RPL 0xa77d #define PCI_DEVICE_ID_INTEL_VSEC_TGL 0x9a0d static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_ADL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_DG1, &dg1_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_MTL_M, &mtl_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_MTL_S, &mtl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_OOBMSM, &(struct intel_vsec_platform_info) {}) }, { PCI_DEVICE_DATA(INTEL, VSEC_RPL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_TGL, &tgl_info) }, diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index a95946800ae9..32c10457399e 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -5563,7 +5563,7 @@ static int light_sysfs_set(struct led_classdev *led_cdev, static enum led_brightness light_sysfs_get(struct led_classdev *led_cdev) { - return (light_get_status() == 1) ? LED_FULL : LED_OFF; + return (light_get_status() == 1) ? LED_ON : LED_OFF; } static struct tpacpi_led_classdev tpacpi_led_thinklight = { @@ -10496,8 +10496,7 @@ static int dytc_profile_set(struct platform_profile_handler *pprof, if (err) goto unlock; } - } - if (dytc_capabilities & BIT(DYTC_FC_PSC)) { + } else if (dytc_capabilities & BIT(DYTC_FC_PSC)) { err = dytc_command(DYTC_SET_COMMAND(DYTC_FUNCTION_PSC, perfmode, 1), &output); if (err) goto unlock; @@ -10525,14 +10524,16 @@ static void dytc_profile_refresh(void) err = dytc_command(DYTC_CMD_MMC_GET, &output); else err = dytc_cql_command(DYTC_CMD_GET, &output); - } else if (dytc_capabilities & BIT(DYTC_FC_PSC)) + funcmode = DYTC_FUNCTION_MMC; + } else if (dytc_capabilities & BIT(DYTC_FC_PSC)) { err = dytc_command(DYTC_CMD_GET, &output); - + /* Check if we are PSC mode, or have AMT enabled */ + funcmode = (output >> DYTC_GET_FUNCTION_BIT) & 0xF; + } mutex_unlock(&dytc_mutex); if (err) return; - funcmode = (output >> DYTC_GET_FUNCTION_BIT) & 0xF; perfmode = (output >> DYTC_GET_MODE_BIT) & 0xF; convert_dytc_to_profile(funcmode, perfmode, &profile); if (profile != dytc_current_profile) { diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index f00995390fdf..13802a3c3591 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -1098,6 +1098,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, }, { + /* Chuwi Vi8 (CWI501) */ + .driver_data = (void *)&chuwi_vi8_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Insyde"), + DMI_MATCH(DMI_PRODUCT_NAME, "i86"), + DMI_MATCH(DMI_BIOS_VERSION, "CHUWI.W86JLBNR01"), + }, + }, + { /* Chuwi Vi8 (CWI506) */ .driver_data = (void *)&chuwi_vi8_data, .matches = { diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c index e991cccdb6e9..1e8bc6cc1e12 100644 --- a/drivers/rtc/rtc-efi.c +++ b/drivers/rtc/rtc-efi.c @@ -188,9 +188,10 @@ static int efi_set_time(struct device *dev, struct rtc_time *tm) static int efi_procfs(struct device *dev, struct seq_file *seq) { - efi_time_t eft, alm; - efi_time_cap_t cap; - efi_bool_t enabled, pending; + efi_time_t eft, alm; + efi_time_cap_t cap; + efi_bool_t enabled, pending; + struct rtc_device *rtc = dev_get_drvdata(dev); memset(&eft, 0, sizeof(eft)); memset(&alm, 0, sizeof(alm)); @@ -213,23 +214,25 @@ static int efi_procfs(struct device *dev, struct seq_file *seq) /* XXX fixme: convert to string? */ seq_printf(seq, "Timezone\t: %u\n", eft.timezone); - seq_printf(seq, - "Alarm Time\t: %u:%u:%u.%09u\n" - "Alarm Date\t: %u-%u-%u\n" - "Alarm Daylight\t: %u\n" - "Enabled\t\t: %s\n" - "Pending\t\t: %s\n", - alm.hour, alm.minute, alm.second, alm.nanosecond, - alm.year, alm.month, alm.day, - alm.daylight, - enabled == 1 ? "yes" : "no", - pending == 1 ? "yes" : "no"); - - if (eft.timezone == EFI_UNSPECIFIED_TIMEZONE) - seq_puts(seq, "Timezone\t: unspecified\n"); - else - /* XXX fixme: convert to string? */ - seq_printf(seq, "Timezone\t: %u\n", alm.timezone); + if (test_bit(RTC_FEATURE_ALARM, rtc->features)) { + seq_printf(seq, + "Alarm Time\t: %u:%u:%u.%09u\n" + "Alarm Date\t: %u-%u-%u\n" + "Alarm Daylight\t: %u\n" + "Enabled\t\t: %s\n" + "Pending\t\t: %s\n", + alm.hour, alm.minute, alm.second, alm.nanosecond, + alm.year, alm.month, alm.day, + alm.daylight, + enabled == 1 ? "yes" : "no", + pending == 1 ? "yes" : "no"); + + if (eft.timezone == EFI_UNSPECIFIED_TIMEZONE) + seq_puts(seq, "Timezone\t: unspecified\n"); + else + /* XXX fixme: convert to string? */ + seq_printf(seq, "Timezone\t: %u\n", alm.timezone); + } /* * now prints the capabilities @@ -269,7 +272,10 @@ static int __init efi_rtc_probe(struct platform_device *dev) rtc->ops = &efi_rtc_ops; clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->features); - set_bit(RTC_FEATURE_ALARM_WAKEUP_ONLY, rtc->features); + if (efi_rt_services_supported(EFI_RT_SUPPORTED_WAKEUP_SERVICES)) + set_bit(RTC_FEATURE_ALARM_WAKEUP_ONLY, rtc->features); + else + clear_bit(RTC_FEATURE_ALARM, rtc->features); device_init_wakeup(&dev->dev, true); diff --git a/drivers/rtc/rtc-sunplus.c b/drivers/rtc/rtc-sunplus.c index e8e2ab1103fc..4b578e4d44f6 100644 --- a/drivers/rtc/rtc-sunplus.c +++ b/drivers/rtc/rtc-sunplus.c @@ -240,8 +240,8 @@ static int sp_rtc_probe(struct platform_device *plat_dev) if (IS_ERR(sp_rtc->reg_base)) return dev_err_probe(&plat_dev->dev, PTR_ERR(sp_rtc->reg_base), "%s devm_ioremap_resource fail\n", RTC_REG_NAME); - dev_dbg(&plat_dev->dev, "res = 0x%x, reg_base = 0x%lx\n", - sp_rtc->res->start, (unsigned long)sp_rtc->reg_base); + dev_dbg(&plat_dev->dev, "res = %pR, reg_base = %p\n", + sp_rtc->res, sp_rtc->reg_base); sp_rtc->irq = platform_get_irq(plat_dev, 0); if (sp_rtc->irq < 0) diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index 49cc18a87473..29a2865b8e2e 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -981,6 +981,9 @@ queue_rtpg: * * Returns true if and only if alua_rtpg_work() will be called asynchronously. * That function is responsible for calling @qdata->fn(). + * + * Context: may be called from atomic context (alua_check()) only if the caller + * holds an sdev reference. */ static bool alua_rtpg_queue(struct alua_port_group *pg, struct scsi_device *sdev, @@ -989,8 +992,6 @@ static bool alua_rtpg_queue(struct alua_port_group *pg, int start_queue = 0; unsigned long flags; - might_sleep(); - if (WARN_ON_ONCE(!pg) || scsi_device_get(sdev)) return false; diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 4dbf51e2623a..f6da34850af9 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -5850,7 +5850,7 @@ static int hpsa_scsi_host_alloc(struct ctlr_info *h) { struct Scsi_Host *sh; - sh = scsi_host_alloc(&hpsa_driver_template, sizeof(h)); + sh = scsi_host_alloc(&hpsa_driver_template, sizeof(struct ctlr_info)); if (sh == NULL) { dev_err(&h->pdev->dev, "scsi_host_alloc failed\n"); return -ENOMEM; diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 1d1cf641937c..0454d94e8cf0 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -849,7 +849,7 @@ static int iscsi_sw_tcp_host_get_param(struct Scsi_Host *shost, enum iscsi_host_param param, char *buf) { struct iscsi_sw_tcp_host *tcp_sw_host = iscsi_host_priv(shost); - struct iscsi_session *session = tcp_sw_host->session; + struct iscsi_session *session; struct iscsi_conn *conn; struct iscsi_tcp_conn *tcp_conn; struct iscsi_sw_tcp_conn *tcp_sw_conn; @@ -859,6 +859,7 @@ static int iscsi_sw_tcp_host_get_param(struct Scsi_Host *shost, switch (param) { case ISCSI_HOST_PARAM_IPADDRESS: + session = tcp_sw_host->session; if (!session) return -ENOTCONN; @@ -959,11 +960,13 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max, if (!cls_session) goto remove_host; session = cls_session->dd_data; - tcp_sw_host = iscsi_host_priv(shost); - tcp_sw_host->session = session; if (iscsi_tcp_r2tpool_alloc(session)) goto remove_session; + + /* We are now fully setup so expose the session to sysfs. */ + tcp_sw_host = iscsi_host_priv(shost); + tcp_sw_host->session = session; return cls_session; remove_session: @@ -983,10 +986,17 @@ static void iscsi_sw_tcp_session_destroy(struct iscsi_cls_session *cls_session) if (WARN_ON_ONCE(session->leadconn)) return; + iscsi_session_remove(cls_session); + /* + * Our get_host_param needs to access the session, so remove the + * host from sysfs before freeing the session to make sure userspace + * is no longer accessing the callout. + */ + iscsi_host_remove(shost, false); + iscsi_tcp_r2tpool_free(cls_session->dd_data); - iscsi_session_teardown(cls_session); - iscsi_host_remove(shost, false); + iscsi_session_free(cls_session); iscsi_host_free(shost); } diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index ef2fc860257e..127f3d7f19dc 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -3104,17 +3104,32 @@ dec_session_count: } EXPORT_SYMBOL_GPL(iscsi_session_setup); -/** - * iscsi_session_teardown - destroy session, host, and cls_session - * @cls_session: iscsi session +/* + * issi_session_remove - Remove session from iSCSI class. */ -void iscsi_session_teardown(struct iscsi_cls_session *cls_session) +void iscsi_session_remove(struct iscsi_cls_session *cls_session) { struct iscsi_session *session = cls_session->dd_data; - struct module *owner = cls_session->transport->owner; struct Scsi_Host *shost = session->host; iscsi_remove_session(cls_session); + /* + * host removal only has to wait for its children to be removed from + * sysfs, and iscsi_tcp needs to do iscsi_host_remove before freeing + * the session, so drop the session count here. + */ + iscsi_host_dec_session_cnt(shost); +} +EXPORT_SYMBOL_GPL(iscsi_session_remove); + +/** + * iscsi_session_free - Free iscsi session and it's resources + * @cls_session: iscsi session + */ +void iscsi_session_free(struct iscsi_cls_session *cls_session) +{ + struct iscsi_session *session = cls_session->dd_data; + struct module *owner = cls_session->transport->owner; iscsi_pool_free(&session->cmdpool); kfree(session->password); @@ -3132,10 +3147,19 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->discovery_parent_type); iscsi_free_session(cls_session); - - iscsi_host_dec_session_cnt(shost); module_put(owner); } +EXPORT_SYMBOL_GPL(iscsi_session_free); + +/** + * iscsi_session_teardown - destroy session and cls_session + * @cls_session: iscsi session + */ +void iscsi_session_teardown(struct iscsi_cls_session *cls_session) +{ + iscsi_session_remove(cls_session); + iscsi_session_free(cls_session); +} EXPORT_SYMBOL_GPL(iscsi_session_teardown); /** diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1426b9b03612..9feb0323bc44 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -588,8 +588,6 @@ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; - might_sleep(); - put_device(&sdev->sdev_gendev); module_put(mod); } diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 7a6904a3928e..f9b18fdc7b3c 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -1232,8 +1232,7 @@ static int scsi_probe_and_add_lun(struct scsi_target *starget, * that no LUN is present, so don't add sdev in these cases. * Two specific examples are: * 1) NetApp targets: return PQ=1, PDT=0x1f - * 2) IBM/2145 targets: return PQ=1, PDT=0 - * 3) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved" + * 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved" * in the UFI 1.0 spec (we cannot rely on reserved bits). * * References: @@ -1247,8 +1246,8 @@ static int scsi_probe_and_add_lun(struct scsi_target *starget, * PDT=00h Direct-access device (floppy) * PDT=1Fh none (no FDD connected to the requested logical unit) */ - if (((result[0] >> 5) == 1 || - (starget->pdt_1f_for_no_lun && (result[0] & 0x1f) == 0x1f)) && + if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) && + (result[0] & 0x1f) == 0x1f && !scsi_is_wlun(lun)) { SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev, "scsi scan: peripheral device type" diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 981d1bab2120..8ef9a5494340 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -451,6 +451,8 @@ static void scsi_device_dev_release(struct device *dev) struct scsi_vpd *vpd_pgb0 = NULL, *vpd_pgb1 = NULL, *vpd_pgb2 = NULL; unsigned long flags; + might_sleep(); + scsi_dh_release_device(sdev); parent = sdev->sdev_gendev.parent; diff --git a/drivers/spi/spi-dw-core.c b/drivers/spi/spi-dw-core.c index 99edddf9958b..c3bfb6c84cab 100644 --- a/drivers/spi/spi-dw-core.c +++ b/drivers/spi/spi-dw-core.c @@ -366,7 +366,7 @@ static void dw_spi_irq_setup(struct dw_spi *dws) * will be adjusted at the final stage of the IRQ-based SPI transfer * execution so not to lose the leftover of the incoming data. */ - level = min_t(u16, dws->fifo_len / 2, dws->tx_len); + level = min_t(unsigned int, dws->fifo_len / 2, dws->tx_len); dw_writel(dws, DW_SPI_TXFTLR, level); dw_writel(dws, DW_SPI_RXFTLR, level - 1); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 15f174f4e056..3f33934f5429 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2220,11 +2220,26 @@ void spi_flush_queue(struct spi_controller *ctlr) /*-------------------------------------------------------------------------*/ #if defined(CONFIG_OF) +static void of_spi_parse_dt_cs_delay(struct device_node *nc, + struct spi_delay *delay, const char *prop) +{ + u32 value; + + if (!of_property_read_u32(nc, prop, &value)) { + if (value > U16_MAX) { + delay->value = DIV_ROUND_UP(value, 1000); + delay->unit = SPI_DELAY_UNIT_USECS; + } else { + delay->value = value; + delay->unit = SPI_DELAY_UNIT_NSECS; + } + } +} + static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { u32 value; - u16 cs_setup; int rc; /* Mode (clock phase/polarity/etc.) */ @@ -2310,10 +2325,8 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, if (!of_property_read_u32(nc, "spi-max-frequency", &value)) spi->max_speed_hz = value; - if (!of_property_read_u16(nc, "spi-cs-setup-delay-ns", &cs_setup)) { - spi->cs_setup.value = cs_setup; - spi->cs_setup.unit = SPI_DELAY_UNIT_NSECS; - } + /* Device CS delays */ + of_spi_parse_dt_cs_delay(nc, &spi->cs_setup, "spi-cs-setup-delay-ns"); return 0; } diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index 1935ca613447..a1ea093795cf 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -90,9 +90,21 @@ MODULE_PARM_DESC(bufsiz, "data bytes in biggest supported SPI message"); /*-------------------------------------------------------------------------*/ static ssize_t +spidev_sync_unlocked(struct spi_device *spi, struct spi_message *message) +{ + ssize_t status; + + status = spi_sync(spi, message); + if (status == 0) + status = message->actual_length; + + return status; +} + +static ssize_t spidev_sync(struct spidev_data *spidev, struct spi_message *message) { - int status; + ssize_t status; struct spi_device *spi; mutex_lock(&spidev->spi_lock); @@ -101,12 +113,10 @@ spidev_sync(struct spidev_data *spidev, struct spi_message *message) if (spi == NULL) status = -ESHUTDOWN; else - status = spi_sync(spi, message); - - if (status == 0) - status = message->actual_length; + status = spidev_sync_unlocked(spi, message); mutex_unlock(&spidev->spi_lock); + return status; } @@ -294,7 +304,7 @@ static int spidev_message(struct spidev_data *spidev, spi_message_add_tail(k_tmp, &msg); } - status = spidev_sync(spidev, &msg); + status = spidev_sync_unlocked(spidev->spi, &msg); if (status < 0) goto done; diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index bac111456fa1..2b95b4550a63 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -73,8 +73,8 @@ static bool __target_check_io_state(struct se_cmd *se_cmd, { struct se_session *sess = se_cmd->se_sess; - assert_spin_locked(&sess->sess_cmd_lock); - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_held(&sess->sess_cmd_lock); + /* * If command already reached CMD_T_COMPLETE state within * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown, diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 27295bda3e0b..b1c6231defad 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -11,6 +11,7 @@ #include <linux/tee_drv.h> #include <linux/uaccess.h> #include <linux/uio.h> +#include <linux/highmem.h> #include "tee_private.h" static void shm_put_kernel_pages(struct page **pages, size_t page_count) @@ -24,38 +25,20 @@ static void shm_put_kernel_pages(struct page **pages, size_t page_count) static int shm_get_kernel_pages(unsigned long start, size_t page_count, struct page **pages) { + struct page *page; size_t n; - int rc; - - if (is_vmalloc_addr((void *)start)) { - struct page *page; - - for (n = 0; n < page_count; n++) { - page = vmalloc_to_page((void *)(start + PAGE_SIZE * n)); - if (!page) - return -ENOMEM; - get_page(page); - pages[n] = page; - } - rc = page_count; - } else { - struct kvec *kiov; - - kiov = kcalloc(page_count, sizeof(*kiov), GFP_KERNEL); - if (!kiov) - return -ENOMEM; - - for (n = 0; n < page_count; n++) { - kiov[n].iov_base = (void *)(start + n * PAGE_SIZE); - kiov[n].iov_len = PAGE_SIZE; - } + if (WARN_ON_ONCE(is_vmalloc_addr((void *)start) || + is_kmap_addr((void *)start))) + return -EINVAL; - rc = get_kernel_pages(kiov, page_count, 0, pages); - kfree(kiov); + page = virt_to_page(start); + for (n = 0; n < page_count; n++) { + pages[n] = page + n; + get_page(pages[n]); } - return rc; + return page_count; } static void release_registered_pages(struct tee_shm *shm) diff --git a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c index 62c0aa5d0783..0a4eaa307156 100644 --- a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c +++ b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c @@ -44,11 +44,13 @@ static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone, int trip, int *temp) { struct int34x_thermal_zone *d = zone->devdata; - int i; + int i, ret = 0; if (d->override_ops && d->override_ops->get_trip_temp) return d->override_ops->get_trip_temp(zone, trip, temp); + mutex_lock(&d->trip_mutex); + if (trip < d->aux_trip_nr) *temp = d->aux_trips[trip]; else if (trip == d->crt_trip_id) @@ -66,10 +68,12 @@ static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone, } } if (i == INT340X_THERMAL_MAX_ACT_TRIP_COUNT) - return -EINVAL; + ret = -EINVAL; } - return 0; + mutex_unlock(&d->trip_mutex); + + return ret; } static int int340x_thermal_get_trip_type(struct thermal_zone_device *zone, @@ -77,11 +81,13 @@ static int int340x_thermal_get_trip_type(struct thermal_zone_device *zone, enum thermal_trip_type *type) { struct int34x_thermal_zone *d = zone->devdata; - int i; + int i, ret = 0; if (d->override_ops && d->override_ops->get_trip_type) return d->override_ops->get_trip_type(zone, trip, type); + mutex_lock(&d->trip_mutex); + if (trip < d->aux_trip_nr) *type = THERMAL_TRIP_PASSIVE; else if (trip == d->crt_trip_id) @@ -99,10 +105,12 @@ static int int340x_thermal_get_trip_type(struct thermal_zone_device *zone, } } if (i == INT340X_THERMAL_MAX_ACT_TRIP_COUNT) - return -EINVAL; + ret = -EINVAL; } - return 0; + mutex_unlock(&d->trip_mutex); + + return ret; } static int int340x_thermal_set_trip_temp(struct thermal_zone_device *zone, @@ -180,6 +188,8 @@ int int340x_thermal_read_trips(struct int34x_thermal_zone *int34x_zone) int trip_cnt = int34x_zone->aux_trip_nr; int i; + mutex_lock(&int34x_zone->trip_mutex); + int34x_zone->crt_trip_id = -1; if (!int340x_thermal_get_trip_config(int34x_zone->adev->handle, "_CRT", &int34x_zone->crt_temp)) @@ -207,6 +217,8 @@ int int340x_thermal_read_trips(struct int34x_thermal_zone *int34x_zone) int34x_zone->act_trips[i].valid = true; } + mutex_unlock(&int34x_zone->trip_mutex); + return trip_cnt; } EXPORT_SYMBOL_GPL(int340x_thermal_read_trips); @@ -230,6 +242,8 @@ struct int34x_thermal_zone *int340x_thermal_zone_add(struct acpi_device *adev, if (!int34x_thermal_zone) return ERR_PTR(-ENOMEM); + mutex_init(&int34x_thermal_zone->trip_mutex); + int34x_thermal_zone->adev = adev; int34x_thermal_zone->override_ops = override_ops; @@ -281,6 +295,7 @@ err_thermal_zone: acpi_lpat_free_conversion_table(int34x_thermal_zone->lpat_table); kfree(int34x_thermal_zone->aux_trips); err_trip_alloc: + mutex_destroy(&int34x_thermal_zone->trip_mutex); kfree(int34x_thermal_zone); return ERR_PTR(ret); } @@ -292,6 +307,7 @@ void int340x_thermal_zone_remove(struct int34x_thermal_zone thermal_zone_device_unregister(int34x_thermal_zone->zone); acpi_lpat_free_conversion_table(int34x_thermal_zone->lpat_table); kfree(int34x_thermal_zone->aux_trips); + mutex_destroy(&int34x_thermal_zone->trip_mutex); kfree(int34x_thermal_zone); } EXPORT_SYMBOL_GPL(int340x_thermal_zone_remove); diff --git a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h index 3b4971df1b33..8f9872afd0d3 100644 --- a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h +++ b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h @@ -32,6 +32,7 @@ struct int34x_thermal_zone { struct thermal_zone_device_ops *override_ops; void *priv_data; struct acpi_lpat_conversion_table *lpat_table; + struct mutex trip_mutex; }; struct int34x_thermal_zone *int340x_thermal_zone_add(struct acpi_device *, diff --git a/drivers/tty/serial/8250/8250_dma.c b/drivers/tty/serial/8250/8250_dma.c index 37d6af2ec427..7fa66501792d 100644 --- a/drivers/tty/serial/8250/8250_dma.c +++ b/drivers/tty/serial/8250/8250_dma.c @@ -43,15 +43,23 @@ static void __dma_rx_complete(struct uart_8250_port *p) struct uart_8250_dma *dma = p->dma; struct tty_port *tty_port = &p->port.state->port; struct dma_tx_state state; + enum dma_status dma_status; int count; - dma->rx_running = 0; - dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state); + /* + * New DMA Rx can be started during the completion handler before it + * could acquire port's lock and it might still be ongoing. Don't to + * anything in such case. + */ + dma_status = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state); + if (dma_status == DMA_IN_PROGRESS) + return; count = dma->rx_size - state.residue; tty_insert_flip_string(tty_port, dma->rx_buf, count); p->port.icount.rx += count; + dma->rx_running = 0; tty_flip_buffer_push(tty_port); } @@ -62,9 +70,14 @@ static void dma_rx_complete(void *param) struct uart_8250_dma *dma = p->dma; unsigned long flags; - __dma_rx_complete(p); - spin_lock_irqsave(&p->port.lock, flags); + if (dma->rx_running) + __dma_rx_complete(p); + + /* + * Cannot be combined with the previous check because __dma_rx_complete() + * changes dma->rx_running. + */ if (!dma->rx_running && (serial_lsr_in(p) & UART_LSR_DR)) p->dma->rx_dma(p); spin_unlock_irqrestore(&p->port.lock, flags); diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index a1490033aa16..409e91d6829a 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -797,25 +797,11 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) spin_unlock(&port->lock); } - if (stm32_usart_rx_dma_enabled(port)) - return IRQ_WAKE_THREAD; - else - return IRQ_HANDLED; -} - -static irqreturn_t stm32_usart_threaded_interrupt(int irq, void *ptr) -{ - struct uart_port *port = ptr; - struct tty_port *tport = &port->state->port; - struct stm32_port *stm32_port = to_stm32_port(port); - unsigned int size; - unsigned long flags; - /* Receiver timeout irq for DMA RX */ - if (!stm32_port->throttled) { - spin_lock_irqsave(&port->lock, flags); + if (stm32_usart_rx_dma_enabled(port) && !stm32_port->throttled) { + spin_lock(&port->lock); size = stm32_usart_receive_chars(port, false); - uart_unlock_and_check_sysrq_irqrestore(port, flags); + uart_unlock_and_check_sysrq(port); if (size) tty_flip_buffer_push(tport); } @@ -1015,10 +1001,8 @@ static int stm32_usart_startup(struct uart_port *port) u32 val; int ret; - ret = request_threaded_irq(port->irq, stm32_usart_interrupt, - stm32_usart_threaded_interrupt, - IRQF_ONESHOT | IRQF_NO_SUSPEND, - name, port); + ret = request_irq(port->irq, stm32_usart_interrupt, + IRQF_NO_SUSPEND, name, port); if (ret) return ret; @@ -1601,13 +1585,6 @@ static int stm32_usart_of_dma_rx_probe(struct stm32_port *stm32port, struct dma_slave_config config; int ret; - /* - * Using DMA and threaded handler for the console could lead to - * deadlocks. - */ - if (uart_console(port)) - return -ENODEV; - stm32port->rx_buf = dma_alloc_coherent(dev, RX_BUF_L, &stm32port->rx_dma_buf, GFP_KERNEL); diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c index 1850bacdb5b0..f566eb1839dc 100644 --- a/drivers/tty/vt/vc_screen.c +++ b/drivers/tty/vt/vc_screen.c @@ -386,10 +386,6 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) uni_mode = use_unicode(inode); attr = use_attributes(inode); - ret = -ENXIO; - vc = vcs_vc(inode, &viewed); - if (!vc) - goto unlock_out; ret = -EINVAL; if (pos < 0) @@ -407,6 +403,11 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned int this_round, skip = 0; int size; + ret = -ENXIO; + vc = vcs_vc(inode, &viewed); + if (!vc) + goto unlock_out; + /* Check whether we are above size each round, * as copy_to_user at the end of this loop * could sleep. diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index bda61be5f035..3a1c4d31e010 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -1234,12 +1234,14 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba) * clock scaling is in progress */ ufshcd_scsi_block_requests(hba); + mutex_lock(&hba->wb_mutex); down_write(&hba->clk_scaling_lock); if (!hba->clk_scaling.is_allowed || ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) { ret = -EBUSY; up_write(&hba->clk_scaling_lock); + mutex_unlock(&hba->wb_mutex); ufshcd_scsi_unblock_requests(hba); goto out; } @@ -1251,12 +1253,16 @@ out: return ret; } -static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock) +static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, int err, bool scale_up) { - if (writelock) - up_write(&hba->clk_scaling_lock); - else - up_read(&hba->clk_scaling_lock); + up_write(&hba->clk_scaling_lock); + + /* Enable Write Booster if we have scaled up else disable it */ + if (ufshcd_enable_wb_if_scaling_up(hba) && !err) + ufshcd_wb_toggle(hba, scale_up); + + mutex_unlock(&hba->wb_mutex); + ufshcd_scsi_unblock_requests(hba); ufshcd_release(hba); } @@ -1273,7 +1279,6 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock) static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up) { int ret = 0; - bool is_writelock = true; ret = ufshcd_clock_scaling_prepare(hba); if (ret) @@ -1302,15 +1307,8 @@ static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up) } } - /* Enable Write Booster if we have scaled up else disable it */ - if (ufshcd_enable_wb_if_scaling_up(hba)) { - downgrade_write(&hba->clk_scaling_lock); - is_writelock = false; - ufshcd_wb_toggle(hba, scale_up); - } - out_unprepare: - ufshcd_clock_scaling_unprepare(hba, is_writelock); + ufshcd_clock_scaling_unprepare(hba, ret, scale_up); return ret; } @@ -6066,9 +6064,11 @@ static void ufshcd_force_error_recovery(struct ufs_hba *hba) static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow) { + mutex_lock(&hba->wb_mutex); down_write(&hba->clk_scaling_lock); hba->clk_scaling.is_allowed = allow; up_write(&hba->clk_scaling_lock); + mutex_unlock(&hba->wb_mutex); } static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend) @@ -9793,6 +9793,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Initialize mutex for exception event control */ mutex_init(&hba->ee_ctrl_mutex); + mutex_init(&hba->wb_mutex); init_rwsem(&hba->clk_scaling_lock); ufshcd_init_clk_gating(hba); diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 079e183cf3bf..934b3d997702 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -526,6 +526,9 @@ static const struct usb_device_id usb_quirk_list[] = { /* DJI CineSSD */ { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM }, + /* Alcor Link AK9563 SC Reader used in 2022 Lenovo ThinkPads */ + { USB_DEVICE(0x2ce3, 0x9563), .driver_info = USB_QUIRK_NO_LPM }, + /* DELL USB GEN2 */ { USB_DEVICE(0x413c, 0xb062), .driver_info = USB_QUIRK_NO_LPM | USB_QUIRK_RESET_RESUME }, diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index b0a0351d2d8b..959fc925ca7c 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -901,7 +901,7 @@ static int dwc3_qcom_probe(struct platform_device *pdev) qcom->mode = usb_get_dr_mode(&qcom->dwc3->dev); /* enable vbus override for device mode */ - if (qcom->mode == USB_DR_MODE_PERIPHERAL) + if (qcom->mode != USB_DR_MODE_HOST) dwc3_qcom_vbus_override_enable(qcom, true); /* register extcon to override sw_vbus on Vbus change later */ diff --git a/drivers/usb/fotg210/fotg210-udc.c b/drivers/usb/fotg210/fotg210-udc.c index 87cca81bf4ac..eb076746f032 100644 --- a/drivers/usb/fotg210/fotg210-udc.c +++ b/drivers/usb/fotg210/fotg210-udc.c @@ -1014,7 +1014,6 @@ static int fotg210_udc_start(struct usb_gadget *g, int ret; /* hook up the driver */ - driver->driver.bus = NULL; fotg210->driver = driver; if (!IS_ERR_OR_NULL(fotg210->phy)) { diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 523a961b910b..8ad354741380 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -279,8 +279,10 @@ static int __ffs_ep0_queue_wait(struct ffs_data *ffs, char *data, size_t len) struct usb_request *req = ffs->ep0req; int ret; - if (!req) + if (!req) { + spin_unlock_irq(&ffs->ev.waitq.lock); return -EINVAL; + } req->zero = len < le16_to_cpu(ffs->ev.setup.wLength); diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index 08726e4c68a5..0219cd79493a 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -1142,6 +1142,7 @@ afunc_bind(struct usb_configuration *cfg, struct usb_function *fn) } std_as_out_if0_desc.bInterfaceNumber = ret; std_as_out_if1_desc.bInterfaceNumber = ret; + std_as_out_if1_desc.bNumEndpoints = 1; uac2->as_out_intf = ret; uac2->as_out_alt = 0; diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 8f12f3f8f6ee..e06022873df1 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -798,6 +798,7 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g, net->max_mtu = GETHER_MAX_MTU_SIZE; dev->gadget = g; + SET_NETDEV_DEV(net, &g->dev); SET_NETDEV_DEVTYPE(net, &gadget_type); status = register_netdev(net); @@ -872,6 +873,8 @@ int gether_register_netdev(struct net_device *net) struct usb_gadget *g; int status; + if (!net->dev.parent) + return -EINVAL; dev = netdev_priv(net); g = dev->gadget; @@ -902,6 +905,7 @@ void gether_set_gadget(struct net_device *net, struct usb_gadget *g) dev = netdev_priv(net); dev->gadget = g; + SET_NETDEV_DEV(net, &g->dev); } EXPORT_SYMBOL_GPL(gether_set_gadget); diff --git a/drivers/usb/gadget/udc/bcm63xx_udc.c b/drivers/usb/gadget/udc/bcm63xx_udc.c index 2cdb07905bde..d04d72f5816e 100644 --- a/drivers/usb/gadget/udc/bcm63xx_udc.c +++ b/drivers/usb/gadget/udc/bcm63xx_udc.c @@ -1830,7 +1830,6 @@ static int bcm63xx_udc_start(struct usb_gadget *gadget, bcm63xx_select_phy_mode(udc, true); udc->driver = driver; - driver->driver.bus = NULL; udc->gadget.dev.of_node = udc->dev->of_node; spin_unlock_irqrestore(&udc->lock, flags); diff --git a/drivers/usb/gadget/udc/fsl_qe_udc.c b/drivers/usb/gadget/udc/fsl_qe_udc.c index bf745358e28e..3b1cc8fa30c8 100644 --- a/drivers/usb/gadget/udc/fsl_qe_udc.c +++ b/drivers/usb/gadget/udc/fsl_qe_udc.c @@ -2285,7 +2285,6 @@ static int fsl_qe_start(struct usb_gadget *gadget, /* lock is needed but whether should use this lock or another */ spin_lock_irqsave(&udc->lock, flags); - driver->driver.bus = NULL; /* hook up the driver */ udc->driver = driver; udc->gadget.speed = driver->max_speed; diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c index 50435e804118..a67873a074b7 100644 --- a/drivers/usb/gadget/udc/fsl_udc_core.c +++ b/drivers/usb/gadget/udc/fsl_udc_core.c @@ -1943,7 +1943,6 @@ static int fsl_udc_start(struct usb_gadget *g, /* lock is needed but whether should use this lock or another */ spin_lock_irqsave(&udc_controller->lock, flags); - driver->driver.bus = NULL; /* hook up the driver */ udc_controller->driver = driver; spin_unlock_irqrestore(&udc_controller->lock, flags); diff --git a/drivers/usb/gadget/udc/fusb300_udc.c b/drivers/usb/gadget/udc/fusb300_udc.c index 9af8b415f303..5954800d652c 100644 --- a/drivers/usb/gadget/udc/fusb300_udc.c +++ b/drivers/usb/gadget/udc/fusb300_udc.c @@ -1311,7 +1311,6 @@ static int fusb300_udc_start(struct usb_gadget *g, struct fusb300 *fusb300 = to_fusb300(g); /* hook up the driver */ - driver->driver.bus = NULL; fusb300->driver = driver; return 0; diff --git a/drivers/usb/gadget/udc/goku_udc.c b/drivers/usb/gadget/udc/goku_udc.c index bdc56b24b5c9..5ffb3d5c635b 100644 --- a/drivers/usb/gadget/udc/goku_udc.c +++ b/drivers/usb/gadget/udc/goku_udc.c @@ -1375,7 +1375,6 @@ static int goku_udc_start(struct usb_gadget *g, struct goku_udc *dev = to_goku_udc(g); /* hook up the driver */ - driver->driver.bus = NULL; dev->driver = driver; /* diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c index 22096f8505de..85cdc0af3bf9 100644 --- a/drivers/usb/gadget/udc/gr_udc.c +++ b/drivers/usb/gadget/udc/gr_udc.c @@ -1906,7 +1906,6 @@ static int gr_udc_start(struct usb_gadget *gadget, spin_lock(&dev->lock); /* Hook up the driver */ - driver->driver.bus = NULL; dev->driver = driver; /* Get ready for host detection */ diff --git a/drivers/usb/gadget/udc/m66592-udc.c b/drivers/usb/gadget/udc/m66592-udc.c index c7e421b449f3..06e21cee431b 100644 --- a/drivers/usb/gadget/udc/m66592-udc.c +++ b/drivers/usb/gadget/udc/m66592-udc.c @@ -1454,7 +1454,6 @@ static int m66592_udc_start(struct usb_gadget *g, struct m66592 *m66592 = to_m66592(g); /* hook up the driver */ - driver->driver.bus = NULL; m66592->driver = driver; m66592_bset(m66592, M66592_VBSE | M66592_URST, M66592_INTENB0); diff --git a/drivers/usb/gadget/udc/max3420_udc.c b/drivers/usb/gadget/udc/max3420_udc.c index 3074da00c3df..ddf0ed3eb4f2 100644 --- a/drivers/usb/gadget/udc/max3420_udc.c +++ b/drivers/usb/gadget/udc/max3420_udc.c @@ -1108,7 +1108,6 @@ static int max3420_udc_start(struct usb_gadget *gadget, spin_lock_irqsave(&udc->lock, flags); /* hook up the driver */ - driver->driver.bus = NULL; udc->driver = driver; udc->gadget.speed = USB_SPEED_FULL; diff --git a/drivers/usb/gadget/udc/mv_u3d_core.c b/drivers/usb/gadget/udc/mv_u3d_core.c index 598654a3cb41..411b6179782c 100644 --- a/drivers/usb/gadget/udc/mv_u3d_core.c +++ b/drivers/usb/gadget/udc/mv_u3d_core.c @@ -1243,7 +1243,6 @@ static int mv_u3d_start(struct usb_gadget *g, } /* hook up the driver ... */ - driver->driver.bus = NULL; u3d->driver = driver; u3d->ep0_dir = USB_DIR_OUT; diff --git a/drivers/usb/gadget/udc/mv_udc_core.c b/drivers/usb/gadget/udc/mv_udc_core.c index fdb17d86cd65..b397f3a848cf 100644 --- a/drivers/usb/gadget/udc/mv_udc_core.c +++ b/drivers/usb/gadget/udc/mv_udc_core.c @@ -1359,7 +1359,6 @@ static int mv_udc_start(struct usb_gadget *gadget, spin_lock_irqsave(&udc->lock, flags); /* hook up the driver ... */ - driver->driver.bus = NULL; udc->driver = driver; udc->usb_state = USB_STATE_ATTACHED; diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c index 84605a4d0715..538c1b9a2883 100644 --- a/drivers/usb/gadget/udc/net2272.c +++ b/drivers/usb/gadget/udc/net2272.c @@ -1451,7 +1451,6 @@ static int net2272_start(struct usb_gadget *_gadget, dev->ep[i].irqs = 0; /* hook up the driver ... */ dev->softconnect = 1; - driver->driver.bus = NULL; dev->driver = driver; /* ... then enable host detection and ep0; and we're ready diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c index d6a68631354a..1b929c519cd7 100644 --- a/drivers/usb/gadget/udc/net2280.c +++ b/drivers/usb/gadget/udc/net2280.c @@ -2423,7 +2423,6 @@ static int net2280_start(struct usb_gadget *_gadget, dev->ep[i].irqs = 0; /* hook up the driver ... */ - driver->driver.bus = NULL; dev->driver = driver; retval = device_create_file(&dev->pdev->dev, &dev_attr_function); diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c index bea346e362b2..f660ebfa1379 100644 --- a/drivers/usb/gadget/udc/omap_udc.c +++ b/drivers/usb/gadget/udc/omap_udc.c @@ -2066,7 +2066,6 @@ static int omap_udc_start(struct usb_gadget *g, udc->softconnect = 1; /* hook up the driver */ - driver->driver.bus = NULL; udc->driver = driver; spin_unlock_irqrestore(&udc->lock, flags); diff --git a/drivers/usb/gadget/udc/pch_udc.c b/drivers/usb/gadget/udc/pch_udc.c index 9bb7a9d7a2fb..4f8617210d85 100644 --- a/drivers/usb/gadget/udc/pch_udc.c +++ b/drivers/usb/gadget/udc/pch_udc.c @@ -2908,7 +2908,6 @@ static int pch_udc_start(struct usb_gadget *g, { struct pch_udc_dev *dev = to_pch_udc(g); - driver->driver.bus = NULL; dev->driver = driver; /* get ready for ep0 traffic */ diff --git a/drivers/usb/gadget/udc/snps_udc_core.c b/drivers/usb/gadget/udc/snps_udc_core.c index 52ea4dcf6a92..2fc5d4d277bc 100644 --- a/drivers/usb/gadget/udc/snps_udc_core.c +++ b/drivers/usb/gadget/udc/snps_udc_core.c @@ -1933,7 +1933,6 @@ static int amd5536_udc_start(struct usb_gadget *g, struct udc *dev = to_amd5536_udc(g); u32 tmp; - driver->driver.bus = NULL; dev->driver = driver; /* Some gadget drivers use both ep0 directions. diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c index 9a6860285fbe..50b24096eb7f 100644 --- a/drivers/usb/typec/altmodes/displayport.c +++ b/drivers/usb/typec/altmodes/displayport.c @@ -535,10 +535,10 @@ int dp_altmode_probe(struct typec_altmode *alt) /* FIXME: Port can only be DFP_U. */ /* Make sure we have compatiple pin configurations */ - if (!(DP_CAP_DFP_D_PIN_ASSIGN(port->vdo) & - DP_CAP_UFP_D_PIN_ASSIGN(alt->vdo)) && - !(DP_CAP_UFP_D_PIN_ASSIGN(port->vdo) & - DP_CAP_DFP_D_PIN_ASSIGN(alt->vdo))) + if (!(DP_CAP_PIN_ASSIGN_DFP_D(port->vdo) & + DP_CAP_PIN_ASSIGN_UFP_D(alt->vdo)) && + !(DP_CAP_PIN_ASSIGN_UFP_D(port->vdo) & + DP_CAP_PIN_ASSIGN_DFP_D(alt->vdo))) return -ENODEV; ret = sysfs_create_group(&alt->dev.kobj, &dp_altmode_group); diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 1292241d581a..1cf8947c6d66 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -1269,6 +1269,9 @@ err_unregister: con->port = NULL; } + kfree(ucsi->connector); + ucsi->connector = NULL; + err_reset: memset(&ucsi->cap, 0, sizeof(ucsi->cap)); ucsi_reset_ppm(ucsi); @@ -1300,7 +1303,8 @@ static void ucsi_resume_work(struct work_struct *work) int ucsi_resume(struct ucsi *ucsi) { - queue_work(system_long_wq, &ucsi->resume_work); + if (ucsi->connector) + queue_work(system_long_wq, &ucsi->resume_work); return 0; } EXPORT_SYMBOL_GPL(ucsi_resume); @@ -1420,6 +1424,9 @@ void ucsi_unregister(struct ucsi *ucsi) /* Disable notifications */ ucsi->ops->async_write(ucsi, UCSI_CONTROL, &cmd, sizeof(cmd)); + if (!ucsi->connector) + return; + for (i = 0; i < ucsi->cap.num_connectors; i++) { cancel_work_sync(&ucsi->connector[i].work); ucsi_unregister_partner(&ucsi->connector[i]); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index f9c0044c6442..44b29289aa19 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -849,7 +849,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = ifcvf_init_hw(vf, pdev); if (ret) { IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); - return ret; + goto err; } for (i = 0; i < vf->nr_vring; i++) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 23c24fe98c00..2209372f236d 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -1856,24 +1856,33 @@ unwind: * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when * hugetlbfs is in use. */ -static void vfio_test_domain_fgsp(struct vfio_domain *domain) +static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions) { - struct page *pages; int ret, order = get_order(PAGE_SIZE * 2); + struct vfio_iova *region; + struct page *pages; + dma_addr_t start; pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (!pages) return; - ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, - IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE); - if (!ret) { - size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); + list_for_each_entry(region, regions, list) { + start = ALIGN(region->start, PAGE_SIZE * 2); + if (start >= region->end || (region->end - start < PAGE_SIZE * 2)) + continue; - if (unmapped == PAGE_SIZE) - iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE); - else - domain->fgsp = true; + ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2, + IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE); + if (!ret) { + size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE); + + if (unmapped == PAGE_SIZE) + iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE); + else + domain->fgsp = true; + } + break; } __free_pages(pages, order); @@ -2326,7 +2335,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, } } - vfio_test_domain_fgsp(domain); + vfio_test_domain_fgsp(domain, &iova_copy); /* replay mappings on new domains */ ret = vfio_iommu_replay(iommu, domain); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9af19b0cf3b7..4c538b30fd76 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1511,6 +1511,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) nvq = &n->vqs[index]; mutex_lock(&vq->mutex); + if (fd == -1) + vhost_clear_msg(&n->dev); + /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index dca6346d75b3..d5ecb8876fc9 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -80,7 +80,7 @@ struct vhost_scsi_cmd { struct scatterlist *tvc_prot_sgl; struct page **tvc_upages; /* Pointer to response header iovec */ - struct iovec tvc_resp_iov; + struct iovec *tvc_resp_iov; /* Pointer to vhost_scsi for our device */ struct vhost_scsi *tvc_vhost; /* Pointer to vhost_virtqueue for the cmd */ @@ -563,7 +563,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) memcpy(v_rsp.sense, cmd->tvc_sense_buf, se_cmd->scsi_sense_length); - iov_iter_init(&iov_iter, ITER_DEST, &cmd->tvc_resp_iov, + iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov, cmd->tvc_in_iovs, sizeof(v_rsp)); ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter); if (likely(ret == sizeof(v_rsp))) { @@ -594,6 +594,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd; struct vhost_scsi_nexus *tv_nexus; struct scatterlist *sg, *prot_sg; + struct iovec *tvc_resp_iov; struct page **pages; int tag; @@ -613,6 +614,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, sg = cmd->tvc_sgl; prot_sg = cmd->tvc_prot_sgl; pages = cmd->tvc_upages; + tvc_resp_iov = cmd->tvc_resp_iov; memset(cmd, 0, sizeof(*cmd)); cmd->tvc_sgl = sg; cmd->tvc_prot_sgl = prot_sg; @@ -625,6 +627,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, cmd->tvc_data_direction = data_direction; cmd->tvc_nexus = tv_nexus; cmd->inflight = vhost_scsi_get_inflight(vq); + cmd->tvc_resp_iov = tvc_resp_iov; memcpy(cmd->tvc_cdb, cdb, VHOST_SCSI_MAX_CDB_SIZE); @@ -935,7 +938,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) struct iov_iter in_iter, prot_iter, data_iter; u64 tag; u32 exp_data_len, data_direction; - int ret, prot_bytes, c = 0; + int ret, prot_bytes, i, c = 0; u16 lun; u8 task_attr; bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); @@ -1092,7 +1095,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) } cmd->tvc_vhost = vs; cmd->tvc_vq = vq; - cmd->tvc_resp_iov = vq->iov[vc.out]; + for (i = 0; i < vc.in ; i++) + cmd->tvc_resp_iov[i] = vq->iov[vc.out + i]; cmd->tvc_in_iovs = vc.in; pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", @@ -1461,6 +1465,7 @@ static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) kfree(tv_cmd->tvc_sgl); kfree(tv_cmd->tvc_prot_sgl); kfree(tv_cmd->tvc_upages); + kfree(tv_cmd->tvc_resp_iov); } sbitmap_free(&svq->scsi_tags); @@ -1508,6 +1513,14 @@ static int vhost_scsi_setup_vq_cmds(struct vhost_virtqueue *vq, int max_cmds) goto out; } + tv_cmd->tvc_resp_iov = kcalloc(UIO_MAXIOV, + sizeof(struct iovec), + GFP_KERNEL); + if (!tv_cmd->tvc_resp_iov) { + pr_err("Unable to allocate tv_cmd->tvc_resp_iov\n"); + goto out; + } + tv_cmd->tvc_prot_sgl = kcalloc(VHOST_SCSI_PREALLOC_PROT_SGLS, sizeof(struct scatterlist), GFP_KERNEL); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index cbe72bfd2f1f..43c9770b86e5 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -661,7 +661,7 @@ void vhost_dev_stop(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_stop); -static void vhost_clear_msg(struct vhost_dev *dev) +void vhost_clear_msg(struct vhost_dev *dev) { struct vhost_msg_node *node, *n; @@ -679,6 +679,7 @@ static void vhost_clear_msg(struct vhost_dev *dev) spin_unlock(&dev->iotlb_lock); } +EXPORT_SYMBOL_GPL(vhost_clear_msg); void vhost_dev_cleanup(struct vhost_dev *dev) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index d9109107af08..790b296271f1 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -181,6 +181,7 @@ long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); bool vhost_vq_access_ok(struct vhost_virtqueue *vq); bool vhost_log_access_ok(struct vhost_dev *); +void vhost_clear_msg(struct vhost_dev *dev); int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_count, diff --git a/drivers/video/fbdev/atmel_lcdfb.c b/drivers/video/fbdev/atmel_lcdfb.c index 1fc8de4ecbeb..8187a7c4f910 100644 --- a/drivers/video/fbdev/atmel_lcdfb.c +++ b/drivers/video/fbdev/atmel_lcdfb.c @@ -49,7 +49,6 @@ struct atmel_lcdfb_info { struct clk *lcdc_clk; struct backlight_device *backlight; - u8 bl_power; u8 saved_lcdcon; u32 pseudo_palette[16]; @@ -109,22 +108,7 @@ static u32 contrast_ctr = ATMEL_LCDC_PS_DIV8 static int atmel_bl_update_status(struct backlight_device *bl) { struct atmel_lcdfb_info *sinfo = bl_get_data(bl); - int power = sinfo->bl_power; - int brightness = bl->props.brightness; - - /* REVISIT there may be a meaningful difference between - * fb_blank and power ... there seem to be some cases - * this doesn't handle correctly. - */ - if (bl->props.fb_blank != sinfo->bl_power) - power = bl->props.fb_blank; - else if (bl->props.power != sinfo->bl_power) - power = bl->props.power; - - if (brightness < 0 && power == FB_BLANK_UNBLANK) - brightness = lcdc_readl(sinfo, ATMEL_LCDC_CONTRAST_VAL); - else if (power != FB_BLANK_UNBLANK) - brightness = 0; + int brightness = backlight_get_brightness(bl); lcdc_writel(sinfo, ATMEL_LCDC_CONTRAST_VAL, brightness); if (contrast_ctr & ATMEL_LCDC_POL_POSITIVE) @@ -133,8 +117,6 @@ static int atmel_bl_update_status(struct backlight_device *bl) else lcdc_writel(sinfo, ATMEL_LCDC_CONTRAST_CTR, contrast_ctr); - bl->props.fb_blank = bl->props.power = sinfo->bl_power = power; - return 0; } @@ -155,8 +137,6 @@ static void init_backlight(struct atmel_lcdfb_info *sinfo) struct backlight_properties props; struct backlight_device *bl; - sinfo->bl_power = FB_BLANK_UNBLANK; - if (sinfo->backlight) return; diff --git a/drivers/video/fbdev/aty/aty128fb.c b/drivers/video/fbdev/aty/aty128fb.c index dd31b9d7d337..36a9ac05a340 100644 --- a/drivers/video/fbdev/aty/aty128fb.c +++ b/drivers/video/fbdev/aty/aty128fb.c @@ -1766,12 +1766,10 @@ static int aty128_bl_update_status(struct backlight_device *bd) unsigned int reg = aty_ld_le32(LVDS_GEN_CNTL); int level; - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK || - !par->lcd_on) + if (!par->lcd_on) level = 0; else - level = bd->props.brightness; + level = backlight_get_brightness(bd); reg |= LVDS_BL_MOD_EN | LVDS_BLON; if (level > 0) { diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c index d59215a4992e..b02e4e645035 100644 --- a/drivers/video/fbdev/aty/atyfb_base.c +++ b/drivers/video/fbdev/aty/atyfb_base.c @@ -2219,13 +2219,7 @@ static int aty_bl_update_status(struct backlight_device *bd) { struct atyfb_par *par = bl_get_data(bd); unsigned int reg = aty_ld_lcd(LCD_MISC_CNTL, par); - int level; - - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; + int level = backlight_get_brightness(bd); reg |= (BLMOD_EN | BIASMOD_EN); if (level > 0) { diff --git a/drivers/video/fbdev/aty/radeon_backlight.c b/drivers/video/fbdev/aty/radeon_backlight.c index d2c1263ad260..427adc838f77 100644 --- a/drivers/video/fbdev/aty/radeon_backlight.c +++ b/drivers/video/fbdev/aty/radeon_backlight.c @@ -57,11 +57,7 @@ static int radeon_bl_update_status(struct backlight_device *bd) * backlight. This provides some greater power saving and the display * is useless without backlight anyway. */ - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; + level = backlight_get_brightness(bd); del_timer_sync(&rinfo->lvds_timer); radeon_engine_idle(); diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index c730253ab85c..583cbcf09446 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -313,7 +313,7 @@ void fb_deferred_io_open(struct fb_info *info, } EXPORT_SYMBOL_GPL(fb_deferred_io_open); -void fb_deferred_io_cleanup(struct fb_info *info) +void fb_deferred_io_release(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; struct page *page; @@ -327,6 +327,14 @@ void fb_deferred_io_cleanup(struct fb_info *info) page = fb_deferred_io_page(info, i); page->mapping = NULL; } +} +EXPORT_SYMBOL_GPL(fb_deferred_io_release); + +void fb_deferred_io_cleanup(struct fb_info *info) +{ + struct fb_deferred_io *fbdefio = info->fbdefio; + + fb_deferred_io_release(info); kvfree(info->pagerefs); mutex_destroy(&fbdefio->lock); diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 14a7d404062c..1b14c21af2b7 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2495,9 +2495,12 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres)) return -EINVAL; + if (font->width > 32 || font->height > 32) + return -EINVAL; + /* Make sure drawing engine can handle the font */ - if (!(info->pixmap.blit_x & (1 << (font->width - 1))) || - !(info->pixmap.blit_y & (1 << (font->height - 1)))) + if (!(info->pixmap.blit_x & BIT(font->width - 1)) || + !(info->pixmap.blit_y & BIT(font->height - 1))) return -EINVAL; /* Make sure driver can handle the font length */ diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 3a6c8458eb8d..ab3545a00abc 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1454,6 +1454,10 @@ __releases(&info->lock) struct fb_info * const info = file->private_data; lock_fb_info(info); +#if IS_ENABLED(CONFIG_FB_DEFERRED_IO) + if (info->fbdefio) + fb_deferred_io_release(info); +#endif if (info->fbops->fb_release) info->fbops->fb_release(info,1); module_put(info->fbops->owner); diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c index b0e690f41025..79e5bfbdd34c 100644 --- a/drivers/video/fbdev/core/fbmon.c +++ b/drivers/video/fbdev/core/fbmon.c @@ -1050,7 +1050,7 @@ static u32 fb_get_vblank(u32 hfreq) } /** - * fb_get_hblank_by_freq - get horizontal blank time given hfreq + * fb_get_hblank_by_hfreq - get horizontal blank time given hfreq * @hfreq: horizontal freq * @xres: horizontal resolution in pixels * diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c index b945b68984b9..76771e126d0a 100644 --- a/drivers/video/fbdev/mx3fb.c +++ b/drivers/video/fbdev/mx3fb.c @@ -283,12 +283,7 @@ static int mx3fb_bl_get_brightness(struct backlight_device *bl) static int mx3fb_bl_update_status(struct backlight_device *bl) { struct mx3fb_data *fbd = bl_get_data(bl); - int brightness = bl->props.brightness; - - if (bl->props.power != FB_BLANK_UNBLANK) - brightness = 0; - if (bl->props.fb_blank != FB_BLANK_UNBLANK) - brightness = 0; + int brightness = backlight_get_brightness(bl); fbd->backlight_level = (fbd->backlight_level & ~0xFF) | brightness; diff --git a/drivers/video/fbdev/nvidia/nv_backlight.c b/drivers/video/fbdev/nvidia/nv_backlight.c index 2ce53529f636..503a7a683855 100644 --- a/drivers/video/fbdev/nvidia/nv_backlight.c +++ b/drivers/video/fbdev/nvidia/nv_backlight.c @@ -49,17 +49,11 @@ static int nvidia_bl_update_status(struct backlight_device *bd) { struct nvidia_par *par = bl_get_data(bd); u32 tmp_pcrt, tmp_pmc, fpcontrol; - int level; + int level = backlight_get_brightness(bd); if (!par->FlatPanel) return 0; - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; - tmp_pmc = NV_RD32(par->PMC, 0x10F0) & 0x0000FFFF; tmp_pcrt = NV_RD32(par->PCRTC0, 0x081C) & 0xFFFFFFFC; fpcontrol = NV_RD32(par->PRAMDAC, 0x0848) & 0xCFFFFFCC; diff --git a/drivers/video/fbdev/nvidia/nvidia.c b/drivers/video/fbdev/nvidia/nvidia.c index 1960916098d4..e60a276b4855 100644 --- a/drivers/video/fbdev/nvidia/nvidia.c +++ b/drivers/video/fbdev/nvidia/nvidia.c @@ -1197,17 +1197,17 @@ static int nvidia_set_fbinfo(struct fb_info *info) return nvidiafb_check_var(&info->var, info); } -static u32 nvidia_get_chipset(struct fb_info *info) +static u32 nvidia_get_chipset(struct pci_dev *pci_dev, + volatile u32 __iomem *REGS) { - struct nvidia_par *par = info->par; - u32 id = (par->pci_dev->vendor << 16) | par->pci_dev->device; + u32 id = (pci_dev->vendor << 16) | pci_dev->device; printk(KERN_INFO PFX "Device ID: %x \n", id); if ((id & 0xfff0) == 0x00f0 || (id & 0xfff0) == 0x02e0) { /* pci-e */ - id = NV_RD32(par->REGS, 0x1800); + id = NV_RD32(REGS, 0x1800); if ((id & 0x0000ffff) == 0x000010DE) id = 0x10DE0000 | (id >> 16); @@ -1220,12 +1220,11 @@ static u32 nvidia_get_chipset(struct fb_info *info) return id; } -static u32 nvidia_get_arch(struct fb_info *info) +static u32 nvidia_get_arch(u32 Chipset) { - struct nvidia_par *par = info->par; u32 arch = 0; - switch (par->Chipset & 0x0ff0) { + switch (Chipset & 0x0ff0) { case 0x0100: /* GeForce 256 */ case 0x0110: /* GeForce2 MX */ case 0x0150: /* GeForce2 */ @@ -1278,16 +1277,44 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) struct fb_info *info; unsigned short cmd; int ret; + volatile u32 __iomem *REGS; + int Chipset; + u32 Architecture; NVTRACE_ENTER(); assert(pd != NULL); + if (pci_enable_device(pd)) { + printk(KERN_ERR PFX "cannot enable PCI device\n"); + return -ENODEV; + } + + /* enable IO and mem if not already done */ + pci_read_config_word(pd, PCI_COMMAND, &cmd); + cmd |= (PCI_COMMAND_IO | PCI_COMMAND_MEMORY); + pci_write_config_word(pd, PCI_COMMAND, cmd); + + nvidiafb_fix.mmio_start = pci_resource_start(pd, 0); + nvidiafb_fix.mmio_len = pci_resource_len(pd, 0); + + REGS = ioremap(nvidiafb_fix.mmio_start, nvidiafb_fix.mmio_len); + if (!REGS) { + printk(KERN_ERR PFX "cannot ioremap MMIO base\n"); + return -ENODEV; + } + + Chipset = nvidia_get_chipset(pd, REGS); + Architecture = nvidia_get_arch(Chipset); + if (Architecture == 0) { + printk(KERN_ERR PFX "unknown NV_ARCH\n"); + goto err_out; + } + ret = aperture_remove_conflicting_pci_devices(pd, "nvidiafb"); if (ret) - return ret; + goto err_out; info = framebuffer_alloc(sizeof(struct nvidia_par), &pd->dev); - if (!info) goto err_out; @@ -1298,11 +1325,6 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) if (info->pixmap.addr == NULL) goto err_out_kfree; - if (pci_enable_device(pd)) { - printk(KERN_ERR PFX "cannot enable PCI device\n"); - goto err_out_enable; - } - if (pci_request_regions(pd, "nvidiafb")) { printk(KERN_ERR PFX "cannot request PCI regions\n"); goto err_out_enable; @@ -1318,34 +1340,17 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) par->paneltweak = paneltweak; par->reverse_i2c = reverse_i2c; - /* enable IO and mem if not already done */ - pci_read_config_word(pd, PCI_COMMAND, &cmd); - cmd |= (PCI_COMMAND_IO | PCI_COMMAND_MEMORY); - pci_write_config_word(pd, PCI_COMMAND, cmd); - - nvidiafb_fix.mmio_start = pci_resource_start(pd, 0); nvidiafb_fix.smem_start = pci_resource_start(pd, 1); - nvidiafb_fix.mmio_len = pci_resource_len(pd, 0); - - par->REGS = ioremap(nvidiafb_fix.mmio_start, nvidiafb_fix.mmio_len); - if (!par->REGS) { - printk(KERN_ERR PFX "cannot ioremap MMIO base\n"); - goto err_out_free_base0; - } + par->REGS = REGS; - par->Chipset = nvidia_get_chipset(info); - par->Architecture = nvidia_get_arch(info); - - if (par->Architecture == 0) { - printk(KERN_ERR PFX "unknown NV_ARCH\n"); - goto err_out_arch; - } + par->Chipset = Chipset; + par->Architecture = Architecture; sprintf(nvidiafb_fix.id, "NV%x", (pd->device & 0x0ff0) >> 4); if (NVCommonSetup(info)) - goto err_out_arch; + goto err_out_free_base0; par->FbAddress = nvidiafb_fix.smem_start; par->FbMapSize = par->RamAmountKBytes * 1024; @@ -1401,7 +1406,6 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) goto err_out_iounmap_fb; } - printk(KERN_INFO PFX "PCI nVidia %s framebuffer (%dMB @ 0x%lX)\n", info->fix.id, @@ -1415,15 +1419,14 @@ err_out_iounmap_fb: err_out_free_base1: fb_destroy_modedb(info->monspecs.modedb); nvidia_delete_i2c_busses(par); -err_out_arch: - iounmap(par->REGS); - err_out_free_base0: +err_out_free_base0: pci_release_regions(pd); err_out_enable: kfree(info->pixmap.addr); err_out_kfree: framebuffer_release(info); err_out: + iounmap(REGS); return -ENODEV; } diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c index 4fc4b26a8d30..ba94a0a7bd4f 100644 --- a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c +++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c @@ -331,13 +331,7 @@ static int dsicm_bl_update_status(struct backlight_device *dev) struct panel_drv_data *ddata = dev_get_drvdata(&dev->dev); struct omap_dss_device *in = ddata->in; int r; - int level; - - if (dev->props.fb_blank == FB_BLANK_UNBLANK && - dev->props.power == FB_BLANK_UNBLANK) - level = dev->props.brightness; - else - level = 0; + int level = backlight_get_brightness(dev); dev_dbg(&ddata->pdev->dev, "update brightness to %d\n", level); diff --git a/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c b/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c index bc5a44c2a144..ae937854403b 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c @@ -10,6 +10,7 @@ #define DSS_SUBSYS_NAME "DISPLAY" #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/module.h> #include <linux/platform_device.h> #include <linux/sysfs.h> @@ -36,7 +37,7 @@ static ssize_t display_enabled_store(struct omap_dss_device *dssdev, int r; bool enable; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; @@ -73,7 +74,7 @@ static ssize_t display_tear_store(struct omap_dss_device *dssdev, if (!dssdev->driver->enable_te || !dssdev->driver->get_te) return -ENOENT; - r = strtobool(buf, &te); + r = kstrtobool(buf, &te); if (r) return r; @@ -183,7 +184,7 @@ static ssize_t display_mirror_store(struct omap_dss_device *dssdev, if (!dssdev->driver->set_mirror || !dssdev->driver->get_mirror) return -ENOENT; - r = strtobool(buf, &mirror); + r = kstrtobool(buf, &mirror); if (r) return r; diff --git a/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c b/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c index ba21c4a2633d..1b644be5fe2e 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c @@ -10,6 +10,7 @@ #define DSS_SUBSYS_NAME "MANAGER" #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/platform_device.h> @@ -246,7 +247,7 @@ static ssize_t manager_trans_key_enabled_store(struct omap_overlay_manager *mgr, bool enable; int r; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; @@ -290,7 +291,7 @@ static ssize_t manager_alpha_blending_enabled_store( if(!dss_has_feature(FEAT_ALPHA_FIXED_ZORDER)) return -ENODEV; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; @@ -329,7 +330,7 @@ static ssize_t manager_cpr_enable_store(struct omap_overlay_manager *mgr, if (!dss_has_feature(FEAT_CPR)) return -ENODEV; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; diff --git a/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c b/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c index 601c0beb6de9..1da4fb1c77b4 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c @@ -13,6 +13,7 @@ #include <linux/err.h> #include <linux/sysfs.h> #include <linux/kobject.h> +#include <linux/kstrtox.h> #include <linux/platform_device.h> #include <video/omapfb_dss.h> @@ -210,7 +211,7 @@ static ssize_t overlay_enabled_store(struct omap_overlay *ovl, const char *buf, int r; bool enable; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c b/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c index 06dc41aa0354..831b2c2fbdf9 100644 --- a/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c @@ -15,6 +15,7 @@ #include <linux/uaccess.h> #include <linux/platform_device.h> #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/omapfb.h> @@ -96,7 +97,7 @@ static ssize_t store_mirror(struct device *dev, int r; struct fb_var_screeninfo new_var; - r = strtobool(buf, &mirror); + r = kstrtobool(buf, &mirror); if (r) return r; diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c index 644278146d3b..41edc6e79460 100644 --- a/drivers/video/fbdev/riva/fbdev.c +++ b/drivers/video/fbdev/riva/fbdev.c @@ -293,13 +293,7 @@ static int riva_bl_update_status(struct backlight_device *bd) { struct riva_par *par = bl_get_data(bd); U032 tmp_pcrt, tmp_pmc; - int level; - - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; + int level = backlight_get_brightness(bd); tmp_pmc = NV_RD32(par->riva.PMC, 0x10F0) & 0x0000FFFF; tmp_pcrt = NV_RD32(par->riva.PCRTC0, 0x081C) & 0xFFFFFFFC; diff --git a/drivers/watchdog/diag288_wdt.c b/drivers/watchdog/diag288_wdt.c index 4cb10877017c..6ca5d9515d85 100644 --- a/drivers/watchdog/diag288_wdt.c +++ b/drivers/watchdog/diag288_wdt.c @@ -86,7 +86,7 @@ static int __diag288(unsigned int func, unsigned int timeout, "1:\n" EX_TABLE(0b, 1b) : "+d" (err) : "d"(__func), "d"(__timeout), - "d"(__action), "d"(__len) : "1", "cc"); + "d"(__action), "d"(__len) : "1", "cc", "memory"); return err; } @@ -268,12 +268,21 @@ static int __init diag288_init(void) char ebc_begin[] = { 194, 197, 199, 201, 213 }; + char *ebc_cmd; watchdog_set_nowayout(&wdt_dev, nowayout_info); if (MACHINE_IS_VM) { - if (__diag288_vm(WDT_FUNC_INIT, 15, - ebc_begin, sizeof(ebc_begin)) != 0) { + ebc_cmd = kmalloc(sizeof(ebc_begin), GFP_KERNEL); + if (!ebc_cmd) { + pr_err("The watchdog cannot be initialized\n"); + return -ENOMEM; + } + memcpy(ebc_cmd, ebc_begin, sizeof(ebc_begin)); + ret = __diag288_vm(WDT_FUNC_INIT, 15, + ebc_cmd, sizeof(ebc_begin)); + kfree(ebc_cmd); + if (ret != 0) { pr_err("The watchdog cannot be initialized\n"); return -EINVAL; } diff --git a/fs/9p/acl.c b/fs/9p/acl.c index c397c51f80d9..eed551d8555f 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -139,7 +139,7 @@ struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu } -struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *v9fs_iop_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { struct v9fs_session_info *v9ses; @@ -151,7 +151,7 @@ struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, return v9fs_get_cached_acl(d_inode(dentry), type); } -int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int retval; @@ -195,7 +195,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, goto err_out; } - if (!inode_owner_or_capable(&init_user_ns, inode)) { + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { retval = -EPERM; goto err_out; } @@ -206,7 +206,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr iattr = {}; struct posix_acl *acl_mode = acl; - retval = posix_acl_update_mode(&init_user_ns, inode, + retval = posix_acl_update_mode(&nop_mnt_idmap, inode, &iattr.ia_mode, &acl_mode); if (retval) @@ -225,7 +225,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * FIXME should we update ctime ? * What is the following setxattr update the mode ? */ - v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); + v9fs_vfs_setattr_dotl(&nop_mnt_idmap, dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 4c60a2bce5de..333cfcc281da 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -10,9 +10,9 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid); struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu); -struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *v9fs_iop_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); -int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 6acabc2e7dc9..f3f74d197b5d 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -151,7 +151,7 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); -extern int v9fs_vfs_rename(struct user_namespace *mnt_userns, +extern int v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index bc417da7e9c1..75106b9f293d 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -60,7 +60,7 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); -int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, +int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b740017634ef..b6ba22975781 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/stat.h> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 27a04a226d97..4344e7a7865f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, { int err = 0; - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -672,7 +672,7 @@ error: /** * v9fs_vfs_create - VFS hook to create a regular file - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: The parent directory * @dentry: The name of file to be created * @mode: The UNIX file mode to set @@ -684,7 +684,7 @@ error: */ static int -v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); @@ -704,14 +704,14 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, /** * v9fs_vfs_mkdir - VFS mkdir hook to create a directory - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @mode: mode for new directory * */ -static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int err; @@ -908,7 +908,7 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) /** * v9fs_vfs_rename - VFS hook to rename an inode - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @old_dir: old dir inode * @old_dentry: old dentry * @new_dir: new dir inode @@ -918,7 +918,7 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) */ int -v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1018,7 +1018,7 @@ error: /** * v9fs_vfs_getattr - retrieve file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @path: Object to query * @stat: metadata structure to populate * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -1027,7 +1027,7 @@ error: */ static int -v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -1038,7 +1038,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1051,7 +1051,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1060,13 +1060,13 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /** * v9fs_vfs_setattr - set file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ -static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, +static int v9fs_vfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; @@ -1077,7 +1077,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(&init_user_ns, dentry, iattr); + retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; @@ -1135,7 +1135,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1300,7 +1300,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_symlink - helper function to create symlinks - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: directory inode containing symlink * @dentry: dentry for symlink * @symname: symlink data @@ -1310,7 +1310,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, */ static int -v9fs_vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", @@ -1356,7 +1356,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod - create a special file - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @mode: mode for creation @@ -1365,7 +1365,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, */ static int -v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index f806b3f11649..3bed3eb3a0e2 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -30,7 +30,7 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** @@ -211,7 +211,7 @@ int v9fs_open_to_dotl_flags(int flags) /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. - * @mnt_userns: The user namespace of the mount + * @idmap: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions @@ -219,10 +219,10 @@ int v9fs_open_to_dotl_flags(int flags) * */ static int -v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_create_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) { - return v9fs_vfs_mknod_dotl(mnt_userns, dir, dentry, omode, 0); + return v9fs_vfs_mknod_dotl(idmap, dir, dentry, omode, 0); } static int @@ -356,14 +356,14 @@ out: /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory * */ -static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, +static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode) { @@ -450,7 +450,7 @@ error: } static int -v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, +v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -462,7 +462,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -479,7 +479,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -529,13 +529,13 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) /** * v9fs_vfs_setattr_dotl - set file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ -int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, +int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; @@ -548,7 +548,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(&init_user_ns, dentry, iattr); + retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; @@ -597,7 +597,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, truncate_setsize(inode, iattr->ia_size); v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ @@ -687,7 +687,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, } static int -v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err; @@ -817,7 +817,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod_dotl - create a special file - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation @@ -825,7 +825,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index b6984311e00a..50f7f3f6b55e 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -150,7 +150,7 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/Makefile b/fs/Makefile index 606c029e1c9b..05f89b5c962f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o remap_range.o + kernel_read_file.o mnt_idmapping.o remap_range.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o mpage.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 06b7c92343ad..223f0283d20f 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -144,7 +144,7 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +int adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); /* map.c */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index ee22278b0cfc..c3ac613d0975 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -294,7 +294,7 @@ out: * later. */ int -adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -302,7 +302,7 @@ adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); /* * we can't change the UID or GID of any file - diff --git a/fs/affs/affs.h b/fs/affs/affs.h index bfa89e131ead..60685ec76d98 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -167,17 +167,17 @@ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct user_namespace *mnt_userns, struct inode *dir, +extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); -extern int affs_symlink(struct user_namespace *mnt_userns, +extern int affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); -extern int affs_rename2(struct user_namespace *mnt_userns, +extern int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); @@ -185,7 +185,7 @@ extern int affs_rename2(struct user_namespace *mnt_userns, /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); -extern int affs_notify_change(struct user_namespace *mnt_userns, +extern int affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 2352a75bd9d6..27f77a52c5c8 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -216,7 +216,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) } int -affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -224,7 +224,7 @@ affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto out; @@ -250,7 +250,7 @@ affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, affs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index bcab18956b4f..d12ccfd2a83d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -242,7 +242,7 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct user_namespace *mnt_userns, struct inode *dir, +affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -274,7 +274,7 @@ affs_create(struct user_namespace *mnt_userns, struct inode *dir, } int -affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -313,7 +313,7 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) } int -affs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct super_block *sb = dir->i_sb; @@ -503,7 +503,7 @@ done: return retval; } -int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index b7c1f8c84b38..82690d1dd49a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,17 +28,17 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, in loff_t fpos, u64 ino, unsigned dtype); static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry); -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content); -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -1332,7 +1332,7 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct afs_operation *op; @@ -1630,7 +1630,7 @@ static const struct afs_operation_ops afs_create_operation = { /* * create a regular file on an AFS filesystem */ -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct afs_operation *op; @@ -1760,7 +1760,7 @@ static const struct afs_operation_ops afs_symlink_operation = { /* * create a symlink in an AFS filesystem */ -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content) { struct afs_operation *op; @@ -1897,7 +1897,7 @@ static const struct afs_operation_ops afs_rename_operation = { /* * rename a file in an AFS filesystem and/or move it between directories */ -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/afs/flock.c b/fs/afs/flock.c index bbcc5afd1576..9c6dea3139f5 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -451,7 +451,7 @@ static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = locks_inode(file); + struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode; afs_lock_type_t type; @@ -701,7 +701,7 @@ error: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); @@ -721,7 +721,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); int ret, lock_count; @@ -763,7 +763,7 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -798,7 +798,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); @@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6d3a3dbe4928..0167e96e5198 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -737,7 +737,7 @@ error_unlock: /* * read the attributes of an inode */ -int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -761,7 +761,7 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; @@ -870,7 +870,7 @@ static const struct afs_operation_ops afs_setattr_operation = { /* * set the attributes of an inode */ -int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { const unsigned int supported = diff --git a/fs/afs/internal.h b/fs/afs/internal.h index fd8567b98e2b..ad8523d0d038 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> @@ -1170,9 +1171,9 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *, +extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); -extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *); +extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); @@ -1387,7 +1388,7 @@ extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); -extern int afs_permission(struct user_namespace *, struct inode *, int); +extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* diff --git a/fs/afs/security.c b/fs/afs/security.c index 7c6a63a30394..6a7744c9e2a2 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -395,7 +395,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int afs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 7751b0b3f81d..9048d8ccc715 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -97,7 +97,7 @@ static const struct afs_operation_ops afs_store_acl_operation = { * Set a file's AFS3 ACL. */ static int afs_xattr_set_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -228,7 +228,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { * Set a file's YFS ACL. */ static int afs_xattr_set_yfs(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -361,6 +361,9 @@ static int aio_ring_mremap(struct vm_area_struct *vma) spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); + if (!table) + goto out_unlock; + for (i = 0; i < table->nr; i++) { struct kioctx *ctx; @@ -374,6 +377,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) } } +out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; diff --git a/fs/attr.c b/fs/attr.c index b45f30e516fa..aca9ff7aed33 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -14,6 +14,7 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/evm.h> #include <linux/ima.h> @@ -23,7 +24,7 @@ /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the setgid bit needs to be removed. @@ -33,7 +34,7 @@ * * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. */ -int setattr_should_drop_sgid(struct user_namespace *mnt_userns, +int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode) { umode_t mode = inode->i_mode; @@ -42,8 +43,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, return 0; if (mode & S_IXGRP) return ATTR_KILL_SGID; - if (!in_group_or_capable(mnt_userns, inode, - i_gid_into_vfsgid(mnt_userns, inode))) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) return ATTR_KILL_SGID; return 0; } @@ -51,7 +51,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to * be dropped - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the set{g,u}id bits need to be removed. @@ -63,7 +63,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits * to remove, 0 otherwise. */ -int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, +int setattr_should_drop_suidgid(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; @@ -73,7 +73,7 @@ int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; - kill |= setattr_should_drop_sgid(mnt_userns, inode); + kill |= setattr_should_drop_sgid(idmap, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; @@ -84,24 +84,24 @@ EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsuid: uid to chown @inode to * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -static bool chown_ok(struct user_namespace *mnt_userns, +static bool chown_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsuid_t ia_vfsuid) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && vfsuid_eq(ia_vfsuid, vfsuid)) return true; - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) @@ -111,28 +111,28 @@ static bool chown_ok(struct user_namespace *mnt_userns, /** * chgrp_ok - verify permissions to chgrp inode - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsgid: gid to chown @inode to * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -static bool chgrp_ok(struct user_namespace *mnt_userns, +static bool chgrp_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t ia_vfsgid) { - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; if (vfsgid_in_group_p(ia_vfsgid)) return true; } - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) @@ -142,7 +142,7 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, /** * setattr_prepare - check if attribute changes to a dentry are allowed - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * @@ -152,16 +152,16 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ -int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, +int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -183,34 +183,34 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && - !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) + !chown_ok(idmap, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && - !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) + !chgrp_ok(idmap, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { vfsgid_t vfsgid; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ia_valid & ATTR_GID) vfsgid = attr->ia_vfsgid; else - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); /* Also check the setgid bit! */ - if (!in_group_or_capable(mnt_userns, inode, vfsgid)) + if (!in_group_or_capable(idmap, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; } @@ -219,7 +219,7 @@ kill_priv: if (ia_valid & ATTR_KILL_PRIV) { int error; - error = security_inode_killpriv(mnt_userns, dentry); + error = security_inode_killpriv(idmap, dentry); if (error) return error; } @@ -276,7 +276,7 @@ EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy - copy simple metadata updates into the generic inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * @@ -289,23 +289,23 @@ EXPORT_SYMBOL(inode_newsize_ok); * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ -void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, +void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -314,15 +314,15 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_or_capable(mnt_userns, inode, - i_gid_into_vfsgid(mnt_userns, inode))) + if (!in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } } EXPORT_SYMBOL(setattr_copy); -int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, +int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid) { int error; @@ -340,8 +340,8 @@ int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, if (IS_IMMUTABLE(inode)) return -EPERM; - if (!inode_owner_or_capable(mnt_userns, inode)) { - error = inode_permission(mnt_userns, inode, MAY_WRITE); + if (!inode_owner_or_capable(idmap, inode)) { + error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; } @@ -352,7 +352,7 @@ EXPORT_SYMBOL(may_setattr); /** * notify_change - modify attributes of a filesytem object - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated @@ -371,13 +371,13 @@ EXPORT_SYMBOL(may_setattr); * the file open for write, as there can be no conflicting delegation in * that case. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; @@ -388,7 +388,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, WARN_ON_ONCE(!inode_is_locked(inode)); - error = may_setattr(mnt_userns, inode, ia_valid); + error = may_setattr(idmap, inode, ia_valid); if (error) return error; @@ -453,11 +453,11 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsgid)) return -EOVERFLOW; @@ -465,13 +465,13 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(idmap, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; - error = security_inode_setattr(mnt_userns, dentry, attr); + error = security_inode_setattr(idmap, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); @@ -479,13 +479,13 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, return error; if (inode->i_op->setattr) - error = inode->i_op->setattr(mnt_userns, dentry, attr); + error = inode->i_op->setattr(idmap, dentry, attr); else - error = simple_setattr(mnt_userns, dentry, attr); + error = simple_setattr(idmap, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); - ima_inode_post_setattr(mnt_userns, dentry); + ima_inode_post_setattr(idmap, dentry); evm_inode_post_setattr(dentry, ia_valid); } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index ca03c1cae2be..6baf90b08e0e 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,12 +10,12 @@ #include "autofs_i.h" -static int autofs_dir_permission(struct user_namespace *, struct inode *, int); -static int autofs_dir_symlink(struct user_namespace *, struct inode *, +static int autofs_dir_permission(struct mnt_idmap *, struct inode *, int); +static int autofs_dir_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct user_namespace *, struct inode *, +static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT @@ -543,7 +543,7 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } -static int autofs_dir_permission(struct user_namespace *mnt_userns, +static int autofs_dir_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (mask & MAY_WRITE) { @@ -560,10 +560,10 @@ static int autofs_dir_permission(struct user_namespace *mnt_userns, return -EACCES; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } -static int autofs_dir_symlink(struct user_namespace *mnt_userns, +static int autofs_dir_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -720,7 +720,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct user_namespace *mnt_userns, +static int autofs_dir_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 92737166203f..db649487d58c 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -27,7 +27,7 @@ static const struct file_operations bad_file_ops = .open = bad_file_open, }; -static int bad_inode_create(struct user_namespace *mnt_userns, +static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -51,14 +51,14 @@ static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_symlink(struct user_namespace *mnt_userns, +static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } -static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; @@ -69,13 +69,13 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } -static int bad_inode_rename2(struct user_namespace *mnt_userns, +static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -89,20 +89,20 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct user_namespace *mnt_userns, +static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } -static int bad_inode_getattr(struct user_namespace *mnt_userns, +static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } -static int bad_inode_setattr(struct user_namespace *mnt_userns, +static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; @@ -146,14 +146,14 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, return -EIO; } -static int bad_inode_tmpfile(struct user_namespace *mnt_userns, +static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } -static int bad_inode_set_acl(struct user_namespace *mnt_userns, +static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 34d4f68f786b..040d5140e426 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,7 +75,7 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int err; @@ -96,7 +96,7 @@ static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, } set_bit(ino, info->si_imap); info->si_freei--; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; @@ -199,7 +199,7 @@ out_brelse: return error; } -static int bfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 555c962fdad6..90d53209755b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,8 @@ condflags := \ $(call cc-option, -Wunused-but-set-variable) \ $(call cc-option, -Wunused-const-variable) \ $(call cc-option, -Wpacked-not-aligned) \ - $(call cc-option, -Wstringop-truncation) + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers @@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ + lru_cache.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3da1779e8b79..7427449a04a3 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -110,7 +110,7 @@ out: return ret; } -int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int ret; @@ -118,7 +118,7 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(mnt_userns, inode, + ret = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (ret) return ret; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 39bd36e6eeb7..a270e71ec05f 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -6,7 +6,7 @@ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 46851511b661..90e40d5ceccd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return false; @@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct * could be a snapshot sharing this extent buffer. */ if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + entry->gen != btrfs_get_last_root_drop_gen(fs_info)) return false; *is_shared = entry->is_shared; @@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return; @@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx ASSERT(level >= 0); if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); + gen = btrfs_get_last_root_drop_gen(fs_info); else gen = btrfs_root_last_snapshot(&root->root_item); @@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .have_delayed_delete_refs = false, }; int level; + bool leaf_cached; + bool leaf_is_shared; for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { if (ctx->prev_extents_cache[i].bytenr == bytenr) @@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, walk_ctx.time_seq = elem.seq; } + ctx->use_path_cache = true; + + /* + * We may have previously determined that the current leaf is shared. + * If it is, then we have a data extent that is shared due to a shared + * subtree (caused by snapshotting) and we don't need to check for data + * backrefs. If the leaf is not shared, then we must do backref walking + * to determine if the data extent is shared through reflinks. + */ + leaf_cached = lookup_backref_shared_cache(ctx, root, + ctx->curr_leaf_bytenr, 0, + &leaf_is_shared); + if (leaf_cached && leaf_is_shared) { + ret = 1; + goto out_trans; + } + walk_ctx.ignore_extent_item_pos = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; @@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; @@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ctx->prev_extents_cache_slot = slot; } +out_trans: if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 8affc88b0e0a..d8b90f95b157 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -14,19 +14,31 @@ #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" +#include "file-item.h" static struct bio_set btrfs_bioset; +static struct bio_set btrfs_clone_bioset; +static struct bio_set btrfs_repair_bioset; +static mempool_t btrfs_failed_bio_pool; + +struct btrfs_failed_bio { + struct btrfs_bio *bbio; + int num_copies; + atomic_t repair_count; +}; /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + atomic_set(&bbio->pending_ios, 1); } /* @@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, * a mempool. */ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private) { struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); + btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct bio *orig, u64 map_length, + bool use_append) { + struct btrfs_bio *orig_bbio = btrfs_bio(orig); struct bio *bio; - struct btrfs_bio *bbio; - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + } + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); + btrfs_bio(bio)->file_offset = orig_bbio->file_offset; + if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + orig_bbio->file_offset += map_length; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; + atomic_inc(&orig_bbio->pending_ios); return bio; } +static void btrfs_orig_write_end_io(struct bio *bio); + +static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, + struct btrfs_bio *orig_bbio) +{ + /* + * For writes we tolerate nr_mirrors - 1 write failures, so we can't + * just blindly propagate a write failure here. Instead increment the + * error count in the original I/O context so that it is guaranteed to + * be larger than the error tolerance. + */ + if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { + struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; + struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + + atomic_add(orig_bioc->max_errors, &orig_bioc->error); + } else { + orig_bbio->bio.bi_status = bbio->bio.bi_status; + } +} + +static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; + + if (bbio->bio.bi_status) + btrfs_bbio_propagate_error(bbio, orig_bbio); + bio_put(&bbio->bio); + bbio = orig_bbio; + } + + if (atomic_dec_and_test(&bbio->pending_ios)) + bbio->end_io(bbio); +} + +static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == fbio->num_copies) + return cur_mirror + 1 - fbio->num_copies; + return cur_mirror + 1; +} + +static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == 1) + return fbio->num_copies; + return cur_mirror - 1; +} + +static void btrfs_repair_done(struct btrfs_failed_bio *fbio) +{ + if (atomic_dec_and_test(&fbio->repair_count)) { + btrfs_orig_bbio_end_io(fbio->bbio); + mempool_free(fbio, &btrfs_failed_bio_pool); + } +} + +static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + struct btrfs_device *dev) +{ + struct btrfs_failed_bio *fbio = repair_bbio->private; + struct btrfs_inode *inode = repair_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + int mirror = repair_bbio->mirror_num; + + if (repair_bbio->bio.bi_status || + !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); + repair_bbio->bio.bi_iter = repair_bbio->saved_iter; + + mirror = next_repair_mirror(fbio, mirror); + if (mirror == fbio->bbio->mirror_num) { + btrfs_debug(fs_info, "no mirror left"); + fbio->bbio->bio.bi_status = BLK_STS_IOERR; + goto done; + } + + btrfs_submit_bio(&repair_bbio->bio, mirror); + return; + } + + do { + mirror = prev_repair_mirror(fbio, mirror); + btrfs_repair_io_failure(fs_info, btrfs_ino(inode), + repair_bbio->file_offset, fs_info->sectorsize, + repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + bv->bv_page, bv->bv_offset, mirror); + } while (mirror != fbio->bbio->mirror_num); + +done: + btrfs_repair_done(fbio); + bio_put(&repair_bbio->bio); +} + +/* + * Try to kick off a repair read to the next available mirror for a bad sector. + * + * This primarily tries to recover good data to serve the actual read request, + * but also tries to write the good data back to the bad mirror(s) when a + * read succeeded to restore the redundancy. + */ +static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + u32 bio_offset, + struct bio_vec *bv, + struct btrfs_failed_bio *fbio) +{ + struct btrfs_inode *inode = failed_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_bio *repair_bbio; + struct bio *repair_bio; + int num_copies; + int mirror; + + btrfs_debug(fs_info, "repair read error: read error at %llu", + failed_bbio->file_offset + bio_offset); + + num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (num_copies == 1) { + btrfs_debug(fs_info, "no copy to repair from"); + failed_bbio->bio.bi_status = BLK_STS_IOERR; + return fbio; + } + + if (!fbio) { + fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); + fbio->bbio = failed_bbio; + fbio->num_copies = num_copies; + atomic_set(&fbio->repair_count, 1); + } + + atomic_inc(&fbio->repair_count); + + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; + bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); + btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); + btrfs_submit_bio(repair_bio, mirror); + return fbio; +} + +static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bvec_iter *iter = &bbio->saved_iter; + blk_status_t status = bbio->bio.bi_status; + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. + */ + if (bbio->bio.bi_pool == &btrfs_repair_bioset) { + btrfs_end_repair_bio(bbio, dev); + return; + } + + /* Clear the I/O error. A failed repair will reset it. */ + bbio->bio.bi_status = BLK_STS_OK; + + while (iter->bi_size) { + struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); + + bv.bv_len = min(bv.bv_len, sectorsize); + if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + fbio = repair_one_sector(bbio, offset, &bv, fbio); + + bio_advance_iter_single(&bbio->bio, iter, sectorsize); + offset += sectorsize; + } + + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + + if (fbio) + btrfs_repair_done(fbio); + else + btrfs_orig_bbio_end_io(bbio); +} + static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) @@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - bbio->end_io(bbio); + /* Metadata reads are checked and repaired by the submitter. */ + if (bbio->bio.bi_opf & REQ_META) + bbio->end_io(bbio); + else + btrfs_check_read_bio(bbio, bbio->bio.bi_private); } static void btrfs_simple_end_io(struct bio *bio) { - struct btrfs_fs_info *fs_info = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); + btrfs_log_dev_io_error(bio, dev); if (bio_op(bio) == REQ_OP_READ) { INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + btrfs_record_physical_zoned(bbio); + btrfs_orig_bbio_end_io(bbio); } } @@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - bbio->end_io(bbio); + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } + /* Do not leak our private flag into the block layer. */ + bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; if (!bioc) { - /* Single mirror read/write fast path */ + /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); + btrfs_submit_dev_bio(smap->dev, bio); } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ + /* Parity RAID write or read recovery. */ bio->bi_private = bioc; bio->bi_end_io = btrfs_raid56_end_io; if (bio_op(bio) == REQ_OP_READ) @@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror else raid56_parity_write(bio, bioc); } else { - /* Write to multiple mirrors */ + /* Write to multiple mirrors. */ int total_devs = bioc->num_stripes; - int dev_nr; bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) btrfs_submit_mirrored_bio(bioc, dev_nr); } } +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(bbio); + return btrfs_csum_one_bio(bbio); +} + +/* + * Async submit bios are used to offload expensive checksumming onto the worker + * threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the btree. + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on. */ + if (bio->bi_status) { + btrfs_orig_bbio_end_io(async->bbio); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* + * If the I/O is not issued by fsync and friends, (->sync_writers != 0), + * then try to defer the submission to a workqueue to parallelize the + * checksum calculation. + */ + if (atomic_read(&bbio->inode->sync_writers)) + return false; + + /* + * Submit metadata writes synchronously if the checksum implementation + * is fast, or we are on a zoned device that wants I/O to be submitted + * in order. + */ + if (bbio->bio.bi_opf & REQ_META) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + if (btrfs_is_zoned(fs_info)) + return false; + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) + return false; + } + + return true; +} + +/* + * Submit bio to an async queue. + * + * Return true if the work has been succesfuly submitted, else false. + */ +static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return false; + + async->bbio = bbio; + async->bioc = bioc; + async->smap = *smap; + async->mirror_num = mirror_num; + + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); + if (op_is_sync(bbio->bio.bi_opf)) + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; +} + +static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *orig_bbio = bbio; + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + bool use_append = btrfs_use_zone_append(bbio); + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + blk_status_t ret; + int error; + + btrfs_bio_counter_inc_blocked(fs_info); + error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (error) { + ret = errno_to_blk_status(error); + goto fail; + } + + map_length = min(map_length, length); + if (use_append) + map_length = min(map_length, fs_info->max_zone_append_size); + + if (map_length < length) { + bio = btrfs_split_bio(fs_info, bio, map_length, use_append); + bbio = btrfs_bio(bio); + } + + /* + * Save the iter for the end_io handler and preload the checksums for + * data reads. + */ + if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + bbio->saved_iter = bio->bi_iter; + ret = btrfs_lookup_bio_sums(bbio); + if (ret) + goto fail_put_bio; + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + if (ret) + goto fail_put_bio; + } + + /* + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ + if (!(inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(inode->root)) { + if (should_async_write(bbio) && + btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) + goto done; + + ret = btrfs_bio_csum(bbio); + if (ret) + goto fail_put_bio; + } + } + + __btrfs_submit_bio(bio, bioc, &smap, mirror_num); +done: + return map_length == length; + +fail_put_bio: + if (map_length < length) + bio_put(bio); +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(orig_bbio, ret); + /* Do not submit another chunk */ + return true; +} + +void btrfs_submit_bio(struct bio *bio, int mirror_num) +{ + while (!btrfs_submit_chunk(bio, mirror_num)) + ; +} + /* * Submit a repair write. * @@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * - * The I/O is issued sychronously to block the repair read completion from + * The I/O is issued synchronously to block the repair read completion from * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, @@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void) offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) return -ENOMEM; + if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), 0)) + goto out_free_bioset; + if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_clone_bioset; + if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, + sizeof(struct btrfs_failed_bio))) + goto out_free_repair_bioset; return 0; + +out_free_repair_bioset: + bioset_exit(&btrfs_repair_bioset); +out_free_clone_bioset: + bioset_exit(&btrfs_clone_bioset); +out_free_bioset: + bioset_exit(&btrfs_bioset); + return -ENOMEM; } void __cold btrfs_bioset_exit(void) { + mempool_exit(&btrfs_failed_bio_pool); + bioset_exit(&btrfs_repair_bioset); + bioset_exit(&btrfs_clone_bioset); bioset_exit(&btrfs_bioset); } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index b12f84b3b341..873ff85817f0 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -26,32 +26,23 @@ struct btrfs_fs_info; typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. + * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and + * passed to btrfs_submit_bio for mapping to the physical devices. */ struct btrfs_bio { - unsigned int mirror_num:7; - - /* - * Extra indicator for metadata bios. - * For some btrfs bios they use pages without a mapping, thus - * we can not rely on page->mapping->host to determine if - * it's a metadata bio. - */ - unsigned int is_metadata:1; - struct bvec_iter iter; - - /* for direct I/O */ + /* Inode and offset into it that this I/O operates on. */ + struct btrfs_inode *inode; u64 file_offset; - /* @device is for stripe IO submission. */ - struct btrfs_device *device; union { - /* For data checksum verification. */ + /* + * Data checksumming and original I/O information for internal + * use in the btrfs_submit_bio machinery. + */ struct { u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + struct bvec_iter saved_iter; }; /* For metadata parentness verification. */ @@ -62,7 +53,9 @@ struct btrfs_bio { btrfs_bio_end_io_t end_io; void *private; - /* For read end I/O handling */ + /* For internal use in read end I/O handling */ + unsigned int mirror_num; + atomic_t pending_ios; struct work_struct end_io_work; /* @@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private); struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { @@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) bbio->end_io(bbio); } -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->is_metadata) - return; - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} +/* Bio only refers to one ordered extent. */ +#define REQ_BTRFS_ONE_ORDERED REQ_DRV -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct bio *bio, int mirror_num); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 708d843daa72..5b10401d803b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> #include <linux/list_sort.h> #include "misc.h" #include "ctree.h" @@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end return total_added; } +/* + * Get an arbitrary extent item index / max_index through the block group + * + * @block_group the block group to sample from + * @index: the integral step through the block group to grab from + * @max_index: the granularity of the sampling + * @key: return value parameter for the item we find + * + * Pre-conditions on indices: + * 0 <= index <= max_index + * 0 < max_index + * + * Returns: 0 on success, 1 if the search didn't yield a useful item, negative + * error code on error. + */ +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group, + int index, int max_index, + struct btrfs_key *key) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + int ret = 0; + u64 search_offset; + u64 search_end = block_group->start + block_group->length; + struct btrfs_path *path; + + ASSERT(index >= 0); + ASSERT(index <= max_index); + ASSERT(max_index > 0); + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, + BTRFS_SUPER_INFO_OFFSET)); + + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + search_offset = index * div_u64(block_group->length, max_index); + key->objectid = block_group->start + search_offset; + key->type = BTRFS_EXTENT_ITEM_KEY; + key->offset = 0; + + while (1) { + ret = btrfs_search_forward(extent_root, key, path, 0); + if (ret != 0) + goto out; + /* Success; sampled an extent item in the block group */ + if (key->type == BTRFS_EXTENT_ITEM_KEY && + key->objectid >= block_group->start && + key->objectid + key->offset <= search_end) + goto out; + + /* We can't possibly find a valid extent item anymore */ + if (key->objectid >= search_end) { + ret = 1; + break; + } + if (key->type < BTRFS_EXTENT_ITEM_KEY) + key->type = BTRFS_EXTENT_ITEM_KEY; + else + key->objectid++; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + } +out: + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + btrfs_free_path(path); + return ret; +} + +/* + * Best effort attempt to compute a block group's size class while caching it. + * + * @block_group: the block group we are caching + * + * We cannot infer the size class while adding free space extents, because that + * logic doesn't care about contiguous file extents (it doesn't differentiate + * between a 100M extent and 100 contiguous 1M extents). So we need to read the + * file extent items. Reading all of them is quite wasteful, because usually + * only a handful are enough to give a good answer. Therefore, we just grab 5 of + * them at even steps through the block group and pick the smallest size class + * we see. Since size class is best effort, and not guaranteed in general, + * inaccuracy is acceptable. + * + * To be more explicit about why this algorithm makes sense: + * + * If we are caching in a block group from disk, then there are three major cases + * to consider: + * 1. the block group is well behaved and all extents in it are the same size + * class. + * 2. the block group is mostly one size class with rare exceptions for last + * ditch allocations + * 3. the block group was populated before size classes and can have a totally + * arbitrary mix of size classes. + * + * In case 1, looking at any extent in the block group will yield the correct + * result. For the mixed cases, taking the minimum size class seems like a good + * approximation, since gaps from frees will be usable to the size class. For + * 2., a small handful of file extents is likely to yield the right answer. For + * 3, we can either read every file extent, or admit that this is best effort + * anyway and try to stay fast. + * + * Returns: 0 on success, negative error code on error. + */ +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group) +{ + struct btrfs_key key; + int i; + u64 min_size = block_group->length; + enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + int ret; + + if (!btrfs_block_group_should_use_size_class(block_group)) + return 0; + + for (i = 0; i < 5; ++i) { + ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + if (ret < 0) + goto out; + if (ret > 0) + continue; + min_size = min_t(u64, min_size, key.offset); + size_class = btrfs_calc_block_group_size_class(min_size); + } + if (size_class != BTRFS_BG_SZ_NONE) { + spin_lock(&block_group->lock); + block_group->size_class = size_class; + spin_unlock(&block_group->lock); + } + +out: + return ret; +} + static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; @@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); + load_block_group_size_class(caching_ctl, block_group); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { @@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group - * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * block copies. */ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len) + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, data_stripe_length)) continue; - if (bdev && map->stripes[i].dev->bdev != bdev) - continue; - stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); @@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, NULL, + ret = btrfs_rmap_block(fs_info, cache->start, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { - bool reclaim; + bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { @@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); + spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + goto out; + } - /* - * Compression can use less space than we reserved, so wake - * tickets if that happens - */ - if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + if (btrfs_block_group_should_use_size_class(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} + +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +{ + if (btrfs_is_zoned(bg->fs_info)) + return false; + if (!btrfs_is_block_group_data_only(bg)) + return false; + return true; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a02ea76fd6cf..6e4a0b429ac3 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { BTRFS_DC_SETUP, }; +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over @@ -233,6 +244,7 @@ struct btrfs_block_group { struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, @@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len); + u64 physical, u64 **logical, int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 195c09e20609..9dc21622806e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,12 +93,6 @@ struct btrfs_inode { /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; - /* special utility tree used to record which mirrors have already been - * tried when checksums fail for a given block - */ - struct rb_root io_failure_tree; - spinlock_t io_failure_lock; - /* * Keep track of where the inode has extent items mapped in order to * make sure the i_size adjustments are accurate @@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); @@ -469,7 +453,7 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args); void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir); void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5122ca79f7ea..f42f31f22d13 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static void finish_compressed_bio_read(struct compressed_bio *cb) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { + struct compressed_bio *cb = bbio->private; unsigned int index; struct page *page; - if (cb->status == BLK_STS_OK) + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; + else cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); /* Release the compressed pages */ @@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); -} - -/* - * Verify the checksums and kick off repair if needed on the uncompressed data - * before decompressing it into the original bio and freeing the uncompressed - * pages. - */ -static void end_compressed_bio_read(struct btrfs_bio *bbio) -{ - struct compressed_bio *cb = bbio->private; - struct inode *inode = cb->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); - bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - blk_status_t status = bbio->bio.bi_status; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (!status && - (!csum || !btrfs_check_data_csum(bi, bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(bi, start, bv.bv_page, - bv.bv_offset); - } else { - int ret; - - refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, - true); - if (ret) { - refcount_dec(&cb->pending_ios); - status = errno_to_blk_status(ret); - } - } - } - - if (status) - cb->status = status; - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_compressed_bio_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = bbio->private; - - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - - if (refcount_dec_and_test(&cb->pending_ios)) { - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - - btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - } - bio_put(&bbio->bio); -} - -/* - * Allocate a compressed_bio, which will be used to read/write on-disk - * (aka, compressed) * data. - * - * @cb: The compressed_bio structure, which records all the needed - * information to bind the compressed data to the uncompressed - * page cache. - * @disk_byten: The logical bytenr where the compressed data will be read - * from or written to. - * @endio_func: The endio function to call after the IO for compressed data - * is finished. - * @next_stripe_start: Return value of logical bytenr of where next stripe starts. - * Let the caller know to only fill the bio up to the stripe - * boundary. - */ - - -static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, - btrfs_bio_end_io_t endio_func, - u64 *next_stripe_start) -{ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - struct btrfs_io_geometry geom; - struct extent_map *em; - struct bio *bio; - int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + cb->status = bbio->bio.bi_status; + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); - if (IS_ERR(em)) { - bio_put(bio); - return ERR_CAST(em); - } - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); - - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); - free_extent_map(em); - if (ret < 0) { - bio_put(bio); - return ERR_PTR(ret); - } - *next_stripe_start = disk_bytenr + geom.len; - refcount_inc(&cb->pending_ios); - return bio; + bio_put(&bbio->bio); } /* @@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct bio *bio = NULL; struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; - u64 next_stripe_start; blk_status_t ret = BLK_STS_OK; - int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; - const bool use_append = btrfs_use_zone_append(inode, disk_start); - const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; @@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - if (blkcg_css) + if (blkcg_css) { kthread_associate_blkcg(blkcg_css); + write_flags |= REQ_CGROUP_PUNT; + } + + write_flags |= REQ_BTRFS_ONE_ORDERED; + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, + BTRFS_I(cb->inode), end_compressed_bio_write, cb); + bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = start; while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; @@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int real_size; unsigned int added; struct page *page = compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!bio) { - bio = alloc_compressed_bio(cb, cur_disk_bytenr, - bio_op | write_flags, end_compressed_bio_write, - &next_stripe_start); - if (IS_ERR(bio)) { - ret = errno_to_blk_status(PTR_ERR(bio)); - break; - } - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_bytenr != next_stripe_start); /* * We have various limits on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - if (use_append) - added = bio_add_zone_append_page(bio, page, real_size, - offset_in_page(offset)); - else - added = bio_add_page(bio, page, real_size, - offset_in_page(offset)); - /* Reached zoned boundary */ - if (added == 0) - submit = true; - + added = bio_add_page(bio, page, real_size, offset_in_page(offset)); + /* + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. + */ + ASSERT(added == real_size); cur_disk_bytenr += added; - /* Reached stripe boundary */ - if (cur_disk_bytenr == next_stripe_start) - submit = true; - - /* Finished the range */ - if (cur_disk_bytenr == disk_start + compressed_len) - submit = true; - - if (submit) { - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - break; - } - } - - ASSERT(bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, bio, 0); - bio = NULL; - } - cond_resched(); } + /* Finished the range. */ + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(bio, 0); if (blkcg_css) kthread_associate_blkcg(NULL); - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_write(cb); return ret; } @@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - struct bio *comp_bio = NULL; + struct bio *comp_bio; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; - u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto out; } - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; @@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; + comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), + end_compressed_bio_read, cb); + comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); + while (cur_disk_byte < disk_bytenr + compressed_len) { u64 offset = cur_disk_byte - disk_bytenr; unsigned int index = offset >> PAGE_SHIFT; unsigned int real_size; unsigned int added; struct page *page = cb->compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!comp_bio) { - comp_bio = alloc_compressed_bio(cb, cur_disk_byte, - REQ_OP_READ, end_compressed_bio_read, - &next_stripe_start); - if (IS_ERR(comp_bio)) { - cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); - break; - } - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_byte != next_stripe_start); + /* * We have various limit on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); @@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ ASSERT(added == real_size); cur_disk_byte += added; - - /* Reached stripe boundary, need to submit */ - if (cur_disk_byte == next_stripe_start) - submit = true; - - /* Has finished the range, need to submit */ - if (cur_disk_byte == disk_bytenr + compressed_len) - submit = true; - - if (submit) { - /* Save the original iter for read repair */ - if (bio_op(comp_bio) == REQ_OP_READ) - btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - - /* - * Save the initial offset of this chunk, as there - * is no direct correlation between compressed pages and - * the original file offset. The field is only used for - * priting error messages. - */ - btrfs_bio(comp_bio)->file_offset = file_offset; - - ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(comp_bio), ret); - break; - } - - ASSERT(comp_bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, comp_bio, mirror_num); - comp_bio = NULL; - } } if (memstall) psi_memstall_leave(&pflags); - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); + /* + * Stash the initial offset of this chunk, as there is no direct + * correlation between compressed pages and the original file offset. + * The field is only used for printing error messages anyway. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; + + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(comp_bio, mirror_num); return; fail: @@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, index_end = end >> PAGE_SHIFT; /* Don't miss unaligned end */ - if (!IS_ALIGNED(end, PAGE_SIZE)) + if (!PAGE_ALIGNED(end)) index_end++; curr_sample_pos = 0; @@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, * * For now is's a naive and optimistic 'return true', we'll extend the logic to * quickly (compared to direct compression) detect data characteristics - * (compressible/uncompressible) to avoid wasting CPU time on uncompressible + * (compressible/incompressible) to avoid wasting CPU time on incompressible * data. * * The following types of analysis can be performed: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 6209d40a1e08..a5e3377db9ad 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of outstanding bios */ - refcount_t pending_ios; - /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4754c9101a4c..a5b6bb54545f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); *last_ref = 1; } return 0; @@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, /* * Search for a key in the given extent_buffer. * - * The lower boundary for the search is specified by the slot number @low. Use a - * value of 0 to search over the whole extent buffer. + * The lower boundary for the search is specified by the slot number @first_slot. + * Use a value of 0 to search over the whole extent buffer. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, int low, - const struct btrfs_key *key, int *slot) +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; - int high = btrfs_header_nritems(eb); + /* + * Use unsigned types for the low and high slots, so that we get a more + * efficient division in the search loop below. + */ + u32 low = first_slot; + u32 high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); - if (low > high) { + if (unlikely(low > high)) { btrfs_err(eb->fs_info, - "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + "%s: low (%u) > high (%u) eb %llu owner %llu level %d", __func__, low, high, eb->start, btrfs_header_owner(eb), btrfs_header_level(eb)); return -EINVAL; @@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low, return 1; } -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot) -{ - return generic_bin_search(eb, 0, key, slot); -} - static void root_add_used(struct btrfs_root *root, u32 size) { spin_lock(&root->accounting_lock); @@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_generic_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_path *path, +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, int free_space, u32 left_nritems, @@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (left_nritems) btrfs_mark_buffer_dirty(left); else - btrfs_clean_tree_block(left); + btrfs_clear_buffer_dirty(trans, left); btrfs_mark_buffer_dirty(right); @@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clean_tree_block(path->nodes[0]); + btrfs_clear_buffer_dirty(trans, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(path, min_data_size, empty, - right, free_space, left_nritems, min_slot); + return __push_leaf_right(trans, path, min_data_size, empty, right, + free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -3259,7 +3255,8 @@ out_unlock: * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) @@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (right_nritems) btrfs_mark_buffer_dirty(right); else - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(path, &disk_key, 1); @@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EUCLEAN; goto out; } - return __push_leaf_left(path, min_data_size, - empty, left, free_space, right_nritems, - max_slot); + return __push_leaf_left(trans, path, min_data_size, empty, left, + free_space, right_nritems, max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6965703a81b6..97897107fab5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); + +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); + +/* + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). + */ +static inline int btrfs_bin_search(struct extent_buffer *eb, + const struct btrfs_key *key, + int *slot) +{ + return btrfs_generic_bin_search(eb, 0, key, slot); +} + int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index d81b764a7644..8065341d831a 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -765,7 +765,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -999,7 +999,7 @@ next: } #define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); /* * Defrag one contiguous target range. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 573ebab886e2..886ffb232eac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); } -static bool merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - drop_delayed_ref(trans, delayed_refs, head, next); + drop_delayed_ref(delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); + drop_delayed_ref(delayed_refs, head, ref); done = true; } else { /* @@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, return done; } -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -524,7 +521,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(trans, delayed_refs, head, ref, seq)) + if (merge_ref(delayed_refs, head, ref, seq)) goto again; } } @@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return 0 for insert. * Return >0 for merge. */ -static int insert_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, +static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { @@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(trans, root, href, exist); + drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); return ret; inserted: @@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d6304b690ec4..2eb34abf700f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index ff2e524d9937..317aeff6c1da 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + lockdep_assert_held(&discard_ctl->lock); if (!btrfs_run_discard_work(discard_ctl)) return; @@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, BTRFS_DISCARD_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; } + if (list_empty(&block_group->discard_list)) + btrfs_get_block_group(block_group); list_move_tail(&block_group->discard_list, get_discard_list(discard_ctl, block_group)); @@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + bool queued; + spin_lock(&discard_ctl->lock); + queued = !list_empty(&block_group->discard_list); + if (!btrfs_run_discard_work(discard_ctl)) { spin_unlock(&discard_ctl->lock); return; @@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_UNUSED_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + if (!queued) + btrfs_get_block_group(block_group); list_add_tail(&block_group->discard_list, &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); @@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool running = false; + bool queued = false; spin_lock(&discard_ctl->lock); @@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, } block_group->discard_eligible_time = 0; + queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); + /* + * If the block group is currently running in the discard workfn, we + * don't want to deref it, since it's still being used by the workfn. + * The workfn will notice this case and deref the block group when it is + * finished. + */ + if (queued && !running) + btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -214,10 +233,12 @@ again: if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { - if (btrfs_is_block_group_data_only(block_group)) + if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); - else + } else { list_del_init(&block_group->discard_list); + btrfs_put_block_group(block_group); + } goto again; } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { @@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; + /* + * If the block group was removed from the discard list while it was + * running in this workfn, then we didn't deref it, since this function + * still owned that reference. But we set the discard_ctl->block_group + * back to NULL, so we can use that condition to know that now we need + * to deref the block_group. + */ + if (discard_ctl->block_group == NULL) + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); @@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + /* + * This put is for the get done by btrfs_mark_bg_unused. + * Queueing discard incremented it for discard's reference. + */ + btrfs_put_block_group(block_group); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) if (block_group->used == 0) btrfs_mark_bg_unused(block_group); spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); } } spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3aa04224315e..b53f0e30ce2b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -79,23 +79,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) } /* - * async submit bios are used to offload expensive checksumming - * onto the worker threads. They checksum file and metadata bios - * just before they are sent down the IO stack. - */ -struct async_submit_bio { - struct btrfs_inode *inode; - struct bio *bio; - enum btrfs_wq_submit_cmd submit_cmd; - int mirror_num; - - /* Optional parameter for used by direct io */ - u64 dio_file_offset; - struct btrfs_work work; - blk_status_t status; -}; - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return csum_one_extent_buffer(eb); } +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct bvec_iter iter; + struct bio_vec bv; + int ret = 0; + + bio_for_each_segment(bv, &bbio->bio, iter) { + ret = csum_dirty_buffer(fs_info, &bv); + if (ret) + break; + } + + return errno_to_blk_status(ret); +} + static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -700,172 +699,6 @@ err: return ret; } -static void run_one_async_start(struct btrfs_work *work) -{ - struct async_submit_bio *async; - blk_status_t ret; - - async = container_of(work, struct async_submit_bio, work); - switch (async->submit_cmd) { - case WQ_SUBMIT_METADATA: - ret = btree_submit_bio_start(async->bio); - break; - case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio); - break; - case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(async->inode, - async->bio, async->dio_file_offset); - break; - } - if (ret) - async->status = ret; -} - -/* - * In order to insert checksums into the metadata in large chunks, we wait - * until bio submission time. All the pages in the bio are checksummed and - * sums are attached onto the ordered extent record. - * - * At IO completion time the csums attached on the ordered extent record are - * inserted into the tree. - */ -static void run_one_async_done(struct btrfs_work *work) -{ - struct async_submit_bio *async = - container_of(work, struct async_submit_bio, work); - struct btrfs_inode *inode = async->inode; - struct btrfs_bio *bbio = btrfs_bio(async->bio); - - /* If an error occurred we just want to clean up the bio and move on */ - if (async->status) { - btrfs_bio_end_io(bbio, async->status); - return; - } - - /* - * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. - */ - async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - struct async_submit_bio *async; - - async = container_of(work, struct async_submit_bio, work); - kfree(async); -} - -/* - * Submit bio to an async queue. - * - * Retrun: - * - true if the work has been succesfuly submitted - * - false in case of error - */ -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_submit_bio *async; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return false; - - async->inode = inode; - async->bio = bio; - async->mirror_num = mirror_num; - async->submit_cmd = cmd; - - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); - - async->dio_file_offset = dio_file_offset; - - async->status = 0; - - if (op_is_sync(bio->bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); - return true; -} - -static blk_status_t btree_csum_one_bio(struct bio *bio) -{ - struct bio_vec *bvec; - struct btrfs_root *root; - int ret = 0; - struct bvec_iter_all iter_all; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec); - if (ret) - break; - } - - return errno_to_blk_status(ret); -} - -blk_status_t btree_submit_bio_start(struct bio *bio) -{ - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_submit_bio. - */ - return btree_csum_one_bio(bio); -} - -static bool should_async_write(struct btrfs_fs_info *fs_info, - struct btrfs_inode *bi) -{ - if (btrfs_is_zoned(fs_info)) - return false; - if (atomic_read(&bi->sync_writers)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - return true; -} - -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_bio *bbio = btrfs_bio(bio); - blk_status_t ret; - - bio->bi_opf |= REQ_META; - bbio->is_metadata = 1; - - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - btrfs_submit_bio(fs_info, bio, mirror_num); - return; - } - - /* - * Kthread helpers are used to submit writes so that checksumming can - * happen in parallel across all CPUs. - */ - if (should_async_write(fs_info, inode) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) - return; - - ret = btree_csum_one_bio(bio); - if (ret) { - btrfs_bio_end_io(bbio, ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -void btrfs_clean_tree_block(struct extent_buffer *buf) -{ - struct btrfs_fs_info *fs_info = buf->fs_info; - if (btrfs_header_generation(buf) == - fs_info->running_transaction->transid) { - btrfs_assert_tree_write_locked(buf); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); - clear_extent_buffer_dirty(buf); - } - } -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg) goto sleep; } + if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) + btrfs_sysfs_feature_update(fs_info); + btrfs_run_delayed_iputs(fs_info); again = btrfs_clean_one_deleted_snapshot(fs_info); @@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, start += fs_info->nodesize; if (!eb) continue; + + btrfs_tree_lock(eb); wait_on_extent_buffer_writeback(eb); + btrfs_clear_buffer_dirty(NULL, eb); + btrfs_tree_unlock(eb); - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags)) - clear_extent_buffer_dirty(eb); free_extent_buffer_stale(eb); } } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f2f295eb6103..4d5772330110 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clean_tree_block(struct extent_buffer *buf); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, @@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, struct btrfs_tree_parent_check *check); -enum btrfs_wq_submit_cmd { - WQ_SUBMIT_METADATA, - WQ_SUBMIT_DATA, - WQ_SUBMIT_DATA_DIO, -}; - -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); -blk_status_t btree_submit_bio_start(struct bio *bio); +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 3c7766dfaa69..29a225836e28 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1625,7 +1625,7 @@ search: } /* - * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * Search a range in the state tree for a given mask. If 'filled' == 1, this * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 * is returned if any bit in the range is found set. */ diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index e3eeec380844..21766e49ec02 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -6,7 +6,6 @@ #include "misc.h" struct extent_changeset; -struct io_failure_record; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72ba13b027a9..824c657f59e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,7 +16,8 @@ #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> -#include "misc.h" +#include "ctree.h" +#include "extent-tree.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -31,14 +32,12 @@ #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" -#include "block-group.h" #include "discard.h" #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" -#include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" @@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, cond_resched(); spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; @@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &actual_count); @@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, + LOOP_UNSET_SIZE_CLASS, LOOP_ALLOC_CHUNK, + LOOP_WRONG_SIZE_CLASS, LOOP_NO_EMPTY_SIZE, }; @@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } -enum btrfs_extent_allocation_policy { - BTRFS_EXTENT_ALLOC_CLUSTERED, - BTRFS_EXTENT_ALLOC_ZONED, -}; - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 min_alloc_size; - u64 empty_size; - u64 flags; - int delalloc; - - /* Where to start the search inside the bg */ - u64 search_start; - - /* For clustered allocation */ - u64 empty_cluster; - struct btrfs_free_cluster *last_ptr; - bool use_cluster; - - bool have_caching_bg; - bool orig_have_caching_bg; - - /* Allocation is called for tree-log */ - bool for_treelog; - - /* Allocation is called for data relocation */ - bool for_data_reloc; - - /* RAID index, converted from flags */ - int index; - - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; - - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; - - /* If current block group is cached */ - int cached; - - /* Max contiguous hole found */ - u64 max_extent_size; - - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; - - /* Found result */ - u64 found_offset; - - /* Hint where to start looking for an empty space */ - u64 hint_byte; - - /* Allocation policy */ - enum btrfs_extent_allocation_policy policy; -}; - - /* * Helper function for find_free_extent(). * @@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; @@ -3610,10 +3535,8 @@ refill_cluster: if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && @@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_UNSET_SIZE_CLASS, allow unset size class * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, - ffe_ctl->flags); + trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { @@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { @@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: + trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4356,6 +4277,7 @@ search: &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; + ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) @@ -4397,6 +4319,7 @@ search: } have_block_group: + trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4421,6 +4344,9 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { @@ -4455,7 +4381,8 @@ have_block_group: ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, @@ -4468,8 +4395,7 @@ have_block_group: ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, - ffe_ctl->num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: @@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); __btrfs_tree_lock(buf, nest); - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); @@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } } - /* make block locked assertion in btrfs_clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { + /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ + if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } - btrfs_clean_tree_block(eb); + btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index ae5425253603..0c958fc1b3b8 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,6 +3,87 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include "misc.h" +#include "block-group.h" + +struct btrfs_free_cluster; + +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 min_alloc_size; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; + + /* Whether or not the allocator is currently following a hint */ + bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; +}; + enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID, BTRFS_REF_TYPE_BLOCK, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9bd32daa9b9a..c25fa74d7615 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -36,6 +36,7 @@ #include "file.h" #include "dev-replace.h" #include "super.h" +#include "transaction.h" static struct kmem_cache *extent_buffer_cache; @@ -99,7 +100,6 @@ struct btrfs_bio_ctrl { struct bio *bio; int mirror_num; enum btrfs_compression_type compress_type; - u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; @@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct btrfs_inode *inode; + struct inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = BTRFS_I(bv->bv_page->mapping->host); + inode = bv->bv_page->mapping->host; mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - - if (!is_data_inode(&inode->vfs_inode)) { + if (!is_data_inode(inode)) { if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * For metadata read, we should have the parent_check, @@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->parent_check, sizeof(struct btrfs_tree_parent_check)); } - btrfs_submit_metadata_bio(inode, bio, mirror_num); - } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_submit_data_write_bio(inode, bio, mirror_num); - } else { - btrfs_submit_data_read_bio(inode, bio, mirror_num, - bio_ctrl->compress_type); + bio->bi_opf |= REQ_META; } + if (btrfs_op(bio) == BTRFS_MAP_READ && + bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + btrfs_submit_compressed_read(inode, bio, mirror_num); + else + btrfs_submit_bio(bio, mirror_num); + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } @@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } -static int insert_failrec(struct btrfs_inode *inode, - struct io_failure_record *failrec) -{ - struct rb_node *exist; - - spin_lock(&inode->io_failure_lock); - exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, - &failrec->rb_node); - spin_unlock(&inode->io_failure_lock); - - return (exist == NULL) ? 0 : -EEXIST; -} - -static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) -{ - struct rb_node *node; - struct io_failure_record *failrec = ERR_PTR(-ENOENT); - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search(&inode->io_failure_tree, start); - if (node) - failrec = rb_entry(node, struct io_failure_record, rb_node); - spin_unlock(&inode->io_failure_lock); - return failrec; -} - -static void free_io_failure(struct btrfs_inode *inode, - struct io_failure_record *rec) -{ - spin_lock(&inode->io_failure_lock); - rb_erase(&rec->rb_node, &inode->io_failure_tree); - spin_unlock(&inode->io_failure_lock); - - kfree(rec); -} - -static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == failrec->num_copies) - return cur_mirror + 1 - failrec->num_copies; - return cur_mirror + 1; -} - -static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == 1) - return failrec->num_copies; - return cur_mirror - 1; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - u64 ino = btrfs_ino(inode); - u64 locked_start, locked_end; - struct io_failure_record *failrec; - int mirror; - int ret; - - failrec = get_failrec(inode, start); - if (IS_ERR(failrec)) - return 0; - - BUG_ON(!failrec->this_mirror); - - if (sb_rdonly(fs_info->sb)) - goto out; - - ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, - &locked_end, EXTENT_LOCKED, NULL); - if (ret || locked_start > failrec->bytenr || - locked_end < failrec->bytenr + failrec->len - 1) - goto out; - - mirror = failrec->this_mirror; - do { - mirror = prev_mirror(failrec, mirror); - btrfs_repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, mirror); - } while (mirror != failrec->failed_mirror); - -out: - free_io_failure(inode, failrec); - return 0; -} - -/* - * Can be called when - * - hold extent lock - * - under ordered extent - * - the inode is freeing - */ -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct io_failure_record *failrec; - struct rb_node *node, *next; - - if (RB_EMPTY_ROOT(&inode->io_failure_tree)) - return; - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search_first(&inode->io_failure_tree, start); - while (node) { - failrec = rb_entry(node, struct io_failure_record, rb_node); - if (failrec->bytenr > end) - break; - - next = rb_next(node); - rb_erase(&failrec->rb_node, &inode->io_failure_tree); - kfree(failrec); - - node = next; - } - spin_unlock(&inode->io_failure_lock); -} - -static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - struct btrfs_bio *bbio, - unsigned int bio_offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 start = bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - const u32 sectorsize = fs_info->sectorsize; - int ret; - - failrec = get_failrec(BTRFS_I(inode), start); - if (!IS_ERR(failrec)) { - btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->bytenr, failrec->len); - /* - * when data can be on disk more than twice, add to failrec here - * (e.g. with a list for failed_mirror) to make - * clean_io_failure() clean all those errors at once. - */ - ASSERT(failrec->this_mirror == bbio->mirror_num); - ASSERT(failrec->len == fs_info->sectorsize); - return failrec; - } - - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return ERR_PTR(-ENOMEM); - - RB_CLEAR_NODE(&failrec->rb_node); - failrec->bytenr = start; - failrec->len = sectorsize; - failrec->failed_mirror = bbio->mirror_num; - failrec->this_mirror = bbio->mirror_num; - failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - - btrfs_debug(fs_info, - "new io failure record logical %llu start %llu", - failrec->logical, start); - - failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); - if (failrec->num_copies == 1) { - /* - * We only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. No - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "cannot repair logical %llu num_copies %d", - failrec->logical, failrec->num_copies); - kfree(failrec); - return ERR_PTR(-EIO); - } - - /* Set the bits in the private failure tree */ - ret = insert_failrec(BTRFS_I(inode), failrec); - if (ret) { - kfree(failrec); - return ERR_PTR(ret); - } - - return failrec; -} - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered) -{ - u64 start = failed_bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *failed_bio = &failed_bbio->bio; - const int icsum = bio_offset >> fs_info->sectorsize_bits; - struct bio *repair_bio; - struct btrfs_bio *repair_bbio; - - btrfs_debug(fs_info, - "repair read error: read error at %llu", start); - - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - - failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); - if (IS_ERR(failrec)) - return PTR_ERR(failrec); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for btrfs_repair_io_failure to do the rest for us. - */ - failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); - if (failrec->this_mirror == failrec->failed_mirror) { - btrfs_debug(fs_info, - "failed to repair num_copies %d this_mirror %d failed_mirror %d", - failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(inode, failrec); - return -EIO; - } - - repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, - failed_bbio->private); - repair_bbio = btrfs_bio(repair_bio); - repair_bbio->file_offset = start; - repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - - if (failed_bbio->csum) { - const u32 csum_size = fs_info->csum_size; - - repair_bbio->csum = repair_bbio->csum_inline; - memcpy(repair_bbio->csum, - failed_bbio->csum + csum_size * icsum, csum_size); - } - - bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_bbio->iter = repair_bio->bi_iter; - - btrfs_debug(fs_info, - "repair read error: submitting new read to mirror %d", - failrec->this_mirror); - - /* - * At this point we have a bio, so any errors from bio submission will - * be handled by the endio on the repair_bio, so we can't return an - * error here. - */ - if (submit_buffered) - btrfs_submit_data_read_bio(inode, repair_bio, - failrec->this_mirror, 0); - else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); - - return BLK_STS_OK; -} - static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static void end_sector_io(struct page *page, u64 offset, bool uptodate) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - const u32 sectorsize = inode->root->fs_info->sectorsize; - - end_page_read(page, uptodate, offset, sectorsize); - unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); -} - -static void submit_data_read_repair(struct inode *inode, - struct btrfs_bio *failed_bbio, - u32 bio_offset, const struct bio_vec *bvec, - unsigned int error_bitmap) -{ - const unsigned int pgoff = bvec->bv_offset; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct page *page = bvec->bv_page; - const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; - const u64 end = start + bvec->bv_len - 1; - const u32 sectorsize = fs_info->sectorsize; - const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int i; - - BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); - - /* This repair is only for data */ - ASSERT(is_data_inode(inode)); - - /* We're here because we had some read errors or csum mismatch */ - ASSERT(error_bitmap); - - /* - * We only get called on buffered IO, thus page must be mapped and bio - * must not be cloned. - */ - ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); - - /* Iterate through all the sectors in the range */ - for (i = 0; i < nr_bits; i++) { - const unsigned int offset = i * sectorsize; - bool uptodate = false; - int ret; - - if (!(error_bitmap & (1U << i))) { - /* - * This sector has no error, just end the page read - * and unlock the range. - */ - uptodate = true; - goto next; - } - - ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, - bio_offset + offset, page, pgoff + offset, - true); - if (!ret) { - /* - * We have submitted the read repair, the page release - * will be handled by the endio function of the - * submitted repair bio. - * Thus we don't need to do any thing here. - */ - continue; - } - /* - * Continue on failed repair, otherwise the remaining sectors - * will not be properly unlocked. - */ -next: - end_sector_io(page, start + offset, uptodate); - } -} - /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) @@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) u64 start; u64 end; struct bvec_iter_all iter_all; - bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; - if (first_bvec) { - btrfs_record_physical_zoned(inode, start, bio); - first_bvec = false; - } - end_extent_writepage(page, error, start, end); btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); @@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - unsigned int error_bitmap = (unsigned int)-1; - bool repair = false; u64 start; u64 end; u32 len; @@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) len = bvec->bv_len; mirror = bbio->mirror_num; - if (likely(uptodate)) { - if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(bbio, - bio_offset, page, start, end); - if (error_bitmap) - uptodate = false; - } else { - if (btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror)) - uptodate = false; - } - } + if (uptodate && !is_data_inode(inode) && + btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) + uptodate = false; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); - /* * Zero out the remaining part if this range straddles * i_size. @@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (is_data_inode(inode)) { - /* - * Only try to repair bios that actually made it to a - * device. If the bio failed to be submitted mirror - * is 0 and we need to fail it without retrying. - * - * This also includes the high level bios for compressed - * extents - these never make it to a device and repair - * is already handled on the lower compressed bio. - */ - if (mirror > 0) - repair = true; - } else { + } else if (!is_data_inode(inode)) { struct extent_buffer *eb; eb = find_extent_buffer_readpage(fs_info, page, start); @@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) atomic_dec(&eb->io_pages); } - if (repair) { - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bbio, bio_offset, bvec, - error_bitmap); - } else { - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); - } + /* Update page status and unlock. */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig = false; - int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ - ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + ASSERT(bio_ctrl->len_to_oe_boundary); if (bio_ctrl->compress_type != compress_type) return 0; @@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (!contig) return 0; - real_size = min(bio_ctrl->len_to_oe_boundary, - bio_ctrl->len_to_stripe_boundary) - bio_size; - real_size = min(real_size, size); + real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); /* * If real_size is 0, never call bio_add_*_page(), as even size is 0, @@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (real_size == 0) return 0; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); - else - ret = bio_add_page(bio, page, real_size, pg_offset); - - return ret; + return bio_add_page(bio, page, real_size, pg_offset); } -static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode, u64 file_offset) +static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_io_geometry geom; struct btrfs_ordered_extent *ordered; - struct extent_map *em; - u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); - int ret; /* - * Pages for compressed extent are never submitted to disk directly, - * thus it has no real boundary, just set them to U32_MAX. - * - * The split happens for real compressed bio, which happens in - * btrfs_submit_compressed_read/write(). + * Limit the extent to the ordered boundary for Zone Append. + * Compressed bios aren't submitted directly, so it doesn't apply to + * them. */ - if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - bio_ctrl->len_to_stripe_boundary = U32_MAX; - return 0; - } - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), - logical, &geom); - free_extent_map(em); - if (ret < 0) { - return ret; - } - if (geom.len > U32_MAX) - bio_ctrl->len_to_stripe_boundary = U32_MAX; - else - bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - - if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; - } - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!ordered) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; + if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && + btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (ordered) { + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->file_offset + + ordered->disk_num_bytes - file_offset); + btrfs_put_ordered_extent(ordered); + return; + } } - bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, - ordered->disk_bytenr + ordered->disk_num_bytes - logical); - btrfs_put_ordered_extent(ordered); - return 0; + bio_ctrl->len_to_oe_boundary = U32_MAX; } -static int alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, - blk_opf_t opf, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, blk_opf_t opf, + u64 disk_bytenr, u32 offset, u64 file_offset, + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; - int ret; - ASSERT(bio_ctrl->end_io_func); - - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, + NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = file_offset; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); - if (ret < 0) - goto error; + calc_bio_boundaries(bio_ctrl, inode, file_offset); if (wbc) { /* - * For Zone append we need the correct block_device that we are - * going to write to set in the bio to be able to respect the - * hardware limitation. Look it up here: + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *dev; - - dev = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto error; - } - - bio_set_dev(bio, dev->bdev); - } else { - /* - * Otherwise pick the last added device to support - * cgroup writeback. For multi-device file systems this - * means blk-cgroup policies have to always be set on the - * last added/replaced device. This is a bit odd but has - * been like that for a long time. - */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - } + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, bio); - } else { - ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } - return 0; -error: - bio_ctrl->bio = NULL; - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return ret; } /* @@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf, enum btrfs_compression_type compress_type, bool force_bio_submit) { - int ret = 0; struct btrfs_inode *inode = BTRFS_I(page->mapping->host); unsigned int cur = pg_offset; @@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { - ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - disk_bytenr, offset, - page_offset(page) + cur, - compress_type); - if (ret < 0) - return ret; + alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, + offset, page_offset(page) + cur, + compress_type); } /* * We must go through btrfs_bio_add_page() to ensure each @@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * find_next_dirty_byte() are all exclusive */ iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - - if (btrfs_use_zone_append(inode, em->block_start)) - op = REQ_OP_ZONE_APPEND; - free_extent_map(em); em = NULL; @@ -2361,13 +1911,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) mapping_set_error(page->mapping, -EIO); /* - * If we error out, we should add back the dirty_metadata_bytes - * to make it consistent. - */ - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - eb->len, fs_info->dirty_metadata_batch); - - /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's @@ -3826,6 +3369,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); @@ -4019,6 +3563,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); @@ -4722,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -void clear_extent_buffer_dirty(const struct extent_buffer *eb) +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; int i; int num_pages; struct page *page; + btrfs_assert_tree_write_locked(eb); + + if (trans && btrfs_header_generation(eb) != trans->transid) + return; + + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) + return; + + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, + fs_info->dirty_metadata_batch); + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c82448b2e0..4341ad978fb8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -11,6 +11,8 @@ #include "ulist.h" #include "misc.h" +struct btrfs_trans_handle; + enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -60,11 +62,9 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_fs_info; -struct io_failure_record; struct extent_io_tree; struct btrfs_tree_parent_check; @@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -void clear_extent_buffer_dirty(const struct extent_buffer *eb); bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); @@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the sector is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - /* Use rb_simple_node for search/insert */ - struct { - struct rb_node rb_node; - u64 bytenr; - }; - struct page *page; - u64 len; - u64 logical; - int this_mirror; - int failed_mirror; - int num_copies; -}; - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5de73466b2ca..41c77a100853 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, /* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. - * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_bio *bbio = NULL; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_bytenr; - u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; blk_status_t ret = BLK_STS_OK; - if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; @@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst if (!path) return BLK_STS_RESOURCE; - if (!dst) { - bbio = btrfs_bio(bio); - - if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } - } else { - bbio->csum = bbio->csum_inline; + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { + btrfs_free_path(path); + return BLK_STS_RESOURCE; } - csum = bbio->csum; } else { - csum = dst; + bbio->csum = bbio->csum_inline; } /* @@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } @@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> fs_info->sectorsize_bits; - csum_dst = csum + sector_offset * csum_size; + csum_dst = bbio->csum + sector_offset * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); - if (bbio) - btrfs_bio_free_csum(bbio); + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + bbio->csum = NULL; break; } @@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst memset(csum_dst, 0, csum_size); count = 1; - if (BTRFS_I(inode)->root->root_key.objectid == + if (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; int ret; - ret = search_file_offset_in_bio(bio, inode, + ret = search_file_offset_in_bio(bio, + &inode->vfs_inode, cur_disk_bytenr, &file_offset); if (ret) set_extent_bits(io_tree, file_offset, @@ -784,23 +772,16 @@ fail: /* * Calculate checksums of the data contained inside a bio. - * - * @inode: Owner of the data inside the bio - * @bio: Contains the data to be checksummed - * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the - * file offsets are determined from the page offsets in the bio. - * Otherwise, this is the starting file offset of the bio vecs in - * @bio, which must be contiguous. - * @one_ordered: If true, @bio only refers to one ordered extent. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered) +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; - const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; @@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (use_page_offsets) - offset = page_offset(bvec.bv_page) + bvec.bv_offset; - if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); /* @@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - 1); for (i = 0; i < blockcount; i++) { - if (!one_ordered && + if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && !in_range(offset, ordered->file_offset, ordered->num_bytes)) { unsigned long bytes_left; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 031225668434..cd7f2ae515c0 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af046d22300e..5cc5a1faaef5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c667e878ef1a..4d155a48ec59 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - btrfs_clean_tree_block(free_space_root->node); + btrfs_clear_buffer_dirty(trans, free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 5553e1f8afe8..31c1648bc0b4 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 37b86acfcbcf..4c477eae6891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,7 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/btrfs_tree.h> #include <linux/sizes.h> @@ -125,6 +126,12 @@ enum { */ BTRFS_FS_NO_OVERCOMMIT, + /* + * Indicate if we have some features changed, this is mostly for + * cleaner thread to update the sysfs interface. + */ + BTRFS_FS_FEATURE_CHANGED, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -742,8 +749,10 @@ struct btrfs_fs_info { */ u64 zone_size; - /* Max size to emit ZONE_APPEND write command */ + /* Constraints for ZONE_APPEND commands: */ + struct queue_limits limits; u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98a800b8bd43..6c18dc9a1831 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,27 +84,12 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct btrfs_inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ + /* Range of I/O */ u64 file_offset; - /* Used for bio::bi_size */ u32 bytes; - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* Array of checksums */ - u8 *csums; - /* This must be last */ - struct bio bio; + struct btrfs_bio bbio; }; static struct bio_set btrfs_dio_bioset; @@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start, page_end; + u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { @@ -2536,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) -{ - return btrfs_csum_one_bio(inode, bio, (u64)-1, false); -} - -/* * Split an extent_map at [start, start + len] * * This function is intended to be used only for extract_ordered_extent(). @@ -2663,19 +2635,19 @@ out: return ret; } -static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, - struct bio *bio, loff_t file_offset) +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) { + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_inode *inode = bbio->inode; struct btrfs_ordered_extent *ordered; - u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 file_len; - u64 len = bio->bi_iter.bi_size; u64 end = start + len; u64 ordered_end; u64 pre, post; int ret = 0; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); + ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); if (WARN_ON_ONCE(!ordered)) return BLK_STS_IOERR; @@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; - ret = split_zoned_em(inode, file_offset, file_len, pre, post); + ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); out: btrfs_put_ordered_extent(ordered); @@ -2723,75 +2695,6 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(inode, bio, - page_offset(bio_first_bvec_all(bio)->bv_page)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - - /* - * If we need to checksum, and the I/O is not issued by fsync and - * friends, that is ->sync_writers != 0, defer the submission to a - * workqueue to parallelize it. - * - * Csum items for reloc roots have already been cloned at this point, - * so they are handled as part of the no-checksum case. - */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { - if (!atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) - return; - - ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - btrfs_submit_bio(fs_info, bio, mirror_num); -} - -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing the bio - * if there were any errors, so just return here. - */ - btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); - return; - } - - /* Save the original iter for read repair */ - btrfs_bio(bio)->iter = bio->bi_iter; - - /* - * Lookup bio sums does extra checks around whether we need to csum or - * not, which is why we ignore skip_sum here. - */ - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2969,7 +2872,7 @@ again: unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid bdev implies a write on a sequential zone */ - if (ordered_extent->bdev) { + /* A valid ->physical implies a write on a sequential zone. */ + if (ordered_extent->physical != (u64)-1) { btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } - btrfs_free_io_failure_record(inode, start, end); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of } /* - * check_data_csum - verify checksum of one sector of uncompressed data - * @inode: inode - * @bbio: btrfs_bio which contains the csum + * Verify the checksum of a single data sector. + * + * @bbio: btrfs_io_bio which contains the csum + * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @page: page where is the data to be verified - * @pgoff: offset inside the page + * @bv: bio_vec to check * - * The length of such check is always one sector size. + * Check if the checksum on a data block is valid. When a checksum mismatch is + * detected, report the error and fill the corrupted range with zero. * - * When csum mismatch is detected, we will also report the error and fill the - * corrupted range with zero. (Thus it needs the extra parameters) + * Return %true if the sector is ok or had no checksum to start with, else %false. */ -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff) +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len = fs_info->sectorsize; + u64 file_offset = bbio->file_offset + bio_offset; + u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - ASSERT(pgoff + len <= PAGE_SIZE); + ASSERT(bv->bv_len == fs_info->sectorsize); - csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (!bbio->csum) + return true; - if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) + if (btrfs_is_data_reloc_root(inode->root) && + test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); + return true; + } + + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, + csum_expected)) goto zeroit; - return 0; + return true; zeroit: - btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, - csum, csum_expected, bbio->mirror_num); - if (bbio->device) - btrfs_dev_stat_inc_and_print(bbio->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_page(page, pgoff, len); - return -EIO; -} - -/* - * When reads are done, we need to check csums to verify the data is correct. - * if there's a match, we allow the bio to finish. If not, the code in - * extent_io.c will try to find good copies for us. - * - * @bio_offset: offset to the beginning of the bio (in bytes) - * @start: file offset of the range start - * @end: file offset of the range end (inclusive) - * - * Return a bitmap where bit set means a csum mismatch, and bit not set means - * csum match. - */ -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - const u32 sectorsize = root->fs_info->sectorsize; - u32 pg_off; - unsigned int result = 0; - - /* - * This only happens for NODATASUM or compressed read. - * Normally this should be covered by above check for compressed read - * or the next check for NODATASUM. Just do a quicker exit here. - */ - if (bbio->csum == NULL) - return 0; - - if (inode->flags & BTRFS_INODE_NODATASUM) - return 0; - - if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) - return 0; - - ASSERT(page_offset(page) <= start && - end <= page_offset(page) + PAGE_SIZE - 1); - for (pg_off = offset_in_page(start); - pg_off < offset_in_page(end); - pg_off += sectorsize, bio_offset += sectorsize) { - u64 file_offset = pg_off + page_offset(page); - int ret; - - if (btrfs_is_data_reloc_root(root) && - test_range_bit(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, 1, NULL)) { - /* Skip the range without csum for data reloc inode */ - clear_extent_bits(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); - continue; - } - ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); - if (ret < 0) { - const int nr_bit = (pg_off - offset_in_page(start)) >> - root->fs_info->sectorsize_bits; - - result |= (1U << nr_bit); - } - } - return result; + btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, + bbio->mirror_num); + if (dev) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + memzero_bvec(bv); + return false; } /* @@ -4987,7 +4834,7 @@ again: unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -5281,7 +5128,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) return ret; } -static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -5291,7 +5138,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -5302,12 +5149,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } if (attr->ia_valid) { - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + err = posix_acl_chmod(idmap, dentry, inode->i_mode); } return err; @@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; @@ -6724,7 +6569,7 @@ out_inode: return err; } -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -6732,13 +6577,13 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); return btrfs_create_common(dir, dentry, inode); } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -6746,7 +6591,7 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; @@ -6837,7 +6682,7 @@ fail: return err; } -static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -6845,7 +6690,7 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); + inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; return btrfs_create_common(dir, dentry, inode); @@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - - if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) - iomap->flags |= IOMAP_F_ZONE_APPEND; - free_extent_map(em); return 0; @@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, return ret; } -static void btrfs_dio_private_put(struct btrfs_dio_private *dip) -{ - /* - * This implies a barrier so that stores to dio_bio->bi_status before - * this and loads of dio_bio->bi_status after this are fully ordered. - */ - if (!refcount_dec_and_test(&dip->refs)) - return; - - if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(dip->inode, NULL, - dip->file_offset, dip->bytes, - !dip->bio.bi_status); - } else { - unlock_extent(&dip->inode->io_tree, - dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - kfree(dip->csums); - bio_endio(&dip->bio); -} - -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - - BUG_ON(bio_op(bio) == REQ_OP_WRITE); - - refcount_inc(&dip->refs); - btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); -} - -static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, - struct btrfs_bio *bbio, - const bool uptodate) -{ - struct inode *inode = &dip->inode->vfs_inode; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - blk_status_t err = BLK_STS_OK; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (uptodate && - (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(BTRFS_I(inode), start, - bv.bv_page, bv.bv_offset); - } else { - int ret; - - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, false); - if (ret) - err = errno_to_blk_status(ret); - } - } - - return err; -} - -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset) +static void btrfs_dio_end_io(struct btrfs_bio *bbio) { - return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); -} - -static void btrfs_end_dio_bio(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = bbio->private; + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; struct bio *bio = &bbio->bio; - blk_status_t err = bio->bi_status; - - if (err) - btrfs_warn(dip->inode->root->fs_info, - "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), - bio->bi_opf, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); - - if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, bbio, !err); - - if (err) - dip->bio.bi_status = err; - - btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); - - bio_put(bio); - btrfs_dio_private_put(dip); -} -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, - u64 file_offset, int async_submit) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - blk_status_t ret; - - /* Save the original iter for read repair */ - if (btrfs_op(bio) == BTRFS_MAP_READ) - btrfs_bio(bio)->iter = bio->bi_iter; - - if (inode->flags & BTRFS_INODE_NODATASUM) - goto map; + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, - WQ_SUBMIT_DATA_DIO)) - return; + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, + dip->bytes, !bio->bi_status); + else + unlock_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); - /* - * If we aren't doing async submit, calculate the csum of the - * bio now. - */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } else { - btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, - file_offset - dip->file_offset); - } -map: - btrfs_submit_bio(fs_info, bio, 0); + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); } -static void btrfs_submit_direct(const struct iomap_iter *iter, - struct bio *dio_bio, loff_t file_offset) +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) { + struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_dio_private *dip = - container_of(dio_bio, struct btrfs_dio_private, bio); - struct inode *inode = iter->inode; - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const bool raid56 = (btrfs_data_alloc_profile(fs_info) & - BTRFS_BLOCK_GROUP_RAID56_MASK); - struct bio *bio; - u64 start_sector; - int async_submit = 0; - u64 submit_len; - u64 clone_offset = 0; - u64 clone_len; - u64 logical; - int ret; - blk_status_t status; - struct btrfs_io_geometry geom; + container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - struct extent_map *em = NULL; - - dip->inode = BTRFS_I(inode); - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - refcount_set(&dip->refs, 1); - dip->csums = NULL; - - if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - unsigned int nr_sectors = - (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - - /* - * Load the csums up front to reduce csum tree searches and - * contention when submitting bios. - */ - status = BLK_STS_RESOURCE; - dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip->csums) - goto out_err; - - status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); - if (status != BLK_STS_OK) - goto out_err; - } - - start_sector = dio_bio->bi_iter.bi_sector; - submit_len = dio_bio->bi_iter.bi_size; - - do { - logical = start_sector << 9; - em = btrfs_get_chunk_map(fs_info, logical, submit_len); - if (IS_ERR(em)) { - status = errno_to_blk_status(PTR_ERR(em)); - em = NULL; - goto out_err_em; - } - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, &geom); - if (ret) { - status = errno_to_blk_status(ret); - goto out_err_em; - } - clone_len = min(submit_len, geom.len); - ASSERT(clone_len <= UINT_MAX); + btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); + bbio->file_offset = file_offset; - /* - * This will never fail as it's passing GPF_NOFS and - * the allocation is backed by btrfs_bioset. - */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, - btrfs_end_dio_bio, dip); - btrfs_bio(bio)->file_offset = file_offset; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - status = extract_ordered_extent(BTRFS_I(inode), bio, - file_offset); - if (status) { - bio_put(bio); - goto out_err; - } - } - - ASSERT(submit_len >= clone_len); - submit_len -= clone_len; - - /* - * Increase the count before we submit the bio so we know - * the end IO handler won't happen before we increase the - * count. Otherwise, the dip might get freed before we're - * done setting it up. - * - * We transfer the initial reference to the last bio, so we - * don't need to increment the reference count for the last one. - */ - if (submit_len > 0) { - refcount_inc(&dip->refs); - /* - * If we are submitting more than one bio, submit them - * all asynchronously. The exception is RAID 5 or 6, as - * asynchronous checksums make it difficult to collect - * full stripe writes. - */ - if (!raid56) - async_submit = 1; - } - - btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); - - dio_data->submitted += clone_len; - clone_offset += clone_len; - start_sector += clone_len >> 9; - file_offset += clone_len; - - free_extent_map(em); - } while (submit_len > 0); - return; + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; -out_err_em: - free_extent_map(em); -out_err: - dio_bio->bi_status = status; - btrfs_dio_private_put(dip); + dio_data->submitted += bio->bi_iter.bi_size; + btrfs_submit_bio(bio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { }; static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_submit_direct, + .submit_io = btrfs_dio_submit_io, .bio_set = &btrfs_dio_bioset, }; @@ -8552,7 +8173,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -8802,7 +8423,7 @@ out: return ret; } -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; @@ -8813,7 +8434,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, * Subvolumes don't inherit the sgid bit or the parent's gid if * the parent's sgid bit is set. This is probably a bug. */ - inode_init_owner(mnt_userns, inode, NULL, + inode_init_owner(idmap, inode, NULL, S_IFDIR | (~current_umask() & S_IRWXUGO)); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); - spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void) goto fail; if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bio), + offsetof(struct btrfs_dio_private, bbio.bio), BIOSET_NEED_BVECS)) goto fail; @@ -9004,7 +8623,7 @@ fail: return -ENOMEM; } -static int btrfs_getattr(struct user_namespace *mnt_userns, +static int btrfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -9034,7 +8653,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9289,14 +8908,14 @@ out_notrans: return ret; } -static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, +static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; inode = new_inode(dir->i_sb); if (inode) { - inode_init_owner(mnt_userns, inode, dir, + inode_init_owner(idmap, inode, dir, S_IFCHR | WHITEOUT_MODE); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); @@ -9304,7 +8923,7 @@ static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, return inode; } -static int btrfs_rename(struct user_namespace *mnt_userns, +static int btrfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -9376,7 +8995,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, filemap_flush(old_inode->i_mapping); if (flags & RENAME_WHITEOUT) { - whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); + whiteout_args.inode = new_whiteout_inode(idmap, old_dir); if (!whiteout_args.inode) { ret = -ENOMEM; goto out_fscrypt_names; @@ -9545,7 +9164,7 @@ out_fscrypt_names: return ret; } -static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -9558,7 +9177,7 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); else - ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); @@ -9758,7 +9377,7 @@ out: return ret; } -static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -9786,7 +9405,7 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); + inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; @@ -10075,7 +9694,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static int btrfs_permission(struct user_namespace *mnt_userns, +static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10088,10 +9707,10 @@ static int btrfs_permission(struct user_namespace *mnt_userns, if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } -static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -10109,7 +9728,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; @@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; - bool skip_csum; }; -static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, - struct bio *bio, int mirror_num) -{ - struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (!priv->skip_csum) { - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) - return ret; - } - - atomic_inc(&priv->pending); - btrfs_submit_bio(fs_info, bio, mirror_num); - return BLK_STS_OK; -} - -static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) -{ - const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); - struct btrfs_encoded_read_private *priv = bbio->private; - struct btrfs_inode *inode = priv->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - u32 bio_offset = 0; - - if (priv->skip_csum || !uptodate) - return bbio->bio.bi_status; - - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; - for (i = 0; i < nr_sectors; i++) { - ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(inode, bbio, bio_offset, - bvec->bv_page, pgoff)) - return BLK_STS_IOERR; - bio_offset += sectorsize; - pgoff += sectorsize; - } - } - return BLK_STS_OK; -} - static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; - blk_status_t status; - status = btrfs_encoded_read_verify_csum(bbio); - if (status) { + if (bbio->bio.bi_status) { /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the @@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ - WRITE_ONCE(priv->status, status); + WRITE_ONCE(priv->status, bbio->bio.bi_status); } if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { .inode = inode, .file_offset = file_offset, .pending = ATOMIC_INIT(1), - .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), }; unsigned long i = 0; u64 cur = 0; - int ret; init_waitqueue_head(&priv.wait); - /* - * Submit bios for the extent, splitting due to bio or stripe limits as - * necessary. - */ + /* Submit bios for the extent, splitting due to bio limits as necessary. */ while (cur < disk_io_size) { - struct extent_map *em; - struct btrfs_io_geometry geom; struct bio *bio = NULL; - u64 remaining; + u64 remaining = disk_io_size - cur; - em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, - disk_io_size - cur); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - } else { - ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, - disk_bytenr + cur, &geom); - free_extent_map(em); - } - if (ret) { - WRITE_ONCE(priv.status, errno_to_blk_status(ret)); - break; - } - remaining = min(geom.len, disk_io_size - cur); while (bio || remaining) { size_t bytes = min_t(u64, remaining, PAGE_SIZE); if (!bio) { bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + inode, btrfs_encoded_read_endio, &priv); bio->bi_iter.bi_sector = @@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!bytes || bio_add_page(bio, pages[i], bytes, 0) < bytes) { - blk_status_t status; - - status = submit_encoded_read_bio(inode, bio, 0); - if (status) { - WRITE_ONCE(priv.status, status); - bio_put(bio); - goto out; - } + atomic_inc(&priv.pending); + btrfs_submit_bio(bio, 0); bio = NULL; continue; } @@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } } -out: if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ @@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return 0; max_pages = sis->max - bsi->nr_pages; - first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, - PAGE_SIZE) >> PAGE_SHIFT; + first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; + next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e348bd2ccde..84626c8ad5bf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -243,7 +243,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int btrfs_fileattr_set(struct user_namespace *mnt_userns, +int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -578,7 +578,7 @@ static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit return num_items; } -static noinline int create_subvol(struct user_namespace *mnt_userns, +static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { @@ -623,7 +623,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, if (ret < 0) goto out_root_item; - new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); if (!new_inode_args.inode) { ret = -ENOMEM; goto out_anon_dev; @@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, * exists). */ btrfs_tree_lock(leaf); - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); @@ -898,7 +898,7 @@ free_pending: * nfs_async_unlink(). */ -static int btrfs_may_delete(struct user_namespace *mnt_userns, +static int btrfs_may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, int isdir) { int error; @@ -909,12 +909,12 @@ static int btrfs_may_delete(struct user_namespace *mnt_userns, BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(mnt_userns, dir, d_inode(victim)) || + if (check_sticky(idmap, dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; @@ -933,16 +933,16 @@ static int btrfs_may_delete(struct user_namespace *mnt_userns, } /* copy of may_create in fs/namei.c() */ -static inline int btrfs_may_create(struct user_namespace *mnt_userns, +static inline int btrfs_may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; - return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } /* @@ -951,7 +951,7 @@ static inline int btrfs_may_create(struct user_namespace *mnt_userns, * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *snap_src, bool readonly, @@ -967,12 +967,12 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(mnt_userns, name, parent->dentry, namelen); + dentry = lookup_one(idmap, name, parent->dentry, namelen); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(mnt_userns, dir, dentry); + error = btrfs_may_create(idmap, dir, dentry); if (error) goto out_dput; @@ -993,7 +993,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, inherit); + error = create_subvol(idmap, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -1007,7 +1007,7 @@ out_unlock: } static noinline int btrfs_mksnapshot(const struct path *parent, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *root, bool readonly, @@ -1037,7 +1037,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - ret = btrfs_mksubvol(parent, mnt_userns, name, namelen, + ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); out: if (snapshot_force_cow) @@ -1240,7 +1240,7 @@ out_drop: } static noinline int __btrfs_ioctl_snap_create(struct file *file, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, unsigned long fd, int subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -1268,7 +1268,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, mnt_userns, name, + ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { struct fd src = fdget(fd); @@ -1283,14 +1283,14 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; - } else if (!inode_owner_or_capable(mnt_userns, src_inode)) { + } else if (!inode_owner_or_capable(idmap, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only */ ret = -EPERM; } else { - ret = btrfs_mksnapshot(&file->f_path, mnt_userns, + ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, BTRFS_I(src_inode)->root, readonly, inherit); @@ -1317,7 +1317,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); @@ -1377,7 +1377,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, readonly, inherit); if (ret) @@ -1422,7 +1422,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (!inode_owner_or_capable(file_mnt_user_ns(file), inode)) + if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -1870,7 +1870,7 @@ out: return ret; } -static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, +static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct inode *inode, struct btrfs_ioctl_ino_lookup_user_args *args) { @@ -1962,7 +1962,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(mnt_userns, temp_inode, + ret = inode_permission(idmap, temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { @@ -2101,7 +2101,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) return -EACCES; } - ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args); + ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2335,7 +2335,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args = NULL; struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; int err = 0; @@ -2428,7 +2428,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * anywhere in the filesystem the user wouldn't be able * to delete without an idmapped mount. */ - if (old_dir != dir && mnt_userns != &init_user_ns) { + if (old_dir != dir && idmap != &nop_mnt_idmap) { err = -EOPNOTSUPP; goto free_parent; } @@ -2471,7 +2471,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) goto free_subvol_name; - dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -2513,13 +2513,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (root == dest) goto out_dput; - err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC); + err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(mnt_userns, dir, dentry, 1); + err = btrfs_may_delete(idmap, dir, dentry, 1); if (err) goto out_dput; @@ -2582,7 +2582,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) * running and allows defrag on files open in read-only mode. */ if (!capable(CAP_SYS_ADMIN) && - inode_permission(&init_user_ns, inode, MAY_WRITE)) { + inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) { ret = -EPERM; goto out; } @@ -3907,7 +3907,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, } static long _btrfs_ioctl_set_received_subvol(struct file *file, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); @@ -3919,7 +3919,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -4024,7 +4024,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, args64->rtime.nsec = args32->rtime.nsec; args64->flags = args32->flags; - ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64); if (ret) goto out; @@ -4058,7 +4058,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, if (IS_ERR(sa)) return PTR_ERR(sa); - ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa); if (ret) goto out; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 8a855d5ac2fa..d51b9a2f2f6e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -6,7 +6,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int btrfs_fileattr_set(struct user_namespace *mnt_userns, +int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c new file mode 100644 index 000000000000..0fe0ae54ac67 --- /dev/null +++ b/fs/btrfs/lru_cache.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include "lru_cache.h" +#include "messages.h" + +/* + * Initialize a cache object. + * + * @cache: The cache. + * @max_size: Maximum size (number of entries) for the cache. + * Use 0 for unlimited size, it's the user's responsability to + * trim the cache in that case. + */ +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) +{ + INIT_LIST_HEAD(&cache->lru_list); + mt_init(&cache->entries); + cache->size = 0; + cache->max_size = max_size; +} + +static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, + u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + + list_for_each_entry(entry, head, list) { + if (entry->key == key && entry->gen == gen) + return entry; + } + + return NULL; +} + +/* + * Lookup for an entry in the cache. + * + * @cache: The cache. + * @key: The key of the entry we are looking for. + * @gen: Generation associated to the key. + * + * Returns the entry associated with the key or NULL if none found. + */ +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen) +{ + struct list_head *head; + struct btrfs_lru_cache_entry *entry; + + head = mtree_load(&cache->entries, key); + if (!head) + return NULL; + + entry = match_entry(head, key, gen); + if (entry) + list_move_tail(&entry->lru_list, &cache->lru_list); + + return entry; +} + +/* + * Remove an entry from the cache. + * + * @cache: The cache to remove from. + * @entry: The entry to remove from the cache. + * + * Note: this also frees the memory used by the entry. + */ +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry) +{ + struct list_head *prev = entry->list.prev; + + ASSERT(cache->size > 0); + ASSERT(!mtree_empty(&cache->entries)); + + list_del(&entry->list); + list_del(&entry->lru_list); + + if (list_empty(prev)) { + struct list_head *head; + + /* + * If previous element in the list entry->list is now empty, it + * means it's a head entry not pointing to any cached entries, + * so remove it from the maple tree and free it. + */ + head = mtree_erase(&cache->entries, entry->key); + ASSERT(head == prev); + kfree(head); + } + + kfree(entry); + cache->size--; +} + +/* + * Store an entry in the cache. + * + * @cache: The cache. + * @entry: The entry to store. + * + * Returns 0 on success and < 0 on error. + */ +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp) +{ + const u64 key = new_entry->key; + struct list_head *head; + int ret; + + head = kmalloc(sizeof(*head), gfp); + if (!head) + return -ENOMEM; + + ret = mtree_insert(&cache->entries, key, head, gfp); + if (ret == 0) { + INIT_LIST_HEAD(head); + list_add_tail(&new_entry->list, head); + } else if (ret == -EEXIST) { + kfree(head); + head = mtree_load(&cache->entries, key); + ASSERT(head != NULL); + if (match_entry(head, key, new_entry->gen) != NULL) + return -EEXIST; + list_add_tail(&new_entry->list, head); + } else if (ret < 0) { + kfree(head); + return ret; + } + + if (cache->max_size > 0 && cache->size == cache->max_size) { + struct btrfs_lru_cache_entry *lru_entry; + + lru_entry = list_first_entry(&cache->lru_list, + struct btrfs_lru_cache_entry, + lru_list); + btrfs_lru_cache_remove(cache, lru_entry); + } + + list_add_tail(&new_entry->lru_list, &cache->lru_list); + cache->size++; + + return 0; +} + +/* + * Empty a cache. + * + * @cache: The cache to empty. + * + * Removes all entries from the cache. + */ +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) +{ + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) + btrfs_lru_cache_remove(cache, entry); + + ASSERT(cache->size == 0); + ASSERT(mtree_empty(&cache->entries)); +} diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h new file mode 100644 index 000000000000..de3e18bce24a --- /dev/null +++ b/fs/btrfs/lru_cache.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_LRU_CACHE_H +#define BTRFS_LRU_CACHE_H + +#include <linux/maple_tree.h> +#include <linux/list.h> + +/* + * A cache entry. This is meant to be embedded in a structure of a user of + * this module. Similar to how struct list_head and struct rb_node are used. + * + * Note: it should be embedded as the first element in a struct (offset 0), and + * this module assumes it was allocated with kmalloc(), so it calls kfree() when + * it needs to free an entry. + */ +struct btrfs_lru_cache_entry { + struct list_head lru_list; + u64 key; + /* + * Optional generation associated to a key. Use 0 if not needed/used. + * Entries with the same key and different generations are stored in a + * linked list, so use this only for cases where there's a small number + * of different generations. + */ + u64 gen; + /* + * The maple tree uses unsigned long type for the keys, which is 32 bits + * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to + * use something like inode numbers as keys, which are always a u64, we + * have to deal with this in a special way - we store the key in the + * entry itself, as a u64, and the values inserted into the maple tree + * are linked lists of entries - so in case we are on a 64 bits system, + * that list always has a single entry, while on 32 bits systems it + * may have more than one, with each entry having the same value for + * their lower 32 bits of the u64 key. + */ + struct list_head list; +}; + +struct btrfs_lru_cache { + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + unsigned int size; + /* Maximum number of entries the cache can have. */ + unsigned int max_size; +}; + +#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ + list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) + +static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) +{ + return cache->size; +} + +static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) +{ + return cache->size >= cache->max_size; +} + +static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) +{ + return list_first_entry_or_null(&cache->lru_list, + struct btrfs_lru_cache_entry, lru_list); +} + +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen); +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp); +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry); +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); + +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d5e78cbc8fbc..71f6d8302d50 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* Check if we have reached page boundary */ - if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + if (PAGE_ALIGNED(cur_in)) { put_page(page_in); page_in = NULL; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 625bbbbb2608..fde5aaa6e7c9 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -293,36 +293,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} - -/* * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an * alert, and either panics or BUGs, depending on mount options. */ diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 190af1f698d9..8c516ee58ff9 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -6,7 +6,6 @@ #include <linux/types.h> struct btrfs_fs_info; -struct btrfs_trans_handle; static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char * __attribute_const__ btrfs_decode_error(int errno); -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_ERR \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_err((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 57d8c72737e1..6c24b69e2d0a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); complete(&ordered->completion); } @@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, } /* - * Used to start IO or wait for a given ordered extent to finish. + * Start IO and wait for a given ordered extent to finish. * - * If wait is one, this effectively waits on page writeback for all the pages - * in the extent, and it waits on the io completion code to insert - * metadata into the btree corresponding to the extent + * Wait on page writeback for all the pages in the extent and the IO completion + * code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); - if (wait) { - if (!freespace_inode) - btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); - wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &entry->flags)); - } + + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } /* @@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 89f82b78f590..eb40cb39f842 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -157,7 +157,6 @@ struct btrfs_ordered_extent { * command in a workqueue context */ u64 physical; - struct block_device *bdev; }; static inline void @@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index af97413abcf4..52a7d2fa2284 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - btrfs_clean_tree_block(quota_root->node); + btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a2cf754912d..642828c1b299 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* - * Return the total numer of errors found in the vertical stripe of @sector_nr. + * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. @@ -1183,7 +1183,15 @@ not_found: trace_info->stripe_nr = -1; } -/* Generate PQ for one veritical stripe. */ +static inline void bio_list_put(struct bio_list *bio_list) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} + +/* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; @@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; @@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, return 0; error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); + bio_list_put(bio_list); return -EIO; } @@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Uptodate directly for + * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, @@ -1425,13 +1431,20 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; - struct bvec_iter_all iter_all; + int i; - bio_for_each_segment_all(bvec, bio, iter_all) + bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; - bitmap_set(rbio->error_bitmap, total_sector_nr, - bio_size >> rbio->bioc->fs_info->sectorsize_bits); + /* + * Since we can have multiple bios touching the error_bitmap, we cannot + * call bitmap_set() without protection. + * + * Instead use set_bit() for each bit, as set_bit() itself is atomic. + */ + for (i = total_sector_nr; i < total_sector_nr + + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) + set_bit(i, rbio->error_bitmap); } /* Verify the data sectors at read time. */ @@ -1490,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio) wake_up(&rbio->io_wait); } -static void submit_read_bios(struct btrfs_raid_bio *rbio, +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; @@ -1507,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, } submit_bio(bio); } -} - -static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) -{ - struct bio *bio; - int total_sector_nr; - int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - - /* - * Build a list of bios to read all sectors (including data and P/Q). - * - * This behaviro is to compensate the later csum verification and - * recovery. - */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; - - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; - } - return 0; - -cleanup: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) @@ -1660,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - ret = PTR_ERR(rbio); - goto fail; + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1674,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - goto queue_rbio; - - cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); - if (cb) { - plug = container_of(cb, struct btrfs_plug_cb, cb); - if (!plug->info) { - plug->info = fs_info; - INIT_LIST_HEAD(&plug->rbio_list); + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; } - list_add_tail(&rbio->plug_list, &plug->rbio_list); - return; } -queue_rbio: + /* * Either we don't have any existing plug, or we're doing a full stripe, - * can queue the rmw work now. + * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); - - return; - -fail: - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); } static int verify_one_sector(struct btrfs_raid_bio *rbio, @@ -1765,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* - * No errors in the veritical stripe, skip it. Can happen for recovery + * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check. */ if (!found_errors) @@ -1886,7 +1859,7 @@ pstripe: sector->uptodate = 1; } if (failb >= 0) { - ret = verify_one_sector(rbio, faila, sector_nr); + ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) goto cleanup; @@ -1941,14 +1914,25 @@ out: return ret; } -static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static void recover_rbio(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -1979,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, } sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret < 0) - goto error; + if (ret < 0) { + bio_list_put(&bio_list); + goto out; + } } - return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - - return -EIO; -} - -static int recover_rbio(struct btrfs_raid_bio *rbio) -{ - struct bio_list bio_list; - struct bio *bio; - int ret; - - /* - * Either we're doing recover for a read failure or degraded write, - * caller should have set error bitmap correctly. - */ - ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); - bio_list_init(&bio_list); - - /* For recovery, we need to read all sectors including P/Q. */ - ret = alloc_rbio_pages(rbio); - if (ret < 0) - goto out; - - index_rbio_pages(rbio); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); - out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) @@ -2196,11 +2134,9 @@ no_csum: static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; /* * Fill the data csums we need for data verification. We need to fill @@ -2209,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ fill_data_csums(rbio); - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ - ret = recover_sectors(rbio); - return ret; -out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) @@ -2282,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) return false; } -static int rmw_rbio(struct btrfs_raid_bio *rbio) +static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2294,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) - return ret; + goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ - if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) - goto write; - - /* - * Now we're doing sub-stripe write, also need all data stripes to do - * the full RMW. - */ - ret = alloc_rbio_data_pages(rbio); - if (ret < 0) - return ret; + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; - index_rbio_pages(rbio); + index_rbio_pages(rbio); - ret = rmw_read_wait_recover(rbio); - if (ret < 0) - return ret; + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } -write: /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe @@ -2348,7 +2290,7 @@ write: bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) - return ret; + goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); @@ -2365,32 +2307,22 @@ write: break; } } - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* @@ -2498,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; - struct bio *bio; int is_replace = 0; int ret; @@ -2629,8 +2560,7 @@ submit_write: return 0; cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + bio_list_put(&bio_list); return ret; } @@ -2725,15 +2655,12 @@ out: return ret; } -static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2762,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret) - goto error; + if (ret) { + bio_list_put(&bio_list); + return ret; + } } + + submit_read_wait_bio_list(rbio, &bio_list); return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; } -static int scrub_rbio(struct btrfs_raid_bio *rbio) +static void scrub_rbio(struct btrfs_raid_bio *rbio) { bool need_check = false; - struct bio_list bio_list; int sector_nr; int ret; - struct bio *bio; - - bio_list_init(&bio_list); ret = alloc_rbio_essential_pages(rbio); if (ret) - goto cleanup; + goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - ret = scrub_assemble_read_bios(rbio, &bio_list); + ret = scrub_assemble_read_bios(rbio); if (ret < 0) - goto cleanup; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) - goto cleanup; + goto out; /* * We have every sector properly prepared. Can finish the scrub @@ -2817,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) break; } } - return ret; - -cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - ret = scrub_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 7c73a443939e..df0e0abdeb1f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -65,7 +65,7 @@ struct btrfs_raid_bio { /* Number of data stripes (no p/q) */ u8 nr_data; - /* Numer of all stripes (including P/Q) */ + /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ @@ -132,7 +132,7 @@ struct btrfs_raid_bio { /* * Checksum buffer if the rbio is for data. The buffer should cover - * all data sectors (exlcuding P/Q sectors). + * all data sectors (excluding P/Q sectors). */ u8 *csum_buf; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ec4a7658ce..ef13a9d4e370 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ - if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 52b346795f66..69c93ae333f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -229,7 +229,7 @@ struct full_stripe_lock { }; #ifndef CONFIG_64BIT -/* This structure is for archtectures whose (void *) is smaller than u64 */ +/* This structure is for architectures whose (void *) is smaller than u64 */ struct scrub_page_private { u64 logical; }; @@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) { sblock->header_error = 1; - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_bytenr(h), + sblock->logical); + goto out; } - if (!scrub_check_fsid(h->fsid, sector)) + if (!scrub_check_fsid(h->fsid, sector)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->fsid, sblock->dev->fs_devices->fsid); + goto out; + } - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->chunk_tree_uuid, fs_info->chunk_tree_uuid); + goto out; + } shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); @@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { sblock->checksum_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, + sblock->logical, sblock->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + goto out; + } + + if (sector->generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad generation, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_generation(h), + sector->generation); + } +out: return sblock->header_error || sblock->checksum_error; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e65e6b6600a7..e5c963bb873d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -32,6 +32,7 @@ #include "file-item.h" #include "ioctl.h" #include "verity.h" +#include "lru_cache.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -80,23 +81,23 @@ struct clone_root { bool found_ref; }; -#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 -#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +#define SEND_MAX_NAME_CACHE_SIZE 256 /* - * Limit the root_ids array of struct backref_cache_entry to 12 elements. - * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * Limit the root_ids array of struct backref_cache_entry to 17 elements. + * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which + * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds - * to the send root. Having the user specify more than 11 clone roots is not + * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of - * cloning roots that lead down to a leaf is more than 12. + * cloning roots that lead down to a leaf is more than 17. */ -#define SEND_MAX_BACKREF_CACHE_ROOTS 12 +#define SEND_MAX_BACKREF_CACHE_ROOTS 17 /* * Max number of entries in the cache. - * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding - * maple tree's internal nodes, is 16K. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding + * maple tree's internal nodes, is 24K. */ #define SEND_MAX_BACKREF_CACHE_SIZE 128 @@ -107,15 +108,31 @@ struct clone_root { * x86_64). */ struct backref_cache_entry { - /* List to link to the cache's lru list. */ - struct list_head list; - /* The key for this entry in the cache. */ - u64 key; + struct btrfs_lru_cache_entry entry; u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct backref_cache_entry, entry) == 0); + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -174,9 +191,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; - struct list_head name_cache_list; - int name_cache_size; + struct btrfs_lru_cache name_cache; /* * The inode we are currently processing. It's not NULL only when we @@ -285,13 +300,11 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; - struct { - u64 last_reloc_trans; - struct list_head lru_list; - struct maple_tree entries; - /* Number of entries stored in the cache. */ - int size; - } backref_cache; + struct btrfs_lru_cache backref_cache; + u64 backref_cache_last_reloc_trans; + + struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; }; struct pending_dir_move { @@ -321,21 +334,15 @@ struct orphan_dir_info { u64 ino; u64 gen; u64 last_dir_index_offset; + u64 dir_high_seq_ino; }; struct name_cache_entry { - struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * The key in the entry is an inode number, and the generation matches + * the inode's generation. */ - struct list_head radix_list; - u64 ino; - u64 gen; + struct btrfs_lru_cache_entry entry; u64 parent_ino; u64 parent_gen; int ret; @@ -344,6 +351,9 @@ struct name_cache_entry { char name[]; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct name_cache_entry, entry) == 0); + #define ADVANCE 1 #define ADVANCE_ONLY_NEXT -1 @@ -956,14 +966,12 @@ out: static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; - struct btrfs_inode_info info; + struct btrfs_inode_info info = { 0 }; - if (!gen) - return -EPERM; + ASSERT(gen); ret = get_inode_info(root, ino, &info); - if (!ret) - *gen = info.gen; + *gen = info.gen; return ret; } @@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, return 0; } -static void empty_backref_cache(struct send_ctx *sctx) -{ - struct backref_cache_entry *entry; - struct backref_cache_entry *tmp; - - list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) - kfree(entry); - - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mtree_destroy(&sctx->backref_cache.entries); - sctx->backref_cache.size = 0; -} - static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { @@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (sctx->backref_cache.size == 0) + if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) return false; /* @@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { - empty_backref_cache(sctx); + if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { + btrfs_lru_cache_clear(&sctx->backref_cache); return false; } - entry = mtree_load(&sctx->backref_cache.entries, key); - if (!entry) + raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); + if (!raw_entry) return false; + entry = container_of(raw_entry, struct backref_cache_entry, entry); *root_ids_ret = entry->root_ids; *root_count_ret = entry->num_roots; - list_move_tail(&entry->list, &sctx->backref_cache.lru_list); return true; } @@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(root_ids, &uiter)) != NULL) { @@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. + * + * Also use GFP_NOFS because we're called while holding a transaction + * handle or while holding fs_info->commit_root_sem. */ - - if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { - struct backref_cache_entry *lru_entry; - struct backref_cache_entry *mt_entry; - - lru_entry = list_first_entry(&sctx->backref_cache.lru_list, - struct backref_cache_entry, list); - mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); - ASSERT(mt_entry == lru_entry); - list_del(&mt_entry->list); - kfree(mt_entry); - sctx->backref_cache.size--; - } - - ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, - new_entry, GFP_NOFS); + ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, + GFP_NOFS); ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */ @@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, return; } - list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); - /* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (sctx->backref_cache.size == 0) - sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; - - sctx->backref_cache.size++; + if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, @@ -1886,7 +1868,8 @@ enum inode_state { inode_state_did_delete, }; -static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; int left_ret; @@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; + if (send_gen) + *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); if (!sctx->parent_root) { right_ret = -ENOENT; @@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; + if (parent_gen) + *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); } if (!left_ret && !right_ret) { @@ -1953,14 +1940,15 @@ out: return ret; } -static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; if (ino == BTRFS_FIRST_FREE_OBJECTID) return 1; - ret = get_cur_inode_state(sctx, ino, gen); + ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) goto out; @@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen, u64 *who_mode) { - int ret = 0; - u64 gen; + int ret; + u64 parent_root_dir_gen; u64 other_inode = 0; struct btrfs_inode_info info; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) - goto out; + return 0; /* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. + * + * @parent_root_dir_gen was set to 0 if the inode does not exist in the + * parent root. */ - if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->parent_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && + parent_root_dir_gen != dir_gen) + return 0; ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; /* * Check if the overwritten ref was already processed. If yes, the ref @@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) - goto out; + return ret; - ret = 1; *who_ino = other_inode; *who_gen = info.gen; *who_mode = info.mode; - } else { - ret = 0; + return 1; } -out: - return ret; + return 0; } /* @@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, u64 ino, u64 ino_gen, const char *name, int name_len) { - int ret = 0; - u64 gen; + int ret; u64 ow_inode; + u64 ow_gen = 0; + u64 send_root_dir_gen; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) - goto out; + return ret; - if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->send_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + /* + * @send_root_dir_gen was set to 0 if the inode does not exist in the + * send root. + */ + if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) + return 0; /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { + if (ret == -ENOENT) { /* was never and will never be overwritten */ - ret = 0; - goto out; + return 0; + } else if (ret < 0) { + return ret; } - ret = get_inode_gen(sctx->send_root, ow_inode, &gen); - if (ret < 0) - goto out; + if (ow_inode == ino) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; - if (ow_inode == ino && gen == ino_gen) { - ret = 0; - goto out; + /* It's the same inode, so no overwrite happened. */ + if (ow_gen == ino_gen) + return 0; } /* @@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed. */ - if ((ow_inode < sctx->send_progress) || - (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && - gen == sctx->cur_inode_gen)) - ret = 1; - else - ret = 0; + if (ow_inode < sctx->send_progress) + return 1; -out: - return ret; + if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { + if (ow_gen == 0) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + } + if (ow_gen == sctx->cur_inode_gen) + return 1; + } + + return 0; } /* @@ -2285,113 +2264,16 @@ out: return ret; } -/* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, - * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. - * In case of error, nce is kfreed. - */ -static int name_cache_insert(struct send_ctx *sctx, - struct name_cache_entry *nce) +static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) { - int ret = 0; - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); - if (!nce_head) { - kfree(nce); - return -ENOMEM; - } - INIT_LIST_HEAD(nce_head); - - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); - if (ret < 0) { - kfree(nce_head); - kfree(nce); - return ret; - } - } - list_add_tail(&nce->radix_list, nce_head); - list_add_tail(&nce->list, &sctx->name_cache_list); - sctx->name_cache_size++; - - return ret; -} + struct btrfs_lru_cache_entry *entry; -static void name_cache_delete(struct send_ctx *sctx, - struct name_cache_entry *nce) -{ - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - btrfs_err(sctx->send_root->fs_info, - "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", - nce->ino, sctx->name_cache_size); - } - - list_del(&nce->radix_list); - list_del(&nce->list); - sctx->name_cache_size--; - - /* - * We may not get to the final release of nce_head if the lookup fails - */ - if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); - kfree(nce_head); - } -} - -static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, - u64 ino, u64 gen) -{ - struct list_head *nce_head; - struct name_cache_entry *cur; - - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); - if (!nce_head) + entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); + if (!entry) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { - if (cur->ino == ino && cur->gen == gen) - return cur; - } - return NULL; -} - -/* - * Remove some entries from the beginning of name_cache_list. - */ -static void name_cache_clean_unused(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) - return; - - while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } -} - -static void name_cache_free(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - while (!list_empty(&sctx->name_cache_list)) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } + return container_of(entry, struct name_cache_entry, entry); } /* @@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct name_cache_entry *nce = NULL; + struct name_cache_entry *nce; /* * First check if we already did a call to this function with the same @@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { - name_cache_delete(sctx, nce); - kfree(nce); + btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); nce = NULL; } else { - /* - * Removes the entry from the list and adds it back to - * the end. This marks the entry as recently used so - * that name_cache_clean_unused does not remove it. - */ - list_move_tail(&nce->list, &sctx->name_cache_list); - *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * This should only happen for the parent dir that we determine in * record_new_ref_if_needed(). */ - ret = is_inode_existent(sctx, ino, gen); + ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) goto out; @@ -2497,8 +2371,8 @@ out_cache: goto out; } - nce->ino = ino; - nce->gen = gen; + nce->entry.key = ino; + nce->entry.gen = gen; nce->parent_ino = *parent_ino; nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); @@ -2510,10 +2384,11 @@ out_cache: else nce->need_later_update = 1; - nce_ret = name_cache_insert(sctx, nce); - if (nce_ret < 0) + nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); + if (nce_ret < 0) { + kfree(nce); ret = nce_ret; - name_cache_clean_unused(sctx); + } out: return ret; @@ -2884,6 +2759,63 @@ out: } /* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + +/* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. @@ -2971,6 +2903,23 @@ out: return ret; } +static void cache_dir_created(struct send_ctx *sctx, u64 dir) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + /* Caching is optional, ignore any failures. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->key = dir; + entry->gen = 0; + ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); + if (ret < 0) + kfree(entry); +} + /* * We need some special handling for inodes that get processed before the parent * directory got created. See process_recorded_refs for details. @@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) struct btrfs_key di_key; struct btrfs_dir_item *di; + if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) + return 1; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; + cache_dir_created(sctx, dir); break; } } @@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) return 0; } - return send_create_inode(sctx, sctx->cur_ino); + ret = send_create_inode(sctx, sctx->cur_ino); + + if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) + cache_dir_created(sctx, sctx->cur_ino); + + return ret; } struct recorded_ref { @@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, odi->ino = dir_ino; odi->gen = dir_gen; odi->last_dir_index_offset = 0; + odi->dir_high_seq_ino = 0; rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); @@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) { int ret = 0; int iter_ret = 0; @@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key loc; struct btrfs_dir_item *di; struct orphan_dir_info *odi = NULL; + u64 dir_high_seq_ino = 0; + u64 last_dir_index_offset = 0; /* * Don't try to rmdir the top/root subvolume dir. @@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (dir == BTRFS_FIRST_FREE_OBJECTID) return 0; + odi = get_orphan_dir_info(sctx, dir, dir_gen); + if (odi && sctx->cur_ino < odi->dir_high_seq_ino) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; + if (!odi) { + /* + * Find the inode number associated with the last dir index + * entry. This is very likely the inode with the highest number + * of all inodes that have an entry in the directory. We can + * then use it to avoid future calls to can_rmdir(), when + * processing inodes with a lower number, from having to search + * the parent root b+tree for dir index keys. + */ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + /* Can't happen, the root is never empty. */ + ASSERT(path->slots[0] > 0); + if (WARN_ON(path->slots[0] == 0)) { + ret = -EUCLEAN; + goto out; + } + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { + /* No index keys, dir can be removed. */ + ret = 1; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = loc.objectid; + if (sctx->cur_ino < dir_high_seq_ino) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + } + key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = 0; - - odi = get_orphan_dir_info(sctx, dir, dir_gen); - if (odi) - key.offset = odi->last_dir_index_offset; + key.offset = (odi ? odi->last_dir_index_offset : 0); btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; @@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); + last_dir_index_offset = found_key.offset; + dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; dm->rmdir_gen = dir_gen; ret = 0; goto out; } - if (loc.objectid > send_progress) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; + if (loc.objectid > sctx->cur_ino) { ret = 0; goto out; } @@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, out: btrfs_free_path(path); - return ret; + + if (ret) + return ret; + + if (!odi) { + odi = add_orphan_dir_info(sctx, dir, dir_gen); + if (IS_ERR(odi)) + return PTR_ERR(odi); + + odi->gen = dir_gen; + } + + odi->last_dir_index_offset = last_dir_index_offset; + odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); + + return 0; } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) @@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } gen = odi->gen; - ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); + ret = can_rmdir(sctx, rmdir_ino, gen); if (ret < 0) goto out; if (!ret) @@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } finish: - ret = send_utimes(sctx, pm->ino, pm->gen); + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -3619,7 +3628,7 @@ finish: if (ret < 0) goto out; - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } @@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * "testdir_2". */ list_for_each_entry(cur, &sctx->new_refs, list) { - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) @@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * the source path when performing its rename * operation. */ - if (is_waiting_for_move(sctx, ow_inode)) { - wdm = get_waiting_dir_move(sctx, - ow_inode); - ASSERT(wdm); + wdm = get_waiting_dir_move(sctx, ow_inode); + if (wdm) wdm->orphanized = true; - } /* * Make sure we clear our orphanized inode's @@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * and get instead the orphan name. */ nce = name_cache_search(sctx, ow_inode, ow_gen); - if (nce) { - name_cache_delete(sctx, nce); - kfree(nce); - } + if (nce) + btrfs_lru_cache_remove(&sctx->name_cache, + &nce->entry); /* * ow_inode might currently be an ancestor of @@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) { @@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; + cache_dir_created(sctx, cur->dir); } } @@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) { @@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { - /* TODO delayed utimes */ - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete && cur->dir != last_dir_ino_rm) { - ret = can_rmdir(sctx, cur->dir, cur->dir_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret) { @@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ - data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + data_offset = PAGE_ALIGN(sctx->send_size); if (data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes) { ret = -EOVERFLOW; @@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, sent += size; } - if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in @@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) goto out; } out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + return ret; } @@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -8073,10 +8092,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside - * access_ok. + * access_ok. Also set an upper limit for allocation size so this can't + * easily exhaust memory. Max number of clone sources is about 200K. */ - if (arg->clone_sources_count > - ULONG_MAX / sizeof(struct clone_root) - 1) { + if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { ret = -EINVAL; goto out; } @@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); - INIT_LIST_HEAD(&sctx->name_cache_list); - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mt_init(&sctx->backref_cache.entries); + btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->dir_created_cache, + SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->flags = arg->flags; @@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->pending_dir_moves = RB_ROOT; - sctx->waiting_dir_moves = RB_ROOT; - sctx->orphan_dirs = RB_ROOT; - sctx->rbtree_new_refs = RB_ROOT; - sctx->rbtree_deleted_refs = RB_ROOT; - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, GFP_KERNEL); @@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (ret < 0) goto out; + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) @@ -8358,11 +8389,12 @@ out: kvfree(sctx->send_buf); kvfree(sctx->verity_descriptor); - name_cache_free(sctx); - close_current_inode(sctx); - empty_backref_cache(sctx); + btrfs_lru_cache_clear(&sctx->name_cache); + btrfs_lru_cache_clear(&sctx->backref_cache); + btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); kfree(sctx); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 433ce221dc5c..581845bc206a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -58,6 +58,7 @@ #include "scrub.h" #include "verity.h" #include "super.h" +#include "extent-tree.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } /* - * Metadata in mixed block goup profiles are accounted in data + * Metadata in mixed block group profiles are accounted in data */ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 45615ce36498..8c5efa5813b3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj) kfree(to_raid_kobj(kobj)); } -static struct kobj_type btrfs_raid_ktype = { +static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, @@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -static struct kobj_type space_info_ktype = { +static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, @@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj) complete(&fs_devs->kobj_unregister); } -static struct kobj_type btrfs_ktype = { +static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; @@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj) complete(&device->kobj_unregister); } -static struct kobj_type devid_ktype = { +static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, @@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj) kfree(kobj); } -static struct kobj_type qgroups_ktype = { +static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, @@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj) memset(&qgroup->kobj, 0, sizeof(*kobj)); } -static struct kobj_type qgroup_ktype = { +static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, @@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, * Change per-fs features in /sys/fs/btrfs/UUID/features to match current * values in superblock. Call after any changes to incompat/compat_ro flags */ -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set) +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 __maybe_unused features; - int __maybe_unused ret; + int ret; if (!fs_info) return; - /* - * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not - * safe when called from some contexts (eg. balance) - */ - features = get_features(fs_info, set); - ASSERT(bit & supported_feature_masks[set]); - - fs_devs = fs_info->fs_devices; - fsid_kobj = &fs_devs->fsid_kobj; - + fsid_kobj = &fs_info->fs_devices->fsid_kobj; if (!fsid_kobj->state_initialized) return; - /* - * FIXME: this is too heavy to update just one value, ideally we'd like - * to use sysfs_update_group but some refactoring is needed first. - */ - sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); - ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret < 0) + btrfs_warn(fs_info, + "failed to update /sys/fs/btrfs/%pU/features: %d", + fs_info->fs_devices->fsid, ret); } int __init btrfs_init_sysfs(void) diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index bacef43f7267..86c7eef12873 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); int __init btrfs_init_sysfs(void); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 181469fc0bb3..ca09cf9afce8 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -64,7 +64,7 @@ struct inode *btrfs_new_test_inode(void) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - inode_init_owner(&init_user_ns, inode, NULL, S_IFREG); + inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG); return inode; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c5b3a631bf4f..f2f2e11dac4c 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b8c52e89688c..18329ebcb1cb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + /* If we have features changed, wake up the cleaner to update sysfs. */ + if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && + fs_info->cleaner_kthread) + wake_up_process(fs_info->cleaner_kthread); + ret = btrfs_write_and_wait_transaction(trans); if (ret) { btrfs_handle_fs_error(fs_info, ret, @@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 97f6c39f59c8..fa728ab80826 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); @@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d43261545264..200cea6e49e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - /* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part @@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, bytenr, blocksize); if (ret) { @@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add( trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, bytenr); } } @@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, path->nodes[*level]->start, path->nodes[*level]->len); @@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, path->nodes[*level]->start); } @@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, next->start, next->len); if (ret) goto out; btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, next->start); } } @@ -3576,17 +3566,19 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, } static int flush_dir_items_batch(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct btrfs_inode *inode, struct extent_buffer *src, struct btrfs_path *dst_path, int start_slot, int count) { + struct btrfs_root *log = inode->root->log_root; char *ins_data = NULL; struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; unsigned long dst_offset; + u64 last_index; struct btrfs_key key; u32 item_size; int ret; @@ -3644,6 +3636,18 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); btrfs_release_path(dst_path); + + last_index = batch.keys[count - 1].offset; + ASSERT(last_index > inode->last_dir_index_offset); + + /* + * If for some unexpected reason the last item's index is not greater + * than the last index we logged, warn and force a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) + ret = BTRFS_LOG_FORCE_COMMIT; + else + inode->last_dir_index_offset = last_index; out: kfree(ins_data); @@ -3693,7 +3697,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - ctx->last_dir_item_offset = key.offset; /* * Skip ranges of items that consist only of dir item keys created @@ -3756,7 +3759,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, if (batch_size > 0) { int ret; - ret = flush_dir_items_batch(trans, log, src, dst_path, + ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) return ret; @@ -3780,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - int err = 0; int ret; u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; @@ -3821,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { - err = ret; + } else if (ret > 0) { + ret = 0; } goto done; @@ -3845,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; } else if (ret < 0) { - err = ret; goto done; } @@ -3867,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret > 0) + if (ret > 0) { ret = btrfs_next_item(root, path); + if (ret > 0) { + /* There are no more keys in the inode's root. */ + ret = 0; + goto done; + } + } if (ret < 0) - err = ret; - /* If ret is 1, there are no more keys in the inode's root. */ - if (ret != 0) goto done; /* @@ -3883,8 +3887,8 @@ search: ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, &last_old_dentry_offset); if (ret != 0) { - if (ret < 0) - err = ret; + if (ret > 0) + ret = 0; goto done; } path->slots[0] = btrfs_header_nritems(path->nodes[0]); @@ -3895,10 +3899,10 @@ search: */ ret = btrfs_next_leaf(root, path); if (ret) { - if (ret == 1) + if (ret == 1) { last_offset = (u64)-1; - else - err = ret; + ret = 0; + } goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); @@ -3929,7 +3933,7 @@ done: btrfs_release_path(path); btrfs_release_path(dst_path); - if (err == 0) { + if (ret == 0) { *last_offset_ret = last_offset; /* * In case the leaf was changed in the current transaction but @@ -3940,15 +3944,13 @@ done: * a range, last_old_dentry_offset is == to last_offset. */ ASSERT(last_old_dentry_offset <= last_offset); - if (last_old_dentry_offset < last_offset) { + if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, last_offset); - if (ret) - err = ret; - } } - return err; + + return ret; } /* @@ -4044,7 +4046,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = BTRFS_DIR_START_INDEX; max_key = 0; - ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { ret = log_dir_items(trans, inode, path, dst_path, @@ -4056,8 +4057,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = max_key + 1; } - inode->last_dir_index_offset = ctx->last_dir_item_offset; - return 0; } @@ -5593,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction * commits. */ - if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { - btrfs_set_log_full_commit(trans); + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - } inode = btrfs_iget(root->fs_info->sb, ino, root); /* @@ -6455,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * result in losing the file after a log replay. */ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { - btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85b43075ac58..bdeb5216718f 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -13,8 +13,13 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 -/* We can't use the tree log for whatever reason, force a transaction commit */ -#define BTRFS_LOG_FORCE_COMMIT (1) +/* + * We can't use the tree log for whatever reason, force a transaction commit. + * We use a negative value because there are functions through the logging code + * that need to return an error (< 0 value), false (0) or true (1). Any negative + * value will do, as it will cause the log to be marked for a full sync. + */ +#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) struct btrfs_log_ctx { int log_ret; @@ -24,8 +29,6 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - /* Tracks the last logged dir item/index key offset. */ - u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index bf9eb693a6a7..c5ff16f9e9fa 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -783,30 +783,25 @@ again: /* * fsverity op that writes a Merkle tree block into the btree. * - * @inode: inode to write a Merkle tree block for - * @buf: Merkle tree data block to write - * @index: index of the block in the Merkle tree - * @log_blocksize: log base 2 of the Merkle tree block size - * - * Note that the block size could be different from the page size, so it is not - * safe to assume that index is a page index. + * @inode: inode to write a Merkle tree block for + * @buf: Merkle tree block to write + * @pos: the position of the block in the Merkle tree (in bytes) + * @size: the Merkle tree block size (in bytes) * * Returns 0 on success or negative error code on failure */ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - u64 off = index << log_blocksize; - u64 len = 1ULL << log_blocksize; loff_t merkle_pos = merkle_file_pos(inode); if (merkle_pos < 0) return merkle_pos; - if (merkle_pos > inode->i_sb->s_maxbytes - off - len) + if (merkle_pos > inode->i_sb->s_maxbytes - pos - size) return -EFBIG; return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, - off, buf, len); + pos, buf, size); } const struct fsverity_operations btrfs_verityops = { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bcfef75b97da..7823168c08a6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -403,6 +403,7 @@ void btrfs_free_device(struct btrfs_device *device) static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, @@ -727,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( /* * Handle the case where the scanned device is part of an fs whose last * metadata UUID change reverted it to the original FSID. At the same - * time * fs_devices was first created by another constitutent device + * time fs_devices was first created by another constituent device * which didn't fully observe the operation. This results in an * btrfs_fs_devices created with metadata/fsid different AND * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the @@ -1181,9 +1182,22 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) + if (!fs_devices->opened) { list_splice_init(&fs_devices->seed_list, &list); + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); @@ -1600,7 +1614,7 @@ again: if (ret < 0) goto out; - while (1) { + while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -1623,6 +1637,9 @@ again: if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; + if (key.offset > search_end) + break; + if (key.offset > search_start) { hole_size = key.offset - search_start; dev_extent_hole_check(device, &search_start, &hole_size, @@ -1683,6 +1700,7 @@ next: else ret = 0; + ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -6266,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } -/* - * Calculate the geometry of a particular (address, len) tuple. This - * information is used to calculate how big a particular bio can get before it - * straddles a stripe. - * - * @fs_info: the filesystem - * @em: mapping containing the logical extent - * @op: type of operation - write or read - * @logical: address that we want to figure out the geometry of - * @io_geom: pointer used to return values - * - * Returns < 0 in case a chunk for the given logical address cannot be found, - * usually shouldn't happen unless @logical is corrupted, 0 otherwise. - */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom) +static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, + u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 *full_stripe_start) { - struct map_lookup *map; - u64 len; - u64 offset; - u64 stripe_offset; - u64 stripe_nr; - u32 stripe_len; - u64 raid56_full_stripe_start = (u64)-1; - int data_stripes; + u32 stripe_len = map->stripe_len; ASSERT(op != BTRFS_MAP_DISCARD); - map = em->map_lookup; - /* Offset of this logical address in the chunk */ - offset = logical - em->start; - /* Len of a stripe in a chunk */ - stripe_len = map->stripe_len; /* - * Stripe_nr is where this block falls in - * stripe_offset is the offset of this block in its stripe. + * Stripe_nr is the stripe where this block falls. stripe_offset is + * the offset of this block in its stripe. */ - stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); - ASSERT(stripe_offset < U32_MAX); + *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + ASSERT(*stripe_offset < U32_MAX); - data_stripes = nr_data_stripes(map); + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - /* Only stripe based profiles needs to check against stripe length. */ - if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { - u64 max_len = stripe_len - stripe_offset; + *full_stripe_start = + div64_u64(offset, full_stripe_len) * full_stripe_len; /* - * In case of raid56, we need to know the stripe aligned start + * For writes to RAID56, allow to write a full stripe set, but + * no straddling of stripe sets. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * data_stripes; - raid56_full_stripe_start = offset; - - /* - * Allow a write of a full stripe, but make sure we - * don't allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - - /* - * For writes to RAID[56], allow a full stripeset across - * all disks. For other RAID types and for RAID[56] - * reads, just allow a single stripe (on a single disk). - */ - if (op == BTRFS_MAP_WRITE) { - max_len = stripe_len * data_stripes - - (offset - raid56_full_stripe_start); - } - } - len = min_t(u64, em->len - offset, max_len); - } else { - len = em->len - offset; + if (op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - *full_stripe_start); } - io_geom->len = len; - io_geom->offset = offset; - io_geom->stripe_len = stripe_len; - io_geom->stripe_nr = stripe_nr; - io_geom->stripe_offset = stripe_offset; - io_geom->raid56_stripe_offset = raid56_full_stripe_start; - - return 0; + /* + * For other RAID types and for RAID56 reads, allow a single stripe (on + * a single disk). + */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) + return stripe_len - *stripe_offset; + return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, @@ -6369,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, { struct extent_map *em; struct map_lookup *map; + u64 map_offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; @@ -6387,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - struct btrfs_io_geometry geom; + u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); @@ -6395,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); - if (ret < 0) - return ret; - map = em->map_lookup; - - *length = geom.len; - stripe_len = geom.stripe_len; - stripe_nr = geom.stripe_nr; - stripe_offset = geom.stripe_offset; - raid56_full_stripe_start = geom.raid56_stripe_offset; data_stripes = nr_data_stripes(map); + stripe_len = map->stripe_len; + + map_offset = logical - em->start; + max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, + &stripe_offset, &raid56_full_stripe_start); + *length = min_t(u64, em->len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6b7a05f6cf82..7e51f2238f72 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -53,21 +53,6 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; -struct btrfs_io_geometry { - /* remaining bytes before crossing a stripe */ - u64 len; - /* offset of logical address in chunk */ - u64 offset; - /* length of single IO stripe */ - u32 stripe_len; - /* offset of address in stripe */ - u32 stripe_offset; - /* number of stripe where address falls */ - u64 stripe_nr; - /* offset of raid56 stripe into the chunk */ - u64 raid56_stripe_offset; -}; - /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 0ed4b119a7ca..0ebeaf4e81f9 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -370,7 +370,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -383,7 +383,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 01a13de11832..da7bb9187b68 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); workspace->level = level; workspace->buf = NULL; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1f503e8e42d4..f95b2c94d619 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "bio.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, */ static inline u32 sb_zone_number(int shift, int mirror) { - u64 zone; + u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { @@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; - u32 zno; int ret; if (!*nr_zones) @@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* Check cache */ if (zinfo->zone_cache) { unsigned int i; + u32 zno; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); zno = pos >> zinfo->zone_size_shift; @@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return -EIO; /* Populate cache */ - if (zinfo->zone_cache) + if (zinfo->zone_cache) { + u32 zno = pos >> zinfo->zone_size_shift; + memcpy(zinfo->zone_cache + zno, zones, sizeof(*zinfo->zone_cache) * *nr_zones); + } return 0; } @@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); - /* - * We limit max_zone_append_size also by max_segments * - * PAGE_SIZE. Technically, we can have multiple pages per segment. But, - * since btrfs adds the pages one by one to a bio, and btrfs cannot - * increase the metadata reservation even if it increases the number of - * extents, it is safe to stick with the limit. - * - * With the zoned emulation, we can have non-zoned device on the zoned - * mode. In this case, we don't have a valid max zone append size. So, - * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. - */ - if (bdev_is_zoned(bdev)) { - zone_info->max_zone_append_size = min_t(u64, - (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); - } else { - zone_info->max_zone_append_size = - (u64)bdev_max_segments(bdev) << PAGE_SHIFT; - } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { + struct queue_limits *lim = &fs_info->limits; struct btrfs_device *device; u64 zone_size = 0; - u64 max_zone_append_size = 0; int ret; /* @@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (!btrfs_fs_incompat(fs_info, ZONED)) return btrfs_check_for_zoned_device(fs_info); + blk_set_stacking_limits(lim); + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { struct btrfs_zoned_device_info *zone_info = device->zone_info; @@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) zone_info->zone_size, zone_size); return -EINVAL; } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = zone_info->max_zone_append_size; + + /* + * With the zoned emulation, we can have non-zoned device on the + * zoned mode. In this case, we don't have a valid max zone + * append size. + */ + if (bdev_is_zoned(device->bdev)) { + blk_stack_limits(lim, + &bdev_get_queue(device->bdev)->limits, + 0); + } } /* @@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, - fs_info->sectorsize); + /* + * Also limit max_zone_append_size by max_segments * PAGE_SIZE. + * Technically, we can have multiple pages per segment. But, since + * we add the pages one by one to a bio, and cannot increase the + * metadata reservation even if it increases the number of extents, it + * is safe to stick with the limit. + */ + fs_info->max_zone_append_size = ALIGN_DOWN( + min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, + (u64)lim->max_sectors << SECTOR_SHIFT, + (u64)lim->max_segments << PAGE_SHIFT), + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; if (fs_info->max_zone_append_size < fs_info->max_extent_size) fs_info->max_extent_size = fs_info->max_zone_append_size; @@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +bool btrfs_use_zone_append(struct btrfs_bio *bbio) { + u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; @@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) + return false; + /* * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the * extent layout the relocation code has. @@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) return ret; } -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio) +void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { + const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; struct btrfs_ordered_extent *ordered; - const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - if (bio_op(bio) != REQ_OP_ZONE_APPEND) - return; - - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); if (WARN_ON(!ordered)) return; ordered->physical = physical; - ordered->bdev = bio->bi_bdev; - btrfs_put_ordered_extent(ordered); } @@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) struct extent_map *em; struct btrfs_ordered_sum *sum; u64 orig_logical = ordered->disk_bytenr; - u64 *logical = NULL; - int nr, stripe_len; + struct map_lookup *map; + u64 physical = ordered->physical; + u64 chunk_start_phys; + u64 logical; - /* Zoned devices should not have partitions. So, we can assume it is 0 */ - ASSERT(!bdev_is_partition(ordered->bdev)); - if (WARN_ON(!ordered->bdev)) + em = btrfs_get_chunk_map(fs_info, orig_logical, 1); + if (IS_ERR(em)) return; + map = em->map_lookup; + chunk_start_phys = map->stripes[0].physical; - if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, - ordered->physical, &logical, &nr, - &stripe_len))) - goto out; - - WARN_ON(nr != 1); + if (WARN_ON_ONCE(map->num_stripes > 1) || + WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || + WARN_ON_ONCE(physical < chunk_start_phys) || + WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { + free_extent_map(em); + return; + } + logical = em->start + (physical - map->stripes[0].physical); + free_extent_map(em); - if (orig_logical == *logical) - goto out; + if (orig_logical == logical) + return; - ordered->disk_bytenr = *logical; + ordered->disk_bytenr = logical; em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); - em->block_start = *logical; + em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); list_for_each_entry(sum, &ordered->list, list) { - if (*logical < orig_logical) - sum->bytenr -= orig_logical - *logical; + if (logical < orig_logical) + sum->bytenr -= orig_logical - logical; else - sum->bytenr += *logical - orig_logical; + sum->bytenr += logical - orig_logical; } - -out: - kfree(logical); } bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; - /* We only support single profile for now */ - device = map->stripes[0].dev; - - free_extent_map(em); - - return device; -} - /* * Activate block group and underlying device zones * diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f43990985d80..c0570d35fea2 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -20,7 +20,6 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; - u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio); +bool btrfs_use_zone_append(struct btrfs_bio *bbio); +void btrfs_record_physical_zoned(struct btrfs_bio *bbio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, @@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); @@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; } -static inline void btrfs_record_physical_zoned(struct inode *inode, - u64 file_offset, struct bio *bio) +static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } @@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } -static inline struct btrfs_device *btrfs_zoned_get_device( - struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) { return true; diff --git a/fs/buffer.c b/fs/buffer.c index d9c6d1fbb6dd..623e77d6ef77 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -48,6 +48,7 @@ #include <linux/sched/mm.h> #include <trace/events/block.h> #include <linux/fscrypt.h> +#include <linux/fsverity.h> #include "internal.h" @@ -295,20 +296,53 @@ still_busy: return; } -struct decrypt_bh_ctx { +struct postprocess_bh_ctx { struct work_struct work; struct buffer_head *bh; }; +static void verify_bh(struct work_struct *work) +{ + struct postprocess_bh_ctx *ctx = + container_of(work, struct postprocess_bh_ctx, work); + struct buffer_head *bh = ctx->bh; + bool valid; + + valid = fsverity_verify_blocks(page_folio(bh->b_page), bh->b_size, + bh_offset(bh)); + end_buffer_async_read(bh, valid); + kfree(ctx); +} + +static bool need_fsverity(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + struct inode *inode = page->mapping->host; + + return fsverity_active(inode) && + /* needed by ext4 */ + page->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + static void decrypt_bh(struct work_struct *work) { - struct decrypt_bh_ctx *ctx = - container_of(work, struct decrypt_bh_ctx, work); + struct postprocess_bh_ctx *ctx = + container_of(work, struct postprocess_bh_ctx, work); struct buffer_head *bh = ctx->bh; int err; - err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size, - bh_offset(bh)); + err = fscrypt_decrypt_pagecache_blocks(page_folio(bh->b_page), + bh->b_size, bh_offset(bh)); + if (err == 0 && need_fsverity(bh)) { + /* + * We use different work queues for decryption and for verity + * because verity may require reading metadata pages that need + * decryption, and we shouldn't recurse to the same workqueue. + */ + INIT_WORK(&ctx->work, verify_bh); + fsverity_enqueue_verify_work(&ctx->work); + return; + } end_buffer_async_read(bh, err == 0); kfree(ctx); } @@ -319,15 +353,24 @@ static void decrypt_bh(struct work_struct *work) */ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) { - /* Decrypt if needed */ - if (uptodate && - fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) { - struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + struct inode *inode = bh->b_page->mapping->host; + bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode); + bool verify = need_fsverity(bh); + + /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */ + if (uptodate && (decrypt || verify)) { + struct postprocess_bh_ctx *ctx = + kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx) { - INIT_WORK(&ctx->work, decrypt_bh); ctx->bh = bh; - fscrypt_enqueue_decrypt_work(&ctx->work); + if (decrypt) { + INIT_WORK(&ctx->work, decrypt_bh); + fscrypt_enqueue_decrypt_work(&ctx->work); + } else { + INIT_WORK(&ctx->work, verify_bh); + fsverity_enqueue_verify_work(&ctx->work); + } return; } uptodate = 0; @@ -2245,6 +2288,11 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) int nr, i; int fully_mapped = 1; bool page_error = false; + loff_t limit = i_size_read(inode); + + /* This is needed for ext4. */ + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) + limit = inode->i_sb->s_maxbytes; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); @@ -2253,7 +2301,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) bbits = block_size_bits(blocksize); iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits); - lblock = (i_size_read(inode)+blocksize-1) >> bbits; + lblock = (limit+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index a69073a1d3f0..40052bdb3365 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -138,7 +138,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object) newattrs.ia_size = oi_size & PAGE_MASK; ret = cachefiles_inject_remove_error(); if (ret == 0) - ret = notify_change(&init_user_ns, file->f_path.dentry, + ret = notify_change(&nop_mnt_idmap, file->f_path.dentry, &newattrs, NULL); if (ret < 0) goto truncate_failed; @@ -148,7 +148,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object) newattrs.ia_size = ni_size; ret = cachefiles_inject_write_error(); if (ret == 0) - ret = notify_change(&init_user_ns, file->f_path.dentry, + ret = notify_change(&nop_mnt_idmap, file->f_path.dentry, &newattrs, NULL); truncate_failed: diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 03ca8f2f657a..82219a8f6084 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -130,7 +130,7 @@ retry: goto mkdir_error; ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); + ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); @@ -245,7 +245,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, ret = cachefiles_inject_remove_error(); if (ret == 0) { - ret = vfs_unlink(&init_user_ns, d_backing_inode(dir), dentry, NULL); + ret = vfs_unlink(&nop_mnt_idmap, d_backing_inode(dir), dentry, NULL); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); } @@ -382,10 +382,10 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_idmap = &nop_mnt_idmap, .old_dir = d_inode(dir), .old_dentry = rep, - .new_mnt_userns = &init_user_ns, + .new_mnt_idmap = &nop_mnt_idmap, .new_dir = d_inode(cache->graveyard), .new_dentry = grave, }; @@ -451,7 +451,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) { - file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG, + file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred); ret = PTR_ERR_OR_ZERO(file); @@ -714,7 +714,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, ret = cachefiles_inject_read_error(); if (ret == 0) - ret = vfs_link(object->file->f_path.dentry, &init_user_ns, + ret = vfs_link(object->file->f_path.dentry, &nop_mnt_idmap, d_inode(fan), dentry, NULL); if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(fan), ret, diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 00b087c14995..bcb6173943ee 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -65,7 +65,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, sizeof(struct cachefiles_xattr) + len, 0); if (ret < 0) { trace_cachefiles_vfs_error(object, file_inode(file), ret, @@ -108,7 +108,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file xlen = cachefiles_inject_read_error(); if (xlen == 0) - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, tlen); + xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen); if (xlen != tlen) { if (xlen < 0) trace_cachefiles_vfs_error(object, file_inode(file), xlen, @@ -150,7 +150,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, ret = cachefiles_inject_remove_error(); if (ret == 0) - ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); + ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache); if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(dentry), ret, cachefiles_trace_remxattr_error); @@ -207,7 +207,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len, 0); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, @@ -249,7 +249,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) xlen = cachefiles_inject_read_error(); if (xlen == 0) - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, len); + xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len); if (xlen != len) { if (xlen < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index c7e8dd5b58d4..6945a938d396 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -85,7 +85,7 @@ retry: return acl; } -int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int ret = 0, size = 0; @@ -105,7 +105,7 @@ int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, + ret = posix_acl_update_mode(&nop_mnt_idmap, inode, &new_mode, &acl); if (ret) goto out; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c74871e37c9..cac4083e387a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -305,7 +305,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; struct page **pages; @@ -313,6 +313,11 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; @@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p idx %lu\n", page, page->index); + if (ceph_inode_is_shutdown(inode)) + return -EIO; + /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { @@ -1643,7 +1651,7 @@ int ceph_uninline_data(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req = NULL; - struct ceph_cap_flush *prealloc_cf; + struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; @@ -1657,6 +1665,11 @@ int ceph_uninline_data(struct file *file) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (inline_version == CEPH_INLINE_NONE) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f75ad432f375..7cc20772eac9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -9,6 +9,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include "super.h" #include "mds_client.h" @@ -4078,6 +4079,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; + bool close_sessions = false; dout("handle_caps from mds%d\n", session->s_mds); @@ -4215,9 +4217,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, - false, &realm); + if (ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm)) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + goto done; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -4277,6 +4283,11 @@ done_unlocked: iput(inode); out: ceph_put_string(extra_info.pool_ns); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); + return; flush_cap_releases: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6c7026cc8988..0ced8b570e42 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -845,7 +845,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) return PTR_ERR(result); } -static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -905,13 +905,13 @@ out: return err; } -static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return ceph_mknod(mnt_userns, dir, dentry, mode, 0); + return ceph_mknod(idmap, dir, dentry, mode, 0); } -static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -970,7 +970,7 @@ out: return err; } -static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -1269,7 +1269,7 @@ out: return err; } -static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 764598e1efd9..b5cff85925a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2011,6 +2011,9 @@ static int ceph_zero_partial_object(struct inode *inode, loff_t zero = 0; int op; + if (ceph_inode_is_shutdown(inode)) + return -EIO; + if (!length) { op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; length = &zero; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 23d05ec87fcc..8e5f41d45283 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2227,7 +2227,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) /* * setattr */ -int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -2240,7 +2240,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (ceph_inode_is_shutdown(inode)) return -ESTALE; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err != 0) return err; @@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode); + err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode); return err; } @@ -2397,7 +2397,7 @@ out: * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int err; @@ -2408,7 +2408,7 @@ int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); return err; } @@ -2417,10 +2417,10 @@ static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) { + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { /* * The link count for directories depends on inode->i_subdirs, * and that is only updated when Fs caps are held. @@ -2431,11 +2431,10 @@ static int statx_to_caps(u32 want, umode_t mode) mask |= CEPH_CAP_LINK_SHARED; } - if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| - STATX_BLOCKS)) + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_FILE_SHARED; - if (want & (STATX_CTIME)) + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_XATTR_SHARED; return mask; @@ -2445,7 +2444,7 @@ static int statx_to_caps(u32 want, umode_t mode) * Get all the attributes. If we have sufficient caps for the requested attrs, * then we can avoid talking to the MDS at all. */ -int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -2466,7 +2465,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return err; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->ino = ceph_present_inode(inode); /* @@ -2478,6 +2477,11 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, valid_mask |= STATX_BTIME; } + if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + if (ceph_snap(inode) == CEPH_NOSNAP) stat->dev = sb->s_dev; else @@ -2519,6 +2523,8 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 9c8dc8a55e7e..cb51c7e9c8e2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -7,6 +7,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/filelock.h> #include <linux/ceph/pagelist.h> static u64 lock_secret; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 26a0a8b9975e..27a245d959c0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return ERR_PTR(-EIO); + if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); @@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc, int mstate; int mds = session->s_mds; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return -EIO; + /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); dout("open_session to mds%d (%s)\n", mds, @@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc, return; } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { + dout("do_request metadata corrupted\n"); + err = -EIO; + goto finish; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { dout("do_request timed out\n"); @@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) u64 tid; int err, result; int mds = session->s_mds; + bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { pr_err("mdsc_handle_reply got corrupt (short) reply\n"); @@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, rinfo->snapblob, + err = ceph_update_snap_trace(mdsc, rinfo->snapblob, rinfo->snapblob + rinfo->snapblob_len, le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, &realm); + if (err) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + if (err == -EIO) + ceph_msg_dump(msg); + goto out_err; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -3412,6 +3431,10 @@ out_err: req->r_end_latency, err); out: ceph_mdsc_put_request(req); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } @@ -3662,6 +3685,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_FLUSHMSG: + /* flush cap releases */ + spin_lock(&session->s_cap_lock); + if (session->s_num_cap_releases) + ceph_flush_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); + send_flushmsg_ack(mdsc, session, seq); break; @@ -5011,7 +5040,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) } /* - * called after sb is ro. + * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { @@ -5301,7 +5330,8 @@ static void mds_peer_reset(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; pr_warn("mds%d closed our session\n", s->s_mds); - send_mds_reconnect(mdsc, s); + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + send_mds_reconnect(mdsc, s); } static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e4151852184e..87007203f130 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/iversion.h> @@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -884,6 +887,27 @@ fail: if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("%s error %d\n", __func__, err); + + /* + * When receiving a corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this kclient and then this kclient + * must be remounted to continue after the corrupted metadata + * fixed in the MDS side. + */ + WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) + pr_err("%s failed to blocklist %s: %d\n", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + + WARN(1, "%s: %s%sdo remount to continue%s", + __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), + ret ? "" : " was blocklisted, ", + err == -EIO ? " after corrupted snaptrace is fixed" : ""); + return err; } @@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, __le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; + bool close_sessions = false; /* decode */ if (msg->front.iov_len < sizeof(*h)) @@ -1092,8 +1117,12 @@ skip_inode: * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps. */ - ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY, NULL); + if (ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, + NULL)) { + close_sessions = true; + goto bad; + } if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -1112,6 +1141,9 @@ bad: out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0ed3be75bb9a..6ecca2c6d137 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -100,6 +100,17 @@ struct ceph_mount_options { char *mon_addr; }; +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_RECOVER, + CEPH_MOUNT_FENCE_IO, +}; + #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 struct ceph_fs_client { @@ -1039,12 +1050,12 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) { return __ceph_do_getattr(inode, NULL, mask, force); } -extern int ceph_permission(struct user_namespace *mnt_userns, +extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); extern int __ceph_setattr(struct inode *inode, struct iattr *attr); -extern int ceph_setattr(struct user_namespace *mnt_userns, +extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -extern int ceph_getattr(struct user_namespace *mnt_userns, +extern int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); void ceph_inode_shutdown(struct inode *inode); @@ -1117,7 +1128,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int, bool); -int ceph_set_acl(struct user_namespace *mnt_userns, +int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f31350cda960..f65b07cc33a2 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1285,7 +1285,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index bbf58c2439da..9a2d390bd06f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1674,7 +1674,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, return rc; } -struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) @@ -1738,7 +1738,7 @@ out: #endif } -int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 10e00c624922..cb7c5460a80b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/init.h> @@ -345,7 +346,7 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) return -EOPNOTSUPP; } -static int cifs_permission(struct user_namespace *mnt_userns, +static int cifs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -361,7 +362,7 @@ static int cifs_permission(struct user_namespace *mnt_userns, on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static struct kmem_cache *cifs_inode_cachep; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 63a0ac2b9355..b58cd737b21e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -49,7 +49,7 @@ extern void cifs_sb_deactive(struct super_block *sb); /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); -extern int cifs_create(struct user_namespace *, struct inode *, +extern int cifs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool excl); extern int cifs_atomic_open(struct inode *, struct dentry *, struct file *, unsigned, umode_t); @@ -57,12 +57,12 @@ extern struct dentry *cifs_lookup(struct inode *, struct dentry *, unsigned int); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); -extern int cifs_mknod(struct user_namespace *, struct inode *, struct dentry *, +extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct user_namespace *, struct inode *, struct dentry *, +extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename2(struct user_namespace *, struct inode *, +extern int cifs_rename2(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); @@ -72,9 +72,9 @@ extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); extern int cifs_revalidate_mapping(struct inode *inode); extern int cifs_zap_mapping(struct inode *inode); -extern int cifs_getattr(struct user_namespace *, const struct path *, +extern int cifs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int cifs_setattr(struct user_namespace *, struct dentry *, +extern int cifs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); @@ -124,7 +124,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); /* Functions related to symlinks */ extern const char *cifs_get_link(struct dentry *, struct inode *, struct delayed_call *); -extern int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, +extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname); #ifdef CONFIG_CIFS_XATTR diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cfdd5bf701a1..cd8171a1c9a0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -26,6 +26,7 @@ #include <uapi/linux/cifs/cifs_mount.h> #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" +#include <linux/filelock.h> #define SMB_PATH_MAX 260 #define CIFS_PORT 445 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1207b39686fb..b8a47704a6ef 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -225,9 +225,9 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *, u32); extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *, u32); -extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, +extern struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); -extern int cifs_set_acl(struct user_namespace *mnt_userns, +extern int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 23f10e0d6e7e..60dd4e37030a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -15,6 +15,7 @@ /* want to reuse a stale file handle and only the caller knows the file info */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/kernel.h> #include <linux/vfs.h> #include <linux/slab.h> diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ad4208bf1e32..2b6076324ffc 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -529,7 +529,7 @@ out_free_xid: return rc; } -int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_create(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode, bool excl) { int rc; @@ -579,7 +579,7 @@ out_free_xid: return rc; } -int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode, dev_t device_number) { int rc = -EPERM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 22dfc1f8b4f1..2870e3b6ffe8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -9,6 +9,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/backing-dev.h> #include <linux/stat.h> #include <linux/fcntl.h> @@ -3889,7 +3890,7 @@ uncached_fill_pages(struct TCP_Server_Info *server, rdata->got_bytes += result; } - return rdata->got_bytes > 0 && result != -ECONNABORTED ? + return result != -ECONNABORTED && rdata->got_bytes > 0 ? rdata->got_bytes : result; } @@ -4665,7 +4666,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, rdata->got_bytes += result; } - return rdata->got_bytes > 0 && result != -ECONNABORTED ? + return result != -ECONNABORTED && rdata->got_bytes > 0 ? rdata->got_bytes : result; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f145a59af89b..11cdc7cfe0ba 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1910,7 +1910,7 @@ posix_mkdir_get_info: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode) { int rc = 0; @@ -2138,7 +2138,7 @@ do_rename_exit: } int -cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, +cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *source_dentry, struct inode *target_dir, struct dentry *target_dentry, unsigned int flags) { @@ -2496,7 +2496,7 @@ int cifs_revalidate_dentry(struct dentry *dentry) return cifs_revalidate_mapping(inode); } -int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -2537,7 +2537,7 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, return rc; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; @@ -2752,7 +2752,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(&init_user_ns, direntry, attrs); + rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); if (rc < 0) goto out; @@ -2859,7 +2859,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } - setattr_copy(&init_user_ns, inode, attrs); + setattr_copy(&nop_mnt_idmap, inode, attrs); mark_inode_dirty(inode); /* force revalidate when any of these times are set since some @@ -2903,7 +2903,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(&init_user_ns, direntry, attrs); + rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); if (rc < 0) goto cifs_setattr_exit; @@ -3058,7 +3058,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } - setattr_copy(&init_user_ns, inode, attrs); + setattr_copy(&nop_mnt_idmap, inode, attrs); mark_inode_dirty(inode); cifs_setattr_exit: @@ -3068,7 +3068,7 @@ cifs_setattr_exit: } int -cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, +cifs_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index a5a097a69983..4510dea77be3 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -569,7 +569,7 @@ cifs_hl_exit: } int -cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, +cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname) { int rc = -EOPNOTSUPP; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index ba6cc50af390..9f1dd04b555a 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -7,6 +7,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 90789aaa6567..8c816b25ce7c 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1405,6 +1405,7 @@ void smbd_destroy(struct TCP_Server_Info *server) destroy_workqueue(info->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); + server->smbd_conn = NULL; } /* diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5f2fb2fd2e37..50e762fa1a14 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -89,7 +89,7 @@ static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 9be281bbcc06..dd6277d87afb 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -46,12 +46,12 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, +int coda_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int coda_revalidate_inode(struct inode *); -int coda_getattr(struct user_namespace *, const struct path *, struct kstat *, +int coda_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int coda_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); /* this file: helpers */ char *coda_f2s(struct CodaFid *f); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 328d7a684b63..8450b1bd354b 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -73,7 +73,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig } -int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, +int coda_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int error; @@ -133,7 +133,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct user_namespace *mnt_userns, struct inode *dir, +static int coda_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *de, umode_t mode, bool excl) { int error; @@ -166,7 +166,7 @@ err_out: return error; } -static int coda_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *de, umode_t mode) { struct inode *inode; @@ -228,7 +228,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, } -static int coda_symlink(struct user_namespace *mnt_userns, +static int coda_symlink(struct mnt_idmap *idmap, struct inode *dir_inode, struct dentry *de, const char *symname) { @@ -295,7 +295,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) } /* rename */ -static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int coda_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2185328b65c7..d661e6cf17ac 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -251,16 +251,16 @@ static void coda_evict_inode(struct inode *inode) coda_cache_clear_inode(inode); } -int coda_getattr(struct user_namespace *mnt_userns, const struct path *path, +int coda_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); return err; } -int coda_setattr(struct user_namespace *mnt_userns, struct dentry *de, +int coda_setattr(struct mnt_idmap *idmap, struct dentry *de, struct iattr *iattr) { struct inode *inode = d_inode(de); diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index cb9fd59a688c..36e35c15561a 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -24,7 +24,7 @@ #include "coda_linux.h" /* pioctl ops */ -static int coda_ioctl_permission(struct user_namespace *mnt_userns, +static int coda_ioctl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -41,7 +41,7 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct user_namespace *mnt_userns, +static int coda_ioctl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return (mask & MAY_EXEC) ? -EACCES : 0; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index c0395363eab9..e710a1782382 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -77,7 +77,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); -extern int configfs_setattr(struct user_namespace *mnt_userns, +extern int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); @@ -91,7 +91,7 @@ extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; -extern int configfs_symlink(struct user_namespace *mnt_userns, +extern int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ec6519e1ca3b..4afcbbe63e68 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1251,7 +1251,7 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index b601610e9907..1c15edbe70ff 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -32,7 +32,7 @@ static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; -int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode * inode = d_inode(dentry); @@ -60,7 +60,7 @@ int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* attributes were changed atleast once in past */ - error = simple_setattr(mnt_userns, dentry, iattr); + error = simple_setattr(idmap, dentry, iattr); if (error) return error; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 0623c3edcfb9..69133ec1fac2 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -137,7 +137,7 @@ static int get_target(const char *symname, struct path *path, } -int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int ret; @@ -196,7 +196,7 @@ int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (dentry->d_inode || d_unhashed(dentry)) ret = -EEXIST; else - ret = inode_permission(&init_user_ns, dir, + ret = inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); if (!ret) ret = type->ct_item_ops->allow_link(parent_item, target_item); diff --git a/fs/coredump.c b/fs/coredump.c index de78bde2991b..68619329ec65 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -644,7 +644,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) goto close_fail; } } else { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode; int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | O_LARGEFILE | O_EXCL; @@ -722,8 +722,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) * a process dumps core while its cwd is e.g. on a vfat * filesystem. */ - mnt_userns = file_mnt_user_ns(cprm.file); - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + idmap = file_mnt_idmap(cprm.file); + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", cn.corename); @@ -736,7 +736,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; - if (do_truncate(mnt_userns, cprm.file->f_path.dentry, + if (do_truncate(idmap, cprm.file->f_path.dentry, 0, 0, cprm.file)) goto close_fail; } @@ -838,6 +838,30 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + return __dump_emit(cprm, addr, nr); +} +EXPORT_SYMBOL(dump_emit); + +void dump_skip_to(struct coredump_params *cprm, unsigned long pos) +{ + cprm->to_skip = pos - cprm->pos; +} +EXPORT_SYMBOL(dump_skip_to); + +void dump_skip(struct coredump_params *cprm, size_t nr) +{ + cprm->to_skip += nr; +} +EXPORT_SYMBOL(dump_skip); + +#ifdef CONFIG_ELF_CORE static int dump_emit_page(struct coredump_params *cprm, struct page *page) { struct bio_vec bvec = { @@ -871,30 +895,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) return 1; } -int dump_emit(struct coredump_params *cprm, const void *addr, int nr) -{ - if (cprm->to_skip) { - if (!__dump_skip(cprm, cprm->to_skip)) - return 0; - cprm->to_skip = 0; - } - return __dump_emit(cprm, addr, nr); -} -EXPORT_SYMBOL(dump_emit); - -void dump_skip_to(struct coredump_params *cprm, unsigned long pos) -{ - cprm->to_skip = pos - cprm->pos; -} -EXPORT_SYMBOL(dump_skip_to); - -void dump_skip(struct coredump_params *cprm, size_t nr) -{ - cprm->to_skip += nr; -} -EXPORT_SYMBOL(dump_skip); - -#ifdef CONFIG_ELF_CORE int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len) { diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 1b4403136d05..d57d0a020f71 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -30,13 +30,11 @@ */ bool fscrypt_decrypt_bio(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, - bv->bv_offset); + bio_for_each_folio_all(fi, bio) { + int err = fscrypt_decrypt_pagecache_blocks(fi.folio, fi.length, + fi.offset); if (err) { bio->bi_status = errno_to_blk_status(err); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index e78be66bbf01..bf642479269a 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -237,41 +237,43 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a - * pagecache page - * @page: The locked pagecache page containing the block(s) to decrypt + * pagecache folio + * @folio: The locked pagecache folio containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. - * @offs: Byte offset within @page of the first block to decrypt. Must be + * @offs: Byte offset within @folio of the first block to decrypt. Must be * a multiple of the filesystem's block size. * - * The specified block(s) are decrypted in-place within the pagecache page, - * which must still be locked and not uptodate. Normally, blocksize == - * PAGE_SIZE and the whole page is decrypted at once. + * The specified block(s) are decrypted in-place within the pagecache folio, + * which must still be locked and not uptodate. * * This is for use by the filesystem's ->readahead() method. * * Return: 0 on success; -errno on failure */ -int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, - unsigned int offs) +int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, + size_t offs) { - const struct inode *inode = page->mapping->host; + const struct inode *inode = folio->mapping->host; const unsigned int blockbits = inode->i_blkbits; const unsigned int blocksize = 1 << blockbits; - u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) + + u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) + (offs >> blockbits); - unsigned int i; + size_t i; int err; - if (WARN_ON_ONCE(!PageLocked(page))) + if (WARN_ON_ONCE(!folio_test_locked(folio))) return -EINVAL; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) return -EINVAL; for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + struct page *page = folio_page(folio, i >> PAGE_SHIFT); + err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, - page, blocksize, i, GFP_NOFS); + page, blocksize, i & ~PAGE_MASK, + GFP_NOFS); if (err) return err; } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 316a778cec0f..0fec2dfc36eb 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -573,6 +573,9 @@ fscrypt_find_master_key(struct super_block *sb, int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec); + int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); @@ -651,6 +654,7 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); +const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 78dd2ff306bd..78086f8dbda5 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -211,10 +211,6 @@ static int allocate_filesystem_keyring(struct super_block *sb) * are still available at this time; this is important because after user file * accesses have been allowed, this function may need to evict keys from the * keyslots of an inline crypto engine, which requires the block device(s). - * - * This is also called when the super_block is being freed. This is needed to - * avoid a memory leak if mounting fails after the "test_dummy_encryption" - * option was processed, as in that case the unmount-time call isn't made. */ void fscrypt_destroy_keyring(struct super_block *sb) { @@ -778,34 +774,26 @@ out: /** * fscrypt_add_test_dummy_key() - add the test dummy encryption key * @sb: the filesystem instance to add the key to - * @dummy_policy: the encryption policy for test_dummy_encryption + * @key_spec: the key specifier of the test dummy encryption key * - * If needed, add the key for the test_dummy_encryption mount option to the - * filesystem. To prevent misuse of this mount option, a per-boot random key is - * used instead of a hardcoded one. This makes it so that any encrypted files - * created using this option won't be accessible after a reboot. + * Add the key for the test_dummy_encryption mount option to the filesystem. To + * prevent misuse of this mount option, a per-boot random key is used instead of + * a hardcoded one. This makes it so that any encrypted files created using + * this option won't be accessible after a reboot. * * Return: 0 on success, -errno on failure */ int fscrypt_add_test_dummy_key(struct super_block *sb, - const struct fscrypt_dummy_policy *dummy_policy) + struct fscrypt_key_specifier *key_spec) { - const union fscrypt_policy *policy = dummy_policy->policy; - struct fscrypt_key_specifier key_spec; struct fscrypt_master_key_secret secret; int err; - if (!policy) - return 0; - err = fscrypt_policy_to_key_spec(policy, &key_spec); - if (err) - return err; fscrypt_get_test_dummy_secret(&secret); - err = add_master_key(sb, &secret, &key_spec); + err = add_master_key(sb, &secret, key_spec); wipe_master_key_secret(&secret); return err; } -EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key); /* * Verify that the current user has added a master key with the given identifier diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 94757ccd3056..aa94fba9d17e 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -438,6 +438,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { + struct super_block *sb = ci->ci_inode->i_sb; struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; int err; @@ -450,8 +451,26 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); - if (!mk) { + mk = fscrypt_find_master_key(sb, &mk_spec); + if (unlikely(!mk)) { + const union fscrypt_policy *dummy_policy = + fscrypt_get_dummy_policy(sb); + + /* + * Add the test_dummy_encryption key on-demand. In principle, + * it should be added at mount time. Do it here instead so that + * the individual filesystems don't need to worry about adding + * this key at mount time and cleaning up on mount failure. + */ + if (dummy_policy && + fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) { + err = fscrypt_add_test_dummy_key(sb, &mk_spec); + if (err) + return err; + mk = fscrypt_find_master_key(sb, &mk_spec); + } + } + if (unlikely(!mk)) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 893661b52376..3b5fcb6402ea 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -53,8 +53,7 @@ int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, } } -static const union fscrypt_policy * -fscrypt_get_dummy_policy(struct super_block *sb) +const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb) { if (!sb->s_cop->get_dummy_policy) return NULL; @@ -506,7 +505,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) return -EFAULT; policy.version = version; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -1271,8 +1271,9 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - ret = copy_mc_to_kernel(daddr, saddr, length); - if (ret) + if (copy_mc_to_kernel(daddr, saddr, length) == 0) + ret = length; + else ret = -EIO; out_unlock: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 2e8e112b1993..bf397f6a6a33 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -42,7 +42,7 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ -static int debugfs_setattr(struct user_namespace *mnt_userns, +static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; @@ -52,7 +52,7 @@ static int debugfs_setattr(struct user_namespace *mnt_userns, if (ret) return ret; } - return simple_setattr(&init_user_ns, dentry, ia); + return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { @@ -837,7 +837,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, take_dentry_name_snapshot(&old_name, old_dentry); - error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry, + error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 1105ce3c80cb..b3b86dbdc187 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,7 +4,6 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP - select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d0b4e2181a5f..9f344d76afa3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -381,23 +381,23 @@ static int threads_start(void) { int error; - error = dlm_scand_start(); + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm_scand thread %d", error); + log_print("cannot start dlm midcomms %d", error); goto fail; } - /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_midcomms_start(); + error = dlm_scand_start(); if (error) { - log_print("cannot start dlm midcomms %d", error); - goto scand_fail; + log_print("cannot start dlm_scand thread %d", error); + goto midcomms_fail; } return 0; - scand_fail: - dlm_scand_stop(); + midcomms_fail: + dlm_midcomms_stop(); fail: return error; } @@ -572,7 +572,7 @@ static int new_lockspace(const char *name, const char *cluster, spin_lock_init(&ls->ls_rcom_spin); get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); ls->ls_recover_status = 0; - ls->ls_recover_seq = 0; + ls->ls_recover_seq = get_random_u64(); ls->ls_recover_args = NULL; init_rwsem(&ls->ls_in_recovery); init_rwsem(&ls->ls_recv_active); @@ -820,6 +820,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) return rv; } + if (ls_count == 1) + dlm_midcomms_version_wait(); + dlm_device_deregister(ls); if (force < 3 && dlm_user_daemon_available()) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4450721ec83c..61cd6c2628fa 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -61,6 +61,7 @@ #include "memory.h" #include "config.h" +#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000) #define NEEDED_RMEM (4*1024*1024) struct connection { @@ -99,6 +100,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* receive worker */ struct work_struct swork; /* send worker */ + wait_queue_head_t shutdown_wait; unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; int mark; @@ -282,6 +284,7 @@ static void dlm_con_init(struct connection *con, int nodeid) INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); spin_lock_init(&con->addrs_lock); + init_waitqueue_head(&con->shutdown_wait); } /* @@ -790,6 +793,43 @@ static void close_connection(struct connection *con, bool and_other) up_write(&con->sock_lock); } +static void shutdown_connection(struct connection *con, bool and_other) +{ + int ret; + + if (con->othercon && and_other) + shutdown_connection(con->othercon, false); + + flush_workqueue(io_workqueue); + down_read(&con->sock_lock); + /* nothing to shutdown */ + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + ret = kernel_sock_shutdown(con->sock, SHUT_WR); + up_read(&con->sock_lock); + if (ret) { + log_print("Connection %p failed to shutdown: %d will force close", + con, ret); + goto force_close; + } else { + ret = wait_event_timeout(con->shutdown_wait, !con->sock, + DLM_SHUTDOWN_WAIT_TIMEOUT); + if (ret == 0) { + log_print("Connection %p shutdown timed out, will force close", + con); + goto force_close; + } + } + + return; + +force_close: + close_connection(con, false); +} + static struct processqueue_entry *new_processqueue_entry(int nodeid, int buflen) { @@ -1488,6 +1528,7 @@ static void process_recv_sockets(struct work_struct *work) break; case DLM_IO_EOF: close_connection(con, false); + wake_up(&con->shutdown_wait); /* CF_RECV_PENDING cleared */ break; case DLM_IO_RESCHED: @@ -1695,6 +1736,9 @@ static int work_start(void) void dlm_lowcomms_shutdown(void) { + struct connection *con; + int i, idx; + /* stop lowcomms_listen_data_ready calls */ lock_sock(listen_con.sock->sk); listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; @@ -1703,29 +1747,20 @@ void dlm_lowcomms_shutdown(void) cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); - flush_workqueue(process_workqueue); -} - -void dlm_lowcomms_shutdown_node(int nodeid, bool force) -{ - struct connection *con; - int idx; - idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, 0); - if (WARN_ON_ONCE(!con)) { - srcu_read_unlock(&connections_srcu, idx); - return; - } + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + shutdown_connection(con, true); + stop_connection_io(con); + flush_workqueue(process_workqueue); + close_connection(con, true); - flush_work(&con->swork); - stop_connection_io(con); - WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); - close_connection(con, true); - clean_one_writequeue(con); - if (con->othercon) - clean_one_writequeue(con->othercon); - allow_connection_io(con); + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + allow_connection_io(con); + } + } srcu_read_unlock(&connections_srcu, idx); } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index eb7a08641fcf..cdbaa452fc05 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -51,7 +51,7 @@ int __init dlm_memory_init(void) cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), __alignof__(struct dlm_callback), 0, NULL); - if (!rsb_cache) + if (!cb_cache) goto cb; return 0; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fc015a6abe17..c02c43e4980a 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -146,8 +146,8 @@ /* init value for sequence numbers for testing purpose only e.g. overflows */ #define DLM_SEQ_INIT 0 -/* 3 minutes wait to sync ending of dlm */ -#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000) +/* 5 seconds wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) #define DLM_VERSION_NOT_SET 0 struct midcomms_node { @@ -375,7 +375,7 @@ static int dlm_send_ack(int nodeid, uint32_t seq) struct dlm_msg *msg; char *ppc; - msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_ATOMIC, &ppc, NULL, NULL); if (!msg) return -ENOMEM; @@ -402,10 +402,11 @@ static int dlm_send_fin(struct midcomms_node *node, struct dlm_mhandle *mh; char *ppc; - mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_ATOMIC, &ppc); if (!mh) return -ENOMEM; + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); mh->ack_rcv = ack_rcv; m_header = (struct dlm_header *)ppc; @@ -417,7 +418,6 @@ static int dlm_send_fin(struct midcomms_node *node, pr_debug("sending fin msg to node %d\n", node->nodeid); dlm_midcomms_commit_mhandle(mh, NULL, 0); - set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); return 0; } @@ -467,7 +467,7 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -498,18 +498,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, switch (p->header.h_cmd) { case DLM_FIN: - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); - spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: - node->state = DLM_CLOSE_WAIT; - pr_debug("switch node %d to state %s\n", - node->nodeid, dlm_state_str(node->state)); + dlm_send_ack(node->nodeid, node->seq_next); + /* passive shutdown DLM_LAST_ACK case 1 * additional we check if the node is used by * cluster manager events at all. @@ -518,34 +514,38 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, node->state = DLM_LAST_ACK; pr_debug("switch node %d to state %s case 1\n", node->nodeid, dlm_state_str(node->state)); - spin_unlock(&node->state_lock); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + } else { + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); } break; case DLM_FIN_WAIT1: + dlm_send_ack(node->nodeid, node->seq_next); node->state = DLM_CLOSING; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_FIN_WAIT2: + dlm_send_ack(node->nodeid, node->seq_next); midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_LAST_ACK: /* probably remove_member caught it, do nothing */ break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); - - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); @@ -564,12 +564,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", seq, node->seq_next, node->nodeid); } - - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); } static struct midcomms_node * @@ -612,16 +606,8 @@ dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, case DLM_ESTABLISHED: break; default: - /* some invalid state passive shutdown - * was failed, we try to reset and - * hope it will go on. - */ - log_print("reset node %d because shutdown stuck", - node->nodeid); - - midcomms_node_reset(node); - node->state = DLM_ESTABLISHED; - break; + spin_unlock(&node->state_lock); + return NULL; } spin_unlock(&node->state_lock); } @@ -671,6 +657,7 @@ static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, node->nodeid); break; @@ -840,6 +827,7 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, node->nodeid); break; @@ -1214,8 +1202,15 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: + /* held rcu read lock here, because we sending the + * dlm message out, when we do that we could receive + * an ack back which releases the mhandle and we + * get a use after free. + */ + rcu_read_lock(); dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); + rcu_read_unlock(); break; default: srcu_read_unlock(&nodes_srcu, mh->idx); @@ -1266,7 +1261,6 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_CLOSED: /* not valid but somehow we got what we want */ @@ -1274,7 +1268,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -1362,11 +1356,11 @@ void dlm_midcomms_remove_member(int nodeid) case DLM_CLOSE_WAIT: /* passive shutdown DLM_LAST_ACK case 2 */ node->state = DLM_LAST_ACK; - spin_unlock(&node->state_lock); - pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + break; case DLM_LAST_ACK: /* probably receive fin caught it, do nothing */ break; @@ -1374,7 +1368,7 @@ void dlm_midcomms_remove_member(int nodeid) /* already gone, do nothing */ break; default: - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); break; } @@ -1382,12 +1376,6 @@ void dlm_midcomms_remove_member(int nodeid) spin_unlock(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); - srcu_read_unlock(&nodes_srcu, idx); } static void midcomms_node_release(struct rcu_head *rcu) @@ -1395,9 +1383,31 @@ static void midcomms_node_release(struct rcu_head *rcu) struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); kfree(node); } +void dlm_midcomms_version_wait(void) +{ + struct midcomms_node *node; + int i, idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + ret = wait_event_timeout(node->shutdown_wait, + node->version != DLM_VERSION_NOT_SET || + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + pr_debug("version wait timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + } + } + srcu_read_unlock(&nodes_srcu, idx); +} + static void midcomms_shutdown(struct midcomms_node *node) { int ret; @@ -1418,11 +1428,11 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state = DLM_FIN_WAIT1; pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); + dlm_send_fin(node, dlm_act_fin_ack_rcv); break; case DLM_CLOSED: /* we have what we want */ - spin_unlock(&node->state_lock); - return; + break; default: /* busy to enter DLM_FIN_WAIT1, wait until passive * done in shutdown_wait to enter DLM_CLOSED. @@ -1431,29 +1441,20 @@ static void midcomms_shutdown(struct midcomms_node *node) } spin_unlock(&node->state_lock); - if (node->state == DLM_FIN_WAIT1) { - dlm_send_fin(node, dlm_act_fin_ack_rcv); - - if (DLM_DEBUG_FENCE_TERMINATION) - msleep(5000); - } + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); /* wait for other side dlm + fin */ ret = wait_event_timeout(node->shutdown_wait, node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); - if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) { + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); - midcomms_node_reset(node); - dlm_lowcomms_shutdown_node(node->nodeid, true); - return; - } - - pr_debug("active shutdown done for node %d with state %s\n", - node->nodeid, dlm_state_str(node->state)); - dlm_lowcomms_shutdown_node(node->nodeid, false); + else + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); } void dlm_midcomms_shutdown(void) @@ -1461,8 +1462,6 @@ void dlm_midcomms_shutdown(void) struct midcomms_node *node; int i, idx; - dlm_lowcomms_shutdown(); - mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { @@ -1480,6 +1479,8 @@ void dlm_midcomms_shutdown(void) } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); + + dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index bea1cee4279c..9f8c9605013d 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); +void dlm_midcomms_version_wait(void); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); void dlm_midcomms_stop(void); diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 737f185aad8d..ed4357e62f35 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -4,6 +4,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/poll.h> #include <linux/dlm.h> diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e3f5d7f3c8a0..bd3f3c755b24 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1105,7 +1105,7 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, } inode_lock(lower_inode); - rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + rc = __vfs_setxattr(&nop_mnt_idmap, lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, page_virt, size, 0); if (!rc && ecryptfs_inode) fsstack_copy_attr_all(ecryptfs_inode, lower_inode); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index f3cd00fac9c3..144ace9e0dd9 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -139,7 +139,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry, + rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); } if (rc) { @@ -180,7 +180,7 @@ ecryptfs_do_create(struct inode *directory_inode, rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_create(&init_user_ns, lower_dir, + rc = vfs_create(&nop_mnt_idmap, lower_dir, lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " @@ -191,7 +191,7 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL); + vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); goto out_lock; } fsstack_copy_attr_times(directory_inode, lower_dir); @@ -253,7 +253,7 @@ out: * Returns zero on success; non-zero on error condition */ static int -ecryptfs_create(struct user_namespace *mnt_userns, +ecryptfs_create(struct mnt_idmap *idmap, struct inode *directory_inode, struct dentry *ecryptfs_dentry, umode_t mode, bool excl) { @@ -434,7 +434,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); if (!rc) - rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir, + rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; @@ -456,7 +456,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } -static int ecryptfs_symlink(struct user_namespace *mnt_userns, +static int ecryptfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -478,7 +478,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry, + rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry, encoded_symname); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) @@ -495,7 +495,7 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int rc; @@ -504,7 +504,7 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, rc = lock_parent(dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_mkdir(&init_user_ns, lower_dir, + rc = vfs_mkdir(&nop_mnt_idmap, lower_dir, lower_dentry, mode); if (rc || d_really_is_negative(lower_dentry)) goto out; @@ -533,7 +533,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry); + rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); } if (!rc) { clear_nlink(d_inode(dentry)); @@ -548,7 +548,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) } static int -ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int rc; @@ -557,7 +557,7 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, rc = lock_parent(dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_mknod(&init_user_ns, lower_dir, + rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; @@ -574,7 +574,7 @@ out: } static int -ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -616,10 +616,10 @@ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out_lock; } - rd.old_mnt_userns = &init_user_ns; + rd.old_mnt_idmap = &nop_mnt_idmap; rd.old_dir = d_inode(lower_old_dir_dentry); rd.old_dentry = lower_old_dentry; - rd.new_mnt_userns = &init_user_ns; + rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_dir = d_inode(lower_new_dir_dentry); rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); @@ -856,7 +856,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); inode_lock(d_inode(lower_dentry)); - rc = notify_change(&init_user_ns, lower_dentry, + rc = notify_change(&nop_mnt_idmap, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); } @@ -864,16 +864,16 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) } static int -ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +ecryptfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { - return inode_permission(&init_user_ns, + return inode_permission(&nop_mnt_idmap, ecryptfs_inode_to_lower(inode), mask); } /** * ecryptfs_setattr - * @mnt_userns: user namespace of the target mount + * @idmap: idmap of the target mount * @dentry: dentry handle to the inode to modify * @ia: Structure with flags of what to change and values * @@ -884,7 +884,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, * All other metadata changes will be passed right to the lower filesystem, * and we will just update our inode to look like the lower. */ -static int ecryptfs_setattr(struct user_namespace *mnt_userns, +static int ecryptfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int rc = 0; @@ -939,7 +939,7 @@ static int ecryptfs_setattr(struct user_namespace *mnt_userns, } mutex_unlock(&crypt_stat->cs_mutex); - rc = setattr_prepare(&init_user_ns, dentry, ia); + rc = setattr_prepare(&nop_mnt_idmap, dentry, ia); if (rc) goto out; if (ia->ia_valid & ATTR_SIZE) { @@ -965,14 +965,14 @@ static int ecryptfs_setattr(struct user_namespace *mnt_userns, lower_ia.ia_valid &= ~ATTR_MODE; inode_lock(d_inode(lower_dentry)); - rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL); + rc = notify_change(&nop_mnt_idmap, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; } -static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, +static int ecryptfs_getattr_link(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -998,7 +998,7 @@ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, return rc; } -static int ecryptfs_getattr(struct user_namespace *mnt_userns, +static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -1011,7 +1011,7 @@ static int ecryptfs_getattr(struct user_namespace *mnt_userns, if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1033,7 +1033,7 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL); + rc = __vfs_setxattr_locked(&nop_mnt_idmap, lower_dentry, name, value, size, flags, NULL); inode_unlock(lower_inode); if (!rc && inode) fsstack_copy_attr_all(inode, lower_inode); @@ -1099,7 +1099,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_removexattr(&init_user_ns, lower_dentry, name); + rc = __vfs_removexattr(&nop_mnt_idmap, lower_dentry, name); inode_unlock(lower_inode); out: return rc; @@ -1110,26 +1110,26 @@ static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa); } -static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns, +static int ecryptfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); int rc; - rc = vfs_fileattr_set(&init_user_ns, lower_dentry, fa); + rc = vfs_fileattr_set(&nop_mnt_idmap, lower_dentry, fa); fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); return rc; } -static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns, +static struct posix_acl *ecryptfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { - return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry), + return vfs_get_acl(idmap, ecryptfs_dentry_to_lower(dentry), posix_acl_xattr_name(type)); } -static int ecryptfs_set_acl(struct user_namespace *mnt_userns, +static int ecryptfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { @@ -1137,7 +1137,7 @@ static int ecryptfs_set_acl(struct user_namespace *mnt_userns, struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); struct inode *lower_inode = d_inode(lower_dentry); - rc = vfs_set_acl(&init_user_ns, lower_dentry, + rc = vfs_set_acl(&nop_mnt_idmap, lower_dentry, posix_acl_xattr_name(type), acl); if (!rc) fsstack_copy_attr_all(d_inode(dentry), lower_inode); @@ -1190,7 +1190,7 @@ static int ecryptfs_xattr_get(const struct xattr_handler *handler, } static int ecryptfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 19af229eb7ca..373c3e5747e6 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -428,7 +428,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + rc = __vfs_setxattr(&nop_mnt_idmap, lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 617f3ad2485e..b973a2c03dde 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -70,7 +70,7 @@ bool efivarfs_valid_name(const char *str, int len) return uuid_is_valid(s); } -static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = NULL; @@ -163,7 +163,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int -efivarfs_fileattr_set(struct user_namespace *mnt_userns, +efivarfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { unsigned int i_flags = 0; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 85490370e0ca..704fb59577e0 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -108,3 +108,21 @@ config EROFS_FS_ONDEMAND read support. If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD + bool "EROFS per-cpu decompression kthread workers" + depends on EROFS_FS_ZIP + help + Saying Y here enables per-CPU kthread workers pool to carry out + async decompression for low latencies on some architectures. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD_HIPRI + bool "EROFS high priority per-CPU kthread workers" + depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD + help + This permits EROFS to configure per-CPU kthread workers to run + at higher priority. + + If unsure, say N. diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7..032e12dccb84 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -74,8 +74,7 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, } static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map, - int flags) + struct erofs_map_blocks *map) { erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; @@ -91,11 +90,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode, map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; map->m_plen = blknr_to_addr(lastblk) - offset; } else if (tailendpacking) { - /* 2 - inode inline B: inode, [xattrs], inline last blk... */ - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(offset); map->m_plen = inode->i_size - offset; /* inline data should be located in the same meta block */ @@ -117,8 +113,7 @@ static int erofs_map_blocks_flatmode(struct inode *inode, return 0; } -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); @@ -130,7 +125,7 @@ int erofs_map_blocks(struct inode *inode, void *kaddr; int err = 0; - trace_erofs_map_blocks_enter(inode, map, flags); + trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ @@ -140,7 +135,7 @@ int erofs_map_blocks(struct inode *inode, } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map, flags); + err = erofs_map_blocks_flatmode(inode, map); goto out; } @@ -150,7 +145,7 @@ int erofs_map_blocks(struct inode *inode, unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ chunknr = map->m_la >> vi->chunkbits; - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); @@ -192,7 +187,7 @@ out_unlock: out: if (!err) map->m_llen = map->m_plen; - trace_erofs_map_blocks_exit(inode, map, flags, 0); + trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } @@ -255,7 +250,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_la = offset; map.m_llen = length; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret < 0) return ret; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index ecf28f66b97d..6970b09b8307 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -6,21 +6,6 @@ */ #include "internal.h" -static void debug_one_dentry(unsigned char d_type, const char *de_name, - unsigned int de_namelen) -{ -#ifdef CONFIG_EROFS_FS_DEBUG - /* since the on-disk name could not have the trailing '\0' */ - unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; - - memcpy(dbg_namebuf, de_name, de_namelen); - dbg_namebuf[de_namelen] = '\0'; - - erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf, - de_namelen, d_type); -#endif -} - static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) @@ -52,10 +37,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, return -EFSCORRUPTED; } - debug_one_dentry(d_type, de_name, de_namelen); if (!dir_emit(ctx, de_name, de_namelen, le64_to_cpu(de->nid), d_type)) - /* stopped by some reason */ return 1; ++de; ctx->pos += sizeof(struct erofs_dirent); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 014e20962376..96a87c023128 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -9,6 +9,7 @@ static DEFINE_MUTEX(erofs_domain_list_lock); static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); +static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; struct erofs_fscache_request { @@ -164,18 +165,8 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct super_block *sb = folio_mapping(folio)->host->i_sb; + struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; struct erofs_fscache_request *req; - struct erofs_map_dev mdev = { - .m_deviceid = 0, - .m_pa = folio_pos(folio), - }; - - ret = erofs_map_dev(sb, &mdev); - if (ret) { - folio_unlock(folio); - return ret; - } req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); @@ -184,8 +175,8 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) return PTR_ERR(req); } - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa, folio_size(folio)); + ret = erofs_fscache_read_folios_async(ctx->cookie, req, + folio_pos(folio), folio_size(folio)); if (ret) req->error = ret; @@ -207,7 +198,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) int ret; map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret) return ret; @@ -328,8 +319,6 @@ const struct address_space_operations erofs_fscache_access_aops = { static void erofs_fscache_domain_put(struct erofs_domain *domain) { - if (!domain) - return; mutex_lock(&erofs_domain_list_lock); if (refcount_dec_and_test(&domain->ref)) { list_del(&domain->list); @@ -337,8 +326,8 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) kern_unmount(erofs_pseudo_mnt); erofs_pseudo_mnt = NULL; } - mutex_unlock(&erofs_domain_list_lock); fscache_relinquish_volume(domain->volume, NULL, false); + mutex_unlock(&erofs_domain_list_lock); kfree(domain->domain_id); kfree(domain); return; @@ -431,19 +420,21 @@ static int erofs_fscache_register_domain(struct super_block *sb) return err; } -static -struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; struct fscache_cookie *cookie; + struct super_block *isb; + struct inode *inode; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->node); + refcount_set(&ctx->ref, 1); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -452,32 +443,32 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, ret = -EINVAL; goto err; } - fscache_use_cookie(cookie, false); - ctx->cookie = cookie; - - if (flags & EROFS_REG_COOKIE_NEED_INODE) { - struct inode *const inode = new_inode(sb); - - if (!inode) { - erofs_err(sb, "failed to get anon inode for %s", name); - ret = -ENOMEM; - goto err_cookie; - } - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - - ctx->inode = inode; + /* + * Allocate anonymous inode in global pseudo mount for shareable blobs, + * so that they are accessible among erofs fs instances. + */ + isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb; + inode = new_inode(isb); + if (!inode) { + erofs_err(sb, "failed to get anon inode for %s", name); + ret = -ENOMEM; + goto err_cookie; } + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &erofs_fscache_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_private = ctx; + + ctx->cookie = cookie; + ctx->inode = inode; return ctx; err_cookie: - fscache_unuse_cookie(ctx->cookie, NULL, NULL); - fscache_relinquish_cookie(ctx->cookie, false); + fscache_unuse_cookie(cookie, NULL, NULL); + fscache_relinquish_cookie(cookie, false); err: kfree(ctx); return ERR_PTR(ret); @@ -492,13 +483,9 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) kfree(ctx); } -static -struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb, + char *name, unsigned int flags) { - int err; - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; @@ -508,55 +495,38 @@ struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, ctx->name = kstrdup(name, GFP_KERNEL); if (!ctx->name) { - err = -ENOMEM; - goto out; - } - - inode = new_inode(erofs_pseudo_mnt->mnt_sb); - if (!inode) { - err = -ENOMEM; - goto out; + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(-ENOMEM); } - ctx->domain = domain; - ctx->anon_inode = inode; - inode->i_private = ctx; refcount_inc(&domain->ref); + ctx->domain = domain; + list_add(&ctx->node, &erofs_domain_cookies_list); return ctx; -out: - erofs_fscache_relinquish_cookie(ctx); - return ERR_PTR(err); } -static -struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, unsigned int flags) { - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + flags |= EROFS_REG_COOKIE_SHARE; mutex_lock(&erofs_domain_cookies_lock); - spin_lock(&psb->s_inode_list_lock); - list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { - ctx = inode->i_private; - if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + list_for_each_entry(ctx, &erofs_domain_cookies_list, node) { + if (ctx->domain != domain || strcmp(ctx->name, name)) continue; if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { - igrab(inode); + refcount_inc(&ctx->ref); } else { erofs_err(sb, "%s already exists in domain %s", name, domain->domain_id); ctx = ERR_PTR(-EEXIST); } - spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } - spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, flags); + ctx = erofs_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } @@ -572,23 +542,22 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) { - bool drop; - struct erofs_domain *domain; + struct erofs_domain *domain = NULL; if (!ctx) return; - domain = ctx->domain; - if (domain) { - mutex_lock(&erofs_domain_cookies_lock); - drop = atomic_read(&ctx->anon_inode->i_count) == 1; - iput(ctx->anon_inode); - mutex_unlock(&erofs_domain_cookies_lock); - if (!drop) - return; - } + if (!ctx->domain) + return erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_domain_put(domain); + mutex_lock(&erofs_domain_cookies_lock); + if (refcount_dec_and_test(&ctx->ref)) { + domain = ctx->domain; + list_del(&ctx->node); + erofs_fscache_relinquish_cookie(ctx); + } + mutex_unlock(&erofs_domain_cookies_lock); + if (domain) + erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) @@ -596,7 +565,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; - unsigned int flags; + unsigned int flags = 0; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -615,7 +584,6 @@ int erofs_fscache_register_fs(struct super_block *sb) * * Acquired domain/volume will be relinquished in kill_sb() on error. */ - flags = EROFS_REG_COOKIE_NEED_INODE; if (sbi->domain_id) flags |= EROFS_REG_COOKIE_NEED_NOEXIST; fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d3b8736fa124..4be7dda3cd24 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -14,7 +14,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = iloc(sbi, vi->nid); + const erofs_off_t inode_loc = erofs_iloc(inode); erofs_blk_t blkaddr, nblks = 0; void *kaddr; @@ -308,52 +308,54 @@ out_unlock: } /* - * erofs nid is 64bits, but i_ino is 'unsigned long', therefore - * we should do more for 32-bit platform to find the right inode. + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. */ -static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +static ino_t erofs_squash_ino(erofs_nid_t nid) { - const erofs_nid_t nid = *(erofs_nid_t *)opaque; + ino_t ino = (ino_t)nid; + + if (sizeof(ino_t) < sizeof(erofs_nid_t)) + ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; + return ino; +} - return EROFS_I(inode)->nid == nid; +static int erofs_iget5_eq(struct inode *inode, void *opaque) +{ + return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque; } -static int erofs_iget_set_actor(struct inode *inode, void *opaque) +static int erofs_iget5_set(struct inode *inode, void *opaque) { const erofs_nid_t nid = *(erofs_nid_t *)opaque; - inode->i_ino = erofs_inode_hash(nid); + inode->i_ino = erofs_squash_ino(nid); + EROFS_I(inode)->nid = nid; return 0; } struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { - const unsigned long hashval = erofs_inode_hash(nid); struct inode *inode; - inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, - erofs_iget_set_actor, &nid); + inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq, + erofs_iget5_set, &nid); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - int err; - struct erofs_inode *vi = EROFS_I(inode); - - vi->nid = nid; + int err = erofs_fill_inode(inode); - err = erofs_fill_inode(inode); - if (!err) { - unlock_new_inode(inode); - } else { + if (err) { iget_failed(inode); - inode = ERR_PTR(err); + return ERR_PTR(err); } + unlock_new_inode(inode); } return inode; } -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -366,7 +368,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index bb8501c0ff5b..3f3561d37d1b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -12,7 +12,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -108,9 +107,12 @@ struct erofs_domain { struct erofs_fscache { struct fscache_cookie *cookie; - struct inode *inode; - struct inode *anon_inode; + struct inode *inode; /* anonymous inode for the blob */ + + /* used for share domain mode */ struct erofs_domain *domain; + struct list_head node; + refcount_t ref; char *name; }; @@ -271,11 +273,6 @@ struct erofs_buf { #define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) #define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) -static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) -{ - return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); -} - #define EROFS_FEATURE_FUNCS(name, compat, feature) \ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ { \ @@ -340,13 +337,14 @@ struct erofs_inode { struct inode vfs_inode; }; -#define EROFS_I(ptr) \ - container_of(ptr, struct erofs_inode, vfs_inode) +#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode) -static inline unsigned long erofs_inode_datablocks(struct inode *inode) +static inline erofs_off_t erofs_iloc(struct inode *inode) { - /* since i_size cannot be changed */ - return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + return blknr_to_addr(sbi->meta_blkaddr) + + (EROFS_I(inode)->nid << sbi->islotbits); } static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, @@ -382,31 +380,18 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); } -extern const struct super_operations erofs_sops; -extern struct file_system_type erofs_fs_type; - -extern const struct address_space_operations erofs_raw_access_aops; -extern const struct address_space_operations z_erofs_aops; - -enum { - BH_Encoded = BH_PrivateStart, - BH_FullMapped, - BH_Fragment, - BH_Partialref, -}; - /* Has a disk mapping */ -#define EROFS_MAP_MAPPED (1 << BH_Mapped) +#define EROFS_MAP_MAPPED 0x0001 /* Located in metadata (could be copied from bd_inode) */ -#define EROFS_MAP_META (1 << BH_Meta) +#define EROFS_MAP_META 0x0002 /* The extent is encoded */ -#define EROFS_MAP_ENCODED (1 << BH_Encoded) +#define EROFS_MAP_ENCODED 0x0004 /* The length of extent is full */ -#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +#define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +#define EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ -#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) +#define EROFS_MAP_PARTIAL_REF 0x0020 struct erofs_map_blocks { struct erofs_buf buf; @@ -419,17 +404,15 @@ struct erofs_map_blocks { unsigned int m_flags; }; -/* Flags used by erofs_map_blocks_flatmode() */ -#define EROFS_GET_BLOCKS_RAW 0x0001 /* * Used to get the exact decompressed length, e.g. fiemap (consider lookback * approach instead if possible since it's more metadata lightweight.) */ -#define EROFS_GET_BLOCKS_FIEMAP 0x0002 +#define EROFS_GET_BLOCKS_FIEMAP 0x0001 /* Used to map the whole extent if non-negligible data is requested for LZMA */ -#define EROFS_GET_BLOCKS_READMORE 0x0004 +#define EROFS_GET_BLOCKS_READMORE 0x0002 /* Used to map tail extent for tailpacking inline or fragment pcluster */ -#define EROFS_GET_BLOCKS_FINDTAIL 0x0008 +#define EROFS_GET_BLOCKS_FINDTAIL 0x0004 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -437,24 +420,6 @@ enum { Z_EROFS_COMPRESSION_RUNTIME_MAX }; -/* zmap.c */ -extern const struct iomap_ops z_erofs_iomap_report_ops; - -#ifdef CONFIG_EROFS_FS_ZIP -int z_erofs_fill_inode(struct inode *inode); -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags); -#else -static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } -static inline int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags) -{ - return -EOPNOTSUPP; -} -#endif /* !CONFIG_EROFS_FS_ZIP */ - struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; @@ -465,8 +430,27 @@ struct erofs_map_dev { unsigned int m_deviceid; }; -/* data.c */ +extern struct file_system_type erofs_fs_type; +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations z_erofs_aops; +extern const struct address_space_operations erofs_fscache_access_aops; + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; +extern const struct inode_operations erofs_dir_iops; + extern const struct file_operations erofs_file_fops; +extern const struct file_operations erofs_dir_fops; + +extern const struct iomap_ops z_erofs_iomap_report_ops; + +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_SHARE 0x0001 +#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 + void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); void *erofs_bread(struct erofs_buf *buf, struct inode *inode, @@ -476,37 +460,14 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags); - -/* inode.c */ -static inline unsigned long erofs_inode_hash(erofs_nid_t nid) -{ -#if BITS_PER_LONG == 32 - return (nid >> 32) ^ (nid & 0xffffffff); -#else - return nid; -#endif -} - -extern const struct inode_operations erofs_generic_iops; -extern const struct inode_operations erofs_symlink_iops; -extern const struct inode_operations erofs_fast_symlink_iops; - +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); - -/* namei.c */ -extern const struct inode_operations erofs_dir_iops; - int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type); -/* dir.c */ -extern const struct file_operations erofs_dir_fops; - static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) { int retried = 0; @@ -522,23 +483,19 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) return NULL; } -/* pcpubuf.c */ void *erofs_get_pcpubuf(unsigned int requiredpages); void erofs_put_pcpubuf(void *ptr); int erofs_pcpubuf_growsize(unsigned int nrpages); void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); -/* sysfs.c */ int erofs_register_sysfs(struct super_block *sb); void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); void erofs_exit_sysfs(void); -/* utils.c / zdata.c */ struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); -static inline void erofs_pagepool_add(struct page **pagepool, - struct page *page) +static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) { set_page_private(page, (unsigned long)*pagepool); *pagepool = page; @@ -564,6 +521,9 @@ int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); +int z_erofs_fill_inode(struct inode *inode); +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -581,6 +541,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } +static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA @@ -601,23 +562,15 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } return 0; } -#endif /* !CONFIG_EROFS_FS_ZIP */ +#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ -/* flags for erofs_fscache_register_cookie() */ -#define EROFS_REG_COOKIE_NEED_INODE 1 -#define EROFS_REG_COOKIE_NEED_NOEXIST 2 - -/* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags); + char *name, unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); - -extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { @@ -627,8 +580,7 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) + char *name, unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index b64a108fac92..966eabc61c13 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -5,7 +5,6 @@ * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" - #include <trace/events/erofs.h> struct erofs_qstr { @@ -87,19 +86,13 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, return ERR_PTR(-ENOENT); } -static void *find_target_block_classic(struct erofs_buf *target, - struct inode *dir, - struct erofs_qstr *name, - int *_ndirents) +static void *erofs_find_target_block(struct erofs_buf *target, + struct inode *dir, struct erofs_qstr *name, int *_ndirents) { - unsigned int startprfx, endprfx; - int head, back; + int head = 0, back = DIV_ROUND_UP(dir->i_size, EROFS_BLKSIZ) - 1; + unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); - startprfx = endprfx = 0; - head = 0; - back = erofs_inode_datablocks(dir) - 1; - while (head <= back) { const int mid = head + (back - head) / 2; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -180,8 +173,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.end = name->name + name->len; ndirents = 0; - - de = find_target_block_classic(&buf, dir, &qn, &ndirents); + de = erofs_find_target_block(&buf, dir, &qn, &ndirents); if (IS_ERR(de)) return PTR_ERR(de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 626a615dafc2..19b1ae79cec4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -5,7 +5,6 @@ * Copyright (C) 2021, Alibaba Cloud */ #include <linux/module.h> -#include <linux/buffer_head.h> #include <linux/statfs.h> #include <linux/parser.h> #include <linux/seq_file.h> @@ -969,6 +968,8 @@ static void erofs_put_super(struct super_block *sb) iput(sbi->packed_inode); sbi->packed_inode = NULL; #endif + erofs_free_dev_context(sbi->devs); + sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index fd476961f742..435e515c0792 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -179,13 +179,13 @@ static const struct sysfs_ops erofs_attr_ops = { .store = erofs_attr_store, }; -static struct kobj_type erofs_sb_ktype = { +static const struct kobj_type erofs_sb_ktype = { .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; -static struct kobj_type erofs_ktype = { +static const struct kobj_type erofs_ktype = { .sysfs_ops = &erofs_attr_ops, }; @@ -193,7 +193,7 @@ static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; -static struct kobj_type erofs_feat_ktype = { +static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h deleted file mode 100644 index 64ceb7270b5c..000000000000 --- a/fs/erofs/tagptr.h +++ /dev/null @@ -1,107 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * A tagged pointer implementation - */ -#ifndef __EROFS_FS_TAGPTR_H -#define __EROFS_FS_TAGPTR_H - -#include <linux/types.h> -#include <linux/build_bug.h> - -/* - * the name of tagged pointer types are tagptr{1, 2, 3...}_t - * avoid directly using the internal structs __tagptr{1, 2, 3...} - */ -#define __MAKE_TAGPTR(n) \ -typedef struct __tagptr##n { \ - uintptr_t v; \ -} tagptr##n##_t; - -__MAKE_TAGPTR(1) -__MAKE_TAGPTR(2) -__MAKE_TAGPTR(3) -__MAKE_TAGPTR(4) - -#undef __MAKE_TAGPTR - -extern void __compiletime_error("bad tagptr tags") - __bad_tagptr_tags(void); - -extern void __compiletime_error("bad tagptr type") - __bad_tagptr_type(void); - -/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ -#define __tagptr_mask_1(ptr, n) \ - __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ - (1UL << (n)) - 1 : - -#define __tagptr_mask(ptr) (\ - __tagptr_mask_1(ptr, 1) ( \ - __tagptr_mask_1(ptr, 2) ( \ - __tagptr_mask_1(ptr, 3) ( \ - __tagptr_mask_1(ptr, 4) ( \ - __bad_tagptr_type(), 0))))) - -/* generate a tagged pointer from a raw value */ -#define tagptr_init(type, val) \ - ((typeof(type)){ .v = (uintptr_t)(val) }) - -/* - * directly cast a tagged pointer to the native pointer type, which - * could be used for backward compatibility of existing code. - */ -#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) - -/* encode tagged pointers */ -#define tagptr_fold(type, ptr, _tags) ({ \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ - __bad_tagptr_tags(); \ -tagptr_init(type, (uintptr_t)(ptr) | tags); }) - -/* decode tagged pointers */ -#define tagptr_unfold_ptr(tptr) \ - ((void *)((tptr).v & ~__tagptr_mask(tptr))) - -#define tagptr_unfold_tags(tptr) \ - ((tptr).v & __tagptr_mask(tptr)) - -/* operations for the tagger pointer */ -#define tagptr_eq(_tptr1, _tptr2) ({ \ - typeof(_tptr1) tptr1 = (_tptr1); \ - typeof(_tptr2) tptr2 = (_tptr2); \ - (void)(&tptr1 == &tptr2); \ -(tptr1).v == (tptr2).v; }) - -/* lock-free CAS operation */ -#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - typeof(_o) o = (_o); \ - typeof(_n) n = (_n); \ - (void)(&o == &n); \ - (void)(&o == ptptr); \ -tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) - -/* wrap WRITE_ONCE if atomic update is needed */ -#define tagptr_replace_tags(_ptptr, tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ -*ptptr; }) - -#define tagptr_set_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v |= tags; \ -*ptptr; }) - -#define tagptr_clear_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v &= ~tags; \ -*ptptr; }) - -#endif /* __EROFS_FS_TAGPTR_H */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a62fb8a3318a..60729b1220b6 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -22,8 +22,7 @@ static int init_inode_xattrs(struct inode *inode) struct xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; - struct super_block *sb; - struct erofs_sb_info *sbi; + struct super_block *sb = inode->i_sb; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ @@ -52,15 +51,14 @@ static int init_inode_xattrs(struct inode *inode) * undefined right now (maybe use later with some new sb feature). */ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { - erofs_err(inode->i_sb, + erofs_err(sb, "xattr_isize %d of nid %llu is not supported yet", vi->xattr_isize, vi->nid); ret = -EOPNOTSUPP; goto out_unlock; } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { if (vi->xattr_isize) { - erofs_err(inode->i_sb, - "bogus xattr ibody @ nid %llu", vi->nid); + erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); DBG_BUGON(1); ret = -EFSCORRUPTED; goto out_unlock; /* xattr ondisk layout error */ @@ -69,11 +67,9 @@ static int init_inode_xattrs(struct inode *inode) goto out_unlock; } - sb = inode->i_sb; - sbi = EROFS_SB(sb); it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); - it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + it.blkaddr = erofs_blknr(erofs_iloc(inode) + vi->inode_isize); + it.ofs = erofs_blkoff(erofs_iloc(inode) + vi->inode_isize); /* read in shared xattr array (non-atomic, see kmalloc below) */ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); @@ -159,7 +155,6 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); unsigned int xattr_header_sz, inline_xattr_ofs; xattr_header_sz = inlinexattr_header_size(inode); @@ -170,9 +165,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); - it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); - + it->blkaddr = erofs_blknr(erofs_iloc(inode) + inline_xattr_ofs); + it->ofs = erofs_blkoff(erofs_iloc(inode) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5200bb86e264..3247d2422bea 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -4,13 +4,178 @@ * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ -#include "zdata.h" #include "compress.h" #include <linux/prefetch.h> #include <linux/psi.h> - +#include <linux/cpuhotplug.h> #include <trace/events/erofs.h> +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) +#define Z_EROFS_INLINE_BVECS 2 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by the pcluster lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct mutex lock; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* L: the maximum decompression size of this round */ + unsigned int length; + + /* L: total number of bvecs */ + unsigned int vcnt; + + /* I: page offset of start position of decompression */ + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; +}; + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +struct z_erofs_decompressqueue { + struct super_block *sb; + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + struct completion done; + struct work_struct work; + struct kthread_work kthread_work; + } u; + bool eio, sync; +}; + +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; + + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + +#define Z_EROFS_ONSTACK_PAGES 32 + /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. @@ -175,35 +340,130 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * tagged pointer with 1-bit tag for all compressed pages - * tag 0 - the page is just found with an extra page reference - */ -typedef tagptr1_t compressed_page_t; +static struct workqueue_struct *z_erofs_workqueue __read_mostly; -#define tag_compressed_page_justfound(page) \ - tagptr_fold(compressed_page_t, page, 1) +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static struct kthread_worker __rcu **z_erofs_pcpu_workers; -static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static void erofs_destroy_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + worker = rcu_dereference_protected( + z_erofs_pcpu_workers[cpu], 1); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + if (worker) + kthread_destroy_worker(worker); + } + kfree(z_erofs_pcpu_workers); +} -void z_erofs_exit_zip_subsystem(void) +static struct kthread_worker *erofs_init_percpu_worker(int cpu) { - destroy_workqueue(z_erofs_workqueue); - z_erofs_destroy_pcluster_pool(); + struct kthread_worker *worker = + kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); + + if (IS_ERR(worker)) + return worker; + if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) + sched_set_fifo_low(worker->task); + else + sched_set_normal(worker->task, 0); + return worker; } -static inline int z_erofs_init_workqueue(void) +static int erofs_init_percpu_workers(void) { - const unsigned int onlinecpus = num_possible_cpus(); + struct kthread_worker *worker; + unsigned int cpu; - /* - * no need to spawn too many threads, limiting threads could minimum - * scheduling overhead, perhaps per-CPU threads should be better? - */ - z_erofs_workqueue = alloc_workqueue("erofs_unzipd", - WQ_UNBOUND | WQ_HIGHPRI, - onlinecpus + onlinecpus / 4); - return z_erofs_workqueue ? 0 : -ENOMEM; + z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), + sizeof(struct kthread_worker *), GFP_ATOMIC); + if (!z_erofs_pcpu_workers) + return -ENOMEM; + + for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ + worker = erofs_init_percpu_worker(cpu); + if (!IS_ERR(worker)) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + } + return 0; +} +#else +static inline void erofs_destroy_percpu_workers(void) {} +static inline int erofs_init_percpu_workers(void) { return 0; } +#endif + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); +static enum cpuhp_state erofs_cpuhp_state; + +static int erofs_cpu_online(unsigned int cpu) +{ + struct kthread_worker *worker, *old; + + worker = erofs_init_percpu_worker(cpu); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + spin_lock(&z_erofs_pcpu_worker_lock); + old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + if (!old) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + spin_unlock(&z_erofs_pcpu_worker_lock); + if (old) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_offline(unsigned int cpu) +{ + struct kthread_worker *worker; + + spin_lock(&z_erofs_pcpu_worker_lock); + worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + spin_unlock(&z_erofs_pcpu_worker_lock); + + synchronize_rcu(); + if (worker) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_hotplug_init(void) +{ + int state; + + state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); + if (state < 0) + return state; + + erofs_cpuhp_state = state; + return 0; +} + +static void erofs_cpu_hotplug_destroy(void) +{ + if (erofs_cpuhp_state) + cpuhp_remove_state_nocalls(erofs_cpuhp_state); +} +#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int erofs_cpu_hotplug_init(void) { return 0; } +static inline void erofs_cpu_hotplug_destroy(void) {} +#endif + +void z_erofs_exit_zip_subsystem(void) +{ + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); + destroy_workqueue(z_erofs_workqueue); + z_erofs_destroy_pcluster_pool(); } int __init z_erofs_init_zip_subsystem(void) @@ -211,10 +471,31 @@ int __init z_erofs_init_zip_subsystem(void) int err = z_erofs_create_pcluster_pool(); if (err) - return err; - err = z_erofs_init_workqueue(); + goto out_error_pcluster_pool; + + z_erofs_workqueue = alloc_workqueue("erofs_worker", + WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); + if (!z_erofs_workqueue) { + err = -ENOMEM; + goto out_error_workqueue_init; + } + + err = erofs_init_percpu_workers(); if (err) - z_erofs_destroy_pcluster_pool(); + goto out_error_pcpu_worker; + + err = erofs_cpu_hotplug_init(); + if (err < 0) + goto out_error_cpuhp_init; + return err; + +out_error_cpuhp_init: + erofs_destroy_percpu_workers(); +out_error_pcpu_worker: + destroy_workqueue(z_erofs_workqueue); +out_error_workqueue_init: + z_erofs_destroy_pcluster_pool(); +out_error_pcluster_pool: return err; } @@ -319,7 +600,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; - compressed_page_t t; + void *t; /* mark pages just found for debugging */ struct page *newpage = NULL; /* the compressed page was loaded before */ @@ -329,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, page = find_get_page(mc, pcl->obj.index + i); if (page) { - t = tag_compressed_page_justfound(page); + t = (void *)((unsigned long)page | 1); } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -345,11 +626,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); + t = (void *)((unsigned long)newpage | 1); } - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, - tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) continue; if (page) @@ -1151,18 +1431,24 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - erofs_release_pages(&pagepool); kvfree(bgq); } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) +{ + z_erofs_decompressqueue_work((struct work_struct *)work); +} +#endif + static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) + int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ - if (sync) { + if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; @@ -1170,9 +1456,24 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; - /* Use workqueue and sync decompression for atomic contexts only */ + /* Use (kthread_)work and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + struct kthread_worker *worker; + + rcu_read_lock(); + worker = rcu_dereference( + z_erofs_pcpu_workers[raw_smp_processor_id()]); + if (!worker) { + INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); + queue_work(z_erofs_workqueue, &io->u.work); + } else { + kthread_queue_work(worker, &io->u.kthread_work); + } + rcu_read_unlock(); +#else queue_work(z_erofs_workqueue, &io->u.work); +#endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; @@ -1192,8 +1493,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, struct address_space *mapping; struct page *oldpage, *page; - - compressed_page_t t; int justfound; repeat: @@ -1203,10 +1502,8 @@ repeat: if (!page) goto out_allocpage; - /* process the target tagged pointer */ - t = tagptr_init(compressed_page_t, page); - justfound = tagptr_unfold_tags(t); - page = tagptr_unfold_ptr(t); + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); /* * preallocated cached pages, which is used to avoid direct reclaim @@ -1294,9 +1591,8 @@ out: /* the only exit (for tracing and debugging) */ return page; } -static struct z_erofs_decompressqueue * -jobqueue_init(struct super_block *sb, - struct z_erofs_decompressqueue *fgq, bool *fg) +static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; @@ -1306,13 +1602,19 @@ jobqueue_init(struct super_block *sb, *fg = true; goto fg_out; } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + kthread_init_work(&q->u.kthread_work, + z_erofs_decompressqueue_kthread_work); +#else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); +#endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; + q->sync = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1326,20 +1628,6 @@ enum { NR_JOBQUEUES, }; -static void *jobqueueset_init(struct super_block *sb, - struct z_erofs_decompressqueue *q[], - struct z_erofs_decompressqueue *fgq, bool *fg) -{ - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); -} - static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t qtail[], z_erofs_next_pcluster_t owned_head) @@ -1361,8 +1649,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_decompressqueue_endio(struct bio *bio) { - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -1381,7 +1668,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } if (err) q->eio = true; - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + z_erofs_decompress_kickoff(q, -1); bio_put(bio); } @@ -1394,7 +1681,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - void *bi_private; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; @@ -1404,7 +1690,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned long pflags; int memstall = 0; - bi_private = jobqueueset_init(sb, q, fgq, force_fg); + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; @@ -1473,7 +1765,7 @@ submit_bio_retry: last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; + bio->bi_private = q[JQ_SUBMIT]; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; @@ -1500,13 +1792,13 @@ submit_bio_retry: /* * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. + * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } - z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h deleted file mode 100644 index d98c95212985..000000000000 --- a/fs/erofs/zdata.h +++ /dev/null @@ -1,178 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZDATA_H -#define __EROFS_FS_ZDATA_H - -#include "internal.h" -#include "tagptr.h" - -#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_INLINE_BVECS 2 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_bvec { - struct page *page; - int offset; - unsigned int end; -}; - -#define __Z_EROFS_BVSET(name, total) \ -struct name { \ - /* point to the next page which contains the following bvecs */ \ - struct page *nextpage; \ - struct z_erofs_bvec bvec[total]; \ -} -__Z_EROFS_BVSET(z_erofs_bvset,); -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); - -/* - * Structure fields follow one of the following exclusion rules. - * - * I: Modifiable by initialization/destruction paths and read-only - * for everyone else; - * - * L: Field should be protected by the pcluster lock; - * - * A: Field should be accessed / updated in atomic for parallelized code. - */ -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct mutex lock; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* L: the maximum decompression size of this round */ - unsigned int length; - - /* L: total number of bvecs */ - unsigned int vcnt; - - /* I: page offset of start position of decompression */ - unsigned short pageofs_out; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; - - union { - /* L: inline a certain number of bvec for bootstrap */ - struct z_erofs_bvset_inline bvset; - - /* I: can be used to free the pcluster by RCU. */ - struct rcu_head rcu; - }; - - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - - /* I: compression algorithm format */ - unsigned char algorithmformat; - - /* L: whether partial decompression or not */ - bool partial; - - /* L: indicate several pageofs_outs or not */ - bool multibases; - - /* A: compressed bvecs (can be cached or inplaced pages) */ - struct z_erofs_bvec compressed_bvecs[]; -}; - -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - -#define Z_EROFS_PCLUSTER_NIL (NULL) - -struct z_erofs_decompressqueue { - struct super_block *sb; - atomic_t pending_bios; - z_erofs_next_pcluster_t head; - - union { - struct completion done; - struct work_struct work; - } u; - - bool eio; -}; - -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->obj.index; -} - -static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) -{ - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; -} - -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) -{ - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static inline void z_erofs_page_mark_eio(struct page *page) -{ - int orig; - - do { - orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; - - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } -} - -#define Z_EROFS_ONSTACK_PAGES 32 - -#endif diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 98fb90b9af71..8bf6d30518b6 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,10 +7,6 @@ #include <asm/unaligned.h> #include <trace/events/erofs.h> -static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags); - int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -29,126 +25,6 @@ int z_erofs_fill_inode(struct inode *inode) return 0; } -static int z_erofs_fill_inode_lazy(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - struct z_erofs_map_header *h; - - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { - /* - * paired with smp_mb() at the end of the function to ensure - * fields will only be observed after the bit is set. - */ - smp_mb(); - return 0; - } - - if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) - return -ERESTARTSYS; - - err = 0; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) - goto out_unlock; - - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + - vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); - goto out_unlock; - } - - h = kaddr + erofs_blkoff(pos); - /* - * if the highest bit of the 8-byte map header is set, the whole file - * is stored in the packed inode. The rest bits keeps z_fragmentoff. - */ - if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { - vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; - vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); - vi->z_tailextent_headlcn = 0; - goto done; - } - vi->z_advise = le16_to_cpu(h->h_advise); - vi->z_algorithmtype[0] = h->h_algorithmtype & 15; - vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_put_metabuf; - } - - vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | - Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - if (err < 0) - goto out_put_metabuf; - } -done: - /* paired with smp_mb() at the beginning of the function */ - smp_mb(); - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -out_put_metabuf: - erofs_put_metabuf(&buf); -out_unlock: - clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); - return err; -} - struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; @@ -169,10 +45,9 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); const erofs_off_t pos = - Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + - vi->xattr_isize) + + Z_EROFS_VLE_LEGACY_INDEX_ALIGN(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; @@ -372,9 +247,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + - vi->inode_isize + vi->xattr_isize, 8) + - sizeof(struct z_erofs_map_header); + const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; @@ -732,6 +606,125 @@ unmap_out: return err; } +static int z_erofs_fill_inode_lazy(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + int err, headnr; + erofs_off_t pos; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); + return 0; + } + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out_unlock; + } + + h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); + err = -EOPNOTSUPP; + goto out_put_metabuf; + } + + vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + + if (!map.m_plen || + erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map.m_plen); + err = -EFSCORRUPTED; + } + if (err < 0) + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_put_metabuf; + } +done: + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { diff --git a/fs/exec.c b/fs/exec.c index ab913243a367..3d2b80d8d58e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1414,15 +1414,15 @@ EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); - struct user_namespace *mnt_userns = file_mnt_user_ns(file); - if (inode_permission(mnt_userns, inode, MAY_READ) < 0) { + struct mnt_idmap *idmap = file_mnt_idmap(file); + if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && - !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode)) + !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { @@ -1596,7 +1596,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; @@ -1612,15 +1612,15 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (!(mode & (S_ISUID|S_ISGID))) return; - mnt_userns = file_mnt_user_ns(file); + idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index bc6d21d7c5ad..1bf16abe3c84 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -450,9 +450,9 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); extern const struct file_operations exfat_file_operations; int __exfat_truncate(struct inode *inode); void exfat_truncate(struct inode *inode); -int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, +int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags); int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/exfat/file.c b/fs/exfat/file.c index f5b29072775d..1fdb0a64b91d 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -226,14 +226,14 @@ write_size: mutex_unlock(&sbi->s_lock); } -int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, +int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags) { struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -242,7 +242,7 @@ int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, return 0; } -int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); @@ -266,7 +266,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_TIMES_SET); } - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) goto out; @@ -293,7 +293,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_SIZE) inode->i_mtime = inode->i_ctime = current_time(inode); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); exfat_truncate_atime(&inode->i_atime); if (attr->ia_valid & ATTR_SIZE) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 5f995eba5dbb..02aab4c3a5f7 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -551,7 +551,7 @@ out: return ret; } -static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir, +static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -834,7 +834,7 @@ unlock: return err; } -static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -1285,7 +1285,7 @@ out: return ret; } -static int exfat_rename(struct user_namespace *mnt_userns, +static int exfat_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3204bd33e4e8..ab88d33d106c 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -524,7 +524,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one(mnt_user_ns(mnt), nbuf, + nresult = lookup_one(mnt_idmap(mnt), nbuf, target_dir, strlen(nbuf)); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 440d5f1e9d47..82b17d7fc93f 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -219,7 +219,7 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; @@ -228,7 +228,7 @@ ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(&init_user_ns, inode, &mode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (error) return error; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3841becb94ff..4a8443a2b8ec 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); -extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index e5cbc27ba459..4a6955a0a116 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -461,9 +461,9 @@ static int ext2_handle_dirsync(struct inode *dir) return err; } -void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, void *page_addr, struct inode *inode, - int update_times) +int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, void *page_addr, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_addr; @@ -472,7 +472,10 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, lock_page(page); err = ext2_prepare_chunk(page, pos, len); - BUG_ON(err); + if (err) { + unlock_page(page); + return err; + } de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); ext2_commit_chunk(page, pos, len); @@ -480,7 +483,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); - ext2_handle_dirsync(dir); + return ext2_handle_dirsync(dir); } /* @@ -646,7 +649,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) unlock_page(page); goto fail; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); memset(kaddr, 0, chunk_size); de = (struct ext2_dir_entry_2 *)kaddr; de->name_len = 1; @@ -661,7 +664,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) de->inode = cpu_to_le32(parent->i_ino); memcpy (de->name, "..\0", 4); ext2_set_de_type (de, inode); - kunmap_atomic(kaddr); + kunmap_local(kaddr); ext2_commit_chunk(page, 0, chunk_size); err = ext2_handle_dirsync(inode); fail: diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 28de11a22e5f..cb78d7dcfb95 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -734,8 +734,9 @@ extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page, char *kaddr); extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa); -extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, void *, - struct inode *, int); +int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, void *page_addr, struct inode *inode, + bool update_times); static inline void ext2_put_page(struct page *page, void *page_addr) { kunmap_local(page_addr); @@ -753,8 +754,8 @@ extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int ext2_setattr (struct user_namespace *, struct dentry *, struct iattr *); -extern int ext2_getattr (struct user_namespace *, const struct path *, +extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); +extern int ext2_getattr (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext2_set_inode_flags(struct inode *inode); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -762,7 +763,7 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* ioctl.c */ extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int ext2_fileattr_set(struct user_namespace *mnt_userns, +extern int ext2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern long ext2_ioctl(struct file *, unsigned int, unsigned long); extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 78b8686d9a4a..a4e1d7a9c544 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -545,7 +545,7 @@ got: inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; } else - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 69aed9e2359e..26f135e7ffce 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1592,7 +1592,7 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ext2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -1614,28 +1614,28 @@ int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } -int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - if (is_quota_modification(mnt_userns, inode, iattr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if (i_uid_needs_update(mnt_userns, iattr, inode) || - i_gid_needs_update(mnt_userns, iattr, inode)) { - error = dquot_transfer(mnt_userns, inode, iattr); + if (i_uid_needs_update(&nop_mnt_idmap, iattr, inode) || + i_gid_needs_update(&nop_mnt_idmap, iattr, inode)) { + error = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (error) return error; } @@ -1644,9 +1644,9 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + error = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index e8340bf09b10..cc87d413eb43 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -27,7 +27,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ext2_fileattr_set(struct user_namespace *mnt_userns, +int ext2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -66,7 +66,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT2_IOC_SETVERSION: { __u32 generation; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; ret = mnt_want_write_file(filp); if (ret) @@ -99,7 +99,7 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c056957221a2..7f5dfa87cc95 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -99,7 +99,7 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct user_namespace * mnt_userns, +static int ext2_create (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { @@ -119,7 +119,7 @@ static int ext2_create (struct user_namespace * mnt_userns, return ext2_add_nondir(dentry, inode); } -static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ext2_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); @@ -133,7 +133,7 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, 0); } -static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, +static int ext2_mknod (struct mnt_idmap * idmap, struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -154,7 +154,7 @@ static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, return err; } -static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, +static int ext2_symlink (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; @@ -225,7 +225,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct user_namespace * mnt_userns, +static int ext2_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; @@ -315,7 +315,7 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ext2_rename (struct user_namespace * mnt_userns, +static int ext2_rename (struct mnt_idmap * idmap, struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry, unsigned int flags) @@ -370,8 +370,11 @@ static int ext2_rename (struct user_namespace * mnt_userns, err = PTR_ERR(new_de); goto out_dir; } - ext2_set_link(new_dir, new_de, new_page, page_addr, old_inode, 1); + err = ext2_set_link(new_dir, new_de, new_page, page_addr, + old_inode, true); ext2_put_page(new_page, page_addr); + if (err) + goto out_dir; new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); @@ -394,24 +397,24 @@ static int ext2_rename (struct user_namespace * mnt_userns, ext2_delete_entry(old_de, old_page, old_page_addr); if (dir_de) { - if (old_dir != new_dir) - ext2_set_link(old_inode, dir_de, dir_page, - dir_page_addr, new_dir, 0); + if (old_dir != new_dir) { + err = ext2_set_link(old_inode, dir_de, dir_page, + dir_page_addr, new_dir, false); + } ext2_put_page(dir_page, dir_page_addr); inode_dec_link_count(old_dir); } +out_old: ext2_put_page(old_page, old_page_addr); - return 0; +out: + return err; out_dir: if (dir_de) ext2_put_page(dir_page, dir_page_addr); -out_old: - ext2_put_page(old_page, old_page_addr); -out: - return err; + goto out_old; } const struct inode_operations ext2_dir_inode_operations = { diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index ebade1f52451..db47b8ab153e 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -19,7 +19,7 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 18a87d5dd1ab..995f931228ce 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -26,7 +26,7 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 58092449f8ff..dd1507231081 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -30,7 +30,7 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index a9f89539aeee..27fcbddfb148 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -225,7 +225,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; @@ -249,7 +249,7 @@ retry: return PTR_ERR(handle); if ((type == ACL_TYPE_ACCESS) && acl) { - error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); + error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 09c4a8a3b716..0c5a79c3b5d4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); -int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 140e1eb300d1..43e26e6f6e42 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2845,7 +2845,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); -extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, +extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, @@ -2853,11 +2853,11 @@ extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, int nblocks); #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ - __ext4_new_inode(&init_user_ns, (handle), (dir), (mode), (qstr), \ + __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ (goal), (owner), i_flags, 0, 0, 0) -#define ext4_new_inode_start_handle(mnt_userns, dir, mode, qstr, goal, owner, \ +#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ type, nblocks) \ - __ext4_new_inode((mnt_userns), NULL, (dir), (mode), (qstr), (goal), (owner), \ + __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) @@ -2976,14 +2976,14 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); -extern int ext4_setattr(struct user_namespace *, struct dentry *, +extern int ext4_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern u32 ext4_dio_alignment(struct inode *inode); -extern int ext4_getattr(struct user_namespace *, const struct path *, +extern int ext4_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); -extern int ext4_file_getattr(struct user_namespace *, const struct path *, +extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); @@ -3024,7 +3024,7 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); -int ext4_fileattr_set(struct user_namespace *mnt_userns, +int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 63f9bb6e8851..157663031f8c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -921,7 +921,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, +struct inode *__ext4_new_inode(struct mnt_idmap *idmap, handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, @@ -972,10 +972,10 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; } else - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9d9f414f99fe..b936ee3af51e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1136,7 +1136,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, for (i = 0; i < nr_wait; i++) { int err2; - err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize, + err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), + blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); @@ -3858,7 +3859,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + err = fscrypt_decrypt_pagecache_blocks(page_folio(page), + blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); @@ -5434,7 +5436,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * * Called with inode->i_rwsem down. */ -int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -5454,7 +5456,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; - error = setattr_prepare(mnt_userns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -5466,14 +5468,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if (i_uid_needs_update(mnt_userns, attr, inode) || - i_gid_needs_update(mnt_userns, attr, inode)) { + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5490,7 +5492,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(mnt_userns, inode, attr); + error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5499,8 +5501,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { @@ -5630,7 +5632,7 @@ out_mmap_sem: if (!error) { if (inc_ivers) inode_inc_iversion(inode); - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } @@ -5642,7 +5644,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) @@ -5668,7 +5670,7 @@ u32 ext4_dio_alignment(struct inode *inode) return 1; /* use the iomap defaults */ } -int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -5725,18 +5727,18 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } -int ext4_file_getattr(struct user_namespace *mnt_userns, +int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; - ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8067ccda34e4..b0dc7212694e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -358,12 +358,12 @@ void ext4_reset_inode_seed(struct inode *inode) * important fields of the inodes. * * @sb: the super block of the filesystem - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode) { handle_t *handle; @@ -393,7 +393,7 @@ static long swap_inode_boot_loader(struct super_block *sb, } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(mnt_userns, inode) || + !inode_owner_or_capable(idmap, inode) || !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; @@ -979,7 +979,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ext4_fileattr_set(struct user_namespace *mnt_userns, +int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -1217,7 +1217,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); @@ -1234,7 +1234,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) __u32 generation; int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ext4_has_metadata_csum(inode->i_sb)) { @@ -1376,7 +1376,7 @@ mext_out: case EXT4_IOC_MIGRATE: { int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1398,7 +1398,7 @@ mext_out: case EXT4_IOC_ALLOC_DA_BLKS: { int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1417,7 +1417,7 @@ mext_out: err = mnt_want_write_file(filp); if (err) return err; - err = swap_inode_boot_loader(sb, mnt_userns, inode); + err = swap_inode_boot_loader(sb, idmap, inode); mnt_drop_write_file(filp); return err; } @@ -1542,7 +1542,7 @@ resizefs_out: case EXT4_IOC_CLEAR_ES_CACHE: { - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ext4_clear_inode_es(inode); return 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dd28453d6ea3..d10a508d95cd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2792,7 +2792,7 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; @@ -2806,7 +2806,7 @@ static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); @@ -2827,7 +2827,7 @@ retry: return err; } -static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; @@ -2841,7 +2841,7 @@ static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); @@ -2861,7 +2861,7 @@ retry: return err; } -static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { handle_t *handle; @@ -2873,7 +2873,7 @@ static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, + inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + @@ -2972,7 +2972,7 @@ out: return err; } -static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; @@ -2989,7 +2989,7 @@ static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFDIR | mode, + inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3339,7 +3339,7 @@ out: return err; } -static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; @@ -3370,7 +3370,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, + inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3720,7 +3720,7 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } -static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, +static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap, struct ext4_renament *ent, int credits, handle_t **h) { @@ -3735,7 +3735,7 @@ static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: - wh = ext4_new_inode_start_handle(mnt_userns, ent->dir, + wh = ext4_new_inode_start_handle(idmap, ent->dir, S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -3763,7 +3763,7 @@ retry: * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ -static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -3851,7 +3851,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto release_bh; } } else { - whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); + whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); goto release_bh; @@ -4158,7 +4158,7 @@ end_rename: return retval; } -static int ext4_rename2(struct user_namespace *mnt_userns, +static int ext4_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -4181,7 +4181,7 @@ static int ext4_rename2(struct user_namespace *mnt_userns, new_dir, new_dentry); } - return ext4_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags); + return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } /* diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index d5266932ce6c..c61dc8a7c014 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -211,8 +211,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio, static inline loff_t ext4_readpage_limit(struct inode *inode) { - if (IS_ENABLED(CONFIG_FS_VERITY) && - (IS_VERITY(inode) || ext4_verity_in_progress(inode))) + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 260c1b3e3ef2..2ae46d11aa30 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2635,7 +2635,6 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; @@ -2668,17 +2667,7 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, "Conflicting test_dummy_encryption options"); return -EINVAL; } - /* - * fscrypt_add_test_dummy_key() technically changes the super_block, so - * technically it should be delayed until ext4_apply_options() like the - * other changes. But since we never get here for remounts (see above), - * and this is the last chance to report errors, we do it here. - */ - err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy); - if (err) - ext4_msg(NULL, KERN_WARNING, - "Error adding test dummy encryption key [%d]", err); - return err; + return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, @@ -5336,11 +5325,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } } - if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); - goto failed_mount_wq; - } - /* * Get the # of file system overhead blocks from the * superblock if present. diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 3d3ed3c38f56..75bf1f88843c 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -55,12 +55,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, return paddr; } -static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns, +static int ext4_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + ext4_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 30e3b65798b5..e4da1704438e 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -381,11 +381,11 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize); + pos += ext4_verity_metadata_pos(inode); - return pagecache_write(inode, buf, 1 << log_blocksize, pos); + return pagecache_write(inode, buf, size, pos); } const struct fsverity_operations ext4_verityops = { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 69a1b8c6a2ec..a2f04a3808db 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -482,11 +482,12 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode, */ e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len, &tmp_data, 1); - if (e_hash == entry->e_hash) - return 0; - /* Still no match - bad */ - return -EFSCORRUPTED; + if (e_hash != entry->e_hash) + return -EFSCORRUPTED; + + /* Let people know about old hash */ + pr_warn_once("ext4: filesystem with signed xattr name hash"); } return 0; } @@ -3096,7 +3097,7 @@ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ - *name++; + (unsigned char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c index c78df5790377..8a5842e4cd95 100644 --- a/fs/ext4/xattr_hurd.c +++ b/fs/ext4/xattr_hurd.c @@ -32,7 +32,7 @@ ext4_xattr_hurd_get(const struct xattr_handler *handler, static int ext4_xattr_hurd_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 8213f66f7b2d..776cf11d24ca 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -23,7 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 7c21ffb26d25..9811eb0ab276 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -30,7 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 2fe7ff0a479c..4b70bf4e7626 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -31,7 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c1c74aa658ae..ec2aeccb69a3 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -204,7 +204,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) return __f2fs_get_acl(inode, type, NULL); } -static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, +static int f2fs_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { @@ -219,14 +219,14 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; } -static int __f2fs_set_acl(struct user_namespace *mnt_userns, +static int __f2fs_set_acl(struct mnt_idmap *idmap, struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = f2fs_acl_update_mode(mnt_userns, inode, + error = f2fs_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; @@ -276,7 +276,7 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, return error; } -int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -284,7 +284,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL); + return __f2fs_set_acl(idmap, inode, type, acl, NULL); } /* diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index ea2bbb3f264b..94ebfbfbdc6f 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); -extern int f2fs_set_acl(struct user_namespace *, struct dentry *, +extern int f2fs_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 97e816590cd9..8630df80fedb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2053,8 +2053,7 @@ out: static inline loff_t f2fs_readpage_limit(struct inode *inode) { - if (IS_ENABLED(CONFIG_FS_VERITY) && - (IS_VERITY(inode) || f2fs_verity_in_progress(inode))) + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e8953c3dc81a..9a3ffa39ad30 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3469,15 +3469,15 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); -int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int f2fs_fileattr_set(struct user_namespace *mnt_userns, +int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3505,7 +3505,7 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); -int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ecbc8c135b49..b90617639743 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -837,7 +837,7 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) return false; } -int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -892,7 +892,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -903,13 +903,13 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, } #ifdef CONFIG_F2FS_FS_POSIX_ACL -static void __setattr_copy(struct user_namespace *mnt_userns, +static void __setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -918,10 +918,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -930,7 +930,7 @@ static void __setattr_copy(struct user_namespace *mnt_userns, #define __setattr_copy setattr_copy #endif -int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -951,7 +951,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -963,15 +963,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(idmap, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if (i_uid_needs_update(mnt_userns, attr, inode) || - i_gid_needs_update(mnt_userns, attr, inode)) { + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(mnt_userns, inode, attr); + err = dquot_transfer(idmap, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -982,8 +982,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } @@ -1023,10 +1023,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(mnt_userns, inode, attr); + __setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(idmap, dentry, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -2038,14 +2038,14 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inode *pinode; loff_t isize; int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2095,7 +2095,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) goto out; } - ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode); iput(pinode); if (ret) { f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -2135,10 +2135,10 @@ out: static int f2fs_ioc_commit_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2167,10 +2167,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) static int f2fs_ioc_abort_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -3090,7 +3090,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int f2fs_fileattr_set(struct user_namespace *mnt_userns, +int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6032589099ce..d8e01bbbf27f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -202,7 +202,7 @@ static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, file_set_hot(inode); } -static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, +static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, struct inode *dir, umode_t mode, const char *name) { @@ -225,7 +225,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, nid_free = true; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; @@ -246,7 +246,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else - F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, F2FS_DEF_PROJID); err = fscrypt_prepare_new_inode(dir, inode, &encrypt); @@ -333,7 +333,7 @@ fail_drop: return ERR_PTR(err); } -static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -350,7 +350,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); + inode = f2fs_new_inode(idmap, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -659,7 +659,7 @@ static const char *f2fs_get_link(struct dentry *dentry, return link; } -static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -682,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); + inode = f2fs_new_inode(idmap, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -739,7 +739,7 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -753,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); + inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -794,7 +794,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } -static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -810,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + inode = f2fs_new_inode(idmap, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -837,7 +837,7 @@ out: return err; } -static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode, bool is_whiteout, struct inode **new_inode) { @@ -849,7 +849,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + inode = f2fs_new_inode(idmap, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -907,7 +907,7 @@ out: return err; } -static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -918,28 +918,28 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL); return finish_open_simple(file, err); } -static int f2fs_create_whiteout(struct user_namespace *mnt_userns, +static int f2fs_create_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct inode **whiteout) { if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) return -EIO; - return __f2fs_tmpfile(mnt_userns, dir, NULL, + return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE, true, whiteout); } -int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode) { - return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); + return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, false, new_inode); } -static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -979,7 +979,7 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout); + err = f2fs_create_whiteout(idmap, old_dir, &whiteout); if (err) return err; } @@ -1295,7 +1295,7 @@ out: return err; } -static int f2fs_rename2(struct user_namespace *mnt_userns, +static int f2fs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -1318,7 +1318,7 @@ static int f2fs_rename2(struct user_namespace *mnt_userns, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(mnt_userns, old_dir, old_dentry, + return f2fs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } @@ -1342,12 +1342,12 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, return target; } -static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns, +static int f2fs_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags); + f2fs_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 77fd453949b1..dfd41908b12d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -258,15 +258,15 @@ static int recover_quota_data(struct inode *inode, struct page *page) attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_UID; - if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(&init_user_ns, inode, &attr); + err = dquot_transfer(&nop_mnt_idmap, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f812b9ce985..64d3556d61a5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -540,12 +540,6 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, opt, err); return -EINVAL; } - err = fscrypt_add_test_dummy_key(sb, policy); - if (err) { - f2fs_warn(sbi, "Error adding test dummy encryption key [%d]", - err); - return err; - } f2fs_warn(sbi, "Test dummy encryption mode enabled"); return 0; } diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index c352fff88a5e..f320ed8172ec 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -276,11 +276,11 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, } static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize); + pos += f2fs_verity_metadata_pos(inode); - return pagecache_write(inode, buf, 1 << log_blocksize, pos); + return pagecache_write(inode, buf, size, pos); } const struct fsverity_operations f2fs_verityops = { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index dc2e8637189e..d92edbbdc30e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -65,7 +65,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -109,7 +109,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -117,7 +117,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, unsigned char old_advise = F2FS_I(inode)->i_advise; unsigned char new_advise; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; if (value == NULL) return -EINVAL; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index a415c02ede39..e3b690b48e3e 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -398,10 +398,10 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; -extern int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); -extern int fat_getattr(struct user_namespace *mnt_userns, +extern int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, diff --git a/fs/fat/file.c b/fs/fat/file.c index 8a6b493b5b5f..795a4fad5c40 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,13 +90,13 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file_mnt_user_ns(file), + err = security_inode_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ - err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia); + err = fat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -395,13 +395,13 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) fat_flush_inodes(inode->i_sb, inode, NULL); } -int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, +int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->blksize = sbi->cluster_size; if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { @@ -456,14 +456,14 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi, return 0; } -static int fat_allow_set_time(struct user_namespace *mnt_userns, +static int fat_allow_set_time(struct mnt_idmap *idmap, struct msdos_sb_info *sbi, struct inode *inode) { umode_t allow_utime = sbi->options.allow_utime; - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { - if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) + if (vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; @@ -477,7 +477,7 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns, /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) -int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); @@ -488,11 +488,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* Check for setting the inode time. */ ia_valid = attr->ia_valid; if (ia_valid & TIMES_SET_FLAGS) { - if (fat_allow_set_time(mnt_userns, sbi, inode)) + if (fat_allow_set_time(idmap, sbi, inode)) attr->ia_valid &= ~TIMES_SET_FLAGS; } - error = setattr_prepare(mnt_userns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) @@ -518,10 +518,10 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + (!uid_eq(from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid), sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + (!gid_eq(from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid), sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) @@ -564,7 +564,7 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); out: return error; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index efba301d68ae..2116c486843b 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -261,7 +261,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, } /***** Create a file */ -static int msdos_create(struct user_namespace *mnt_userns, struct inode *dir, +static int msdos_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -339,7 +339,7 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -594,7 +594,7 @@ error_inode: } /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ -static int msdos_rename(struct user_namespace *mnt_userns, +static int msdos_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 21620054e1c4..fceda1de4805 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -756,7 +756,7 @@ error: return ERR_PTR(err); } -static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, +static int vfat_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -844,7 +844,7 @@ out: return err; } -static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -1158,7 +1158,7 @@ error_exchange: goto out; } -static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +static int vfat_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 146c9ab0cd4b..b622be119706 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/capability.h> @@ -47,7 +48,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode)) + if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ diff --git a/fs/file_table.c b/fs/file_table.c index dd88701e54a9..372653b92617 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index c05c71d57291..0e2fc08f7de4 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -8,7 +8,7 @@ config VXFS_FS of SCO UnixWare (and possibly others) and optionally available for Sunsoft Solaris, HP-UX and many other operating systems. However these particular OS implementations of vxfs may differ in on-disk - data endianess and/or superblock offset. The vxfs module has been + data endianness and/or superblock offset. The vxfs module has been tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) Currently only readonly access is supported and VxFX versions 2, 3 and 4. Tests were performed with HP-UX VxFS version 3. diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index ab8ceddf9efa..cdf991bdd9de 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -141,13 +141,14 @@ static bool fscache_is_acquire_pending(struct fscache_volume *volume) static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, unsigned int collidee_debug_id) { - wait_var_event_timeout(&candidate->flags, - !fscache_is_acquire_pending(candidate), 20 * HZ); + wait_on_bit_timeout(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE, 20 * HZ); if (fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); - wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate)); + wait_on_bit(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE); } } @@ -279,8 +280,7 @@ static void fscache_create_volume_work(struct work_struct *work) fscache_end_cache_access(volume->cache, fscache_access_acquire_volume_end); - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); fscache_put_volume(volume, fscache_volume_put_create_work); } @@ -347,8 +347,8 @@ static void fscache_wake_pending_volume(struct fscache_volume *volume, hlist_bl_for_each_entry(cursor, p, h, hash_link) { if (fscache_volume_same(cursor, volume)) { fscache_see_volume(cursor, fscache_volume_see_hash_wake); - clear_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &cursor->flags); - wake_up_bit(&cursor->flags, FSCACHE_VOLUME_ACQUIRE_PENDING); + clear_and_wake_up_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, + &cursor->flags); return; } } diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index a4850aee2639..3d192b80a561 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -11,9 +11,10 @@ #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) +static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, + struct mnt_idmap *idmap, + struct inode *inode, int type, bool rcu) { - struct fuse_conn *fc = get_fuse_conn(inode); int size; const char *name; void *value = NULL; @@ -25,7 +26,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) if (fuse_is_bad(inode)) return ERR_PTR(-EIO); - if (!fc->posix_acl || fc->no_getxattr) + if (fc->no_getxattr) return NULL; if (type == ACL_TYPE_ACCESS) @@ -53,7 +54,47 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) return acl; } -int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +static inline bool fuse_no_acl(const struct fuse_conn *fc, + const struct inode *inode) +{ + /* + * Refuse interacting with POSIX ACLs for daemons that + * don't support FUSE_POSIX_ACL and are not mounted on + * the host to retain backwards compatibility. + */ + return !fc->posix_acl && (i_user_ns(inode) != &init_user_ns); +} + +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) +{ + struct inode *inode = d_inode(dentry); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_no_acl(fc, inode)) + return ERR_PTR(-EOPNOTSUPP); + + return __fuse_get_acl(fc, idmap, inode, type, false); +} + +struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * FUSE daemons before FUSE_POSIX_ACL was introduced could get and set + * POSIX ACLs without them being used for permission checking by the + * vfs. Retain that behavior for backwards compatibility as there are + * filesystems that do all permission checking for acls in the daemon + * and not in the kernel. + */ + if (!fc->posix_acl) + return NULL; + + return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); +} + +int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -64,7 +105,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (fuse_is_bad(inode)) return -EIO; - if (!fc->posix_acl || fc->no_setxattr) + if (fc->no_setxattr || fuse_no_acl(fc, inode)) return -EOPNOTSUPP; if (type == ACL_TYPE_ACCESS) @@ -99,8 +140,14 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } - if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + /* + * Fuse daemons without FUSE_POSIX_ACL never changed the passed + * through POSIX ACLs. Such daemons don't expect setgid bits to + * be stripped. + */ + if (fc->posix_acl && + !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && + !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); @@ -108,8 +155,15 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, } else { ret = fuse_removexattr(inode, name); } - forget_all_cached_acls(inode); - fuse_invalidate_attr(inode); + + if (fc->posix_acl) { + /* + * Fuse daemons without FUSE_POSIX_ACL never cached POSIX ACLs + * and didn't invalidate attributes. Retain that behavior. + */ + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + } return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index cd1a071b625a..cd1eae61e84c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -645,7 +645,7 @@ out_err: return err; } -static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *, +static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, @@ -686,7 +686,7 @@ out_dput: return err; mknod: - err = fuse_mknod(&init_user_ns, dir, entry, mode, 0); + err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -773,7 +773,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return err; } -static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; @@ -796,13 +796,13 @@ static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, mode); } -static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(&init_user_ns, dir, entry, mode, 0); + return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); } -static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct fuse_conn *fc = get_fuse_conn(dir); @@ -819,7 +819,7 @@ static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; @@ -841,7 +841,7 @@ static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, S_IFDIR); } -static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, const char *link) { struct fuse_mount *fm = get_fuse_mount(dir); @@ -998,7 +998,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, +static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) { @@ -1156,7 +1156,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } @@ -1326,7 +1326,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct user_namespace *mnt_userns, +static int fuse_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -1358,7 +1358,7 @@ static int fuse_permission(struct user_namespace *mnt_userns, } if (fc->default_permissions) { - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1366,7 +1366,7 @@ static int fuse_permission(struct user_namespace *mnt_userns, if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(&init_user_ns, + err = generic_permission(&nop_mnt_idmap, inode, mask); } @@ -1690,7 +1690,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -1837,7 +1837,7 @@ error: return err; } -static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, +static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); @@ -1900,7 +1900,7 @@ static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, return ret; } -static int fuse_getattr(struct user_namespace *mnt_userns, +static int fuse_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -1942,7 +1942,8 @@ static const struct inode_operations fuse_dir_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_inode_acl = fuse_get_acl, + .get_inode_acl = fuse_get_inode_acl, + .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, @@ -1964,7 +1965,8 @@ static const struct inode_operations fuse_common_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_inode_acl = fuse_get_acl, + .get_inode_acl = fuse_get_inode_acl, + .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 875314ee6f59..82710d103556 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -18,6 +18,7 @@ #include <linux/falloc.h> #include <linux/uio.h> #include <linux/fs.h> +#include <linux/filelock.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -1313,7 +1314,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + setattr_should_drop_suidgid(&nop_mnt_idmap, + file_inode(file))) { goto writethrough; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c673faefdcb9..9b5058cf5bc3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1264,12 +1264,12 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); extern const struct xattr_handler *fuse_xattr_handlers[]; -extern const struct xattr_handler *fuse_acl_xattr_handlers[]; -extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; -struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); -int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type); +int fuse_set_acl(struct mnt_idmap *, struct dentry *dentry, struct posix_acl *acl, int type); /* readdir.c */ @@ -1309,7 +1309,7 @@ long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int fuse_fileattr_set(struct user_namespace *mnt_userns, +int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* file.c */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6b3beda16c1b..de9b9ec5ce81 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -311,7 +311,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } -static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, + struct fuse_conn *fc) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; @@ -333,6 +334,12 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) new_decode_dev(attr->rdev)); } else BUG(); + /* + * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL + * so they see the exact same behavior as before. + */ + if (!fc->posix_acl) + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; } static int fuse_inode_eq(struct inode *inode, void *_nodeidp) @@ -372,7 +379,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if (!inode) return NULL; - fuse_init_inode(inode, attr); + fuse_init_inode(inode, attr, fc); get_fuse_inode(inode)->nodeid = nodeid; inode->i_flags |= S_AUTOMOUNT; goto done; @@ -388,7 +395,7 @@ retry: if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; - fuse_init_inode(inode, attr); + fuse_init_inode(inode, attr, fc); unlock_new_inode(inode); } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ @@ -1174,7 +1181,6 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fm->sb->s_xattr = fuse_acl_xattr_handlers; } if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; @@ -1420,13 +1426,6 @@ static void fuse_sb_defaults(struct super_block *sb) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - - /* - * If we are not in the initial user namespace posix - * acls must be translated. - */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; } static int fuse_fill_super_submount(struct super_block *sb, diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fcce94ace2c2..e50a18ee6cc6 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -467,7 +467,7 @@ cleanup: return err; } -int fuse_fileattr_set(struct user_namespace *mnt_userns, +int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 0d3e7177fce0..49c01559580f 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -189,7 +189,7 @@ static int fuse_xattr_get(const struct xattr_handler *handler, } static int fuse_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -203,27 +203,6 @@ static int fuse_xattr_set(const struct xattr_handler *handler, return fuse_setxattr(inode, name, value, size, flags, 0); } -static bool no_xattr_list(struct dentry *dentry) -{ - return false; -} - -static int no_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) -{ - return -EOPNOTSUPP; -} - -static int no_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *nodee, - const char *name, const void *value, - size_t size, int flags) -{ - return -EOPNOTSUPP; -} - static const struct xattr_handler fuse_xattr_handler = { .prefix = "", .get = fuse_xattr_get, @@ -234,33 +213,3 @@ const struct xattr_handler *fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; - -const struct xattr_handler *fuse_acl_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - &fuse_xattr_handler, - NULL -}; - -static const struct xattr_handler fuse_no_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = no_xattr_list, - .get = no_xattr_get, - .set = no_xattr_set, -}; - -static const struct xattr_handler fuse_no_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_ACCESS, - .list = no_xattr_list, - .get = no_xattr_get, - .set = no_xattr_set, -}; - -const struct xattr_handler *fuse_no_acl_xattr_handlers[] = { - &fuse_no_acl_access_xattr_handler, - &fuse_no_acl_default_xattr_handler, - &fuse_xattr_handler, - NULL -}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3dcde4912413..a392aa0f041d 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -109,7 +109,7 @@ out: return error; } -int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -135,7 +135,7 @@ int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); + ret = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (ret) goto unlock; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b8de8c148f5c..d4deb2b19959 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,7 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index eea5be4fbf0e..300844f50dcd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/gfs2_ondisk.h> #include <linux/falloc.h> #include <linux/swap.h> @@ -235,7 +236,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(&init_user_ns, inode, MAY_WRITE); + error = gfs2_permission(&nop_mnt_idmap, inode, MAY_WRITE); if (error) goto out; } @@ -273,7 +274,7 @@ out: return error; } -int gfs2_fileattr_set(struct user_namespace *mnt_userns, +int gfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 614db3055c02..713efa3bb732 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -320,7 +320,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = gfs2_permission(&init_user_ns, dir, MAY_EXEC); + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_EXEC); if (error) goto out; } @@ -350,7 +350,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = gfs2_permission(&init_user_ns, &dip->i_inode, + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -843,7 +843,7 @@ fail: /** * gfs2_create - Create a file - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory in which to create the file * @dentry: The dentry of the new file * @mode: The mode of the new file @@ -852,7 +852,7 @@ fail: * Returns: errno */ -static int gfs2_create(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl); @@ -960,7 +960,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) goto out_gunlock; - error = gfs2_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1078,7 +1078,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&init_user_ns, &dip->i_inode, + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1207,7 +1207,7 @@ out_inodes: /** * gfs2_symlink - Create a symlink - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory to create the symlink in * @dentry: The dentry to put the symlink in * @symname: The thing which the link points to @@ -1215,7 +1215,7 @@ out_inodes: * Returns: errno */ -static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { unsigned int size; @@ -1229,7 +1229,7 @@ static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mkdir - Make a directory - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The parent directory of the new one * @dentry: The dentry of the new directory * @mode: The mode of the new directory @@ -1237,7 +1237,7 @@ static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, * Returns: errno */ -static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); @@ -1246,7 +1246,7 @@ static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mknod - Make a special file - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory in which the special file will reside * @dentry: The dentry of the special file * @mode: The mode of the special file @@ -1254,7 +1254,7 @@ static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, * */ -static int gfs2_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0); @@ -1504,7 +1504,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = gfs2_permission(&init_user_ns, ndir, + error = gfs2_permission(&nop_mnt_idmap, ndir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1541,7 +1541,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(&init_user_ns, d_inode(odentry), + error = gfs2_permission(&nop_mnt_idmap, d_inode(odentry), MAY_WRITE); if (error) goto out_gunlock; @@ -1705,13 +1705,13 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (S_ISDIR(old_mode)) { - error = gfs2_permission(&init_user_ns, odentry->d_inode, + error = gfs2_permission(&nop_mnt_idmap, odentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; } if (S_ISDIR(new_mode)) { - error = gfs2_permission(&init_user_ns, ndentry->d_inode, + error = gfs2_permission(&nop_mnt_idmap, ndentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; @@ -1766,7 +1766,7 @@ out: return error; } -static int gfs2_rename2(struct user_namespace *mnt_userns, struct inode *odir, +static int gfs2_rename2(struct mnt_idmap *idmap, struct inode *odir, struct dentry *odentry, struct inode *ndir, struct dentry *ndentry, unsigned int flags) { @@ -1841,7 +1841,7 @@ out: /** * gfs2_permission - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: The inode * @mask: The mask to be tested * @@ -1852,7 +1852,7 @@ out: * Returns: errno */ -int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, +int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct gfs2_inode *ip; @@ -1872,7 +1872,7 @@ int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EPERM; else - error = generic_permission(&init_user_ns, inode, mask); + error = generic_permission(&nop_mnt_idmap, inode, mask); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); @@ -1881,7 +1881,7 @@ int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -1966,7 +1966,7 @@ out: /** * gfs2_setattr - Change attributes on an inode - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: The dentry which is changing * @attr: The structure describing the change * @@ -1976,7 +1976,7 @@ out: * Returns: errno */ -static int gfs2_setattr(struct user_namespace *mnt_userns, +static int gfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -1992,11 +1992,11 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, if (error) goto out; - error = may_setattr(&init_user_ns, inode, attr->ia_valid); + error = may_setattr(&nop_mnt_idmap, inode, attr->ia_valid); if (error) goto error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto error; @@ -2007,7 +2007,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, + error = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); } @@ -2022,7 +2022,7 @@ out: /** * gfs2_getattr - Read out an inode's attributes - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @path: Object to query * @stat: The inode's stats * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -2037,7 +2037,7 @@ out: * Returns: errno */ -static int gfs2_getattr(struct user_namespace *mnt_userns, +static int gfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -2066,7 +2066,7 @@ static int gfs2_getattr(struct user_namespace *mnt_userns, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 0264d514dda7..c8c5814e7295 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); -extern int gfs2_permission(struct user_namespace *mnt_userns, +extern int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); @@ -111,7 +111,7 @@ extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int gfs2_fileattr_set(struct user_namespace *mnt_userns, +extern int gfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern void gfs2_set_inode_flags(struct inode *inode); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 723639376ae2..61323deb80bc 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -80,6 +80,15 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) brelse(bd->bd_bh); } +static int __gfs2_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + /** * gfs2_ail1_start_one - Start I/O on a transaction * @sdp: The superblock @@ -131,7 +140,7 @@ __acquires(&sdp->sd_ail_lock) if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); - ret = filemap_fdatawrite_wbc(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __gfs2_writepage, mapping); if (need_resched()) { blk_finish_plug(plug); cond_resched(); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 518c0677e12a..adf6d17cf033 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1225,7 +1225,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 2bd54efaf416..6341bb248247 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -121,7 +121,7 @@ static int hfs_xattr_get(const struct xattr_handler *handler, } static int hfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 527f6e46cbe8..3e1e3dcf0b48 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -189,7 +189,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ -static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -219,7 +219,7 @@ static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -280,7 +280,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) * new file/directory. * XXX: how do you handle must_be dir? */ -static int hfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int hfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 68d0305880f7..49d02524e667 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -206,7 +206,7 @@ int hfs_write_begin(struct file *file, struct address_space *mapping, extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); -extern int hfs_inode_setattr(struct user_namespace *, struct dentry *, +extern int hfs_inode_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3a155c1d810e..1f7bd068acf0 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -611,14 +611,14 @@ static int hfs_file_release(struct inode *inode, struct file *file) * correspond to the same HFS file. */ -int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; - error = setattr_prepare(&init_user_ns, dentry, + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); /* basic permission checks */ if (error) return error; @@ -658,7 +658,7 @@ int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, current_time(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 84714bbccc12..56fb5f1312e7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -434,7 +434,7 @@ out: return res; } -static int hfsplus_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); @@ -476,7 +476,7 @@ out: return res; } -static int hfsplus_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); @@ -517,19 +517,19 @@ out: return res; } -static int hfsplus_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hfsplus_mknod(&init_user_ns, dir, dentry, mode, 0); + return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); } -static int hfsplus_rename(struct user_namespace *mnt_userns, +static int hfsplus_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 6aa919e59483..7ededcb720c1 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -481,13 +481,13 @@ void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork); int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); int hfsplus_cat_write_inode(struct inode *inode); -int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, +int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int hfsplus_fileattr_set(struct user_namespace *mnt_userns, +int hfsplus_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* ioctl.c */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 840577a0c1e7..abb91f5fae92 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -246,13 +246,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) return 0; } -static int hfsplus_setattr(struct user_namespace *mnt_userns, +static int hfsplus_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -270,13 +270,13 @@ static int hfsplus_setattr(struct user_namespace *mnt_userns, inode->i_mtime = inode->i_ctime = current_time(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } -int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, +int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -298,7 +298,7 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } @@ -390,7 +390,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, return NULL; inode->i_ino = sbi->next_cnid++; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -655,7 +655,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int hfsplus_fileattr_set(struct user_namespace *mnt_userns, +int hfsplus_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 49891b12c415..5b476f57eb17 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -857,7 +857,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index c1c7a16cbf21..90f68ec119cd 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,7 +23,7 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index e150372ec564..fdbaebc1c49a 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -22,7 +22,7 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index a6b60b153916..6464b6c3d58d 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -22,7 +22,7 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 277468783fee..c18bb50c31b6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -559,7 +559,7 @@ static int read_name(struct inode *ino, char *name) return 0; } -static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hostfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -658,7 +658,7 @@ static int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, +static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino, struct dentry *dentry, const char *to) { char *file; @@ -671,7 +671,7 @@ static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, return err; } -static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino, +static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; @@ -696,7 +696,7 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hostfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; @@ -734,7 +734,7 @@ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int hostfs_rename2(struct user_namespace *mnt_userns, +static int hostfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -763,7 +763,7 @@ static int hostfs_rename2(struct user_namespace *mnt_userns, return err; } -static int hostfs_permission(struct user_namespace *mnt_userns, +static int hostfs_permission(struct mnt_idmap *idmap, struct inode *ino, int desired) { char *name; @@ -786,11 +786,11 @@ static int hostfs_permission(struct user_namespace *mnt_userns, err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(&init_user_ns, ino, desired); + err = generic_permission(&nop_mnt_idmap, ino, desired); return err; } -static int hostfs_setattr(struct user_namespace *mnt_userns, +static int hostfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -800,7 +800,7 @@ static int hostfs_setattr(struct user_namespace *mnt_userns, int fd = HOSTFS_I(inode)->fd; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -857,7 +857,7 @@ static int hostfs_setattr(struct user_namespace *mnt_userns, attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 167ec6884642..f5a2476c47bf 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -280,7 +280,7 @@ void hpfs_init_inode(struct inode *); void hpfs_read_inode(struct inode *); void hpfs_write_inode(struct inode *); void hpfs_write_inode_nolock(struct inode *); -int hpfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int hpfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); void hpfs_write_if_changed(struct inode *); void hpfs_evict_inode(struct inode *); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 82208cc28ebd..e50e92a42432 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,7 @@ void hpfs_write_inode_nolock(struct inode *i) brelse(bh); } -int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int hpfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -275,7 +275,7 @@ int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto out_unlock; @@ -289,7 +289,7 @@ int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, hpfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); hpfs_write_inode(inode); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 15fc63276caa..69fb40b2c99a 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -20,7 +20,7 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; @@ -129,7 +129,7 @@ bail: return err; } -static int hpfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { const unsigned char *name = dentry->d_name.name; @@ -217,7 +217,7 @@ bail: return err; } -static int hpfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { const unsigned char *name = dentry->d_name.name; @@ -292,7 +292,7 @@ bail: return err; } -static int hpfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symlink) { const unsigned char *name = dentry->d_name.name; @@ -512,7 +512,7 @@ const struct address_space_operations hpfs_symlink_aops = { .read_folio = hpfs_symlink_read_folio }; -static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int hpfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 790d2727141a..0ce1cc4c2add 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -898,7 +898,7 @@ out: return error; } -static int hugetlbfs_setattr(struct user_namespace *mnt_userns, +static int hugetlbfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -907,7 +907,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -924,7 +924,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, hugetlb_vmtruncate(inode, newsize); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -980,7 +980,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -1019,7 +1019,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; @@ -1033,24 +1033,24 @@ static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return 0; } -static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry, + int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int hugetlbfs_create(struct user_namespace *mnt_userns, +static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } -static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, +static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { @@ -1064,7 +1064,7 @@ static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, return finish_open_simple(file, 0); } -static int hugetlbfs_symlink(struct user_namespace *mnt_userns, +static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { diff --git a/fs/init.c b/fs/init.c index 5c36adaa9b44..9684406a8416 100644 --- a/fs/init.c +++ b/fs/init.c @@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) mode &= ~current_umask(); error = security_path_mknod(&path, dentry, mode, dev); if (!error) - error = vfs_mknod(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); done_path_create(&path, dentry); return error; @@ -167,7 +167,7 @@ int __init init_link(const char *oldname, const char *newname) { struct dentry *new_dentry; struct path old_path, new_path; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; int error; error = kern_path(oldname, 0, &old_path); @@ -182,14 +182,14 @@ int __init init_link(const char *oldname, const char *newname) error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - mnt_userns = mnt_user_ns(new_path.mnt); - error = may_linkat(mnt_userns, &old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: done_path_create(&new_path, new_dentry); @@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) - error = vfs_symlink(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); done_path_create(&path, dentry); return error; @@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); if (!error) - error = vfs_mkdir(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); return error; diff --git a/fs/inode.c b/fs/inode.c index f453eb58fd03..4558dc2f1355 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -5,6 +5,7 @@ */ #include <linux/export.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> @@ -1893,7 +1894,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ - if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode)) + if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode)) return false; if (IS_NOATIME(inode)) @@ -1953,7 +1954,7 @@ EXPORT_SYMBOL(touch_atime); * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ -int dentry_needs_remove_privs(struct user_namespace *mnt_userns, +int dentry_needs_remove_privs(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); @@ -1963,7 +1964,7 @@ int dentry_needs_remove_privs(struct user_namespace *mnt_userns, if (IS_NOSEC(inode)) return 0; - mask = setattr_should_drop_suidgid(mnt_userns, inode); + mask = setattr_should_drop_suidgid(idmap, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; @@ -1972,7 +1973,7 @@ int dentry_needs_remove_privs(struct user_namespace *mnt_userns, return mask; } -static int __remove_privs(struct user_namespace *mnt_userns, +static int __remove_privs(struct mnt_idmap *idmap, struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1982,7 +1983,7 @@ static int __remove_privs(struct user_namespace *mnt_userns, * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ - return notify_change(mnt_userns, dentry, &newattrs, NULL); + return notify_change(idmap, dentry, &newattrs, NULL); } static int __file_remove_privs(struct file *file, unsigned int flags) @@ -1995,7 +1996,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; - kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry); + kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry); if (kill < 0) return kill; @@ -2003,7 +2004,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (flags & IOCB_NOWAIT) return -EAGAIN; - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + error = __remove_privs(file_mnt_idmap(file), dentry, kill); } if (!error) @@ -2279,21 +2280,21 @@ EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards - * @mnt_userns: User namespace of the mount the inode was created from + * @idmap: idmap of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode * - * If the inode has been created through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions + * If the inode has been created through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions * and initializing i_uid and i_gid. On non-idmapped mounts or if permission - * checking is to be performed on the raw inode simply passs init_user_ns. + * checking is to be performed on the raw inode simply pass @nop_mnt_idmap. */ -void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, +void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode) { - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -2301,32 +2302,32 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, if (S_ISDIR(mode)) mode |= S_ISGID; } else - inode_fsgid_set(inode, mnt_userns); + inode_fsgid_set(inode, idmap); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -bool inode_owner_or_capable(struct user_namespace *mnt_userns, +bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) { vfsuid_t vfsuid; struct user_namespace *ns; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; @@ -2458,7 +2459,7 @@ EXPORT_SYMBOL(current_time); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * @@ -2468,19 +2469,19 @@ EXPORT_SYMBOL(current_time); * * Return: true if the caller is sufficiently privileged, false if not. */ -bool in_group_or_capable(struct user_namespace *mnt_userns, +bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid) { if (vfsgid_in_group_p(vfsgid)) return true; - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) return true; return false; } /** * mode_strip_sgid - handle the sgid bit for non-directories - * @mnt_userns: User namespace of the mount the inode was created from + * @idmap: idmap of the mount the inode was created from * @dir: parent directory inode * @mode: mode of the file to be created in @dir * @@ -2492,15 +2493,14 @@ bool in_group_or_capable(struct user_namespace *mnt_userns, * * Return: the new mode to use for the file */ -umode_t mode_strip_sgid(struct user_namespace *mnt_userns, +umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode) { if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; - if (in_group_or_capable(mnt_userns, dir, - i_gid_into_vfsgid(mnt_userns, dir))) + if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir))) return mode; return mode & ~S_ISGID; } diff --git a/fs/internal.h b/fs/internal.h index cb0c0749661a..dc4eb91a577a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -14,9 +14,9 @@ struct path; struct mount; struct shrink_control; struct fs_context; -struct user_namespace; struct pipe_inode_info; struct iov_iter; +struct mnt_idmap; /* * block/bdev.c @@ -63,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); -int may_linkat(struct user_namespace *mnt_userns, const struct path *link); +int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); @@ -151,8 +151,8 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry); -bool in_group_or_capable(struct user_namespace *mnt_userns, +int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); +bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* @@ -232,7 +232,7 @@ ssize_t do_getxattr(struct mnt_idmap *idmap, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); -int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); +int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, @@ -259,5 +259,8 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po /* * fs/attr.c */ -int setattr_should_drop_sgid(struct user_namespace *mnt_userns, +int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); +struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); +struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); +void mnt_idmap_put(struct mnt_idmap *idmap); diff --git a/fs/ioctl.c b/fs/ioctl.c index 80ac36aea913..5b2481cd4750 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -651,7 +651,7 @@ static int fileattr_set_prepare(struct inode *inode, /** * vfs_fileattr_set - change miscellaneous file attributes - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the object to change * @fa: fileattr pointer * @@ -665,7 +665,7 @@ static int fileattr_set_prepare(struct inode *inode, * * Return: 0 on success, or a negative error on failure. */ -int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -675,7 +675,7 @@ int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, if (!inode->i_op->fileattr_set) return -ENOIOCTLCMD; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; inode_lock(inode); @@ -693,7 +693,7 @@ int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, } err = fileattr_set_prepare(inode, &old_ma, fa); if (!err) - err = inode->i_op->fileattr_set(mnt_userns, dentry, fa); + err = inode->i_op->fileattr_set(idmap, dentry, fa); } inode_unlock(inode); @@ -714,7 +714,7 @@ static int ioctl_getflags(struct file *file, unsigned int __user *argp) static int ioctl_setflags(struct file *file, unsigned int __user *argp) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; unsigned int flags; @@ -725,7 +725,7 @@ static int ioctl_setflags(struct file *file, unsigned int __user *argp) err = mnt_want_write_file(file); if (!err) { fileattr_fill_flags(&fa, flags); - err = vfs_fileattr_set(mnt_userns, dentry, &fa); + err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } @@ -746,7 +746,7 @@ static int ioctl_fsgetxattr(struct file *file, void __user *argp) static int ioctl_fssetxattr(struct file *file, void __user *argp) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; int err; @@ -755,7 +755,7 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) if (!err) { err = mnt_want_write_file(file); if (!err) { - err = vfs_fileattr_set(mnt_userns, dentry, &fa); + err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9804714b1751..f771001574d0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - if (!(dio->flags & IOMAP_DIO_WRITE)) { - WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + if (!(dio->flags & IOMAP_DIO_WRITE)) return REQ_OP_READ; - } - - if (iomap->flags & IOMAP_F_ZONE_APPEND) - opflags |= REQ_OP_ZONE_APPEND; - else - opflags |= REQ_OP_WRITE; + opflags |= REQ_OP_WRITE; if (use_fua) opflags |= REQ_FUA; else diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8bb58ce5c06c..888a7ceb6479 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -229,7 +229,7 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int rc, xprefix; @@ -241,7 +241,7 @@ int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (acl) { umode_t mode; - rc = posix_acl_update_mode(&init_user_ns, inode, &mode, + rc = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (rc) return rc; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index ca36a6eca594..e976b8cb82cf 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); -int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f399b390b5f6..5075a0a6d594 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -24,20 +24,20 @@ static int jffs2_readdir (struct file *, struct dir_context *); -static int jffs2_create (struct user_namespace *, struct inode *, +static int jffs2_create (struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, unsigned int); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); -static int jffs2_symlink (struct user_namespace *, struct inode *, +static int jffs2_symlink (struct mnt_idmap *, struct inode *, struct dentry *, const char *); -static int jffs2_mkdir (struct user_namespace *, struct inode *,struct dentry *, +static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct user_namespace *, struct inode *,struct dentry *, +static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); -static int jffs2_rename (struct user_namespace *, struct inode *, +static int jffs2_rename (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); @@ -160,7 +160,7 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx) /***********************************************************************/ -static int jffs2_create(struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode, bool excl) { struct jffs2_raw_inode *ri; @@ -279,7 +279,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de /***********************************************************************/ -static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, const char *target) { struct jffs2_inode_info *f, *dir_f; @@ -442,7 +442,7 @@ static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i } -static int jffs2_mkdir (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; @@ -614,7 +614,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; @@ -762,7 +762,7 @@ static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, return ret; } -static int jffs2_rename (struct user_namespace *mnt_userns, +static int jffs2_rename (struct mnt_idmap *idmap, struct inode *old_dir_i, struct dentry *old_dentry, struct inode *new_dir_i, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 66af51c41619..09174898efd0 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,19 +190,19 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return 0; } -int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(&init_user_ns, dentry, iattr); + rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (rc) return rc; rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return rc; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 921d782583d6..8da19766c101 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -164,7 +164,7 @@ long jffs2_ioctl(struct file *, unsigned int, unsigned long); extern const struct inode_operations jffs2_symlink_inode_operations; /* fs.c */ -int jffs2_setattr (struct user_namespace *, struct dentry *, struct iattr *); +int jffs2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index aef5522551db..437f3a2c1b54 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,7 +57,7 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index cc3f24883e7d..b7c5da2d89bd 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,7 +25,7 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index fb945977c013..f64edce4927b 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,7 +25,7 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 3b667eccc73b..fb96f872d207 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -94,7 +94,7 @@ out: return rc; } -int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int rc; @@ -106,7 +106,7 @@ int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); if (type == ACL_TYPE_ACCESS && acl) { - rc = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); + rc = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (rc) goto end_tx; if (mode != inode->i_mode) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 88663465aecd..2ee35be49de1 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -85,24 +85,24 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } -int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(&init_user_ns, dentry, iattr); + rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (rc) return rc; - if (is_quota_modification(mnt_userns, inode, iattr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(mnt_userns, inode, iattr); + rc = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (rc) return rc; } @@ -119,11 +119,11 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, jfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return rc; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 1e7b177ece60..ed7989bc2db1 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -70,7 +70,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int jfs_fileattr_set(struct user_namespace *mnt_userns, +int jfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index f0704a25835f..f892e54d0fcd 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,7 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); -int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 59379089e939..9e1f02767201 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -64,7 +64,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_put; } - inode_init_owner(&init_user_ns, inode, parent, mode); + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 7de961a81862..ea80661597ac 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -10,7 +10,7 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); extern int jfs_fsync(struct file *, loff_t, loff_t, int); extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int jfs_fileattr_set(struct user_namespace *mnt_userns, +extern int jfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); @@ -28,7 +28,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int jfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern int jfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a38d14eed047..b29d68b5eec5 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -59,7 +59,7 @@ static inline void free_ea_wmap(struct inode *inode) * RETURN: Errors from subroutines * */ -static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode, bool excl) { int rc = 0; @@ -192,7 +192,7 @@ static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode) { int rc = 0; @@ -869,7 +869,7 @@ static int jfs_link(struct dentry *old_dentry, * an intermediate result whose length exceeds PATH_MAX [XPG4.2] */ -static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, const char *name) { int rc; @@ -1059,7 +1059,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, * * FUNCTION: rename a file or directory */ -static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1345,7 +1345,7 @@ static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * * FUNCTION: Create a special file (device) */ -static int jfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index f9273f6901c8..f817798fa1eb 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -932,7 +932,7 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -951,7 +951,7 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 935ef8cb02b2..e3181c3e1988 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1200,7 +1200,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return d_splice_alias(inode, dentry); } -static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, +static int kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -1238,7 +1238,7 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) return ret; } -static int kernfs_iop_rename(struct user_namespace *mnt_userns, +static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index eac0f210299a..30494dcb0df3 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -107,7 +107,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) return ret; } -int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@ -120,7 +120,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, root = kernfs_root(kn); down_write(&root->kernfs_rwsem); - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; @@ -129,7 +129,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto out; /* this ignores size changes */ - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); out: up_write(&root->kernfs_rwsem); @@ -181,7 +181,7 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) set_nlink(inode, kn->dir.subdirs + 2); } -int kernfs_iop_getattr(struct user_namespace *mnt_userns, +int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -191,7 +191,7 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, down_read(&root->kernfs_rwsem); kernfs_refresh_inode(kn, inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); up_read(&root->kernfs_rwsem); return 0; @@ -272,7 +272,7 @@ void kernfs_evict_inode(struct inode *inode) kernfs_put(kn); } -int kernfs_iop_permission(struct user_namespace *mnt_userns, +int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct kernfs_node *kn; @@ -287,7 +287,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, down_read(&root->kernfs_rwsem); kernfs_refresh_inode(kn, inode); - ret = generic_permission(&init_user_ns, inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); up_read(&root->kernfs_rwsem); return ret; @@ -324,7 +324,7 @@ static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) @@ -391,7 +391,7 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 9046d9f39e63..236c3a6113f1 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -127,11 +127,11 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; */ extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); -int kernfs_iop_permission(struct user_namespace *mnt_userns, +int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); -int kernfs_iop_getattr(struct user_namespace *mnt_userns, +int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index fd0a288af299..56be077e5d8a 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -280,7 +280,7 @@ int ksmbd_conn_handler_loop(void *p) { struct ksmbd_conn *conn = (struct ksmbd_conn *)p; struct ksmbd_transport *t = conn->transport; - unsigned int pdu_size; + unsigned int pdu_size, max_allowed_pdu_size; char hdr_buf[4] = {0,}; int size; @@ -305,13 +305,26 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); + if (conn->status == KSMBD_SESS_GOOD) + max_allowed_pdu_size = + SMB3_MAX_MSGSIZE + conn->vals->max_write_size; + else + max_allowed_pdu_size = SMB3_MAX_MSGSIZE; + + if (pdu_size > max_allowed_pdu_size) { + pr_err_ratelimited("PDU length(%u) excceed maximum allowed pdu size(%u) on connection(%d)\n", + pdu_size, max_allowed_pdu_size, + conn->status); + break; + } + /* * Check if pdu size is valid (min : smb header size, * max : 0x00FFFFFF). */ if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE || pdu_size > MAX_STREAM_PROT_LEN) { - continue; + break; } /* 4 for rfc1002 length field */ diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index b6bd8311e6b4..fb8b2d566efb 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -106,7 +106,8 @@ struct ksmbd_startup_request { __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ __u32 smbd_max_io_size; /* smbd read write size */ - __u32 reserved[127]; /* Reserved room */ + __u32 max_connections; /* Number of maximum simultaneous connections */ + __u32 reserved[126]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 0ae8d08d85a8..3507d8f89074 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -242,7 +242,7 @@ int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) return ret; if (da->version != 3 && da->version != 4) { - pr_err("v%d version is not supported\n", da->version); + ksmbd_debug(VFS, "v%d version is not supported\n", da->version); return -EINVAL; } @@ -251,7 +251,7 @@ int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) return ret; if (da->version != version2) { - pr_err("ndr version mismatched(version: %d, version2: %d)\n", + ksmbd_debug(VFS, "ndr version mismatched(version: %d, version2: %d)\n", da->version, version2); return -EINVAL; } @@ -338,7 +338,7 @@ static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl) } int ndr_encode_posix_acl(struct ndr *n, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct inode *inode, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl) @@ -374,11 +374,11 @@ int ndr_encode_posix_acl(struct ndr *n, if (ret) return ret; - vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); if (ret) return ret; - vfsgid = i_gid_into_vfsgid(user_ns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); if (ret) return ret; @@ -457,7 +457,7 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (ret) return ret; if (acl->version != 4) { - pr_err("v%d version is not supported\n", acl->version); + ksmbd_debug(VFS, "v%d version is not supported\n", acl->version); return -EINVAL; } @@ -465,7 +465,7 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (ret) return ret; if (acl->version != version2) { - pr_err("ndr version mismatched(version: %d, version2: %d)\n", + ksmbd_debug(VFS, "ndr version mismatched(version: %d, version2: %d)\n", acl->version, version2); return -EINVAL; } diff --git a/fs/ksmbd/ndr.h b/fs/ksmbd/ndr.h index 60ca265d1bb0..f3c108c8cf4d 100644 --- a/fs/ksmbd/ndr.h +++ b/fs/ksmbd/ndr.h @@ -14,7 +14,7 @@ struct ndr { int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); -int ndr_encode_posix_acl(struct ndr *n, struct user_namespace *user_ns, +int ndr_encode_posix_acl(struct ndr *n, struct mnt_idmap *idmap, struct inode *inode, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl); int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index d7d47b82451d..2e54ded4d92c 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1608,9 +1608,9 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) { struct create_posix_rsp *buf; struct inode *inode = file_inode(fp->filp); - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); buf = (struct create_posix_rsp *)cc; memset(buf, 0, sizeof(struct create_posix_rsp)); diff --git a/fs/ksmbd/server.h b/fs/ksmbd/server.h index ac9d932f8c8a..db7278181760 100644 --- a/fs/ksmbd/server.h +++ b/fs/ksmbd/server.h @@ -41,6 +41,7 @@ struct ksmbd_server_config { unsigned int share_fake_fscaps; struct smb_sid domain_sid; unsigned int auth_mechs; + unsigned int max_connections; char *conf[SERVER_CONF_WORK_GROUP + 1]; }; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 38fbda52e06f..4ef6e1e59a40 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -12,6 +12,7 @@ #include <linux/ethtool.h> #include <linux/falloc.h> #include <linux/mount.h> +#include <linux/filelock.h> #include "glob.h" #include "smbfsctl.h" @@ -2192,7 +2193,7 @@ out: static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, const struct path *path) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *attr_name = NULL, *value; int rc = 0; unsigned int next = 0; @@ -2228,7 +2229,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, value = (char *)&eabuf->name + eabuf->EaNameLength + 1; if (!eabuf->EaValueLength) { - rc = ksmbd_vfs_casexattr_len(user_ns, + rc = ksmbd_vfs_casexattr_len(idmap, path->dentry, attr_name, XATTR_USER_PREFIX_LEN + @@ -2236,7 +2237,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* delete the EA only when it exits */ if (rc > 0) { - rc = ksmbd_vfs_remove_xattr(user_ns, + rc = ksmbd_vfs_remove_xattr(idmap, path->dentry, attr_name); @@ -2251,7 +2252,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* if the EA doesn't exist, just do nothing. */ rc = 0; } else { - rc = ksmbd_vfs_setxattr(user_ns, + rc = ksmbd_vfs_setxattr(idmap, path->dentry, attr_name, value, le16_to_cpu(eabuf->EaValueLength), 0); if (rc < 0) { @@ -2281,7 +2282,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, struct ksmbd_file *fp, char *stream_name, int s_type) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); size_t xattr_stream_size; char *xattr_stream_name; int rc; @@ -2297,7 +2298,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, fp->stream.size = xattr_stream_size; /* Check if there is stream prefix in xattr space */ - rc = ksmbd_vfs_casexattr_len(user_ns, + rc = ksmbd_vfs_casexattr_len(idmap, path->dentry, xattr_stream_name, xattr_stream_size); @@ -2309,7 +2310,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(user_ns, path->dentry, + rc = ksmbd_vfs_setxattr(idmap, path->dentry, xattr_stream_name, NULL, 0, 0); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); @@ -2318,7 +2319,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, static int smb2_remove_smb_xattrs(const struct path *path) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; @@ -2338,7 +2339,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { - err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, + err = ksmbd_vfs_remove_xattr(idmap, path->dentry, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", @@ -2385,7 +2386,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_user_ns(path->mnt), + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path->dentry, &da); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); @@ -2404,7 +2405,7 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) return; - rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_user_ns(path->mnt), + rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_idmap(path->mnt), path->dentry, &da); if (rc > 0) { fp->f_ci->m_fattr = cpu_to_le32(da.attr); @@ -2479,11 +2480,11 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, } static void ksmbd_acls_fattr(struct smb_fattr *fattr, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); fattr->cf_uid = vfsuid_into_kuid(vfsuid); fattr->cf_gid = vfsgid_into_kgid(vfsgid); @@ -2515,7 +2516,7 @@ int smb2_open(struct ksmbd_work *work) struct ksmbd_share_config *share = tcon->share_conf; struct ksmbd_file *fp = NULL; struct file *filp = NULL; - struct user_namespace *user_ns = NULL; + struct mnt_idmap *idmap = NULL; struct kstat stat; struct create_context *context; struct lease_ctx_info *lc = NULL; @@ -2768,7 +2769,7 @@ int smb2_open(struct ksmbd_work *work) rc = 0; } else { file_present = true; - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); } if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { @@ -2831,7 +2832,7 @@ int smb2_open(struct ksmbd_work *work) if (!file_present) { daccess = cpu_to_le32(GENERIC_ALL_FLAGS); } else { - rc = ksmbd_vfs_query_maximal_access(user_ns, + rc = ksmbd_vfs_query_maximal_access(idmap, path.dentry, &daccess); if (rc) @@ -2867,7 +2868,7 @@ int smb2_open(struct ksmbd_work *work) } created = true; - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); if (ea_buf) { if (le32_to_cpu(ea_buf->ccontext.DataLength) < sizeof(struct smb2_ea_info)) { @@ -2889,7 +2890,7 @@ int smb2_open(struct ksmbd_work *work) * is already granted. */ if (daccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_READ_CONTROL_LE)) { - rc = inode_permission(user_ns, + rc = inode_permission(idmap, d_inode(path.dentry), may_flags); if (rc) @@ -2897,7 +2898,7 @@ int smb2_open(struct ksmbd_work *work) if ((daccess & FILE_DELETE_LE) || (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { - rc = ksmbd_vfs_may_delete(user_ns, + rc = ksmbd_vfs_may_delete(idmap, path.dentry); if (rc) goto err_out; @@ -2960,7 +2961,7 @@ int smb2_open(struct ksmbd_work *work) int posix_acl_rc; struct inode *inode = d_inode(path.dentry); - posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns, + posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap, path.dentry, d_inode(path.dentry->d_parent)); if (posix_acl_rc) @@ -2976,7 +2977,7 @@ int smb2_open(struct ksmbd_work *work) rc = smb2_create_sd_buffer(work, req, &path); if (rc) { if (posix_acl_rc) - ksmbd_vfs_set_init_posix_acl(user_ns, + ksmbd_vfs_set_init_posix_acl(idmap, path.dentry); if (test_share_config_flag(work->tcon->share_conf, @@ -2985,7 +2986,7 @@ int smb2_open(struct ksmbd_work *work) struct smb_ntsd *pntsd; int pntsd_size, ace_num = 0; - ksmbd_acls_fattr(&fattr, user_ns, inode); + ksmbd_acls_fattr(&fattr, idmap, inode); if (fattr.cf_acls) ace_num = fattr.cf_acls->a_count; if (fattr.cf_dacls) @@ -2999,7 +3000,7 @@ int smb2_open(struct ksmbd_work *work) if (!pntsd) goto err_out; - rc = build_sec_desc(user_ns, + rc = build_sec_desc(idmap, pntsd, NULL, 0, OWNER_SECINFO | GROUP_SECINFO | @@ -3013,7 +3014,7 @@ int smb2_open(struct ksmbd_work *work) } rc = ksmbd_vfs_set_sd_xattr(conn, - user_ns, + idmap, path.dentry, pntsd, pntsd_size); @@ -3209,7 +3210,7 @@ int smb2_open(struct ksmbd_work *work) struct create_context *mxac_ccontext; if (maximal_access == 0) - ksmbd_vfs_query_maximal_access(user_ns, + ksmbd_vfs_query_maximal_access(idmap, path.dentry, &maximal_access); mxac_ccontext = (struct create_context *)(rsp->Buffer + @@ -3634,7 +3635,7 @@ static void unlock_dir(struct ksmbd_file *dir_fp) static int process_query_dir_entries(struct smb2_query_dir_private *priv) { - struct user_namespace *user_ns = file_mnt_user_ns(priv->dir_fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(priv->dir_fp->filp); struct kstat kstat; struct ksmbd_kstat ksmbd_kstat; int rc; @@ -3647,7 +3648,7 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one(user_ns, priv->d_info->name, + dent = lookup_one(idmap, priv->d_info->name, priv->dir_fp->filp->f_path.dentry, priv->d_info->name_len); unlock_dir(priv->dir_fp); @@ -3668,7 +3669,7 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) ksmbd_kstat.kstat = &kstat; if (priv->info_level != FILE_NAMES_INFORMATION) ksmbd_vfs_fill_dentry_attrs(priv->work, - user_ns, + idmap, dent, &ksmbd_kstat); @@ -3898,7 +3899,7 @@ int smb2_query_dir(struct ksmbd_work *work) } if (!(dir_fp->daccess & FILE_LIST_DIRECTORY_LE) || - inode_permission(file_mnt_user_ns(dir_fp->filp), + inode_permission(file_mnt_idmap(dir_fp->filp), file_inode(dir_fp->filp), MAY_READ | MAY_EXEC)) { pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp); @@ -4164,7 +4165,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0; struct smb2_ea_info_req *ea_req = NULL; const struct path *path; - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); if (!(fp->daccess & FILE_READ_EA_LE)) { pr_err("Not permitted to read ext attr : 0x%x\n", @@ -4244,7 +4245,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, buf_free_len -= (offsetof(struct smb2_ea_info, name) + name_len + 1); /* bailout if xattr can't fit in buf_free_len */ - value_len = ksmbd_vfs_getxattr(user_ns, path->dentry, + value_len = ksmbd_vfs_getxattr(idmap, path->dentry, name, &buf); if (value_len <= 0) { rc = -ENOENT; @@ -4334,7 +4335,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, } basic_info = (struct smb2_file_basic_info *)rsp->Buffer; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4375,7 +4376,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, struct kstat stat; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); @@ -4429,7 +4430,7 @@ static int get_file_all_info(struct ksmbd_work *work, return PTR_ERR(filename); inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); ksmbd_debug(SMB, "filename = %s\n", filename); delete_pending = ksmbd_inode_pending_delete(fp); @@ -4506,7 +4507,7 @@ static void get_file_stream_info(struct ksmbd_work *work, int buf_free_len; struct smb2_query_info_req *req = ksmbd_req_buf_next(work); - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; @@ -4597,7 +4598,7 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp, struct smb2_file_internal_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_internal_info *)rsp->Buffer; file_info->IndexNumber = cpu_to_le64(stat.ino); @@ -4623,7 +4624,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); file_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4684,7 +4685,7 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, struct smb2_file_comp_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_comp_info *)rsp->Buffer; @@ -4725,9 +4726,9 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, { struct smb311_posix_qinfo *file_info; struct inode *inode = file_inode(fp->filp); - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); u64 time; int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32; @@ -5127,7 +5128,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, struct smb2_query_info_rsp *rsp) { struct ksmbd_file *fp; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct smb_ntsd *pntsd = (struct smb_ntsd *)rsp->Buffer, *ppntsd = NULL; struct smb_fattr fattr = {{0}}; struct inode *inode; @@ -5174,19 +5175,19 @@ static int smb2_get_info_sec(struct ksmbd_work *work, if (!fp) return -ENOENT; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); inode = file_inode(fp->filp); - ksmbd_acls_fattr(&fattr, user_ns, inode); + ksmbd_acls_fattr(&fattr, idmap, inode); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) - ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, user_ns, + ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, idmap, fp->filp->f_path.dentry, &ppntsd); /* Check if sd buffer size exceeds response buffer size */ if (smb2_resp_buf_len(work, 8) > ppntsd_size) - rc = build_sec_desc(user_ns, pntsd, ppntsd, ppntsd_size, + rc = build_sec_desc(idmap, pntsd, ppntsd, ppntsd_size, addition_info, &secdesclen, &fattr); posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); @@ -5416,7 +5417,7 @@ int smb2_echo(struct ksmbd_work *work) static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct smb2_file_rename_info *file_info, struct nls_table *local_nls) { @@ -5479,7 +5480,7 @@ static int smb2_rename(struct ksmbd_work *work, if (rc) goto out; - rc = ksmbd_vfs_setxattr(user_ns, + rc = ksmbd_vfs_setxattr(idmap, fp->filp->f_path.dentry, xattr_stream_name, NULL, 0, 0); @@ -5618,7 +5619,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, struct iattr attrs; struct file *filp; struct inode *inode; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; int rc = 0; if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) @@ -5627,7 +5628,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, attrs.ia_valid = 0; filp = fp->filp; inode = file_inode(filp); - user_ns = file_mnt_user_ns(filp); + idmap = file_mnt_idmap(filp); if (file_info->CreationTime) fp->create_time = le64_to_cpu(file_info->CreationTime); @@ -5671,7 +5672,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, filp->f_path.dentry, &da); if (rc) ksmbd_debug(SMB, @@ -5689,7 +5690,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, inode_lock(inode); inode->i_ctime = attrs.ia_ctime; attrs.ia_valid &= ~ATTR_CTIME; - rc = notify_change(user_ns, dentry, &attrs, NULL); + rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); } return rc; @@ -5782,7 +5783,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, struct smb2_file_rename_info *rename_info, unsigned int buf_len) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct ksmbd_file *parent_fp; struct dentry *parent; struct dentry *dentry = fp->filp->f_path.dentry; @@ -5797,12 +5798,12 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(rename_info->FileNameLength)) return -EINVAL; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); if (ksmbd_stream_fd(fp)) goto next; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; @@ -5821,7 +5822,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, ksmbd_fd_put(work, parent_fp); } next: - return smb2_rename(work, fp, user_ns, rename_info, + return smb2_rename(work, fp, idmap, rename_info, work->conn->local_nls); } @@ -7530,14 +7531,14 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, struct file_sparse *sparse) { struct ksmbd_file *fp; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; int ret = 0; __le32 old_fattr; fp = ksmbd_lookup_fd_fast(work, id); if (!fp) return -ENOENT; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); old_fattr = fp->f_ci->m_fattr; if (sparse->SetSparse) @@ -7550,13 +7551,13 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { struct xattr_dos_attrib da; - ret = ksmbd_vfs_get_dos_attrib_xattr(user_ns, + ret = ksmbd_vfs_get_dos_attrib_xattr(idmap, fp->filp->f_path.dentry, &da); if (ret <= 0) goto out; da.attr = le32_to_cpu(fp->f_ci->m_fattr); - ret = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, fp->filp->f_path.dentry, &da); if (ret) fp->f_ci->m_fattr = old_fattr; @@ -8663,6 +8664,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; struct smb2_hdr *rsp = smb2_get_msg(work->response_buf); if (conn->dialect < SMB30_PROT_ID) @@ -8672,6 +8674,7 @@ bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work) rsp = ksmbd_resp_buf_next(work); if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE && + sess->user && !user_guest(sess->user) && rsp->Status == STATUS_SUCCESS) return true; return false; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index aa5dbe54f5a1..0c8a770fe318 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -24,8 +24,9 @@ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) -#define SMB3_MIN_IOSIZE (64 * 1024) -#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) +#define SMB3_MIN_IOSIZE (64 * 1024) +#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) +#define SMB3_MAX_MSGSIZE (4 * 4096) /* * Definitions for SMB2 Protocol Data Units (network frames) diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 2a4fbbd55b91..fa2b54df6ee6 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -307,7 +307,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, { int i, rc = 0; struct ksmbd_conn *conn = work->conn; - struct user_namespace *user_ns = file_mnt_user_ns(dir->filp); + struct mnt_idmap *idmap = file_mnt_idmap(dir->filp); for (i = 0; i < 2; i++) { struct kstat kstat; @@ -333,7 +333,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, ksmbd_kstat.kstat = &kstat; ksmbd_vfs_fill_dentry_attrs(work, - user_ns, + idmap, dentry, &ksmbd_kstat); rc = fn(conn, info_level, d_info, &ksmbd_kstat); diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index ab5c68cc0e13..6d6cfb6957a9 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -254,7 +254,7 @@ void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid) ssid->num_subauth++; } -static int sid_to_id(struct user_namespace *user_ns, +static int sid_to_id(struct mnt_idmap *idmap, struct smb_sid *psid, uint sidtype, struct smb_fattr *fattr) { @@ -276,7 +276,7 @@ static int sid_to_id(struct user_namespace *user_ns, id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); uid = KUIDT_INIT(id); - uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid)); + uid = from_vfsuid(idmap, &init_user_ns, VFSUIDT_INIT(uid)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -287,7 +287,7 @@ static int sid_to_id(struct user_namespace *user_ns, id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); gid = KGIDT_INIT(id); - gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid)); + gid = from_vfsgid(idmap, &init_user_ns, VFSGIDT_INIT(gid)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; @@ -362,7 +362,7 @@ void free_acl_state(struct posix_acl_state *state) kfree(state->groups); } -static void parse_dacl(struct user_namespace *user_ns, +static void parse_dacl(struct mnt_idmap *idmap, struct smb_acl *pdacl, char *end_of_acl, struct smb_sid *pownersid, struct smb_sid *pgrpsid, struct smb_fattr *fattr) @@ -489,7 +489,7 @@ static void parse_dacl(struct user_namespace *user_ns, acl_mode = access_flags_to_mode(fattr, ppace[i]->access_req, ppace[i]->type); temp_fattr.cf_uid = INVALID_UID; - ret = sid_to_id(user_ns, &ppace[i]->sid, SIDOWNER, &temp_fattr); + ret = sid_to_id(idmap, &ppace[i]->sid, SIDOWNER, &temp_fattr); if (ret || uid_eq(temp_fattr.cf_uid, INVALID_UID)) { pr_err("%s: Error %d mapping Owner SID to uid\n", __func__, ret); @@ -575,7 +575,7 @@ static void parse_dacl(struct user_namespace *user_ns, free_acl_state(&default_acl_state); } -static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, +static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, struct smb_ace *pndace, struct smb_fattr *fattr, u32 *num_aces, u16 *size, u32 nt_aces_num) @@ -600,14 +600,14 @@ static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, uid_t uid; unsigned int sid_type = SIDOWNER; - uid = posix_acl_uid_translate(user_ns, pace); + uid = posix_acl_uid_translate(idmap, pace); if (!uid) sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = posix_acl_gid_translate(user_ns, pace); + gid = posix_acl_gid_translate(idmap, pace); id_to_sid(gid, SIDUNIX_GROUP, sid); } else if (pace->e_tag == ACL_OTHER && !nt_aces_num) { smb_copy_sid(sid, &sid_everyone); @@ -666,12 +666,12 @@ posix_default_acl: if (pace->e_tag == ACL_USER) { uid_t uid; - uid = posix_acl_uid_translate(user_ns, pace); + uid = posix_acl_uid_translate(idmap, pace); id_to_sid(uid, SIDCREATOR_OWNER, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = posix_acl_gid_translate(user_ns, pace); + gid = posix_acl_gid_translate(idmap, pace); id_to_sid(gid, SIDCREATOR_GROUP, sid); } else { kfree(sid); @@ -689,7 +689,7 @@ posix_default_acl: } } -static void set_ntacl_dacl(struct user_namespace *user_ns, +static void set_ntacl_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_acl *nt_dacl, unsigned int aces_size, @@ -723,13 +723,13 @@ static void set_ntacl_dacl(struct user_namespace *user_ns, } } - set_posix_acl_entries_dacl(user_ns, pndace, fattr, + set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, nt_num_aces); pndacl->num_aces = cpu_to_le32(num_aces); pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); } -static void set_mode_dacl(struct user_namespace *user_ns, +static void set_mode_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_fattr *fattr) { struct smb_ace *pace, *pndace; @@ -741,7 +741,7 @@ static void set_mode_dacl(struct user_namespace *user_ns, pace = pndace = (struct smb_ace *)((char *)pndacl + sizeof(struct smb_acl)); if (fattr->cf_acls) { - set_posix_acl_entries_dacl(user_ns, pndace, fattr, + set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, num_aces); goto out; } @@ -808,7 +808,7 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl) } /* Convert CIFS ACL to POSIX form */ -int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int acl_len, struct smb_fattr *fattr) { int rc = 0; @@ -851,7 +851,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, return rc; } - rc = sid_to_id(user_ns, owner_sid_ptr, SIDOWNER, fattr); + rc = sid_to_id(idmap, owner_sid_ptr, SIDOWNER, fattr); if (rc) { pr_err("%s: Error %d mapping Owner SID to uid\n", __func__, rc); @@ -866,7 +866,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, __func__, rc); return rc; } - rc = sid_to_id(user_ns, group_sid_ptr, SIDUNIX_GROUP, fattr); + rc = sid_to_id(idmap, group_sid_ptr, SIDUNIX_GROUP, fattr); if (rc) { pr_err("%s: Error %d mapping Group SID to gid\n", __func__, rc); @@ -881,7 +881,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, pntsd->type |= cpu_to_le16(DACL_PROTECTED); if (dacloffset) { - parse_dacl(user_ns, dacl_ptr, end_of_acl, + parse_dacl(idmap, dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr); } @@ -889,7 +889,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, } /* Convert permission bits from mode to equivalent CIFS ACL */ -int build_sec_desc(struct user_namespace *user_ns, +int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr) @@ -950,7 +950,7 @@ int build_sec_desc(struct user_namespace *user_ns, dacl_ptr->num_aces = 0; if (!ppntsd) { - set_mode_dacl(user_ns, dacl_ptr, fattr); + set_mode_dacl(idmap, dacl_ptr, fattr); } else { struct smb_acl *ppdacl_ptr; unsigned int dacl_offset = le32_to_cpu(ppntsd->dacloffset); @@ -966,7 +966,7 @@ int build_sec_desc(struct user_namespace *user_ns, ppdacl_size < sizeof(struct smb_acl)) goto out; - set_ntacl_dacl(user_ns, dacl_ptr, ppdacl_ptr, + set_ntacl_dacl(idmap, dacl_ptr, ppdacl_ptr, ntacl_size - sizeof(struct smb_acl), nowner_sid_ptr, ngroup_sid_ptr, fattr); @@ -1002,13 +1002,13 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct smb_ntsd *parent_pntsd = NULL; struct smb_sid owner_sid, group_sid; struct dentry *parent = path->dentry->d_parent; - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size; int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size; char *aces_base; bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); - pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns, + pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, parent, &parent_pntsd); if (pntsd_size <= 0) return -ENOENT; @@ -1162,7 +1162,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, user_ns, + ksmbd_vfs_set_sd_xattr(conn, idmap, path->dentry, pntsd, pntsd_size); kfree(pntsd); } @@ -1190,7 +1190,7 @@ bool smb_inherit_flags(int flags, bool is_dir) int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); struct smb_ntsd *pntsd = NULL; struct smb_acl *pdacl; struct posix_acl *posix_acls; @@ -1206,7 +1206,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, unsigned short ace_size; ksmbd_debug(SMB, "check permission using windows acl\n"); - pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns, + pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, path->dentry, &pntsd); if (pntsd_size <= 0 || !pntsd) goto err_out; @@ -1296,9 +1296,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, pa_entry = posix_acls->a_entries; for (i = 0; i < posix_acls->a_count; i++, pa_entry++) { if (pa_entry->e_tag == ACL_USER) - id = posix_acl_uid_translate(user_ns, pa_entry); + id = posix_acl_uid_translate(idmap, pa_entry); else if (pa_entry->e_tag == ACL_GROUP) - id = posix_acl_gid_translate(user_ns, pa_entry); + id = posix_acl_gid_translate(idmap, pa_entry); else continue; @@ -1360,14 +1360,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, int rc; struct smb_fattr fattr = {{0}}; struct inode *inode = d_inode(path->dentry); - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); struct iattr newattrs; fattr.cf_uid = INVALID_UID; fattr.cf_gid = INVALID_GID; fattr.cf_mode = inode->i_mode; - rc = parse_sec_desc(user_ns, pntsd, ntsd_len, &fattr); + rc = parse_sec_desc(idmap, pntsd, ntsd_len, &fattr); if (rc) goto out; @@ -1383,17 +1383,17 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); + ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, path->dentry, + rc = set_posix_acl(idmap, path->dentry, ACL_TYPE_ACCESS, fattr.cf_acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, path->dentry, + rc = set_posix_acl(idmap, path->dentry, ACL_TYPE_DEFAULT, fattr.cf_dacls); if (rc) ksmbd_debug(SMB, @@ -1403,7 +1403,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, } inode_lock(inode); - rc = notify_change(user_ns, path->dentry, &newattrs, NULL); + rc = notify_change(idmap, path->dentry, &newattrs, NULL); inode_unlock(inode); if (rc) goto out; @@ -1414,8 +1414,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ - ksmbd_vfs_remove_sd_xattrs(user_ns, path->dentry); - ksmbd_vfs_set_sd_xattr(conn, user_ns, + ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry); + ksmbd_vfs_set_sd_xattr(conn, idmap, path->dentry, pntsd, ntsd_len); } diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index 618f2e0236b3..49a8c292bd2e 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -190,9 +190,9 @@ struct posix_acl_state { struct posix_ace_state_array *groups; }; -int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int acl_len, struct smb_fattr *fattr); -int build_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr); int init_acl_state(struct posix_acl_state *state, int cnt); @@ -211,25 +211,25 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); -static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, +static inline uid_t posix_acl_uid_translate(struct mnt_idmap *idmap, struct posix_acl_entry *pace) { vfsuid_t vfsuid; /* If this is an idmapped mount, apply the idmapping. */ - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid); + vfsuid = make_vfsuid(idmap, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)); } -static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, +static inline gid_t posix_acl_gid_translate(struct mnt_idmap *idmap, struct posix_acl_entry *pace) { vfsgid_t vfsgid; /* If this is an idmapped mount, apply the idmapping. */ - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid); + vfsgid = make_vfsgid(idmap, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)); diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index c9aca21637d5..40c721f9227e 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -308,6 +308,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); + if (req->max_connections) + server_conf.max_connections = req->max_connections; + ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); ret |= ksmbd_set_work_group(req->work_group); diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 4c6bd0b69979..603893fd87f5 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -15,6 +15,8 @@ #define IFACE_STATE_DOWN BIT(0) #define IFACE_STATE_CONFIGURED BIT(1) +static atomic_t active_num_conn; + struct interface { struct task_struct *ksmbd_kthread; struct socket *ksmbd_socket; @@ -185,8 +187,10 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) struct tcp_transport *t; t = alloc_transport(client_sk); - if (!t) + if (!t) { + sock_release(client_sk); return -ENOMEM; + } csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn); if (kernel_getpeername(client_sk, csin) < 0) { @@ -239,6 +243,15 @@ static int ksmbd_kthread_fn(void *p) continue; } + if (server_conf.max_connections && + atomic_inc_return(&active_num_conn) >= server_conf.max_connections) { + pr_info_ratelimited("Limit the maximum number of connections(%u)\n", + atomic_read(&active_num_conn)); + atomic_dec(&active_num_conn); + sock_release(client_sk); + continue; + } + ksmbd_debug(CONN, "connect success: accepted new connection\n"); client_sk->sk->sk_rcvtimeo = KSMBD_TCP_RECV_TIMEOUT; client_sk->sk->sk_sndtimeo = KSMBD_TCP_SEND_TIMEOUT; @@ -368,6 +381,8 @@ static int ksmbd_tcp_writev(struct ksmbd_transport *t, struct kvec *iov, static void ksmbd_tcp_disconnect(struct ksmbd_transport *t) { free_transport(TCP_TRANS(t)); + if (server_conf.max_connections) + atomic_dec(&active_num_conn); } static void tcp_destroy_socket(struct socket *ksmbd_socket) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index ff0e7a4fcd4d..aa1300b7bfc2 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/backing-dev.h> #include <linux/writeback.h> @@ -69,14 +70,14 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, * * the reference count of @parent isn't incremented. */ -int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, +int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, struct dentry *child) { struct dentry *dentry; int ret = 0; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - dentry = lookup_one(user_ns, child->d_name.name, parent, + dentry = lookup_one(idmap, child->d_name.name, parent, child->d_name.len); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); @@ -96,20 +97,20 @@ out_err: return ret; } -int ksmbd_vfs_may_delete(struct user_namespace *user_ns, +int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry) { struct dentry *parent; int ret; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; } - ret = inode_permission(user_ns, d_inode(parent), + ret = inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE); inode_unlock(d_inode(parent)); @@ -117,7 +118,7 @@ int ksmbd_vfs_may_delete(struct user_namespace *user_ns, return ret; } -int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, +int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess) { struct dentry *parent; @@ -125,26 +126,26 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL); - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_WRITE)) *daccess |= cpu_to_le32(WRITE_DAC | WRITE_OWNER | SYNCHRONIZE | FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES | FILE_DELETE_CHILD); - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_READ)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_READ)) *daccess |= FILE_READ_DATA_LE | FILE_READ_EA_LE; - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_EXEC)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC)) *daccess |= FILE_EXECUTE_LE; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; } - if (!inode_permission(user_ns, d_inode(parent), MAY_EXEC | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE)) *daccess |= FILE_DELETE_LE; inode_unlock(d_inode(parent)); @@ -177,7 +178,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } mode |= S_IFREG; - err = vfs_create(mnt_user_ns(path.mnt), d_inode(path.dentry), + err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); if (!err) { ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), @@ -199,7 +200,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) */ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path path; struct dentry *dentry; int err; @@ -215,15 +216,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(user_ns, d_inode(path.dentry), dentry, mode); + err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); if (err) { goto out; } else if (d_unhashed(dentry)) { struct dentry *d; - d = lookup_one(user_ns, dentry->d_name.name, dentry->d_parent, + d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); @@ -245,7 +246,7 @@ out: return err; } -static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns, +static ssize_t ksmbd_vfs_getcasexattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len, char **attr_value) { @@ -262,7 +263,7 @@ static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns, if (strncasecmp(attr_name, name, attr_name_len)) continue; - value_len = ksmbd_vfs_getxattr(user_ns, + value_len = ksmbd_vfs_getxattr(idmap, dentry, name, attr_value); @@ -285,7 +286,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos, ksmbd_debug(VFS, "read stream data pos : %llu, count : %zd\n", *pos, count); - v_len = ksmbd_vfs_getcasexattr(file_mnt_user_ns(fp->filp), + v_len = ksmbd_vfs_getcasexattr(file_mnt_idmap(fp->filp), fp->filp->f_path.dentry, fp->stream.name, fp->stream.size, @@ -409,7 +410,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, size_t count) { char *stream_buf = NULL, *wbuf; - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); size_t size, v_len; int err = 0; @@ -422,7 +423,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, count = (*pos + count) - XATTR_SIZE_MAX; } - v_len = ksmbd_vfs_getcasexattr(user_ns, + v_len = ksmbd_vfs_getcasexattr(idmap, fp->filp->f_path.dentry, fp->stream.name, fp->stream.size, @@ -448,7 +449,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, memcpy(&stream_buf[*pos], buf, count); - err = ksmbd_vfs_setxattr(user_ns, + err = ksmbd_vfs_setxattr(idmap, fp->filp->f_path.dentry, fp->stream.name, (void *)stream_buf, @@ -583,7 +584,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) */ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path path; struct dentry *parent; int err; @@ -598,9 +599,9 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) return err; } - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); parent = dget_parent(path.dentry); - err = ksmbd_vfs_lock_parent(user_ns, parent, path.dentry); + err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry); if (err) { dput(parent); path_put(&path); @@ -614,12 +615,12 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) } if (S_ISDIR(d_inode(path.dentry)->i_mode)) { - err = vfs_rmdir(user_ns, d_inode(parent), path.dentry); + err = vfs_rmdir(idmap, d_inode(parent), path.dentry); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, err); } else { - err = vfs_unlink(user_ns, d_inode(parent), path.dentry, NULL); + err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL); if (err) ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, err); @@ -672,7 +673,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } - err = vfs_link(oldpath.dentry, mnt_user_ns(newpath.mnt), + err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) @@ -711,10 +712,10 @@ static int ksmbd_validate_entry_in_use(struct dentry *src_dent) } static int __ksmbd_vfs_rename(struct ksmbd_work *work, - struct user_namespace *src_user_ns, + struct mnt_idmap *src_idmap, struct dentry *src_dent_parent, struct dentry *src_dent, - struct user_namespace *dst_user_ns, + struct mnt_idmap *dst_idmap, struct dentry *dst_dent_parent, struct dentry *trap_dent, char *dst_name) @@ -740,8 +741,8 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, if (ksmbd_override_fsids(work)) return -ENOMEM; - dst_dent = lookup_one(dst_user_ns, dst_name, dst_dent_parent, - strlen(dst_name)); + dst_dent = lookup_one(dst_idmap, dst_name, + dst_dent_parent, strlen(dst_name)); err = PTR_ERR(dst_dent); if (IS_ERR(dst_dent)) { pr_err("lookup failed %s [%d]\n", dst_name, err); @@ -751,10 +752,10 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, err = -ENOTEMPTY; if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) { struct renamedata rd = { - .old_mnt_userns = src_user_ns, + .old_mnt_idmap = src_idmap, .old_dir = d_inode(src_dent_parent), .old_dentry = src_dent, - .new_mnt_userns = dst_user_ns, + .new_mnt_idmap = dst_idmap, .new_dir = d_inode(dst_dent_parent), .new_dentry = dst_dent, }; @@ -772,7 +773,7 @@ out: int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path dst_path; struct dentry *src_dent_parent, *dst_dent_parent; struct dentry *src_dent, *trap_dent, *src_child; @@ -800,8 +801,8 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, trap_dent = lock_rename(src_dent_parent, dst_dent_parent); dget(src_dent); dget(dst_dent_parent); - user_ns = file_mnt_user_ns(fp->filp); - src_child = lookup_one(user_ns, src_dent->d_name.name, src_dent_parent, + idmap = file_mnt_idmap(fp->filp); + src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent, src_dent->d_name.len); if (IS_ERR(src_child)) { err = PTR_ERR(src_child); @@ -816,10 +817,10 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, dput(src_child); err = __ksmbd_vfs_rename(work, - user_ns, + idmap, src_dent_parent, src_dent, - mnt_user_ns(dst_path.mnt), + mnt_idmap(dst_path.mnt), dst_dent_parent, trap_dent, dst_name); @@ -907,22 +908,22 @@ ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list) return size; } -static ssize_t ksmbd_vfs_xattr_len(struct user_namespace *user_ns, +static ssize_t ksmbd_vfs_xattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name) { - return vfs_getxattr(user_ns, dentry, xattr_name, NULL, 0); + return vfs_getxattr(idmap, dentry, xattr_name, NULL, 0); } /** * ksmbd_vfs_getxattr() - vfs helper for smb get extended attributes value - * @user_ns: user namespace + * @idmap: idmap * @dentry: dentry of file for getting xattrs * @xattr_name: name of xattr name to query * @xattr_buf: destination buffer xattr value * * Return: read xattr value length on success, otherwise error */ -ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name, char **xattr_buf) { @@ -930,7 +931,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, char *buf; *xattr_buf = NULL; - xattr_len = ksmbd_vfs_xattr_len(user_ns, dentry, xattr_name); + xattr_len = ksmbd_vfs_xattr_len(idmap, dentry, xattr_name); if (xattr_len < 0) return xattr_len; @@ -938,7 +939,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, if (!buf) return -ENOMEM; - xattr_len = vfs_getxattr(user_ns, dentry, xattr_name, + xattr_len = vfs_getxattr(idmap, dentry, xattr_name, (void *)buf, xattr_len); if (xattr_len > 0) *xattr_buf = buf; @@ -949,7 +950,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, /** * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value - * @user_ns: user namespace + * @idmap: idmap of the relevant mount * @dentry: dentry to set XATTR at * @name: xattr name for setxattr * @value: xattr value to set @@ -958,13 +959,13 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_setxattr(struct user_namespace *user_ns, +int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *attr_name, void *attr_value, size_t attr_size, int flags) { int err; - err = vfs_setxattr(user_ns, + err = vfs_setxattr(idmap, dentry, attr_name, attr_value, @@ -1074,26 +1075,26 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, return ret; } -int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name) { - return vfs_removexattr(user_ns, dentry, attr_name); + return vfs_removexattr(idmap, dentry, attr_name); } -int ksmbd_vfs_unlink(struct user_namespace *user_ns, +int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, struct dentry *dentry) { int err = 0; - err = ksmbd_vfs_lock_parent(user_ns, dir, dentry); + err = ksmbd_vfs_lock_parent(idmap, dir, dentry); if (err) return err; dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) - err = vfs_rmdir(user_ns, d_inode(dir), dentry); + err = vfs_rmdir(idmap, d_inode(dir), dentry); else - err = vfs_unlink(user_ns, d_inode(dir), dentry, NULL); + err = vfs_unlink(idmap, d_inode(dir), dentry, NULL); dput(dentry); inode_unlock(d_inode(dir)); @@ -1298,7 +1299,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, return dent; } -int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, struct dentry *dentry) { char *name, *xattr_list = NULL; @@ -1321,7 +1322,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = vfs_remove_acl(user_ns, dentry, name); + err = vfs_remove_acl(idmap, dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); @@ -1332,7 +1333,7 @@ out: return err; } -int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, struct dentry *dentry) { char *name, *xattr_list = NULL; @@ -1352,7 +1353,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + err = ksmbd_vfs_remove_xattr(idmap, dentry, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } @@ -1362,7 +1363,7 @@ out: return err; } -static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespace *user_ns, +static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct mnt_idmap *idmap, struct inode *inode, int acl_type) { @@ -1392,14 +1393,14 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac switch (pa_entry->e_tag) { case ACL_USER: xa_entry->type = SMB_ACL_USER; - xa_entry->uid = posix_acl_uid_translate(user_ns, pa_entry); + xa_entry->uid = posix_acl_uid_translate(idmap, pa_entry); break; case ACL_USER_OBJ: xa_entry->type = SMB_ACL_USER_OBJ; break; case ACL_GROUP: xa_entry->type = SMB_ACL_GROUP; - xa_entry->gid = posix_acl_gid_translate(user_ns, pa_entry); + xa_entry->gid = posix_acl_gid_translate(idmap, pa_entry); break; case ACL_GROUP_OBJ: xa_entry->type = SMB_ACL_GROUP_OBJ; @@ -1428,7 +1429,7 @@ out: } int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd *pntsd, int len) { @@ -1461,13 +1462,13 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, return rc; } - smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_DEFAULT); - rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, + rc = ndr_encode_posix_acl(&acl_ndr, idmap, inode, smb_acl, def_smb_acl); if (rc) { pr_err("failed to encode ndr to posix acl\n"); @@ -1487,7 +1488,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_vfs_setxattr(user_ns, dentry, + rc = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_SD, sd_ndr.data, sd_ndr.offset, 0); if (rc < 0) @@ -1502,7 +1503,7 @@ out: } int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd) { @@ -1514,7 +1515,7 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct xattr_smb_acl *smb_acl = NULL, *def_smb_acl = NULL; __u8 cmp_hash[XATTR_SD_HASH_SIZE] = {0}; - rc = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_SD, &n.data); + rc = ksmbd_vfs_getxattr(idmap, dentry, XATTR_NAME_SD, &n.data); if (rc <= 0) return rc; @@ -1523,13 +1524,13 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, if (rc) goto free_n_data; - smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_DEFAULT); - rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, smb_acl, + rc = ndr_encode_posix_acl(&acl_ndr, idmap, inode, smb_acl, def_smb_acl); if (rc) { pr_err("failed to encode ndr to posix acl\n"); @@ -1576,7 +1577,7 @@ free_n_data: return rc; } -int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da) { @@ -1587,7 +1588,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, if (err) return err; - err = ksmbd_vfs_setxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, (void *)n.data, n.offset, 0); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); @@ -1596,14 +1597,14 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, return err; } -int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da) { struct ndr n; int err; - err = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_getxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, (char **)&n.data); if (err > 0) { n.length = err; @@ -1650,14 +1651,14 @@ void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat) } int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct ksmbd_kstat *ksmbd_kstat) { u64 time; int rc; - generic_fillattr(user_ns, d_inode(dentry), ksmbd_kstat->kstat); + generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat); time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); ksmbd_kstat->create_time = time; @@ -1675,7 +1676,7 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { struct xattr_dos_attrib da; - rc = ksmbd_vfs_get_dos_attrib_xattr(user_ns, dentry, &da); + rc = ksmbd_vfs_get_dos_attrib_xattr(idmap, dentry, &da); if (rc > 0) { ksmbd_kstat->file_attributes = cpu_to_le32(da.attr); ksmbd_kstat->create_time = da.create_time; @@ -1687,7 +1688,7 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, return 0; } -ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len) { @@ -1704,7 +1705,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, if (strncasecmp(attr_name, name, attr_name_len)) continue; - value_len = ksmbd_vfs_xattr_len(user_ns, dentry, name); + value_len = ksmbd_vfs_xattr_len(idmap, dentry, name); break; } @@ -1823,7 +1824,7 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) locks_delete_block(flock); } -int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry) { struct posix_acl_state acl_state; @@ -1857,13 +1858,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); else if (S_ISDIR(inode->i_mode)) { posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); @@ -1873,7 +1874,7 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return rc; } -int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *parent_inode) { struct posix_acl *acls; @@ -1896,12 +1897,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, } } - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode)) { - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, + rc = set_posix_acl(idmap, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 0d73d735cc39..9d676ab0cd25 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -71,10 +71,10 @@ struct ksmbd_kstat { __le32 file_attributes; }; -int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, +int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, struct dentry *child); -int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry); -int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, +int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry); +int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); @@ -102,19 +102,19 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, unsigned int *chunk_size_written, loff_t *total_size_written); ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list); -ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name, char **xattr_buf); -ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len); -int ksmbd_vfs_setxattr(struct user_namespace *user_ns, +int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *attr_name, void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); -int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name); int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, @@ -131,37 +131,37 @@ struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, unsigned int in_count, unsigned int *out_count); -int ksmbd_vfs_unlink(struct user_namespace *user_ns, - struct dentry *dir, struct dentry *dentry); +int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, + struct dentry *dentry); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct ksmbd_kstat *ksmbd_kstat); void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); -int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, struct dentry *dentry); -int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, struct dentry *dentry); int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd *pntsd, int len); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); -int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); -int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); -int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry); -int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index da9163b00350..1d8126443a7f 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -5,6 +5,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -251,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) filp = fp->filp; if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { ci->m_flags &= ~S_DEL_ON_CLS_STREAM; - err = ksmbd_vfs_remove_xattr(file_mnt_user_ns(filp), + err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), filp->f_path.dentry, fp->stream.name); if (err) @@ -266,7 +267,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) dir = dentry->d_parent; ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); write_unlock(&ci->m_lock); - ksmbd_vfs_unlink(file_mnt_user_ns(filp), dir, dentry); + ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry); write_lock(&ci->m_lock); } write_unlock(&ci->m_lock); diff --git a/fs/libfs.c b/fs/libfs.c index aada4e7c8713..4eda519c3002 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -28,12 +28,12 @@ #include "internal.h" -int simple_getattr(struct user_namespace *mnt_userns, const struct path *path, +int simple_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -473,7 +473,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, } EXPORT_SYMBOL_GPL(simple_rename_exchange); -int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -509,7 +509,7 @@ EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem - * @mnt_userns: user namespace of the target mount + * @idmap: idmap of the target mount * @dentry: dentry * @iattr: iattr structure * @@ -522,19 +522,19 @@ EXPORT_SYMBOL(simple_rename); * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ -int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(mnt_userns, dentry, iattr); + error = setattr_prepare(idmap, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1315,16 +1315,16 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(struct user_namespace *mnt_userns, +static int empty_dir_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } -static int empty_dir_setattr(struct user_namespace *mnt_userns, +static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EPERM; @@ -1582,3 +1582,39 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); + +/** + * inode_query_iversion - read i_version for later use + * @inode: inode from which i_version should be read + * + * Read the inode i_version counter. This should be used by callers that wish + * to store the returned i_version for later comparison. This will guarantee + * that a later query of the i_version will result in a different value if + * anything has changed. + * + * In this implementation, we fetch the current value, set the QUERIED flag and + * then try to swap it into place with a cmpxchg, if it wasn't already set. If + * that fails, we try again with the newly fetched value from the cmpxchg. + */ +u64 inode_query_iversion(struct inode *inode) +{ + u64 cur, new; + + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is already set, then no need to swap */ + if (cur & I_VERSION_QUERIED) { + /* + * This barrier (and the implicit barrier in the + * cmpxchg below) pairs with the barrier in + * inode_maybe_inc_iversion(). + */ + smp_mb(); + break; + } + + new = cur | I_VERSION_QUERIED; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return cur >> I_VERSION_QUERIED_SHIFT; +} +EXPORT_SYMBOL(inode_query_iversion); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index a5bb3f721a9d..82b19a30e0f0 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -188,7 +188,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(locks_inode(fl_blocked->fl_file)), fh) != 0) + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 99fffc9cb958..16b4de868cd2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/nfs_fs.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -130,7 +131,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(locks_inode(fl->fl_file)), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5bec78c8e431..17432c445fe6 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -3,6 +3,7 @@ #define __LOCKD_NETNS_H__ #include <linux/fs.h> +#include <linux/filelock.h> #include <net/netns/generic.h> struct lockd_net { diff --git a/fs/locks.c b/fs/locks.c index 8f01bee17715..624c6ac92ede 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -52,6 +52,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/filelock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/security.h> @@ -233,7 +234,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); list_for_each_entry(fl, list, fl_list) if (fl->fl_file == filp) @@ -887,7 +888,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); void *owner; void (*func)(void); @@ -1330,7 +1331,7 @@ retry: int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(locks_inode(filp), fl, conflock); + return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1629,7 +1630,7 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); @@ -1667,7 +1668,7 @@ int fcntl_getlease(struct file *filp) static int check_conflicting_open(struct file *filp, const long arg, int flags) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; if (flags & FL_LAYOUT) @@ -1703,7 +1704,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1819,7 +1820,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1861,7 +1862,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -2350,7 +2351,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file *f; int error; @@ -2554,7 +2555,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2591,7 +2592,7 @@ static void locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { struct file_lock fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); if (list_empty(&flctx->flc_flock)) return; @@ -2636,7 +2637,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = locks_inode_context(locks_inode(filp)); + ctx = locks_inode_context(file_inode(filp)); if (!ctx) return; @@ -2720,7 +2721,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, */ if (fl->fl_file != NULL) - inode = locks_inode(fl->fl_file); + inode = file_inode(fl->fl_file); seq_printf(f, "%lld: ", id); @@ -2861,7 +2862,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int id = 0; diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 9115948c624e..724d8191a310 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -252,7 +252,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) iput(inode); return NULL; } - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/minix/file.c b/fs/minix/file.c index 6a7bd2d9eec0..0dd05d47724a 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -22,13 +22,13 @@ const struct file_operations minix_file_operations = { .splice_read = generic_file_splice_read, }; -static int minix_setattr(struct user_namespace *mnt_userns, +static int minix_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -42,7 +42,7 @@ static int minix_setattr(struct user_namespace *mnt_userns, minix_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index da8bdd1712a7..e9fbb5303a22 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -654,13 +654,13 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -int minix_getattr(struct user_namespace *mnt_userns, const struct path *path, +int minix_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 202173368025..e0b76defa85c 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -51,7 +51,7 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(struct user_namespace *, const struct path *, +extern int minix_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 8afdc408ca4f..39ebe10d6a8b 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -33,7 +33,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un return d_splice_alias(inode, dentry); } -static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int error; @@ -52,7 +52,7 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { int error; @@ -65,13 +65,13 @@ static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, error); } -static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return minix_mknod(mnt_userns, dir, dentry, mode, 0); + return minix_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int minix_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; @@ -111,7 +111,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; @@ -184,7 +184,7 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) return err; } -static int minix_rename(struct user_namespace *mnt_userns, +static int minix_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c new file mode 100644 index 000000000000..4905665c47d0 --- /dev/null +++ b/fs/mnt_idmapping.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ + +#include <linux/cred.h> +#include <linux/fs.h> +#include <linux/mnt_idmapping.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +#include "internal.h" + +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; + +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ +struct mnt_idmap nop_mnt_idmap = { + .owner = &init_user_ns, + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(nop_mnt_idmap); + +/** + * check_fsmapping - check whether an mount idmapping is allowed + * @idmap: idmap of the relevent mount + * @sb: super block of the filesystem + * + * Return: true if @idmap is allowed, false if not. + */ +bool check_fsmapping(const struct mnt_idmap *idmap, + const struct super_block *sb) +{ + return idmap->owner != sb->s_user_ns; +} + +/** + * initial_idmapping - check whether this is the initial mapping + * @ns: idmapping to check + * + * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, + * [...], 1000 to 1000 [...]. + * + * Return: true if this is the initial mapping, false if not. + */ +static inline bool initial_idmapping(const struct user_namespace *ns) +{ + return ns == &init_user_ns; +} + +/** + * no_idmapping - check whether we can skip remapping a kuid/gid + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * + * This function can be used to check whether a remapping between two + * idmappings is required. + * An idmapped mount is a mount that has an idmapping attached to it that + * is different from the filsystem's idmapping and the initial idmapping. + * If the initial mapping is used or the idmapping of the mount and the + * filesystem are identical no remapping is required. + * + * Return: true if remapping can be skipped, false if not. + */ +static inline bool no_idmapping(const struct user_namespace *mnt_userns, + const struct user_namespace *fs_userns) +{ + return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; +} + +/** + * make_vfsuid - map a filesystem kuid according to an idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kuid : kuid to be mapped + * + * Take a @kuid and remap it from @fs_userns into @idmap. Use this + * function when preparing a @kuid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kuid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kuid won't change when calling + * from_kuid() so we can simply retrieve the value via __kuid_val() + * directly. + * + * Return: @kuid mapped according to @idmap. + * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is + * returned. + */ + +vfsuid_t make_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, + kuid_t kuid) +{ + uid_t uid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return VFSUIDT_INIT(kuid); + if (initial_idmapping(fs_userns)) + uid = __kuid_val(kuid); + else + uid = from_kuid(fs_userns, kuid); + if (uid == (uid_t)-1) + return INVALID_VFSUID; + return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); +} +EXPORT_SYMBOL_GPL(make_vfsuid); + +/** + * make_vfsgid - map a filesystem kgid according to an idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kgid : kgid to be mapped + * + * Take a @kgid and remap it from @fs_userns into @idmap. Use this + * function when preparing a @kgid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kgid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kgid won't change when calling + * from_kgid() so we can simply retrieve the value via __kgid_val() + * directly. + * + * Return: @kgid mapped according to @idmap. + * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is + * returned. + */ +vfsgid_t make_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, kgid_t kgid) +{ + gid_t gid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return VFSGIDT_INIT(kgid); + if (initial_idmapping(fs_userns)) + gid = __kgid_val(kgid); + else + gid = from_kgid(fs_userns, kgid); + if (gid == (gid_t)-1) + return INVALID_VFSGID; + return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); +} +EXPORT_SYMBOL_GPL(make_vfsgid); + +/** + * from_vfsuid - map a vfsuid into the filesystem idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsuid : vfsuid to be mapped + * + * Map @vfsuid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsuid to inode->i_uid. + * + * Return: @vfsuid mapped into the filesystem idmapping + */ +kuid_t from_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsuid_t vfsuid) +{ + uid_t uid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return AS_KUIDT(vfsuid); + uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + if (uid == (uid_t)-1) + return INVALID_UID; + if (initial_idmapping(fs_userns)) + return KUIDT_INIT(uid); + return make_kuid(fs_userns, uid); +} +EXPORT_SYMBOL_GPL(from_vfsuid); + +/** + * from_vfsgid - map a vfsgid into the filesystem idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsgid : vfsgid to be mapped + * + * Map @vfsgid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsgid to inode->i_gid. + * + * Return: @vfsgid mapped into the filesystem idmapping + */ +kgid_t from_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsgid_t vfsgid) +{ + gid_t gid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return AS_KGIDT(vfsgid); + gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + if (gid == (gid_t)-1) + return INVALID_GID; + if (initial_idmapping(fs_userns)) + return KGIDT_INIT(gid); + return make_kgid(fs_userns, gid); +} +EXPORT_SYMBOL_GPL(from_vfsgid); + +#ifdef CONFIG_MULTIUSER +/** + * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups + * @vfsgid: the mnt gid to match + * + * This function can be used to determine whether @vfsuid matches any of the + * caller's groups. + * + * Return: 1 if vfsuid matches caller's groups, 0 if not. + */ +int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return in_group_p(AS_KGIDT(vfsgid)); +} +#else +int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return 1; +} +#endif +EXPORT_SYMBOL_GPL(vfsgid_in_group_p); + +struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) +{ + struct mnt_idmap *idmap; + + idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); + if (!idmap) + return ERR_PTR(-ENOMEM); + + idmap->owner = get_user_ns(mnt_userns); + refcount_set(&idmap->count, 1); + return idmap; +} + +/** + * mnt_idmap_get - get a reference to an idmapping + * @idmap: the idmap to bump the reference on + * + * If @idmap is not the @nop_mnt_idmap bump the reference count. + * + * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. + */ +struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap) + refcount_inc(&idmap->count); + + return idmap; +} + +/** + * mnt_idmap_put - put a reference to an idmapping + * @idmap: the idmap to put the reference on + * + * If this is a non-initial idmapping, put the reference count when a mount is + * released and free it if we're the last user. + */ +void mnt_idmap_put(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { + put_user_ns(idmap->owner); + kfree(idmap); + } +} diff --git a/fs/mpage.c b/fs/mpage.c index 0f8ae954a579..ce53179428db 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -532,6 +532,8 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; + if (!buffer_mapped(&map_bh)) + goto confused; if (buffer_new(&map_bh)) clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { diff --git a/fs/namei.c b/fs/namei.c index 309ae6fc8c99..5855dc6edbd5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> @@ -273,7 +274,7 @@ void putname(struct filename *name) /** * check_acl - perform ACL permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -281,13 +282,13 @@ void putname(struct filename *name) * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int check_acl(struct user_namespace *mnt_userns, +static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL @@ -300,14 +301,14 @@ static int check_acl(struct user_namespace *mnt_userns, /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(mnt_userns, inode, acl, mask); + return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { - int error = posix_acl_permission(mnt_userns, inode, acl, mask); + int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } @@ -318,7 +319,7 @@ static int check_acl(struct user_namespace *mnt_userns, /** * acl_permission_check - perform basic UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -326,20 +327,20 @@ static int check_acl(struct user_namespace *mnt_userns, * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int acl_permission_check(struct user_namespace *mnt_userns, +static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; @@ -348,7 +349,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(mnt_userns, inode, mask); + int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } @@ -362,7 +363,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } @@ -373,7 +374,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /** * generic_permission - check for access rights on a Posix-like filesystem - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) @@ -387,13 +388,13 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, +int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; @@ -401,17 +402,17 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, /* * Do the basic permission checks. */ - ret = acl_permission_check(mnt_userns, inode, mask); + ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; @@ -422,7 +423,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* @@ -431,7 +432,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; @@ -441,7 +442,7 @@ EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -450,19 +451,19 @@ EXPORT_SYMBOL(generic_permission); * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct user_namespace *mnt_userns, +static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) - return inode->i_op->permission(mnt_userns, inode, mask); + return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } /** @@ -487,7 +488,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) /** * inode_permission - Check for access rights to a given inode - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * @@ -497,7 +498,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct user_namespace *mnt_userns, +int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; @@ -518,11 +519,11 @@ int inode_permission(struct user_namespace *mnt_userns, * written back improperly if their true value is unknown * to the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } - retval = do_inode_permission(mnt_userns, inode, mask); + retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; @@ -1094,14 +1095,14 @@ fs_initcall(init_fs_namei_sysctls); */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; - mnt_userns = mnt_user_ns(nd->path.mnt); - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + idmap = mnt_idmap(nd->path.mnt); + vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; @@ -1124,7 +1125,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod /** * safe_hardlink_source - Check for safe hardlink conditions - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: @@ -1135,7 +1136,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod * * Otherwise returns true. */ -static bool safe_hardlink_source(struct user_namespace *mnt_userns, +static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; @@ -1153,7 +1154,7 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ - if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE)) + if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; @@ -1161,8 +1162,8 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, /** * may_linkat - Check permissions for creating a hardlink - * @mnt_userns: user namespace of the mount the inode was found from - * @link: the source to hardlink from + * @idmap: idmap of the mount the inode was found from + * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled @@ -1170,21 +1171,21 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct user_namespace *mnt_userns, const struct path *link) +int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1193,8 +1194,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (safe_hardlink_source(mnt_userns, inode) || - inode_owner_or_capable(mnt_userns, inode)) + if (safe_hardlink_source(idmap, inode) || + inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); @@ -1205,7 +1206,7 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * @@ -1220,15 +1221,15 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) * the directory doesn't have to be world writable: being group writable will * be enough. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(struct user_namespace *mnt_userns, +static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; @@ -1237,8 +1238,8 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns, if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) || - vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) + vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || + vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || @@ -1704,15 +1705,15 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } -static inline int may_lookup(struct user_namespace *mnt_userns, +static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); + int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD || !try_to_unlazy(nd)) return err; } - return inode_permission(mnt_userns, nd->inode, MAY_EXEC); + return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) @@ -2253,13 +2254,13 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; const char *link; u64 hash_len; int type; - mnt_userns = mnt_user_ns(nd->path.mnt); - err = may_lookup(mnt_userns, nd); + idmap = mnt_idmap(nd->path.mnt); + err = may_lookup(idmap, nd); if (err) return err; @@ -2307,7 +2308,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode); + nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2622,7 +2623,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_common(struct user_namespace *mnt_userns, +static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { @@ -2652,7 +2653,7 @@ static int lookup_one_common(struct user_namespace *mnt_userns, return err; } - return inode_permission(mnt_userns, base->d_inode, MAY_EXEC); + return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** @@ -2676,7 +2677,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2703,7 +2704,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2714,7 +2715,7 @@ EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component - * @mnt_userns: user namespace of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2724,7 +2725,7 @@ EXPORT_SYMBOL(lookup_one_len); * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, +struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; @@ -2733,7 +2734,7 @@ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(mnt_userns, name, base, len, &this); + err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2744,7 +2745,7 @@ EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component - * @mnt_userns: idmapping of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2755,7 +2756,7 @@ EXPORT_SYMBOL(lookup_one); * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ -struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { @@ -2763,7 +2764,7 @@ struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, int err; struct dentry *ret; - err = lookup_one_common(mnt_userns, name, base, len, &this); + err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2777,7 +2778,7 @@ EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component - * @mnt_userns: idmapping of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2794,11 +2795,11 @@ EXPORT_SYMBOL(lookup_one_unlocked); * * The helper should be called without i_mutex held. */ -struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { - struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len); + struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); @@ -2823,7 +2824,7 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_unlocked(&init_user_ns, name, base, len); + return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2838,7 +2839,7 @@ EXPORT_SYMBOL(lookup_one_len_unlocked); struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_positive_unlocked(&init_user_ns, name, base, len); + return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); @@ -2880,16 +2881,16 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, +int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; - return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); @@ -2913,7 +2914,7 @@ EXPORT_SYMBOL(__check_sticky); * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, +static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); @@ -2926,21 +2927,21 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) || + if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || - HAS_UNMAPPED_ID(mnt_userns, inode)) + HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -2965,7 +2966,7 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct user_namespace *mnt_userns, +static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); @@ -2973,10 +2974,10 @@ static inline int may_create(struct user_namespace *mnt_userns, return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; - return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } /* @@ -3044,7 +3045,7 @@ static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) /** * vfs_prepare_mode - prepare the mode to be used for a new inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs @@ -3065,11 +3066,11 @@ static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) * * Returns: mode to be passed to the filesystem */ -static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, +static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { - mode = mode_strip_sgid(mnt_userns, dir, mode); + mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* @@ -3084,7 +3085,7 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, /** * vfs_create - create new file - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new file @@ -3092,27 +3093,29 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, * * Create a new file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_create(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { - int error = may_create(mnt_userns, dir, dentry); + int error; + + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ - mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IALLUGO, S_IFREG); + mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl); + error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; @@ -3124,7 +3127,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; - int error = may_create(&init_user_ns, dir, dentry); + int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; @@ -3146,7 +3149,7 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(struct user_namespace *mnt_userns, const struct path *path, +static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -3182,7 +3185,7 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, break; } - error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode); + error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; @@ -3197,13 +3200,13 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } -static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) +static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; @@ -3213,7 +3216,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) error = security_file_truncate(filp); if (!error) { - error = do_truncate(mnt_userns, path->dentry, 0, + error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } @@ -3228,7 +3231,7 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(struct user_namespace *mnt_userns, +static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { @@ -3236,10 +3239,10 @@ static int may_o_create(struct user_namespace *mnt_userns, if (error) return error; - if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; - error = inode_permission(mnt_userns, dir->dentry->d_inode, + error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -3319,7 +3322,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; @@ -3367,13 +3370,13 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; - mode = vfs_prepare_mode(mnt_userns, dir->d_inode, mode, mode, mode); + mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) - create_error = may_o_create(mnt_userns, &nd->path, + create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; @@ -3410,7 +3413,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, goto out_dput; } - error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry, + error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; @@ -3513,7 +3516,7 @@ finish_lookup: static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; @@ -3526,13 +3529,13 @@ static int do_open(struct nameidata *nd, } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; - error = may_create_in_sticky(mnt_userns, nd, + error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; @@ -3552,13 +3555,13 @@ static int do_open(struct nameidata *nd, return error; do_truncate = true; } - error = may_open(mnt_userns, &nd->path, acc_mode, open_flag); + error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = ima_file_check(file, op->acc_mode); if (!error && do_truncate) - error = handle_truncate(mnt_userns, file); + error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3570,20 +3573,20 @@ static int do_open(struct nameidata *nd, /** * vfs_tmpfile - create tmpfile - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: pointer to dentry of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * * Create a temporary file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int vfs_tmpfile(struct user_namespace *mnt_userns, +static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { @@ -3594,7 +3597,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, int open_flag = file->f_flags; /* we want directory to be writable */ - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) @@ -3604,13 +3607,13 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; - mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); - error = dir->i_op->tmpfile(mnt_userns, dir, file, mode); + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); + error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (error) return error; /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &file->f_path, 0, file->f_flags); + error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); @@ -3619,13 +3622,13 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } - ima_post_create_tmpfile(mnt_userns, inode); + ima_post_create_tmpfile(idmap, inode); return 0; } /** * vfs_tmpfile_open - open a tmpfile for kernel internal use - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags @@ -3635,7 +3638,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ -struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, +struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { @@ -3644,7 +3647,7 @@ struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, file = alloc_empty_file_noaccount(open_flag, cred); if (!IS_ERR(file)) { - error = vfs_tmpfile(mnt_userns, parentpath, file, mode); + error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); @@ -3658,7 +3661,6 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { - struct user_namespace *mnt_userns; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); @@ -3667,8 +3669,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_tmpfile(mnt_userns, &path, file, op->mode); + error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); @@ -3873,7 +3874,7 @@ EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new device node or file @@ -3881,17 +3882,17 @@ EXPORT_SYMBOL(user_path_create); * * Create a device node or file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; - int error = may_create(mnt_userns, dir, dentry); + int error = may_create(idmap, dir, dentry); if (error) return error; @@ -3903,7 +3904,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (!dir->i_op->mknod) return -EPERM; - mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; @@ -3912,7 +3913,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev); + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; @@ -3939,7 +3940,7 @@ static int may_mknod(umode_t mode) static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; @@ -3959,20 +3960,20 @@ retry: if (error) goto out2; - mnt_userns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(mnt_userns, path.dentry->d_inode, + error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) - ima_post_path_mknod(mnt_userns, dentry); + ima_post_path_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, + error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, + error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } @@ -4000,32 +4001,33 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d /** * vfs_mkdir - create directory - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new directory * * Create a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int error = may_create(mnt_userns, dir, dentry); + int error; unsigned max_links = dir->i_sb->s_max_links; + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->mkdir) return -EPERM; - mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IRWXUGO | S_ISVTX, 0); + mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; @@ -4033,7 +4035,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (max_links && dir->i_nlink >= max_links) return -EMLINK; - error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode); + error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; @@ -4056,10 +4058,8 @@ retry: error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { - struct user_namespace *mnt_userns; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, - mode); + error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, mode); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { @@ -4083,22 +4083,22 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * * Remove a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { - int error = may_delete(mnt_userns, dir, dentry, 1); + int error = may_delete(idmap, dir, dentry, 1); if (error) return error; @@ -4138,7 +4138,6 @@ EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { - struct user_namespace *mnt_userns; int error; struct dentry *dentry; struct path path; @@ -4178,8 +4177,7 @@ retry: error = security_path_rmdir(&path, dentry); if (error) goto exit4; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry); + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: @@ -4203,7 +4201,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) /** * vfs_unlink - unlink a filesystem object - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. @@ -4220,17 +4218,17 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; - int error = may_delete(mnt_userns, dir, dentry, 0); + int error = may_delete(idmap, dir, dentry, 0); if (error) return error; @@ -4304,7 +4302,6 @@ retry_deleg: dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { - struct user_namespace *mnt_userns; /* Why not before? Because we want correct error value */ if (last.name[last.len]) @@ -4316,9 +4313,8 @@ retry_deleg: error = security_path_unlink(&path, dentry); if (error) goto exit3; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, - &delegated_inode); + error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); exit3: dput(dentry); } @@ -4370,24 +4366,25 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) /** * vfs_symlink - create symlink - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @oldname: name of the file to link to * * Create a symlink. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { - int error = may_create(mnt_userns, dir, dentry); + int error; + error = may_create(idmap, dir, dentry); if (error) return error; @@ -4398,7 +4395,7 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname); + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; @@ -4423,13 +4420,9 @@ retry: goto out_putnames; error = security_path_symlink(&path, dentry, from->name); - if (!error) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry, - from->name); - } + if (!error) + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -4455,7 +4448,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn /** * vfs_link - create a new link * @old_dentry: object to be linked - * @mnt_userns: the user namespace of the mount + * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break @@ -4472,13 +4465,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, +int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { @@ -4489,7 +4482,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, if (!inode) return -ENOENT; - error = may_create(mnt_userns, dir, new_dentry); + error = may_create(idmap, dir, new_dentry); if (error) return error; @@ -4506,7 +4499,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, * be writen back improperly if their true value is unknown to * the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; @@ -4553,7 +4546,7 @@ EXPORT_SYMBOL(vfs_link); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; @@ -4590,14 +4583,14 @@ retry: error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - mnt_userns = mnt_user_ns(new_path.mnt); - error = may_linkat(mnt_userns, &old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); @@ -4697,20 +4690,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir); + error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_userns, new_dir, new_dentry); + error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -4725,13 +4718,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_userns, source, + error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_userns, target, + error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -4776,7 +4769,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -4921,10 +4914,10 @@ retry_deleg: rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; - rd.old_mnt_userns = mnt_user_ns(old_path.mnt); + rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; - rd.new_mnt_userns = mnt_user_ns(new_path.mnt); + rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/namespace.c b/fs/namespace.c index ab467ee58341..5927d90e24a0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,22 +75,6 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ -struct mnt_idmap { - struct user_namespace *owner; - refcount_t count; -}; - -/* - * Carries the initial idmapping of 0:0:4294967295 which is an identity - * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is - * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. - */ -struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, - .count = REFCOUNT_INIT(1), -}; -EXPORT_SYMBOL_GPL(nop_mnt_idmap); - struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; @@ -210,104 +194,6 @@ int mnt_get_count(struct mount *mnt) #endif } -/** - * mnt_idmap_owner - retrieve owner of the mount's idmapping - * @idmap: mount idmapping - * - * This helper will go away once the conversion to use struct mnt_idmap - * everywhere has finished at which point the helper will be unexported. - * - * Only code that needs to perform permission checks based on the owner of the - * idmapping will get access to it. All other code will solely rely on - * idmappings. This will get us type safety so it's impossible to conflate - * filesystems idmappings with mount idmappings. - * - * Return: The owner of the idmapping. - */ -struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap) -{ - return idmap->owner; -} -EXPORT_SYMBOL_GPL(mnt_idmap_owner); - -/** - * mnt_user_ns - retrieve owner of an idmapped mount - * @mnt: the relevant vfsmount - * - * This helper will go away once the conversion to use struct mnt_idmap - * everywhere has finished at which point the helper will be unexported. - * - * Only code that needs to perform permission checks based on the owner of the - * idmapping will get access to it. All other code will solely rely on - * idmappings. This will get us type safety so it's impossible to conflate - * filesystems idmappings with mount idmappings. - * - * Return: The owner of the idmapped. - */ -struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) -{ - struct mnt_idmap *idmap = mnt_idmap(mnt); - - /* Return the actual owner of the filesystem instead of the nop. */ - if (idmap == &nop_mnt_idmap && - !initial_idmapping(mnt->mnt_sb->s_user_ns)) - return mnt->mnt_sb->s_user_ns; - return mnt_idmap_owner(idmap); -} -EXPORT_SYMBOL_GPL(mnt_user_ns); - -/** - * alloc_mnt_idmap - allocate a new idmapping for the mount - * @mnt_userns: owning userns of the idmapping - * - * Allocate a new struct mnt_idmap which carries the idmapping of the mount. - * - * Return: On success a new idmap, on error an error pointer is returned. - */ -static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) -{ - struct mnt_idmap *idmap; - - idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); - if (!idmap) - return ERR_PTR(-ENOMEM); - - idmap->owner = get_user_ns(mnt_userns); - refcount_set(&idmap->count, 1); - return idmap; -} - -/** - * mnt_idmap_get - get a reference to an idmapping - * @idmap: the idmap to bump the reference on - * - * If @idmap is not the @nop_mnt_idmap bump the reference count. - * - * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. - */ -static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) -{ - if (idmap != &nop_mnt_idmap) - refcount_inc(&idmap->count); - - return idmap; -} - -/** - * mnt_idmap_put - put a reference to an idmapping - * @idmap: the idmap to put the reference on - * - * If this is a non-initial idmapping, put the reference count when a mount is - * released and free it if we're the last user. - */ -static inline void mnt_idmap_put(struct mnt_idmap *idmap) -{ - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } -} - static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -4094,7 +3980,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns) + if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb)) return -EINVAL; /* @@ -4340,7 +4226,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); - if (initial_idmapping(mnt_userns)) { + if (mnt_userns == &init_user_ns) { err = -EPERM; goto out_fput; } diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 1ead5bd740c2..14a72224b657 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default y + default n help - Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS - attempts to improve read performance by compressing out sparse holes - in the file contents. + This is intended for developers only. The READ_PLUS operation has + been shown to have issues under specific conditions and should not + be used in production. diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f7e4a88d5d92..f8e420464b77 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2296,7 +2296,7 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct iattr attr; @@ -2325,7 +2325,7 @@ EXPORT_SYMBOL_GPL(nfs_create); * See comments for nfs_proc_create regarding failed operations. */ int -nfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; @@ -2352,7 +2352,7 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; @@ -2524,7 +2524,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink); * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct page *page; @@ -2642,7 +2642,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) * If these conditions are met, we can drop the dentries before doing * the rename. */ -int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -3262,7 +3262,7 @@ static int nfs_execute_ok(struct inode *inode, int mask) return ret; } -int nfs_permission(struct user_namespace *mnt_userns, +int nfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { @@ -3313,7 +3313,7 @@ out_notsup: res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER); if (res == 0) - res = generic_permission(&init_user_ns, inode, mask); + res = generic_permission(&nop_mnt_idmap, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 01596f2d0a1e..1a9d5aa51dfb 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -145,17 +145,10 @@ out: return parent; } -static u64 nfs_fetch_iversion(struct inode *inode) -{ - nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); - return inode_peek_iversion_raw(inode); -} - const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, - .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d8ec889a4b3f..b0f3c9339e70 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -31,6 +31,7 @@ #include <linux/swap.h> #include <linux/uaccess.h> +#include <linux/filelock.h> #include "delegation.h" #include "internal.h" diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e98ee7599eeb..222a28320e1c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -606,7 +606,7 @@ EXPORT_SYMBOL_GPL(nfs_fhget); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int -nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -825,10 +825,12 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; } -int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -843,7 +845,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS; + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | + STATX_CHANGE_COOKIE; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) @@ -851,8 +854,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, goto out_no_revalidate; } - /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + /* Flush out writes to the server in order to update c/mtime/version. */ + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && S_ISREG(inode->i_mode)) filemap_write_and_wait(inode->i_mapping); @@ -872,7 +875,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| - STATX_SIZE|STATX_BLOCKS))) + STATX_SIZE|STATX_BLOCKS| + STATX_CHANGE_COOKIE))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ @@ -908,8 +912,12 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + stat->change_cookie = inode_peek_iversion_raw(inode); + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; out: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ae7d4a8c728c..41468c21291d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -384,18 +384,18 @@ extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); -int nfs_create(struct user_namespace *, struct inode *, struct dentry *, +int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, +int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); -int nfs_symlink(struct user_namespace *, struct inode *, struct dentry *, +int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); -int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t, +int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -int nfs_rename(struct user_namespace *, struct inode *, struct dentry *, +int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index b0ef7e7ddb30..19d51ebf842c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -208,23 +208,23 @@ out_fc: } static int -nfs_namespace_getattr(struct user_namespace *mnt_userns, +nfs_namespace_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { if (NFS_FH(d_inode(path->dentry))->size != 0) - return nfs_getattr(mnt_userns, path, stat, request_mask, + return nfs_getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); return 0; } static int -nfs_namespace_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { if (NFS_FH(d_inode(dentry))->size != 0) - return nfs_setattr(mnt_userns, dentry, attr); + return nfs_setattr(idmap, dentry, attr); return -EACCES; } diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index df9ca56db347..4fa37dc038b5 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); -extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 74d11e3c4205..1247f544a440 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -255,7 +255,7 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5edd1704f735..4c9f8bd866ab 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -23,6 +23,7 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) #include <linux/seqlock.h> +#include <linux/filelock.h> struct idmap; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 40d749f29ed3..d9c332019d06 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7692,7 +7692,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7716,7 +7716,7 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7739,7 +7739,7 @@ static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) #define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7764,7 +7764,7 @@ static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7815,7 +7815,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) #ifdef CONFIG_NFS_V4_2 static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 16be6dae524f..779bfc37233c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -21,6 +21,7 @@ #include <linux/nfs_page.h> #include <linux/nfs_mount.h> #include <linux/export.h> +#include <linux/filelock.h> #include "internal.h" #include "pnfs.h" diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80c240e50952..1a80d548253a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -25,6 +25,7 @@ #include <linux/freezer.h> #include <linux/wait.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/sched/mm.h> diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 0a9b72685f98..1479583fbb62 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -9,6 +9,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> +#include <linux/filelock.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 0ef070349014..c0950edb26b0 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -662,6 +662,39 @@ static struct shrinker nfsd_file_shrinker = { }; /** + * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file + * @nf: nfsd_file to attempt to queue + * @dispose: private list to queue successfully-put objects + * + * Unhash an nfsd_file, try to get a reference to it, and then put that + * reference. If it's the last reference, queue it to the dispose list. + */ +static void +nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) + __must_hold(RCU) +{ + int decrement = 1; + + /* If we raced with someone else unhashing, ignore it */ + if (!nfsd_file_unhash(nf)) + return; + + /* If we can't get a reference, ignore it */ + if (!nfsd_file_get(nf)) + return; + + /* Extra decrement if we remove from the LRU */ + if (nfsd_file_lru_remove(nf)) + ++decrement; + + /* If refcount goes to 0, then put on the dispose list */ + if (refcount_sub_and_test(decrement, &nf->nf_ref)) { + list_add(&nf->nf_lru, dispose); + trace_nfsd_file_closing(nf); + } +} + +/** * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode * @inode: inode on which to close out nfsd_files * @dispose: list on which to gather nfsd_files to close out @@ -688,30 +721,11 @@ nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) rcu_read_lock(); do { - int decrement = 1; - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, nfsd_file_rhash_params); if (!nf) break; - - /* If we raced with someone else unhashing, ignore it */ - if (!nfsd_file_unhash(nf)) - continue; - - /* If we can't get a reference, ignore it */ - if (!nfsd_file_get(nf)) - continue; - - /* Extra decrement if we remove from the LRU */ - if (nfsd_file_lru_remove(nf)) - ++decrement; - - /* If refcount goes to 0, then put on the dispose list */ - if (refcount_sub_and_test(decrement, &nf->nf_ref)) { - list_add(&nf->nf_lru, dispose); - trace_nfsd_file_closing(nf); - } + nfsd_file_cond_queue(nf, dispose); } while (1); rcu_read_unlock(); } @@ -928,11 +942,8 @@ __nfsd_file_cache_purge(struct net *net) nf = rhashtable_walk_next(&iter); while (!IS_ERR_OR_NULL(nf)) { - if (!net || nf->nf_net == net) { - nfsd_file_unhash(nf); - nfsd_file_lru_remove(nf); - list_add(&nf->nf_lru, &dispose); - } + if (!net || nf->nf_net == net) + nfsd_file_cond_queue(nf, &dispose); nf = rhashtable_walk_next(&iter); } diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 51a4b7885cae..ec49b200b797 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -10,6 +10,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/filelock.h> #include <linux/percpu_counter.h> #include <linux/siphash.h> diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1457f59f447a..995cb2c90b1a 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); if (error) goto out_drop_lock; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 647108138e8a..887803735e2a 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); out_drop_lock: diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index d01b29aba662..f41992ecd0d7 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -320,7 +320,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode &= ~current_umask(); fh_fill_pre_attrs(fhp); - host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); if (host_err < 0) { status = nfserrno(host_err); goto out; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 78b8cd9651d5..3509e73abe1f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU); + status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: @@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) status = -ENOENT; if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); out: dput(dentry); out_unlock: @@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(&init_user_ns, d_inode(parent), child); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4ef529379065..c1684da6c01f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5356,7 +5356,7 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, { struct nfs4_ol_stateid *st; struct file *f = fp->fi_deleg_file->nf_file; - struct inode *ino = locks_inode(f); + struct inode *ino = file_inode(f); int writes; writes = atomic_read(&ino->i_writecount); @@ -7809,7 +7809,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } - inode = locks_inode(nf->nf_file); + inode = file_inode(nf->nf_file); flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -8182,7 +8182,6 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif @@ -8192,6 +8191,7 @@ void nfs4_state_shutdown(void) { nfsd4_destroy_callback_queue(); + rhltable_destroy(&nfs4_file_rhltable); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 97edb32be77f..e12e5a4ad502 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2965,7 +2965,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; } - err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, + STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, + AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; if (!(stat.result_mask & STATX_BTIME)) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8c52b6c9d31a..ccd8485fee04 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -40,7 +40,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(&init_user_ns, + err = inode_permission(&nop_mnt_idmap, d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); @@ -628,6 +628,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) stat.mtime = inode->i_mtime; stat.ctime = inode->i_ctime; stat.size = inode->i_size; + if (v4 && IS_I_VERSION(inode)) { + stat.change_cookie = inode_query_iversion(inode); + stat.result_mask |= STATX_CHANGE_COOKIE; + } } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -659,6 +663,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) if (err) { fhp->fh_post_saved = false; fhp->fh_post_attr.ctime = inode->i_ctime; + if (v4 && IS_I_VERSION(inode)) { + fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); + fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; + } } else fhp->fh_post_saved = true; if (v4) @@ -748,3 +756,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) return FSIDSOURCE_UUID; return FSIDSOURCE_DEV; } + +/* + * We could use i_version alone as the change attribute. However, i_version + * can go backwards on a regular file after an unclean shutdown. On its own + * that doesn't necessarily cause a problem, but if i_version goes backwards + * and then is incremented again it could reuse a value that was previously + * used before boot, and a client who queried the two values might incorrectly + * assume nothing changed. + * + * By using both ctime and the i_version counter we guarantee that as long as + * time doesn't go backwards we never reuse an old value. If the filesystem + * advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not + * needed. + * + * We only need to do this for regular files as well. For directories, we + * assume that the new change attr is always logged to stable storage in some + * fashion before the results can be seen. + */ +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(inode->i_mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 513e028b0bbe..4e0ecf0ae2cf 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -293,34 +293,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -/* - * We could use i_version alone as the change attribute. However, - * i_version can go backwards after a reboot. On its own that doesn't - * necessarily cause a problem, but if i_version goes backwards and then - * is incremented again it could reuse a value that was previously used - * before boot, and a client who queried the two values might - * incorrectly assume nothing changed. - * - * By using both ctime and the i_version counter we guarantee that as - * long as time doesn't go backwards we never reuse an old value. - */ -static inline u64 nfsd4_change_attribute(struct kstat *stat, - struct inode *inode) -{ - if (inode->i_sb->s_export_op->fetch_iversion) - return inode->i_sb->s_export_op->fetch_iversion(inode); - else if (IS_I_VERSION(inode)) { - u64 chattr; - - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; - } else - return time_to_chattr(&stat->ctime); -} - +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); extern void fh_fill_pre_attrs(struct svc_fh *fhp); extern void fh_fill_post_attrs(struct svc_fh *fhp); extern void fh_fill_both_attrs(struct svc_fh *fhp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 9744443c3965..a82d91afdc9c 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -93,7 +93,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) { + setattr_prepare(&nop_mnt_idmap, fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4c3a0d84043c..ab4ee3509ce3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -426,7 +426,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) if (iap->ia_size < 0) return -EFBIG; - host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); + host_err = notify_change(&nop_mnt_idmap, dentry, &size_attr, NULL); if (host_err) return host_err; iap->ia_valid &= ~ATTR_SIZE; @@ -444,7 +444,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) return 0; iap->ia_valid |= ATTR_CTIME; - return notify_change(&init_user_ns, dentry, iap, NULL); + return notify_change(&nop_mnt_idmap, dentry, iap, NULL); } /** @@ -542,12 +542,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, attr->na_labelerr = security_inode_setsecctx(dentry, attr->na_seclabel->data, attr->na_seclabel->len); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) - attr->na_aclerr = set_posix_acl(&init_user_ns, + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_ACCESS, attr->na_pacl); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode)) - attr->na_aclerr = set_posix_acl(&init_user_ns, + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); inode_unlock(inode); @@ -583,7 +583,7 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME, + if (vfs_getxattr(&nop_mnt_idmap, dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) return 0; return 1; @@ -1363,12 +1363,13 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, dirp, dchild, + iap->ia_mode, true); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode); + host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); if (!host_err && unlikely(d_unhashed(dchild))) { struct dentry *d; d = lookup_one_len(dchild->d_name.name, @@ -1396,7 +1397,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = vfs_mknod(&init_user_ns, dirp, dchild, + host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, rdev); break; default: @@ -1557,7 +1558,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_drop_write; } fh_fill_pre_attrs(fhp); - host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path); + host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) @@ -1625,7 +1626,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (d_really_is_negative(dold)) goto out_dput; fh_fill_pre_attrs(ffhp); - host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL); + host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); inode_unlock(dirp); if (!host_err) { @@ -1745,10 +1746,10 @@ retry: goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_idmap = &nop_mnt_idmap, .old_dir = fdir, .old_dentry = odentry, - .new_mnt_userns = &init_user_ns, + .new_mnt_idmap = &nop_mnt_idmap, .new_dir = tdir, .new_dentry = ndentry, }; @@ -1850,14 +1851,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, nfsd_close_cached_files(rdentry); for (retries = 1;;) { - host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); + host_err = vfs_unlink(&nop_mnt_idmap, dirp, rdentry, NULL); if (host_err != -EAGAIN || !retries--) break; if (!nfsd_wait_for_delegreturn(rqstp, rinode)) break; } } else { - host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); + host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry); } fh_fill_post_attrs(fhp); @@ -2129,7 +2130,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock_shared(inode); - len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, NULL, 0); /* * Zero-length attribute, just return. @@ -2156,7 +2157,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, goto out; } - len = vfs_getxattr(&init_user_ns, dentry, name, buf, len); + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, buf, len); if (len <= 0) { kvfree(buf); buf = NULL; @@ -2267,7 +2268,7 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) inode_lock(fhp->fh_dentry->d_inode); fh_fill_pre_attrs(fhp); - ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry, + ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, NULL); fh_fill_post_attrs(fhp); @@ -2294,7 +2295,7 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock(fhp->fh_dentry->d_inode); fh_fill_pre_attrs(fhp); - ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf, + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf, len, flags, NULL); fh_fill_post_attrs(fhp); inode_unlock(fhp->fh_dentry->d_inode); @@ -2378,14 +2379,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ - err = inode_permission(&init_user_ns, inode, + err = inode_permission(&nop_mnt_idmap, inode, acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) - err = inode_permission(&init_user_ns, inode, MAY_EXEC); + err = inode_permission(&nop_mnt_idmap, inode, MAY_EXEC); return err? nfserrno(err) : 0; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index dbdfef7ae85b..43fb57a301d3 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -170,9 +170,14 @@ static inline void fh_drop_write(struct svc_fh *fh) static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { + u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; - return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, + + if (fh->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, AT_STATX_SYNC_AS_STAT)); } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 232dd7b6cca1..1310d2d5feb3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -364,7 +364,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_bh = bh; atomic64_inc(&root->inodes_count); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -949,7 +949,7 @@ void nilfs_evict_inode(struct inode *inode) */ } -int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int nilfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct nilfs_transaction_info ti; @@ -957,7 +957,7 @@ int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct super_block *sb = inode->i_sb; int err; - err = setattr_prepare(&init_user_ns, dentry, iattr); + err = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (err) return err; @@ -972,7 +972,7 @@ int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, nilfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { @@ -988,7 +988,7 @@ out_err: return err; } -int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; @@ -997,7 +997,7 @@ int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 87e1004b606d..5ccc638ae92f 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -128,7 +128,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) /** * nilfs_fileattr_set - ioctl to support chattr */ -int nilfs_fileattr_set(struct user_namespace *mnt_userns, +int nilfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -1114,7 +1114,14 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) minseg = range[0] + segbytes - 1; do_div(minseg, segbytes); + + if (range[1] < 4096) + goto out; + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + if (maxseg < segbytes) + goto out; + do_div(maxseg, segbytes); maxseg--; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 23899e0ae850..c7024da8f1e2 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -72,7 +72,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -100,7 +100,7 @@ static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, } static int -nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +nilfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -125,7 +125,7 @@ nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int nilfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct nilfs_transaction_info ti; @@ -202,7 +202,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -340,7 +340,7 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) return err; } -static int nilfs_rename(struct user_namespace *mnt_userns, +static int nilfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index aecda4fc95f5..8046490cd7fe 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -242,7 +242,7 @@ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); /* ioctl.c */ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m); -int nilfs_fileattr_set(struct user_namespace *mnt_userns, +int nilfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long nilfs_ioctl(struct file *, unsigned int, unsigned long); long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -271,10 +271,10 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode); extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); -extern int nilfs_setattr(struct user_namespace *, struct dentry *, +extern int nilfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nilfs_write_failed(struct address_space *mapping, loff_t to); -int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6edb6e0dd61f..1422b8ba24ed 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -409,6 +409,15 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) goto out; /* + * Prevent underflow in second superblock position calculation. + * The exact minimum size check is done in nilfs_sufile_resize(). + */ + if (newsize < 4096) { + ret = -ENOSPC; + goto out; + } + + /* * Write lock is required to protect some functions depending * on the number of segments, the number of reserved segments, * and so forth. diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2064e6473d30..3a4c9c150cbf 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -544,9 +544,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); + u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); int valid[2], swp = 0; + if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { + nilfs_err(sb, "device size too small"); + return -EINVAL; + } + sb2off = NILFS_SB2_OFFSET_BYTES(devsize); + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, &sbh[0]); sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index a2a15bc4df28..29bdd99b29fa 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -262,7 +262,7 @@ static int fanotify_get_response(struct fsnotify_group *group, } /* userspace responded, convert to something usable */ - switch (event->response & ~FAN_AUDIT) { + switch (event->response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: ret = 0; break; @@ -273,7 +273,8 @@ static int fanotify_get_response(struct fsnotify_group *group, /* Check if the response should be audited */ if (event->response & FAN_AUDIT) - audit_fanotify(event->response & ~FAN_AUDIT); + audit_fanotify(event->response & ~FAN_AUDIT, + &event->audit_rule); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -563,6 +564,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH_PERM; pevent->response = 0; + pevent->hdr.type = FAN_RESPONSE_INFO_NONE; + pevent->hdr.pad = 0; + pevent->hdr.len = 0; pevent->state = FAN_EVENT_INIT; pevent->path = *path; path_get(path); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 57f51a9a3015..e8a3c28c5d12 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -425,9 +425,13 @@ FANOTIFY_PE(struct fanotify_event *event) struct fanotify_perm_event { struct fanotify_event fae; struct path path; - unsigned short response; /* userspace answer to the event */ + u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ + union { + struct fanotify_response_info_header hdr; + struct fanotify_response_info_audit_rule audit_rule; + }; }; static inline struct fanotify_perm_event * diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4546da4a54f9..8f430bfad487 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -283,19 +283,42 @@ static int create_fd(struct fsnotify_group *group, const struct path *path, return client_fd; } +static int process_access_response_info(const char __user *info, + size_t info_len, + struct fanotify_response_info_audit_rule *friar) +{ + if (info_len != sizeof(*friar)) + return -EINVAL; + + if (copy_from_user(friar, info, sizeof(*friar))) + return -EFAULT; + + if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) + return -EINVAL; + if (friar->hdr.pad != 0) + return -EINVAL; + if (friar->hdr.len != sizeof(*friar)) + return -EINVAL; + + return info_len; +} + /* * Finish processing of permission event by setting it to ANSWERED state and * drop group->notification_lock. */ static void finish_permission_event(struct fsnotify_group *group, - struct fanotify_perm_event *event, - unsigned int response) + struct fanotify_perm_event *event, u32 response, + struct fanotify_response_info_audit_rule *friar) __releases(&group->notification_lock) { bool destroy = false; assert_spin_locked(&group->notification_lock); - event->response = response; + event->response = response & ~FAN_INFO; + if (response & FAN_INFO) + memcpy(&event->audit_rule, friar, sizeof(*friar)); + if (event->state == FAN_EVENT_CANCELED) destroy = true; else @@ -306,20 +329,27 @@ static void finish_permission_event(struct fsnotify_group *group, } static int process_access_response(struct fsnotify_group *group, - struct fanotify_response *response_struct) + struct fanotify_response *response_struct, + const char __user *info, + size_t info_len) { struct fanotify_perm_event *event; int fd = response_struct->fd; - int response = response_struct->response; + u32 response = response_struct->response; + int ret = info_len; + struct fanotify_response_info_audit_rule friar; - pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, - fd, response); + pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__, + group, fd, response, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the * timeout */ - switch (response & ~FAN_AUDIT) { + if (response & ~FANOTIFY_RESPONSE_VALID_MASK) + return -EINVAL; + + switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: case FAN_DENY: break; @@ -327,10 +357,20 @@ static int process_access_response(struct fsnotify_group *group, return -EINVAL; } - if (fd < 0) + if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; - if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) + if (response & FAN_INFO) { + ret = process_access_response_info(info, info_len, &friar); + if (ret < 0) + return ret; + if (fd == FAN_NOFD) + return ret; + } else { + ret = 0; + } + + if (fd < 0) return -EINVAL; spin_lock(&group->notification_lock); @@ -340,9 +380,9 @@ static int process_access_response(struct fsnotify_group *group, continue; list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, response); + finish_permission_event(group, event, response, &friar); wake_up(&group->fanotify_data.access_waitq); - return 0; + return ret; } spin_unlock(&group->notification_lock); @@ -804,7 +844,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (ret <= 0) { spin_lock(&group->notification_lock); finish_permission_event(group, - FANOTIFY_PERM(event), FAN_DENY); + FANOTIFY_PERM(event), FAN_DENY, NULL); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); @@ -827,28 +867,32 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct fanotify_response response = { .fd = -1, .response = -1 }; + struct fanotify_response response; struct fsnotify_group *group; int ret; + const char __user *info_buf = buf + sizeof(struct fanotify_response); + size_t info_len; if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) return -EINVAL; group = file->private_data; + pr_debug("%s: group=%p count=%zu\n", __func__, group, count); + if (count < sizeof(response)) return -EINVAL; - count = sizeof(response); - - pr_debug("%s: group=%p count=%zu\n", __func__, group, count); - - if (copy_from_user(&response, buf, count)) + if (copy_from_user(&response, buf, sizeof(response))) return -EFAULT; - ret = process_access_response(group, &response); + info_len = count - sizeof(response); + + ret = process_access_response(group, &response, info_buf, info_len); if (ret < 0) count = ret; + else + count = sizeof(response) + ret; return count; } @@ -876,7 +920,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, FAN_ALLOW); + finish_permission_event(group, event, FAN_ALLOW, NULL); spin_lock(&group->notification_lock); } @@ -893,7 +937,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), - FAN_ALLOW); + FAN_ALLOW, NULL); } spin_lock(&group->notification_lock); } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 08c659332e26..e6fc5f7cb1d7 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2865,7 +2865,7 @@ void ntfs_truncate_vfs(struct inode *vi) { /** * ntfs_setattr - called from notify_change() when an attribute is being changed - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry whose attributes to change * @attr: structure describing the attributes and the changes * @@ -2878,14 +2878,14 @@ void ntfs_truncate_vfs(struct inode *vi) { * * Called with ->i_mutex held. */ -int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) goto out; /* We do not support NTFS ACLs yet. */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 6f78ee00f57f..147ef4ddb691 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -289,7 +289,7 @@ extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); extern int ntfs_truncate(struct inode *vi); extern void ntfs_truncate_vfs(struct inode *vi); -extern int ntfs_setattr(struct user_namespace *mnt_userns, +extern int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int __ntfs_write_inode(struct inode *vi, int sync); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index d294cd975688..e9bdc1ff08c9 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -70,7 +70,7 @@ static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) /* * ntfs_getattr - inode_operations::getattr */ -int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags) { struct inode *inode = d_inode(path->dentry); @@ -84,7 +84,7 @@ int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->result_mask |= STATX_BTIME; stat->btime = ni->i_crtime; @@ -657,7 +657,7 @@ out: /* * ntfs3_setattr - inode_operations::setattr */ -int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct super_block *sb = dentry->d_sb; @@ -676,7 +676,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ia_valid = attr->ia_valid; } - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -704,10 +704,10 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode->i_size = newsize; } - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); if (mode != inode->i_mode) { - err = ntfs_acl_chmod(mnt_userns, dentry); + err = ntfs_acl_chmod(idmap, dentry); if (err) goto out; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 20b953871574..8ce2616b087f 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -1185,7 +1185,7 @@ out: * * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked */ -struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, @@ -1307,7 +1307,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, goto out3; } inode = &ni->vfs_inode; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); mode = inode->i_mode; inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime = @@ -1614,7 +1614,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { - err = ntfs_init_acl(mnt_userns, inode, dir); + err = ntfs_init_acl(idmap, inode, dir); if (err) goto out7; } else diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index c8db35e2ae17..407fe92394e2 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -94,12 +94,12 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, /* * ntfs_create - inode_operations::create */ -static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -110,12 +110,12 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, * * inode_operations::mknod */ -static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -183,13 +183,13 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) /* * ntfs_symlink - inode_operations::symlink */ -static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { u32 size = strlen(symname); struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -198,12 +198,12 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, /* * ntfs_mkdir- inode_operations::mkdir */ -static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -229,7 +229,7 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) /* * ntfs_rename - inode_operations::rename */ -static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode *new_dir, struct dentry *new_dentry, u32 flags) { @@ -415,13 +415,13 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, /* * Unfortunately I don't know how to get here correct 'struct nameidata *nd' - * or 'struct user_namespace *mnt_userns'. + * or 'struct mnt_idmap *idmap'. * See atomic_open in fs/namei.c. * This is why xfstest/633 failed. - * Looks like ntfs_atomic_open must accept 'struct user_namespace *mnt_userns' as argument. + * Looks like ntfs_atomic_open must accept 'struct mnt_idmap *idmap' as argument. */ - inode = ntfs_create_inode(&init_user_ns, dir, dentry, uni, mode, 0, + inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : finish_open(file, dentry, ntfs_file_open); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 0e051c5595a2..80072e5f96f7 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -492,10 +492,12 @@ bool dir_is_empty(struct inode *dir); extern const struct file_operations ntfs_dir_operations; /* Globals from file.c */ -int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); -int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, + CLST len); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -706,7 +708,7 @@ int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); int inode_write_data(struct inode *inode, const void *data, size_t bytes); -struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, @@ -857,17 +859,17 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); -int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); -int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL #endif -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry); -int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); +int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *ntfs_xattr_handlers[]; diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 616df209feea..ff64302e87e5 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -578,7 +578,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) return ntfs_get_acl_ex(inode, type, 0); } -static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, +static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, struct inode *inode, struct posix_acl *acl, int type, bool init_acl) { @@ -597,7 +597,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, case ACL_TYPE_ACCESS: /* Do not change i_mode if we are in init_acl */ if (acl && !init_acl) { - err = posix_acl_update_mode(mnt_userns, inode, &mode, + err = posix_acl_update_mode(idmap, inode, &mode, &acl); if (err) return err; @@ -652,10 +652,10 @@ out: /* * ntfs_set_acl - inode_operations::set_acl */ -int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false); + return ntfs_set_acl_ex(idmap, d_inode(dentry), acl, type, false); } /* @@ -663,7 +663,7 @@ int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * * Called from ntfs_create_inode(). */ -int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; @@ -674,7 +674,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, return err; if (default_acl) { - err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + err = ntfs_set_acl_ex(idmap, inode, default_acl, ACL_TYPE_DEFAULT, true); posix_acl_release(default_acl); } else { @@ -683,7 +683,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, if (acl) { if (!err) - err = ntfs_set_acl_ex(mnt_userns, inode, acl, + err = ntfs_set_acl_ex(idmap, inode, acl, ACL_TYPE_ACCESS, true); posix_acl_release(acl); } else { @@ -697,7 +697,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, /* * ntfs_acl_chmod - Helper for ntfs3_setattr(). */ -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) +int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; @@ -708,13 +708,13 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - return posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + return posix_acl_chmod(idmap, dentry, inode->i_mode); } /* * ntfs_permission - inode_operations::permission */ -int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (ntfs_sb(inode->i_sb)->options->noacsrules) { @@ -722,7 +722,7 @@ int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, return 0; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } /* @@ -835,7 +835,7 @@ out: * ntfs_setxattr - inode_operations::setxattr */ static noinline int ntfs_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *de, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 9f19cf9a5a9f..9fd03eaf15f8 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -260,7 +260,7 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; @@ -274,7 +274,7 @@ int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (type == ACL_TYPE_ACCESS && acl) { umode_t mode; - status = posix_acl_update_mode(&init_user_ns, inode, &mode, + status = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (status) goto unlock; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index a897c4e41b26..667c6f03fa60 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -17,7 +17,7 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8b2020f92b5f..ba26c5567cff 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -188,18 +188,18 @@ static int dlmfs_file_release(struct inode *inode, * We do ->setattr() just to override size changes. Our size is the size * of the LVB and nothing else. */ -static int dlmfs_file_setattr(struct user_namespace *mnt_userns, +static int dlmfs_file_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -336,7 +336,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inc_nlink(inode); @@ -359,7 +359,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, parent, mode); + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); ip = DLMFS_I(inode); @@ -402,7 +402,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct user_namespace * mnt_userns, +static int dlmfs_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) @@ -451,7 +451,7 @@ bail: return status; } -static int dlmfs_create(struct user_namespace *mnt_userns, +static int dlmfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5c60b6bc85bf..efb09de4343d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1111,7 +1111,7 @@ out: return ret; } -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; @@ -1142,11 +1142,11 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = setattr_prepare(&init_user_ns, dentry, attr); + status = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (status) return status; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { status = dquot_initialize(inode); if (status) return status; @@ -1265,7 +1265,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); @@ -1302,7 +1302,7 @@ bail: return status; } -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -1317,7 +1317,7 @@ int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, goto bail; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -1334,7 +1334,7 @@ bail: return err; } -int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret, had_lock; @@ -1360,7 +1360,7 @@ int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, dump_stack(); } - ret = generic_permission(&init_user_ns, inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: @@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (setattr_should_drop_suidgid(&init_user_ns, inode)) { + if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 71db8f3aa027..8e53e4ac1120 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -49,11 +49,11 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int ocfs2_permission(struct user_namespace *mnt_userns, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index afd54ec66103..811a6ea374bb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -82,7 +82,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) return status; } -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, +int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 0297c8846945..48a5fdfe87a1 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -12,7 +12,7 @@ #define OCFS2_IOCTL_PROTO_H int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, +int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 73a3854b2afb..f37174e79fad 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/fcntl.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a8fd51afb794..9175dbc47201 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -197,8 +197,8 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); - mode = mode_strip_sgid(&init_user_ns, dir, mode); - inode_init_owner(&init_user_ns, inode, dir, mode); + mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); status = dquot_initialize(inode); if (status) return ERR_PTR(status); @@ -221,7 +221,7 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, iput(inode); } -static int ocfs2_mknod(struct user_namespace *mnt_userns, +static int ocfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -642,7 +642,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe_blkno, suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct user_namespace *mnt_userns, +static int ocfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) @@ -651,14 +651,14 @@ static int ocfs2_mkdir(struct user_namespace *mnt_userns, trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (ret) mlog_errno(ret); return ret; } -static int ocfs2_create(struct user_namespace *mnt_userns, +static int ocfs2_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -668,7 +668,7 @@ static int ocfs2_create(struct user_namespace *mnt_userns, trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); if (ret) mlog_errno(ret); @@ -1194,7 +1194,7 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) ocfs2_inode_unlock(inode2, 1); } -static int ocfs2_rename(struct user_namespace *mnt_userns, +static int ocfs2_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, @@ -1784,7 +1784,7 @@ bail: return status; } -static int ocfs2_symlink(struct user_namespace *mnt_userns, +static int ocfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 623db358b1ef..5a656dc683f1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4316,7 +4316,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); } /** @@ -4370,7 +4370,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * file. */ if (!preserve) { - error = inode_permission(&init_user_ns, inode, MAY_READ); + error = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (error) return error; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 64e6ddcfe329..05d4414d0c33 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7..389308efe854 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7247,7 +7247,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7320,7 +7320,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7351,7 +7351,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c219f91f44e9..82cf7e9a665f 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,13 +279,13 @@ out_free_inode: return err; } -static int omfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return omfs_add_node(dir, dentry, mode | S_IFDIR); } -static int omfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int omfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return omfs_add_node(dir, dentry, mode | S_IFREG); @@ -370,7 +370,7 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, return true; } -static int omfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 3a5b4b88a583..0101f1f87b56 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -337,13 +337,13 @@ const struct file_operations omfs_file_operations = { .splice_read = generic_file_splice_read, }; -static int omfs_setattr(struct user_namespace *mnt_userns, +static int omfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -356,7 +356,7 @@ static int omfs_setattr(struct user_namespace *mnt_userns, omfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 2a0e83236c01..c4c79e07efc7 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -48,7 +48,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) goto fail; inode->i_ino = new_block; - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/open.c b/fs/open.c index 82c1a28b3308..8038cf652583 100644 --- a/fs/open.c +++ b/fs/open.c @@ -33,10 +33,11 @@ #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> +#include <linux/filelock.h> #include "internal.h" -int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; @@ -54,7 +55,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(mnt_userns, dentry); + ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) @@ -62,14 +63,14 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ - ret = notify_change(mnt_userns, dentry, &newattrs, NULL); + ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode; long error; @@ -85,8 +86,8 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto out; - mnt_userns = mnt_user_ns(path->mnt); - error = inode_permission(mnt_userns, inode, MAY_WRITE); + idmap = mnt_idmap(path->mnt); + error = inode_permission(idmap, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; @@ -108,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = security_path_truncate(path); if (!error) - error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); + error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); @@ -190,7 +191,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) sb_start_write(inode->i_sb); error = security_file_truncate(f.file); if (!error) - error = do_truncate(file_mnt_user_ns(f.file), dentry, length, + error = do_truncate(file_mnt_idmap(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: @@ -459,7 +460,7 @@ retry: goto out_path_release; } - res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS); + res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -603,7 +604,7 @@ retry_deleg: goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(mnt_user_ns(path->mnt), path->dentry, + error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); @@ -701,7 +702,8 @@ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) int chown_common(const struct path *path, uid_t user, gid_t group) { - struct user_namespace *mnt_userns, *fs_userns; + struct mnt_idmap *idmap; + struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; @@ -712,7 +714,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); - mnt_userns = mnt_user_ns(path->mnt); + idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: @@ -726,14 +728,14 @@ retry_deleg: inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | - setattr_should_drop_sgid(mnt_userns, inode); + setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, - from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), - from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); + from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) - error = notify_change(mnt_userns, path->dentry, &newattrs, + error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { @@ -870,7 +872,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(locks_inode(f), f->f_flags); + error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; @@ -1064,7 +1066,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, if (IS_ERR(f)) return f; - error = vfs_create(mnt_user_ns(path->mnt), + error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) @@ -1411,8 +1413,9 @@ int filp_close(struct file *filp, fl_owner_t id) { int retval = 0; - if (!file_count(filp)) { - printk(KERN_ERR "VFS: Close: file count is 0\n"); + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, + "VFS: Close: file count is 0 (f_op=%ps)", + filp->f_op)) { return 0; } diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index c5da2091cefb..5aefb705bcc8 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -118,7 +118,7 @@ out: return error; } -int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int orangefs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; @@ -136,7 +136,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * and "mode" to the new desired value. It is up to * us to propagate the new mode back to the server... */ - error = posix_acl_update_mode(&init_user_ns, inode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &iattr.ia_mode, &acl); if (error) { gossip_err("%s: posix_acl_update_mode err: %d\n", diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 167fa43b24f9..4ecb91a9bbeb 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,7 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> static int flush_racache(struct inode *inode) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 4df560894386..11e21a0e65ce 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -822,7 +822,7 @@ again: ORANGEFS_I(inode)->attr_uid = current_fsuid(); ORANGEFS_I(inode)->attr_gid = current_fsgid(); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); spin_unlock(&inode->i_lock); mark_inode_dirty(inode); @@ -839,20 +839,20 @@ int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) ret = __orangefs_setattr(inode, iattr); /* change mode on a file that has ACLs */ if (!ret && (iattr->ia_valid & ATTR_MODE)) - ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + ret = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return ret; } /* * Change attributes of an object referenced by dentry. */ -int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int orangefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int ret; gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", dentry); - ret = setattr_prepare(&init_user_ns, dentry, iattr); + ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (ret) goto out; ret = __orangefs_setattr_mode(dentry, iattr); @@ -866,7 +866,7 @@ out: /* * Obtain attributes of an object given a dentry */ -int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { int ret; @@ -879,7 +879,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -890,7 +890,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, return ret; } -int orangefs_permission(struct user_namespace *mnt_userns, +int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; @@ -905,7 +905,7 @@ int orangefs_permission(struct user_namespace *mnt_userns, if (ret < 0) return ret; - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) @@ -944,7 +944,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -static int orangefs_fileattr_set(struct user_namespace *mnt_userns, +static int orangefs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { u64 val = 0; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 75c1a3dcf68c..77518e248cf7 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -15,7 +15,7 @@ /* * Get a newly allocated inode to go with a negative dentry. */ -static int orangefs_create(struct user_namespace *mnt_userns, +static int orangefs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -216,7 +216,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) return ret; } -static int orangefs_symlink(struct user_namespace *mnt_userns, +static int orangefs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) @@ -305,7 +305,7 @@ out: return ret; } -static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); @@ -375,7 +375,7 @@ out: return ret; } -static int orangefs_rename(struct user_namespace *mnt_userns, +static int orangefs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 6e0cc01b3a14..ce20d3443869 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -106,7 +106,7 @@ enum orangefs_vfs_op_states { extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); -extern int orangefs_set_acl(struct user_namespace *mnt_userns, +extern int orangefs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); @@ -362,12 +362,12 @@ struct inode *orangefs_new_inode(struct super_block *sb, int __orangefs_setattr(struct inode *, struct iattr *); int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr); -int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int orangefs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); -int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int orangefs_permission(struct user_namespace *mnt_userns, +int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int orangefs_update_time(struct inode *, struct timespec64 *, int); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 9a5b757fbd2f..6ecad4f94ae6 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -526,7 +526,7 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 6e4e65ee050d..c14e90764e35 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -792,7 +792,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (!c->metacopy && c->stat.size) { err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); if (err) - return err; + goto out_fput; } err = ovl_copy_up_metadata(c, temp); @@ -1011,6 +1011,10 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, if (err) return err; + if (!kuid_has_mapping(current_user_ns(), ctx.stat.uid) || + !kgid_has_mapping(current_user_ns(), ctx.stat.gid)) + return -EOVERFLOW; + ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); if (parent) { diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f61e37f4c8ff..fc25fb95d5fc 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -641,7 +641,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode->i_state |= I_CREATING; spin_unlock(&inode->i_lock); - inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode); + inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; err = ovl_create_or_link(dentry, inode, &attr, false); @@ -655,19 +655,19 @@ out: return err; } -static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); } -static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { /* Don't allow creation of "whiteout" on overlay */ @@ -677,7 +677,7 @@ static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, return ovl_create_object(dentry, mode, rdev, NULL); } -static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *link) { return ovl_create_object(dentry, S_IFLNK, 0, link); @@ -1075,7 +1075,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) return err; } -static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, +static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *old, struct inode *newdir, struct dentry *new, unsigned int flags) { diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index a25bb3453dde..defd4e231ad2 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -392,8 +392,8 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); /* - * No mnt_userns handling here: it's an internal lookup. Could skip - * permission checking altogether, but for now just use non-mnt_userns + * No idmap handling here: it's an internal lookup. Could skip + * permission checking altogether, but for now just use non-idmap * transformed ids. */ this = lookup_one_len(name.name.name, connected, name.name.len); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index c9d0c362c7ef..7c04f033aadd 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -42,7 +42,7 @@ static struct file *ovl_open_realfile(const struct file *file, { struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); - struct user_namespace *real_mnt_userns; + struct mnt_idmap *real_idmap; struct file *realfile; const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; @@ -53,12 +53,12 @@ static struct file *ovl_open_realfile(const struct file *file, acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - real_mnt_userns = mnt_user_ns(realpath->mnt); - err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode); + real_idmap = mnt_idmap(realpath->mnt); + err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(real_mnt_userns, realinode)) + if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ee6dfa577c93..541cf3717fc2 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,7 +19,7 @@ #include "overlayfs.h" -int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; @@ -28,7 +28,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct dentry *upperdentry; const struct cred *old_cred; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -153,7 +153,7 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) } } -int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -278,7 +278,7 @@ out: return err; } -int ovl_permission(struct user_namespace *mnt_userns, +int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); @@ -298,7 +298,7 @@ int ovl_permission(struct user_namespace *mnt_userns, * Check overlay inode with the creds of task and underlying inode * with creds of mounter */ - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); if (err) return err; @@ -310,7 +310,7 @@ int ovl_permission(struct user_namespace *mnt_userns, /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask); + err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask); revert_creds(old_cred); return err; @@ -361,7 +361,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0); + err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out_drop_write; @@ -403,7 +403,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size); + res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); revert_creds(old_cred); return res; } @@ -463,7 +463,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * alter the POSIX ACLs for the underlying filesystem. */ static void ovl_idmap_posix_acl(const struct inode *realinode, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct posix_acl *acl) { struct user_namespace *fs_userns = i_user_ns(realinode); @@ -475,11 +475,11 @@ static void ovl_idmap_posix_acl(const struct inode *realinode, struct posix_acl_entry *e = &acl->a_entries[i]; switch (e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid); + vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid); e->e_uid = vfsuid_into_kuid(vfsuid); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid); + vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid); e->e_gid = vfsgid_into_kgid(vfsgid); break; } @@ -514,15 +514,15 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm) { struct posix_acl *real_acl, *clone; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *realinode = d_inode(path->dentry); - mnt_userns = mnt_user_ns(path->mnt); + idmap = mnt_idmap(path->mnt); if (noperm) real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); else - real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name); + real_acl = vfs_get_acl(idmap, path->dentry, acl_name); if (IS_ERR_OR_NULL(real_acl)) return real_acl; @@ -540,7 +540,7 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, if (!clone) return ERR_PTR(-ENOMEM); - ovl_idmap_posix_acl(realinode, mnt_userns, clone); + ovl_idmap_posix_acl(realinode, idmap, clone); return clone; } @@ -555,7 +555,7 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, * * This is obviously only relevant when idmapped layers are used. */ -struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm) { @@ -618,7 +618,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry, + real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); revert_creds(old_cred); if (IS_ERR(real_acl)) { @@ -651,7 +651,7 @@ out_drop_write: return err; } -int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int err; @@ -665,7 +665,7 @@ int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return -EOPNOTSUPP; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; /* @@ -674,10 +674,10 @@ int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, */ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { + !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) { struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - err = ovl_setattr(&init_user_ns, dentry, &iattr); + err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr); if (err) return err; } @@ -755,10 +755,10 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) if (err) return err; - return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa); + return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa); } -int ovl_fileattr_set(struct user_namespace *mnt_userns, +int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 46753134533a..cfb3420b7df0 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -204,7 +204,7 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -711,7 +711,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name, + index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); @@ -1182,7 +1182,7 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt), + this = lookup_one_positive_unlocked(mnt_idmap(poe->lowerstack[i].layer->mnt), name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 1df7f850ff3b..4d0b278f5630 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -141,13 +141,13 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_idmap(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; @@ -156,7 +156,7 @@ static inline int ovl_do_rmdir(struct ovl_fs *ofs, static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL); + int err = vfs_unlink(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; @@ -165,7 +165,8 @@ static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL); + int err = vfs_link(old_dentry, ovl_upper_mnt_idmap(ofs), dir, + new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; @@ -175,7 +176,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; @@ -185,7 +186,7 @@ static inline int ovl_do_mkdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode); + int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } @@ -194,7 +195,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; @@ -204,7 +205,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; @@ -217,7 +218,7 @@ static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); - err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry, + err = vfs_getxattr(mnt_idmap(path->mnt), path->dentry, name, value, size); len = (value && err > 0) ? err : 0; @@ -251,7 +252,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + int err = vfs_setxattr(ovl_upper_mnt_idmap(ofs), dentry, name, value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", @@ -269,7 +270,7 @@ static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name) { - int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name); + int err = vfs_removexattr(ovl_upper_mnt_idmap(ofs), dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } @@ -283,13 +284,13 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name, struct posix_acl *acl) { - return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl); + return vfs_set_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name, acl); } static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name) { - return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name); + return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, @@ -298,10 +299,10 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, { int err; struct renamedata rd = { - .old_mnt_userns = ovl_upper_mnt_userns(ofs), + .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_dir = olddir, .old_dentry = olddentry, - .new_mnt_userns = ovl_upper_mnt_userns(ofs), + .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_dir = newdir, .new_dentry = newdentry, .flags = flags, @@ -319,7 +320,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, static inline int ovl_do_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry); + int err = vfs_whiteout(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } @@ -328,7 +329,7 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; - struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, O_LARGEFILE | O_WRONLY, current_cred()); int err = PTR_ERR_OR_ZERO(file); @@ -340,7 +341,7 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { - return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len); + return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); } static inline bool ovl_open_flags_need_copy_up(int flags) @@ -596,11 +597,11 @@ int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); -int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); @@ -609,20 +610,20 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm); static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type, bool rcu) { - return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true); + return do_ovl_get_acl(&nop_mnt_idmap, inode, type, rcu, true); } -static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, +static inline struct posix_acl *ovl_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { - return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false); + return do_ovl_get_acl(idmap, d_inode(dentry), type, false, false); } -int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm); @@ -717,7 +718,7 @@ void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ovl_fileattr_set(struct user_namespace *mnt_userns, +int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* copy_up.c */ diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index e1af8f660698..fd11fe6d6d45 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -90,9 +90,9 @@ static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) return ofs->layers[0].mnt; } -static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs) +static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs) { - return mnt_user_ns(ovl_upper_mnt(ofs)); + return mnt_idmap(ovl_upper_mnt(ofs)); } static inline struct ovl_fs *OVL_FS(struct super_block *sb) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 8cd2b9947de1..b6952b21a7ee 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -278,7 +278,7 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -480,7 +480,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry goto get; } } - this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 85b891152a2c..f1d9f75f8786 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1012,7 +1012,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler, } static int ovl_own_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1028,7 +1028,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler, } static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index bde291623c8c..923d66d131c1 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -491,7 +491,7 @@ bool ovl_is_whiteout(struct dentry *dentry) struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); - struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); + struct mnt_idmap *real_idmap = mnt_idmap(path->mnt); int err, acc_mode; if (flags & ~(O_ACCMODE | O_LARGEFILE)) @@ -508,12 +508,12 @@ struct file *ovl_path_open(const struct path *path, int flags) BUG(); } - err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN); + err = inode_permission(real_idmap, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ - if (inode_owner_or_capable(real_mnt_userns, inode)) + if (inode_owner_or_capable(real_idmap, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); @@ -1101,16 +1101,16 @@ void ovl_copyattr(struct inode *inode) { struct path realpath; struct inode *realinode; - struct user_namespace *real_mnt_userns; + struct mnt_idmap *real_idmap; vfsuid_t vfsuid; vfsgid_t vfsgid; ovl_i_path_real(inode, &realpath); realinode = d_inode(realpath.dentry); - real_mnt_userns = mnt_user_ns(realpath.mnt); + real_idmap = mnt_idmap(realpath.mnt); - vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode); - vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode); + vfsuid = i_uid_into_vfsuid(real_idmap, realinode); + vfsgid = i_gid_into_vfsgid(real_idmap, realinode); inode->i_uid = vfsuid_into_kuid(vfsuid); inode->i_gid = vfsgid_into_kgid(vfsgid); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d7bc81fc0840..5a76fb35923a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -28,6 +28,7 @@ #include <linux/security.h> #include <linux/evm.h> #include <linux/fsnotify.h> +#include <linux/filelock.h> #include "internal.h" @@ -111,7 +112,7 @@ void forget_all_cached_acls(struct inode *inode) } EXPORT_SYMBOL(forget_all_cached_acls); -static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, +static struct posix_acl *__get_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, int type) { @@ -154,7 +155,7 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, * we'll just create the negative cache entry. */ if (dentry && inode->i_op->get_acl) { - acl = inode->i_op->get_acl(mnt_userns, dentry, type); + acl = inode->i_op->get_acl(idmap, dentry, type); } else if (inode->i_op->get_inode_acl) { acl = inode->i_op->get_inode_acl(inode, type, false); } else { @@ -174,14 +175,14 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, * Cache the result, but only if our sentinel is still in place. */ posix_acl_dup(acl); - if (unlikely(cmpxchg(p, sentinel, acl) != sentinel)) + if (unlikely(!try_cmpxchg(p, &sentinel, acl))) posix_acl_release(acl); return acl; } struct posix_acl *get_inode_acl(struct inode *inode, int type) { - return __get_acl(&init_user_ns, NULL, inode, type); + return __get_acl(&nop_mnt_idmap, NULL, inode, type); } EXPORT_SYMBOL(get_inode_acl); @@ -372,7 +373,7 @@ EXPORT_SYMBOL(posix_acl_from_mode); * by the acl. Returns -E... otherwise. */ int -posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, +posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; @@ -387,18 +388,18 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, + vfsuid = make_vfsuid(idmap, fs_userns, pa->e_uid); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) @@ -406,7 +407,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, + vfsgid = make_vfsgid(idmap, fs_userns, pa->e_gid); if (vfsgid_in_group_p(vfsgid)) { found = 1; @@ -591,18 +592,18 @@ EXPORT_SYMBOL(__posix_acl_chmod); /** * posix_acl_chmod - chmod a posix acl * - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @dentry: dentry to check permissions on * @mode: the new mode of @inode * - * If the dentry has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the dentry has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ int - posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, + posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { struct inode *inode = d_inode(dentry); @@ -624,7 +625,7 @@ int ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -683,7 +684,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: target inode * @mode_p: mode (pointer) for update * @acl: acl pointer @@ -695,15 +696,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Called from set_acl inode operations. */ -int posix_acl_update_mode(struct user_namespace *mnt_userns, +int posix_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { @@ -715,8 +716,8 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; @@ -893,7 +894,6 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; struct user_namespace *fs_userns, *caller_userns; - struct user_namespace *mnt_userns; ssize_t real_size, n; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -909,19 +909,18 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, fs_userns = i_user_ns(inode); caller_userns = current_user_ns(); - mnt_userns = mnt_idmap_owner(idmap); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); switch(acl_e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid); + vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid); ext_entry->e_id = cpu_to_le32(from_kuid( caller_userns, vfsuid_into_kuid(vfsuid))); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid); + vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid); ext_entry->e_id = cpu_to_le32(from_kgid( caller_userns, vfsgid_into_kgid(vfsgid))); break; @@ -934,7 +933,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, } int -set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type, struct posix_acl *acl) { struct inode *inode = d_inode(dentry); @@ -946,7 +945,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (acl) { @@ -954,7 +953,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (ret) return ret; } - return inode->i_op->set_acl(mnt_userns, dentry, acl, type); + return inode->i_op->set_acl(idmap, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); @@ -978,14 +977,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = { }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(mnt_userns, inode, + error = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (error) return error; @@ -1017,7 +1016,7 @@ int simple_acl_create(struct inode *dir, struct inode *inode) return 0; } -static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, +static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap, struct user_namespace *fs_userns, struct posix_acl *acl) { @@ -1026,11 +1025,11 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns, + acl_e->e_uid = from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(acl_e->e_uid)); break; case ACL_GROUP: - acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns, + acl_e->e_gid = from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(acl_e->e_gid)); break; } @@ -1041,7 +1040,7 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, /** * vfs_set_acl - set posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to set the posix acls * @acl_name: the name of the posix acl * @kacl: the posix acls in the appropriate VFS format @@ -1051,7 +1050,7 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, * * Return: On success 0, on error negative errno. */ -int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { int acl_type; @@ -1071,7 +1070,7 @@ int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * if this is a filesystem with a backing store - ultimately * translate them to backing store values. */ - error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl); + error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl); if (error) return error; } @@ -1083,11 +1082,11 @@ retry_deleg: * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ - error = may_write_xattr(mnt_userns, inode); + error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; - error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + error = security_inode_set_acl(idmap, dentry, acl_name, kacl); if (error) goto out_inode_unlock; @@ -1096,7 +1095,7 @@ retry_deleg: goto out_inode_unlock; if (inode->i_opflags & IOP_XATTR) - error = set_posix_acl(mnt_userns, dentry, acl_type, kacl); + error = set_posix_acl(idmap, dentry, acl_type, kacl); else if (unlikely(is_bad_inode(inode))) error = -EIO; else @@ -1121,7 +1120,7 @@ EXPORT_SYMBOL_GPL(vfs_set_acl); /** * vfs_get_acl - get posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * @@ -1130,7 +1129,7 @@ EXPORT_SYMBOL_GPL(vfs_set_acl); * * Return: On success POSIX ACLs in VFS format, on error negative errno. */ -struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct inode *inode = d_inode(dentry); @@ -1145,7 +1144,7 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, * The VFS has no restrictions on reading POSIX ACLs so calling * something like xattr_permission() isn't needed. Only LSMs get a say. */ - error = security_inode_get_acl(mnt_userns, dentry, acl_name); + error = security_inode_get_acl(idmap, dentry, acl_name); if (error) return ERR_PTR(error); @@ -1154,7 +1153,7 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, if (S_ISLNK(inode->i_mode)) return ERR_PTR(-EOPNOTSUPP); - acl = __get_acl(mnt_userns, dentry, inode, acl_type); + acl = __get_acl(idmap, dentry, inode, acl_type); if (IS_ERR(acl)) return acl; if (!acl) @@ -1166,7 +1165,7 @@ EXPORT_SYMBOL_GPL(vfs_get_acl); /** * vfs_remove_acl - remove posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * @@ -1174,7 +1173,7 @@ EXPORT_SYMBOL_GPL(vfs_get_acl); * * Return: On success 0, on error negative errno. */ -int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int acl_type; @@ -1193,11 +1192,11 @@ retry_deleg: * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ - error = may_write_xattr(mnt_userns, inode); + error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; - error = security_inode_remove_acl(mnt_userns, dentry, acl_name); + error = security_inode_remove_acl(idmap, dentry, acl_name); if (error) goto out_inode_unlock; @@ -1206,14 +1205,14 @@ retry_deleg: goto out_inode_unlock; if (inode->i_opflags & IOP_XATTR) - error = set_posix_acl(mnt_userns, dentry, acl_type, NULL); + error = set_posix_acl(idmap, dentry, acl_type, NULL); else if (unlikely(is_bad_inode(inode))) error = -EIO; else error = -EOPNOTSUPP; if (!error) { fsnotify_xattr(dentry); - evm_inode_post_remove_acl(mnt_userns, dentry, acl_name); + evm_inode_post_remove_acl(idmap, dentry, acl_name); } out_inode_unlock: @@ -1245,7 +1244,7 @@ int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, return PTR_ERR(acl); } - error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl); + error = vfs_set_acl(idmap, dentry, acl_name, acl); posix_acl_release(acl); return error; } @@ -1256,7 +1255,7 @@ ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, ssize_t error; struct posix_acl *acl; - acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name); + acl = vfs_get_acl(idmap, dentry, acl_name); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b..5e0e0ccd47aa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -685,7 +685,7 @@ static bool proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; @@ -694,11 +694,11 @@ int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -727,7 +727,7 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info, } -static int proc_pid_permission(struct user_namespace *mnt_userns, +static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); @@ -753,7 +753,7 @@ static int proc_pid_permission(struct user_namespace *mnt_userns, return -EPERM; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } @@ -1959,14 +1959,14 @@ static struct inode *proc_pid_make_base_inode(struct super_block *sb, return inode; } -int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, +int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -3557,7 +3557,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) * This function makes sure that the node is always accessible for members of * same thread group. */ -static int proc_tid_comm_permission(struct user_namespace *mnt_userns, +static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; @@ -3577,7 +3577,7 @@ static int proc_tid_comm_permission(struct user_namespace *mnt_userns, return 0; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { @@ -3891,13 +3891,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(struct user_namespace *mnt_userns, +static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index fc46d6fe080c..b3140deebbbf 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/proc_fs.h> @@ -325,13 +326,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -int proc_fd_permission(struct user_namespace *mnt_userns, +int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct task_struct *p; int rv; - rv = generic_permission(&init_user_ns, inode, mask); + rv = generic_permission(&nop_mnt_idmap, inode, mask); if (rv == 0) return rv; @@ -344,14 +345,14 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } -static int proc_fd_getattr(struct user_namespace *mnt_userns, +static int proc_fd_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); int rv = 0; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* If it's a directory, put the number of open fds there */ if (S_ISDIR(inode->i_mode)) { diff --git a/fs/proc/fd.h b/fs/proc/fd.h index c5a921a06a0b..7e7265f7e06f 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -10,7 +10,7 @@ extern const struct inode_operations proc_fd_inode_operations; extern const struct file_operations proc_fdinfo_operations; extern const struct inode_operations proc_fdinfo_inode_operations; -extern int proc_fd_permission(struct user_namespace *mnt_userns, +extern int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); static inline unsigned int proc_fd(struct inode *inode) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 587b91d9d998..8379593fa4bb 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -115,18 +115,18 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, return true; } -static int proc_notify_change(struct user_namespace *mnt_userns, +static int proc_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); proc_set_user(de, inode->i_uid, inode->i_gid); @@ -134,7 +134,7 @@ static int proc_notify_change(struct user_namespace *mnt_userns, return 0; } -static int proc_getattr(struct user_namespace *mnt_userns, +static int proc_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -147,7 +147,7 @@ static int proc_getattr(struct user_namespace *mnt_userns, } } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index b701d0207edf..9dda7e54b2d0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,9 +162,9 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(struct user_namespace *, const struct path *, +extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int proc_setattr(struct user_namespace *, struct dentry *, +extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 856839b8ae8b..a0c0419872e3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -299,7 +299,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, return de; } -static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, +static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, net = get_proc_task_net(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 48f2d60bd78a..e89bd8f1368b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -798,7 +798,7 @@ out: return 0; } -static int proc_sys_permission(struct user_namespace *mnt_userns, +static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* @@ -827,7 +827,7 @@ static int proc_sys_permission(struct user_namespace *mnt_userns, return error; } -static int proc_sys_setattr(struct user_namespace *mnt_userns, +static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -836,16 +836,16 @@ static int proc_sys_setattr(struct user_namespace *mnt_userns, if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } -static int proc_sys_getattr(struct user_namespace *mnt_userns, +static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -856,7 +856,7 @@ static int proc_sys_getattr(struct user_namespace *mnt_userns, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; diff --git a/fs/proc/root.c b/fs/proc/root.c index 3c2ee3eb1138..a86e65a608da 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -310,11 +310,11 @@ void __init proc_root_init(void) register_filesystem(&proc_fs_type); } -static int proc_root_getattr(struct user_namespace *mnt_userns, +static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..af1c49ae11b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, page = pfn_swap_entry_to_page(swpent); } if (page) { - int mapcount = page_mapcount(page); - - if (mapcount >= 2) + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f27faf5db554..a6357f728034 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2085,7 +2085,7 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, +int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; @@ -2096,8 +2096,8 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, if (!dquot_active(inode)) return 0; - if (i_uid_needs_update(mnt_userns, iattr, inode)) { - kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + if (i_uid_needs_update(idmap, iattr, inode)) { + kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); dquot = dqget(sb, make_kqid_uid(kuid)); @@ -2110,8 +2110,8 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, } transfer_to[USRQUOTA] = dquot; } - if (i_gid_needs_update(mnt_userns, iattr, inode)) { - kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + if (i_gid_needs_update(idmap, iattr, inode)) { + kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); dquot = dqget(sb, make_kqid_gid(kgid)); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index cb240eac5036..5bf74c2f6042 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include "internal.h" -static int ramfs_nommu_setattr(struct user_namespace *, struct dentry *, struct iattr *); +static int ramfs_nommu_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, @@ -158,7 +158,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) * handle a change of attributes * - we're specifically interested in a change of size */ -static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, +static int ramfs_nommu_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { struct inode *inode = d_inode(dentry); @@ -166,7 +166,7 @@ static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, int ret = 0; /* POSIX UID/GID verification for setting inode attributes */ - ret = setattr_prepare(&init_user_ns, dentry, ia); + ret = setattr_prepare(&nop_mnt_idmap, dentry, ia); if (ret) return ret; @@ -186,7 +186,7 @@ static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, } } - setattr_copy(&init_user_ns, inode, ia); + setattr_copy(&nop_mnt_idmap, inode, ia); out: ia->ia_valid = old_ia_valid; return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b3257e852820..5ba580c78835 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); @@ -95,7 +95,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, */ /* SMP-safe */ static int -ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); @@ -110,22 +110,22 @@ ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int ramfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int ramfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } -static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; @@ -145,7 +145,7 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int ramfs_tmpfile(struct user_namespace *mnt_userns, +static int ramfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index 29c503a06db4..2571b1a8be84 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,7 +49,7 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); -int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct dentry *dentry); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c7d1fa526dea..d54cab854f60 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3262,21 +3262,21 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3359,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(mnt_userns, inode, attr); + error = dquot_transfer(&nop_mnt_idmap, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); @@ -3398,7 +3398,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (!error) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 4b86ecf5817e..6bf9b54e58ca 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -24,7 +24,7 @@ int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int reiserfs_fileattr_set(struct user_namespace *mnt_userns, +int reiserfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -96,7 +96,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!inode_owner_or_capable(&init_user_ns, inode)) { + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { err = -EPERM; break; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 0b8aa99749f1..42d2c20e1345 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -616,11 +616,11 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) * the quota init calls have to know who to charge the quota to, so * we have to set uid and gid here */ - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return dquot_initialize(inode); } -static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int retval; @@ -700,7 +700,7 @@ out_failed: return retval; } -static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int retval; @@ -784,7 +784,7 @@ out_failed: return retval; } -static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval; @@ -1099,7 +1099,7 @@ out_unlink: return retval; } -static int reiserfs_symlink(struct user_namespace *mnt_userns, +static int reiserfs_symlink(struct mnt_idmap *idmap, struct inode *parent_dir, struct dentry *dentry, const char *symname) { @@ -1311,7 +1311,7 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, * one path. If it holds 2 or more, it can get into endless waiting in * get_empty_nodes or its clones */ -static int reiserfs_rename(struct user_namespace *mnt_userns, +static int reiserfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 3aa928ec527a..98e6f53c2fe0 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3100,7 +3100,7 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); @@ -3407,7 +3407,7 @@ __u32 r5_hash(const signed char *msg, int len); /* prototypes from ioctl.c */ int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int reiserfs_fileattr_set(struct user_namespace *mnt_userns, +int reiserfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long reiserfs_compat_ioctl(struct file *filp, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8b2d52443f41..06d810c72c52 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -66,14 +66,14 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->create(&init_user_ns, dir, dentry, mode, true); + return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->mkdir(&init_user_ns, dir, dentry, mode); + return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode); } /* @@ -352,7 +352,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) * ATTR_MODE is set. */ attrs->ia_valid &= (ATTR_UID|ATTR_GID); - err = reiserfs_setattr(&init_user_ns, dentry, attrs); + err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs); attrs->ia_valid = ia_valid; return err; @@ -597,7 +597,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); - err = reiserfs_setattr(&init_user_ns, dentry, &newattrs); + err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs); inode_unlock(d_inode(dentry)); } else update_ctime(inode); @@ -941,7 +941,7 @@ static int xattr_mount_check(struct super_block *s) return 0; } -int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* @@ -951,7 +951,7 @@ int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, if (IS_PRIVATE(inode)) return 0; - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index e47fde1182de..5868a4e990e3 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -16,7 +16,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct user_namespace *mnt_userns, +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 93fe414fed18..138060452678 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error, error2; @@ -42,7 +42,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error == 0) { if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(&init_user_ns, inode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (error) goto unlock; @@ -407,5 +407,5 @@ int reiserfs_acl_chmod(struct dentry *dentry) !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 857a65b05726..41c0ea84fbff 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -22,7 +22,7 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, static int security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index d853cea2afcd..0c0c74d8db0e 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -21,7 +21,7 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, static int trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 65d9cd10a5ea..88195181e1d7 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -18,7 +18,7 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, +user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/remap_range.c b/fs/remap_range.c index 41f60477bb41..1331a890f2f2 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -419,16 +419,16 @@ EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ static bool allow_file_dedupe(struct file *file) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); if (capable(CAP_SYS_ADMIN)) return true; if (file->f_mode & FMODE_WRITE) return true; - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return true; - if (!inode_permission(mnt_userns, inode, MAY_WRITE)) + if (!inode_permission(idmap, inode, MAY_WRITE)) return true; return false; } diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b3fdc8212c5f..95f8e8901768 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) /* xattr id lookup table defines */ -#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) +#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id)) #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ SQUASHFS_METADATA_SIZE) diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 659082e9e51d..72f6f4b37863 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -63,7 +63,7 @@ struct squashfs_sb_info { long long bytes_used; unsigned int inodes; unsigned int fragments; - int xattr_ids; + unsigned int xattr_ids; unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index d8a270d3ac4c..f1a463d8bfa0 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -10,12 +10,12 @@ #ifdef CONFIG_SQUASHFS_XATTR extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, - u64 *, int *); + u64 *, unsigned int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, - u64 start, u64 *xattr_table_start, int *xattr_ids) + u64 start, u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_xattr_id_table *id_table; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index 087cab8c78f4..c8469c656e0d 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, - u64 *xattr_table_start, int *xattr_ids) + u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; diff --git a/fs/stat.c b/fs/stat.c index d6cc74ca8486..7c238da22ef0 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -18,6 +18,7 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/compat.h> +#include <linux/iversion.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -27,7 +28,7 @@ /** * generic_fillattr - Fill in the basic attributes from the inode struct - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: Inode to use as the source * @stat: Where to fill in the attributes * @@ -35,17 +36,17 @@ * found on the VFS inode structure. This is the default if no getattr inode * operation is supplied. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before filling in the + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ -void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, +void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, struct kstat *stat) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; @@ -97,7 +98,7 @@ EXPORT_SYMBOL(generic_fill_statx_attr); int vfs_getattr_nosec(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode = d_backing_inode(path->dentry); memset(stat, 0, sizeof(*stat)); @@ -122,12 +123,17 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); - mnt_userns = mnt_user_ns(path->mnt); + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + + idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) - return inode->i_op->getattr(mnt_userns, path, stat, + return inode->i_op->getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -602,9 +608,11 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) memset(&tmp, 0, sizeof(tmp)); - tmp.stx_mask = stat->result_mask; + /* STATX_CHANGE_COOKIE is kernel-only for now */ + tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; tmp.stx_blksize = stat->blksize; - tmp.stx_attributes = stat->attributes; + /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ + tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; tmp.stx_nlink = stat->nlink; tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); @@ -643,6 +651,11 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; + /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + * from userland. + */ + mask &= ~STATX_CHANGE_COOKIE; + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; diff --git a/fs/super.c b/fs/super.c index 904adfbacdcf..84332d5cb817 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,7 +291,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); - fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -491,10 +490,23 @@ void generic_shutdown_super(struct super_block *sb) if (sop->put_super) sop->put_super(sb); - if (!list_empty(&sb->s_inodes)) { - printk("VFS: Busy inodes after unmount of %s. " - "Self-destruct in 5 seconds. Have a nice day...\n", - sb->s_id); + if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), + "VFS: Busy inodes after unmount of %s (%s)", + sb->s_id, sb->s_type->name)) { + /* + * Adding a proper bailout path here would be hard, but + * we can at least make it more likely that a later + * iput_final() or such crashes cleanly. + */ + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + inode->i_op = VFS_PTR_POISON; + inode->i_sb = VFS_PTR_POISON; + inode->i_mapping = VFS_PTR_POISON; + } + spin_unlock(&sb->s_inode_list_lock); } } spin_lock(&sb_lock); diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 90e00124ea07..50eb92557a0f 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -29,13 +29,13 @@ const struct file_operations sysv_file_operations = { .splice_read = generic_file_splice_read, }; -static int sysv_setattr(struct user_namespace *mnt_userns, +static int sysv_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -48,7 +48,7 @@ static int sysv_setattr(struct user_namespace *mnt_userns, sysv_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 50df794a3c1f..e732879036ab 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -163,7 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 3b8567564e7e..b22764fe669c 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -441,11 +441,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) return res; } -int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, +int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index b2e6abc06a2d..ecd424461511 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -41,7 +41,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un return d_splice_alias(inode, dentry); } -static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -61,13 +61,13 @@ static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int sysv_create(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return sysv_mknod(&init_user_ns, dir, dentry, mode, 0); + return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int sysv_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; @@ -110,7 +110,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int sysv_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; @@ -189,7 +189,7 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 99ddf033da4f..5e122a5673c1 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -141,7 +141,7 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(struct user_namespace *, const struct path *, +extern int sysv_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int sysv_init_icache(void); extern void sysv_destroy_icache(void); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index da85b3979195..57ac8aa4a724 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -67,7 +67,7 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct user_namespace *mnt_userns, +static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t mode) { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 0f29cf201136..1e92c1730c16 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -95,7 +95,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, */ inode->i_flags |= S_NOCMTIME; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mapping->nrpages = 0; @@ -283,7 +283,7 @@ static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry, return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm); } -static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -426,7 +426,7 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } -static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct dentry *dentry = file->f_path.dentry; @@ -979,7 +979,7 @@ out_fname: return err; } -static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -1052,7 +1052,7 @@ out_budg: return err; } -static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -1141,7 +1141,7 @@ out_budg: return err; } -static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; @@ -1606,7 +1606,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int ubifs_rename(struct user_namespace *mnt_userns, +static int ubifs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -1631,7 +1631,7 @@ static int ubifs_rename(struct user_namespace *mnt_userns, return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { loff_t size; @@ -1654,7 +1654,7 @@ int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f2353dd676ef..8cb5d76b301c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1258,7 +1258,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, return err; } -int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; @@ -1267,7 +1267,7 @@ int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, dbg_gen("ino %lu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -1608,11 +1608,11 @@ static const char *ubifs_get_link(struct dentry *dentry, return fscrypt_get_symlink(inode, ui->data, ui->data_len, done); } -static int ubifs_symlink_getattr(struct user_namespace *mnt_userns, +static int ubifs_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - ubifs_getattr(mnt_userns, path, stat, request_mask, query_flags); + ubifs_getattr(idmap, path, stat, request_mask, query_flags); if (IS_ENCRYPTED(d_inode(path->dentry))) return fscrypt_symlink_getattr(path, stat); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 71bcebe45f9c..67c5108abd89 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -144,7 +144,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ubifs_fileattr_set(struct user_namespace *mnt_userns, +int ubifs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 478bbbb5382f..9063b73536f8 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2020,15 +2020,15 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); -int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode, bool is_xattr); -int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); +int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ @@ -2085,7 +2085,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c); /* ioctl.c */ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ubifs_fileattr_set(struct user_namespace *mnt_userns, +int ubifs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void ubifs_set_inode_flags(struct inode *inode); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3db8486e3725..349228dd1191 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -699,7 +699,7 @@ static int xattr_get(const struct xattr_handler *handler, } static int xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 8e597db4d971..14b9db4c80f0 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -36,18 +36,41 @@ static int read_block_bitmap(struct super_block *sb, unsigned long bitmap_nr) { struct buffer_head *bh = NULL; - int retval = 0; + int i; + int max_bits, off, count; struct kernel_lb_addr loc; loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; - bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block)); + bh = sb_bread(sb, udf_get_lb_pblock(sb, &loc, block)); + bitmap->s_block_bitmap[bitmap_nr] = bh; if (!bh) - retval = -EIO; + return -EIO; - bitmap->s_block_bitmap[bitmap_nr] = bh; - return retval; + /* Check consistency of Space Bitmap buffer. */ + max_bits = sb->s_blocksize * 8; + if (!bitmap_nr) { + off = sizeof(struct spaceBitmapDesc) << 3; + count = min(max_bits - off, bitmap->s_nr_groups); + } else { + /* + * Rough check if bitmap number is too big to have any bitmap + * blocks reserved. + */ + if (bitmap_nr > + (bitmap->s_nr_groups >> (sb->s_blocksize_bits + 3)) + 2) + return 0; + off = 0; + count = bitmap->s_nr_groups - bitmap_nr * max_bits + + (sizeof(struct spaceBitmapDesc) << 3); + count = min(count, max_bits); + } + + for (i = 0; i < count; i++) + if (udf_test_bit(i + off, bh->b_data)) + return -EFSCORRUPTED; + return 0; } static int __load_block_bitmap(struct super_block *sb, diff --git a/fs/udf/dir.c b/fs/udf/dir.c index be640f4b2f2c..212393b12c22 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -39,26 +39,13 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); - struct udf_inode_info *iinfo = UDF_I(dir); - struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; - struct fileIdentDesc *fi = NULL; - struct fileIdentDesc cfi; - udf_pblk_t block, iblock; loff_t nf_pos, emit_pos = 0; int flen; - unsigned char *fname = NULL, *copy_name = NULL; - unsigned char *nameptr; - uint16_t liu; - uint8_t lfi; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - struct buffer_head *tmp, *bha[16]; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - int i, num, ret = 0; - struct extent_position epos = { NULL, 0, {0, 0} }; + unsigned char *fname = NULL; + int ret = 0; struct super_block *sb = dir->i_sb; bool pos_valid = false; + struct udf_fileident_iter iter; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -66,7 +53,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 1; } nf_pos = (ctx->pos - 1) << 2; - if (nf_pos >= size) + if (nf_pos >= dir->i_size) goto out; /* @@ -90,138 +77,57 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) goto out; } - if (nf_pos == 0) - nf_pos = udf_ext0_offset(dir); - - fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1); - if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) - != (EXT_RECORDED_ALLOCATED >> 30)) { - ret = -ENOENT; - goto out; - } - block = udf_get_lb_pblock(sb, &eloc, offset); - if ((++offset << sb->s_blocksize_bits) < elen) { - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (iinfo->i_alloc_type == - ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else { - offset = 0; - } - - if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) { - ret = -EIO; - goto out; - } - - if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) { - i = 16 >> (sb->s_blocksize_bits - 9); - if (i + offset > (elen >> sb->s_blocksize_bits)) - i = (elen >> sb->s_blocksize_bits) - offset; - for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(sb, &eloc, offset + i); - tmp = udf_tgetblk(sb, block); - if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) - bha[num++] = tmp; - else - brelse(tmp); - } - if (num) { - bh_readahead_batch(num, bha, REQ_RAHEAD); - for (i = 0; i < num; i++) - brelse(bha[i]); - } - } - } - - while (nf_pos < size) { + for (ret = udf_fiiter_init(&iter, dir, nf_pos); + !ret && iter.pos < dir->i_size; + ret = udf_fiiter_advance(&iter)) { struct kernel_lb_addr tloc; - loff_t cur_pos = nf_pos; + udf_pblk_t iblock; - /* Update file position only if we got past the current one */ - if (nf_pos >= emit_pos) { - ctx->pos = (nf_pos >> 2) + 1; - pos_valid = true; - } - - fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, - &elen, &offset); - if (!fi) - goto out; /* Still not at offset where user asked us to read from? */ - if (cur_pos < emit_pos) + if (iter.pos < emit_pos) continue; - liu = le16_to_cpu(cfi.lengthOfImpUse); - lfi = cfi.lengthFileIdent; - - if (fibh.sbh == fibh.ebh) { - nameptr = udf_get_fi_ident(fi); - } else { - int poffset; /* Unpaded ending offset */ - - poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi; - - if (poffset >= lfi) { - nameptr = (char *)(fibh.ebh->b_data + poffset - lfi); - } else { - if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN, - GFP_NOFS); - if (!copy_name) { - ret = -ENOMEM; - goto out; - } - } - nameptr = copy_name; - memcpy(nameptr, udf_get_fi_ident(fi), - lfi - poffset); - memcpy(nameptr + lfi - poffset, - fibh.ebh->b_data, poffset); - } - } + /* Update file position only if we got past the current one */ + pos_valid = true; + ctx->pos = (iter.pos >> 2) + 1; - if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } - if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } - if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_PARENT) { if (!dir_emit_dotdot(file, ctx)) - goto out; + goto out_iter; continue; } - flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + flen = udf_get_filename(sb, iter.name, + iter.fi.lengthFileIdent, fname, UDF_NAME_LEN); if (flen < 0) continue; - tloc = lelb_to_cpu(cfi.icb.extLocation); + tloc = lelb_to_cpu(iter.fi.icb.extLocation); iblock = udf_get_lb_pblock(sb, &tloc, 0); if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) - goto out; - } /* end while */ - - ctx->pos = (nf_pos >> 2) + 1; - pos_valid = true; + goto out_iter; + } + if (!ret) { + ctx->pos = (iter.pos >> 2) + 1; + pos_valid = true; + } +out_iter: + udf_fiiter_release(&iter); out: if (pos_valid) file->f_version = inode_query_iversion(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); kfree(fname); - kfree(copy_name); return ret; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 16bcf2c6b8b3..654536d2b609 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -17,183 +17,478 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/bio.h> +#include <linux/crc-itu-t.h> +#include <linux/iversion.h> -struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi, - struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t *elen, - sector_t *offset) +static int udf_verify_fi(struct udf_fileident_iter *iter) { - struct fileIdentDesc *fi; - int i, num; - udf_pblk_t block; - struct buffer_head *tmp, *bha[16]; - struct udf_inode_info *iinfo = UDF_I(dir); - - fibh->soffset = fibh->eoffset; + unsigned int len; + + if (iter->fi.descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry at pos %llu with incorrect tag %x\n", + iter->dir->i_ino, (unsigned long long)iter->pos, + le16_to_cpu(iter->fi.descTag.tagIdent)); + return -EFSCORRUPTED; + } + len = udf_dir_entry_len(&iter->fi); + if (le16_to_cpu(iter->fi.lengthOfImpUse) & 3) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry at pos %llu with unaligned length of impUse field\n", + iter->dir->i_ino, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + /* + * This is in fact allowed by the spec due to long impUse field but + * we don't support it. If there is real media with this large impUse + * field, support can be added. + */ + if (len > 1 << iter->dir->i_blkbits) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has too big (%u) entry at pos %llu\n", + iter->dir->i_ino, len, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + if (iter->pos + len > iter->dir->i_size) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry past directory size at pos %llu\n", + iter->dir->i_ino, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + if (udf_dir_entry_len(&iter->fi) != + sizeof(struct tag) + le16_to_cpu(iter->fi.descTag.descCRCLength)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry where CRC length (%u) does not match entry length (%u)\n", + iter->dir->i_ino, + (unsigned)le16_to_cpu(iter->fi.descTag.descCRCLength), + (unsigned)(udf_dir_entry_len(&iter->fi) - + sizeof(struct tag))); + return -EFSCORRUPTED; + } + return 0; +} +static int udf_copy_fi(struct udf_fileident_iter *iter) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + u32 blksize = 1 << iter->dir->i_blkbits; + u32 off, len, nameoff; + int err; + + /* Skip copying when we are at EOF */ + if (iter->pos >= iter->dir->i_size) { + iter->name = NULL; + return 0; + } + if (iter->dir->i_size < iter->pos + sizeof(struct fileIdentDesc)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry straddling EOF\n", + iter->dir->i_ino); + return -EFSCORRUPTED; + } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - fi = udf_get_fileident(iinfo->i_data - - (iinfo->i_efe ? - sizeof(struct extendedFileEntry) : - sizeof(struct fileEntry)), - dir->i_sb->s_blocksize, - &(fibh->eoffset)); - if (!fi) - return NULL; - - *nf_pos += fibh->eoffset - fibh->soffset; - - memcpy((uint8_t *)cfi, (uint8_t *)fi, + memcpy(&iter->fi, iinfo->i_data + iinfo->i_lenEAttr + iter->pos, sizeof(struct fileIdentDesc)); - - return fi; + err = udf_verify_fi(iter); + if (err < 0) + return err; + iter->name = iinfo->i_data + iinfo->i_lenEAttr + iter->pos + + sizeof(struct fileIdentDesc) + + le16_to_cpu(iter->fi.lengthOfImpUse); + return 0; } - if (fibh->eoffset == dir->i_sb->s_blocksize) { - uint32_t lextoffset = epos->offset; - unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits; - - if (udf_next_aext(dir, epos, eloc, elen, 1) != - (EXT_RECORDED_ALLOCATED >> 30)) - return NULL; + off = iter->pos & (blksize - 1); + len = min_t(int, sizeof(struct fileIdentDesc), blksize - off); + memcpy(&iter->fi, iter->bh[0]->b_data + off, len); + if (len < sizeof(struct fileIdentDesc)) + memcpy((char *)(&iter->fi) + len, iter->bh[1]->b_data, + sizeof(struct fileIdentDesc) - len); + err = udf_verify_fi(iter); + if (err < 0) + return err; + + /* Handle directory entry name */ + nameoff = off + sizeof(struct fileIdentDesc) + + le16_to_cpu(iter->fi.lengthOfImpUse); + if (off + udf_dir_entry_len(&iter->fi) <= blksize) { + iter->name = iter->bh[0]->b_data + nameoff; + } else if (nameoff >= blksize) { + iter->name = iter->bh[1]->b_data + (nameoff - blksize); + } else { + iter->name = iter->namebuf; + len = blksize - nameoff; + memcpy(iter->name, iter->bh[0]->b_data + nameoff, len); + memcpy(iter->name + len, iter->bh[1]->b_data, + iter->fi.lengthFileIdent - len); + } + return 0; +} - block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); +/* Readahead 8k once we are at 8k boundary */ +static void udf_readahead_dir(struct udf_fileident_iter *iter) +{ + unsigned int ralen = 16 >> (iter->dir->i_blkbits - 9); + struct buffer_head *tmp, *bha[16]; + int i, num; + udf_pblk_t blk; + + if (iter->loffset & (ralen - 1)) + return; + + if (iter->loffset + ralen > (iter->elen >> iter->dir->i_blkbits)) + ralen = (iter->elen >> iter->dir->i_blkbits) - iter->loffset; + num = 0; + for (i = 0; i < ralen; i++) { + blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc, + iter->loffset + i); + tmp = sb_getblk(iter->dir->i_sb, blk); + if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse(tmp); + } + if (num) { + bh_readahead_batch(num, bha, REQ_RAHEAD); + for (i = 0; i < num; i++) + brelse(bha[i]); + } +} - (*offset)++; +static struct buffer_head *udf_fiiter_bread_blk(struct udf_fileident_iter *iter) +{ + udf_pblk_t blk; - if ((*offset << blocksize_bits) >= *elen) - *offset = 0; - else - epos->offset = lextoffset; + udf_readahead_dir(iter); + blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc, iter->loffset); + return sb_bread(iter->dir->i_sb, blk); +} - brelse(fibh->sbh); - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->sbh) - return NULL; - fibh->soffset = fibh->eoffset = 0; - - if (!(*offset & ((16 >> (blocksize_bits - 9)) - 1))) { - i = 16 >> (blocksize_bits - 9); - if (i + *offset > (*elen >> blocksize_bits)) - i = (*elen >> blocksize_bits)-*offset; - for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, eloc, - *offset + i); - tmp = udf_tgetblk(dir->i_sb, block); - if (tmp && !buffer_uptodate(tmp) && - !buffer_locked(tmp)) - bha[num++] = tmp; - else - brelse(tmp); - } - if (num) { - bh_readahead_batch(num, bha, REQ_RAHEAD); - for (i = 0; i < num; i++) - brelse(bha[i]); - } +/* + * Updates loffset to point to next directory block; eloc, elen & epos are + * updated if we need to traverse to the next extent as well. + */ +static int udf_fiiter_advance_blk(struct udf_fileident_iter *iter) +{ + iter->loffset++; + if (iter->loffset < DIV_ROUND_UP(iter->elen, 1<<iter->dir->i_blkbits)) + return 0; + + iter->loffset = 0; + if (udf_next_aext(iter->dir, &iter->epos, &iter->eloc, &iter->elen, 1) + != (EXT_RECORDED_ALLOCATED >> 30)) { + if (iter->pos == iter->dir->i_size) { + iter->elen = 0; + return 0; } - } else if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; + udf_err(iter->dir->i_sb, + "extent after position %llu not allocated in directory (ino %lu)\n", + (unsigned long long)iter->pos, iter->dir->i_ino); + return -EFSCORRUPTED; } + return 0; +} - fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize, - &(fibh->eoffset)); - - if (!fi) - return NULL; +static int udf_fiiter_load_bhs(struct udf_fileident_iter *iter) +{ + int blksize = 1 << iter->dir->i_blkbits; + int off = iter->pos & (blksize - 1); + int err; + struct fileIdentDesc *fi; - *nf_pos += fibh->eoffset - fibh->soffset; + /* Is there any further extent we can map from? */ + if (!iter->bh[0] && iter->elen) { + iter->bh[0] = udf_fiiter_bread_blk(iter); + if (!iter->bh[0]) { + err = -ENOMEM; + goto out_brelse; + } + if (!buffer_uptodate(iter->bh[0])) { + err = -EIO; + goto out_brelse; + } + } + /* There's no next block so we are done */ + if (iter->pos >= iter->dir->i_size) + return 0; + /* Need to fetch next block as well? */ + if (off + sizeof(struct fileIdentDesc) > blksize) + goto fetch_next; + fi = (struct fileIdentDesc *)(iter->bh[0]->b_data + off); + /* Need to fetch next block to get name? */ + if (off + udf_dir_entry_len(fi) > blksize) { +fetch_next: + err = udf_fiiter_advance_blk(iter); + if (err) + goto out_brelse; + iter->bh[1] = udf_fiiter_bread_blk(iter); + if (!iter->bh[1]) { + err = -ENOMEM; + goto out_brelse; + } + if (!buffer_uptodate(iter->bh[1])) { + err = -EIO; + goto out_brelse; + } + } + return 0; +out_brelse: + brelse(iter->bh[0]); + brelse(iter->bh[1]); + iter->bh[0] = iter->bh[1] = NULL; + return err; +} - if (fibh->eoffset <= dir->i_sb->s_blocksize) { - memcpy((uint8_t *)cfi, (uint8_t *)fi, - sizeof(struct fileIdentDesc)); - } else if (fibh->eoffset > dir->i_sb->s_blocksize) { - uint32_t lextoffset = epos->offset; +int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, + loff_t pos) +{ + struct udf_inode_info *iinfo = UDF_I(dir); + int err = 0; + + iter->dir = dir; + iter->bh[0] = iter->bh[1] = NULL; + iter->pos = pos; + iter->elen = 0; + iter->epos.bh = NULL; + iter->name = NULL; + /* + * When directory is verified, we don't expect directory iteration to + * fail and it can be difficult to undo without corrupting filesystem. + * So just do not allow memory allocation failures here. + */ + iter->namebuf = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL | __GFP_NOFAIL); - if (udf_next_aext(dir, epos, eloc, elen, 1) != - (EXT_RECORDED_ALLOCATED >> 30)) - return NULL; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + err = udf_copy_fi(iter); + goto out; + } - block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); + if (inode_bmap(dir, iter->pos >> dir->i_blkbits, &iter->epos, + &iter->eloc, &iter->elen, &iter->loffset) != + (EXT_RECORDED_ALLOCATED >> 30)) { + if (pos == dir->i_size) + return 0; + udf_err(dir->i_sb, + "position %llu not allocated in directory (ino %lu)\n", + (unsigned long long)pos, dir->i_ino); + err = -EFSCORRUPTED; + goto out; + } + err = udf_fiiter_load_bhs(iter); + if (err < 0) + goto out; + err = udf_copy_fi(iter); +out: + if (err < 0) + udf_fiiter_release(iter); + return err; +} - (*offset)++; +int udf_fiiter_advance(struct udf_fileident_iter *iter) +{ + unsigned int oldoff, len; + int blksize = 1 << iter->dir->i_blkbits; + int err; + + oldoff = iter->pos & (blksize - 1); + len = udf_dir_entry_len(&iter->fi); + iter->pos += len; + if (UDF_I(iter->dir)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (oldoff + len >= blksize) { + brelse(iter->bh[0]); + iter->bh[0] = NULL; + /* Next block already loaded? */ + if (iter->bh[1]) { + iter->bh[0] = iter->bh[1]; + iter->bh[1] = NULL; + } else { + err = udf_fiiter_advance_blk(iter); + if (err < 0) + return err; + } + } + err = udf_fiiter_load_bhs(iter); + if (err < 0) + return err; + } + return udf_copy_fi(iter); +} - if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen) - *offset = 0; - else - epos->offset = lextoffset; +void udf_fiiter_release(struct udf_fileident_iter *iter) +{ + iter->dir = NULL; + brelse(iter->bh[0]); + brelse(iter->bh[1]); + iter->bh[0] = iter->bh[1] = NULL; + kfree(iter->namebuf); + iter->namebuf = NULL; +} - fibh->soffset -= dir->i_sb->s_blocksize; - fibh->eoffset -= dir->i_sb->s_blocksize; +static void udf_copy_to_bufs(void *buf1, int len1, void *buf2, int len2, + int off, void *src, int len) +{ + int copy; + + if (off >= len1) { + off -= len1; + } else { + copy = min(off + len, len1) - off; + memcpy(buf1 + off, src, copy); + src += copy; + len -= copy; + off = 0; + } + if (len > 0) { + if (WARN_ON_ONCE(off + len > len2 || !buf2)) + return; + memcpy(buf2 + off, src, len); + } +} - fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->ebh) - return NULL; +static uint16_t udf_crc_fi_bufs(void *buf1, int len1, void *buf2, int len2, + int off, int len) +{ + int copy; + uint16_t crc = 0; + + if (off >= len1) { + off -= len1; + } else { + copy = min(off + len, len1) - off; + crc = crc_itu_t(crc, buf1 + off, copy); + len -= copy; + off = 0; + } + if (len > 0) { + if (WARN_ON_ONCE(off + len > len2 || !buf2)) + return 0; + crc = crc_itu_t(crc, buf2 + off, len); + } + return crc; +} - if (sizeof(struct fileIdentDesc) > -fibh->soffset) { - int fi_len; +static void udf_copy_fi_to_bufs(char *buf1, int len1, char *buf2, int len2, + int off, struct fileIdentDesc *fi, + uint8_t *impuse, uint8_t *name) +{ + uint16_t crc; + int fioff = off; + int crcoff = off + sizeof(struct tag); + unsigned int crclen = udf_dir_entry_len(fi) - sizeof(struct tag); + char zeros[UDF_NAME_PAD] = {}; + int endoff = off + udf_dir_entry_len(fi); + + udf_copy_to_bufs(buf1, len1, buf2, len2, off, fi, + sizeof(struct fileIdentDesc)); + off += sizeof(struct fileIdentDesc); + if (impuse) + udf_copy_to_bufs(buf1, len1, buf2, len2, off, impuse, + le16_to_cpu(fi->lengthOfImpUse)); + off += le16_to_cpu(fi->lengthOfImpUse); + if (name) { + udf_copy_to_bufs(buf1, len1, buf2, len2, off, name, + fi->lengthFileIdent); + off += fi->lengthFileIdent; + udf_copy_to_bufs(buf1, len1, buf2, len2, off, zeros, + endoff - off); + } - memcpy((uint8_t *)cfi, (uint8_t *)fi, -fibh->soffset); - memcpy((uint8_t *)cfi - fibh->soffset, - fibh->ebh->b_data, - sizeof(struct fileIdentDesc) + fibh->soffset); + crc = udf_crc_fi_bufs(buf1, len1, buf2, len2, crcoff, crclen); + fi->descTag.descCRC = cpu_to_le16(crc); + fi->descTag.descCRCLength = cpu_to_le16(crclen); + fi->descTag.tagChecksum = udf_tag_checksum(&fi->descTag); - fi_len = udf_dir_entry_len(cfi); - *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); - fibh->eoffset = fibh->soffset + fi_len; - } else { - memcpy((uint8_t *)cfi, (uint8_t *)fi, - sizeof(struct fileIdentDesc)); - } - } - /* Got last entry outside of dir size - fs is corrupted! */ - if (*nf_pos > dir->i_size) - return NULL; - return fi; + udf_copy_to_bufs(buf1, len1, buf2, len2, fioff, fi, sizeof(struct tag)); } -struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) +void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse) { - struct fileIdentDesc *fi; - int lengthThisIdent; - uint8_t *ptr; - int padlen; + struct udf_inode_info *iinfo = UDF_I(iter->dir); + void *buf1, *buf2 = NULL; + int len1, len2 = 0, off; + int blksize = 1 << iter->dir->i_blkbits; - if ((!buffer) || (!offset)) { - udf_debug("invalidparms, buffer=%p, offset=%p\n", - buffer, offset); - return NULL; + off = iter->pos & (blksize - 1); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + buf1 = iinfo->i_data + iinfo->i_lenEAttr; + len1 = iter->dir->i_size; + } else { + buf1 = iter->bh[0]->b_data; + len1 = blksize; + if (iter->bh[1]) { + buf2 = iter->bh[1]->b_data; + len2 = blksize; + } } - ptr = buffer; + udf_copy_fi_to_bufs(buf1, len1, buf2, len2, off, &iter->fi, impuse, + iter->name == iter->namebuf ? iter->name : NULL); - if ((*offset > 0) && (*offset < bufsize)) - ptr += *offset; - fi = (struct fileIdentDesc *)ptr; - if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { - udf_debug("0x%x != TAG_IDENT_FID\n", - le16_to_cpu(fi->descTag.tagIdent)); - udf_debug("offset: %d sizeof: %lu bufsize: %d\n", - *offset, (unsigned long)sizeof(struct fileIdentDesc), - bufsize); - return NULL; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + mark_inode_dirty(iter->dir); + } else { + mark_buffer_dirty_inode(iter->bh[0], iter->dir); + if (iter->bh[1]) + mark_buffer_dirty_inode(iter->bh[1], iter->dir); } - if ((*offset + sizeof(struct fileIdentDesc)) > bufsize) - lengthThisIdent = sizeof(struct fileIdentDesc); - else - lengthThisIdent = sizeof(struct fileIdentDesc) + - fi->lengthFileIdent + le16_to_cpu(fi->lengthOfImpUse); + inode_inc_iversion(iter->dir); +} - /* we need to figure padding, too! */ - padlen = lengthThisIdent % UDF_NAME_PAD; - if (padlen) - lengthThisIdent += (UDF_NAME_PAD - padlen); - *offset = *offset + lengthThisIdent; +void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + int diff = new_elen - iter->elen; + + /* Skip update when we already went past the last extent */ + if (!iter->elen) + return; + iter->elen = new_elen; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + iter->epos.offset -= sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + iter->epos.offset -= sizeof(struct long_ad); + udf_write_aext(iter->dir, &iter->epos, &iter->eloc, iter->elen, 1); + iinfo->i_lenExtents += diff; + mark_inode_dirty(iter->dir); +} - return fi; +/* Append new block to directory. @iter is expected to point at EOF */ +int udf_fiiter_append_blk(struct udf_fileident_iter *iter) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + int blksize = 1 << iter->dir->i_blkbits; + struct buffer_head *bh; + sector_t block; + uint32_t old_elen = iter->elen; + int err; + + if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)) + return -EINVAL; + + /* Round up last extent in the file */ + udf_fiiter_update_elen(iter, ALIGN(iter->elen, blksize)); + + /* Allocate new block and refresh mapping information */ + block = iinfo->i_lenExtents >> iter->dir->i_blkbits; + bh = udf_bread(iter->dir, block, 1, &err); + if (!bh) { + udf_fiiter_update_elen(iter, old_elen); + return err; + } + if (inode_bmap(iter->dir, block, &iter->epos, &iter->eloc, &iter->elen, + &iter->loffset) != (EXT_RECORDED_ALLOCATED >> 30)) { + udf_err(iter->dir->i_sb, + "block %llu not allocated in directory (ino %lu)\n", + (unsigned long long)block, iter->dir->i_ino); + return -EFSCORRUPTED; + } + if (!(iter->pos & (blksize - 1))) { + brelse(iter->bh[0]); + iter->bh[0] = bh; + } else { + iter->bh[1] = bh; + } + return 0; } struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, diff --git a/fs/udf/file.c b/fs/udf/file.c index 5c659e23e578..8238f742377b 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -38,100 +38,55 @@ #include "udf_i.h" #include "udf_sb.h" -static void __udf_adinicb_readpage(struct page *page) +static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) { - struct inode *inode = page->mapping->host; - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - loff_t isize = i_size_read(inode); - - /* - * We have to be careful here as truncate can change i_size under us. - * So just sample it once and use the same value everywhere. - */ - kaddr = kmap_atomic(page); - memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); - memset(kaddr + isize, 0, PAGE_SIZE - isize); - flush_dcache_page(page); - SetPageUptodate(page); - kunmap_atomic(kaddr); -} - -static int udf_adinicb_read_folio(struct file *file, struct folio *folio) -{ - BUG_ON(!folio_test_locked(folio)); - __udf_adinicb_readpage(&folio->page); - folio_unlock(folio); - - return 0; -} - -static int udf_adinicb_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - - BUG_ON(!PageLocked(page)); - - kaddr = kmap_atomic(page); - memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, i_size_read(inode)); - SetPageUptodate(page); - kunmap_atomic(kaddr); - mark_inode_dirty(inode); - unlock_page(page); - - return 0; -} - -static int udf_adinicb_write_begin(struct file *file, - struct address_space *mapping, loff_t pos, - unsigned len, struct page **pagep, - void **fsdata) -{ - struct page *page; - - if (WARN_ON_ONCE(pos >= PAGE_SIZE)) - return -EIO; - page = grab_cache_page_write_begin(mapping, 0); - if (!page) - return -ENOMEM; - *pagep = page; - - if (!PageUptodate(page)) - __udf_adinicb_readpage(page); - return 0; -} - -static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - /* Fallback to buffered I/O. */ - return 0; -} + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + struct page *page = vmf->page; + loff_t size; + unsigned int end; + vm_fault_t ret = VM_FAULT_LOCKED; + int err; -static int udf_adinicb_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - loff_t last_pos = pos + copied; - if (last_pos > inode->i_size) - i_size_write(inode, last_pos); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + filemap_invalidate_lock_shared(mapping); + lock_page(page); + size = i_size_read(inode); + if (page->mapping != inode->i_mapping || page_offset(page) >= size) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out_unlock; + } + /* Space is already allocated for in-ICB file */ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + goto out_dirty; + if (page->index == size >> PAGE_SHIFT) + end = size & ~PAGE_MASK; + else + end = PAGE_SIZE; + err = __block_write_begin(page, 0, end, udf_get_block); + if (!err) + err = block_commit_write(page, 0, end); + if (err < 0) { + unlock_page(page); + ret = block_page_mkwrite_return(err); + goto out_unlock; + } +out_dirty: set_page_dirty(page); - unlock_page(page); - put_page(page); - return copied; + wait_for_stable_page(page); +out_unlock: + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(inode->i_sb); + return ret; } -const struct address_space_operations udf_adinicb_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = udf_adinicb_read_folio, - .writepage = udf_adinicb_writepage, - .write_begin = udf_adinicb_write_begin, - .write_end = udf_adinicb_write_end, - .direct_IO = udf_adinicb_direct_IO, +static const struct vm_operations_struct udf_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = udf_page_mkwrite, }; static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -140,7 +95,6 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct udf_inode_info *iinfo = UDF_I(inode); - int err; inode_lock(inode); @@ -148,27 +102,23 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (retval <= 0) goto out; - down_write(&iinfo->i_data_sem); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - loff_t end = iocb->ki_pos + iov_iter_count(from); - - if (inode->i_sb->s_blocksize < - (udf_file_entry_alloc_offset(inode) + end)) { - err = udf_expand_file_adinicb(inode); - if (err) { - inode_unlock(inode); - udf_debug("udf_expand_adinicb: err=%d\n", err); - return err; - } - } else { - iinfo->i_lenAlloc = max(end, inode->i_size); - up_write(&iinfo->i_data_sem); - } - } else - up_write(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && + inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + + iocb->ki_pos + iov_iter_count(from))) { + filemap_invalidate_lock(inode->i_mapping); + retval = udf_expand_file_adinicb(inode); + filemap_invalidate_unlock(inode->i_mapping); + if (retval) + goto out; + } retval = __generic_file_write_iter(iocb, from); out: + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && retval > 0) { + down_write(&iinfo->i_data_sem); + iinfo->i_lenAlloc = inode->i_size; + up_write(&iinfo->i_data_sem); + } inode_unlock(inode); if (retval > 0) { @@ -243,11 +193,19 @@ static int udf_release_file(struct inode *inode, struct file *filp) return 0; } +static int udf_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &udf_file_vm_ops; + + return 0; +} + const struct file_operations udf_file_operations = { .read_iter = generic_file_read_iter, .unlocked_ioctl = udf_ioctl, .open = generic_file_open, - .mmap = generic_file_mmap, + .mmap = udf_file_mmap, .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, @@ -256,14 +214,14 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; -static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +static int udf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -286,7 +244,7 @@ static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_MODE) udf_update_extra_perms(inode, attr->ia_mode); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index b5d611cee749..8d50121778a5 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -28,21 +28,7 @@ void udf_free_inode(struct inode *inode) { - struct super_block *sb = inode->i_sb; - struct udf_sb_info *sbi = UDF_SB(sb); - struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); - - if (lvidiu) { - mutex_lock(&sbi->s_alloc_mutex); - if (S_ISDIR(inode->i_mode)) - le32_add_cpu(&lvidiu->numDirs, -1); - else - le32_add_cpu(&lvidiu->numFiles, -1); - udf_updated_lvid(sb); - mutex_unlock(&sbi->s_alloc_mutex); - } - - udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); + udf_free_blocks(inode->i_sb, NULL, &UDF_I(inode)->i_location, 0, 1); } struct inode *udf_new_inode(struct inode *dir, umode_t mode) @@ -54,7 +40,6 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); - struct logicalVolIntegrityDescImpUse *lvidiu; int err; inode = new_inode(sb); @@ -92,20 +77,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(err); } - lvidiu = udf_sb_lvidiu(sb); - if (lvidiu) { - iinfo->i_unique = lvid_get_unique_id(sb); - inode->i_generation = iinfo->i_unique; - mutex_lock(&sbi->s_alloc_mutex); - if (S_ISDIR(mode)) - le32_add_cpu(&lvidiu->numDirs, 1); - else - le32_add_cpu(&lvidiu->numFiles, 1); - udf_updated_lvid(sb); - mutex_unlock(&sbi->s_alloc_mutex); - } + iinfo->i_unique = lvid_get_unique_id(sb); + inode->i_generation = iinfo->i_unique; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 34e416327dd4..3b2adf4cbc57 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -52,21 +52,24 @@ #define FE_DELETE_PERMS (FE_PERM_U_DELETE | FE_PERM_G_DELETE | \ FE_PERM_O_DELETE) +struct udf_map_rq; + static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); -static sector_t inode_getblk(struct inode *, sector_t, int *, int *); -static int8_t udf_insert_aext(struct inode *, struct extent_position, - struct kernel_lb_addr, uint32_t); +static int inode_getblk(struct inode *inode, struct udf_map_rq *map); +static int udf_insert_aext(struct inode *, struct extent_position, + struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, udf_pblk_t, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); -static void udf_update_extents(struct inode *, struct kernel_long_ad *, int, - int, struct extent_position *); -static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); +static int udf_update_extents(struct inode *, struct kernel_long_ad *, int, + int, struct extent_position *); +static int udf_get_block_wb(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create); static void __udf_clear_extent_cache(struct inode *inode) { @@ -182,14 +185,56 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } +static int udf_adinicb_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct inode *inode = page->mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + + BUG_ON(!PageLocked(page)); + memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, + i_size_read(inode)); + unlock_page(page); + mark_inode_dirty(inode); + + return 0; +} + static int udf_writepages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, udf_get_block); + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) + return mpage_writepages(mapping, wbc, udf_get_block_wb); + return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); +} + +static void udf_adinicb_readpage(struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = i_size_read(inode); + + kaddr = kmap_local_page(page); + memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); + memset(kaddr + isize, 0, PAGE_SIZE - isize); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap_local(kaddr); } static int udf_read_folio(struct file *file, struct folio *folio) { + struct udf_inode_info *iinfo = UDF_I(file_inode(file)); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + udf_adinicb_readpage(&folio->page); + folio_unlock(folio); + return 0; + } return mpage_read_folio(folio, udf_get_block); } @@ -199,15 +244,49 @@ static void udf_readahead(struct readahead_control *rac) } static int udf_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) { + struct udf_inode_info *iinfo = UDF_I(file_inode(file)); + struct page *page; int ret; - ret = block_write_begin(mapping, pos, len, pagep, udf_get_block); - if (unlikely(ret)) - udf_write_failed(mapping, pos + len); - return ret; + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + ret = block_write_begin(mapping, pos, len, pagep, + udf_get_block); + if (unlikely(ret)) + udf_write_failed(mapping, pos + len); + return ret; + } + if (WARN_ON_ONCE(pos >= PAGE_SIZE)) + return -EIO; + page = grab_cache_page_write_begin(mapping, 0); + if (!page) + return -ENOMEM; + *pagep = page; + if (!PageUptodate(page)) + udf_adinicb_readpage(page); + return 0; +} + +static int udf_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file_inode(file); + loff_t last_pos; + + if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) + return generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + last_pos = pos + copied; + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + set_page_dirty(page); + unlock_page(page); + put_page(page); + + return copied; } static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) @@ -218,6 +297,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) size_t count = iov_iter_count(iter); ssize_t ret; + /* Fallback to buffered IO for in-ICB files */ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return 0; ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, iocb->ki_pos + count); @@ -226,6 +308,10 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static sector_t udf_bmap(struct address_space *mapping, sector_t block) { + struct udf_inode_info *iinfo = UDF_I(mapping->host); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return -EINVAL; return generic_block_bmap(mapping, block, udf_get_block); } @@ -236,7 +322,7 @@ const struct address_space_operations udf_aops = { .readahead = udf_readahead, .writepages = udf_writepages, .write_begin = udf_write_begin, - .write_end = generic_write_end, + .write_end = udf_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, .migrate_folio = buffer_migrate_folio, @@ -245,18 +331,17 @@ const struct address_space_operations udf_aops = { /* * Expand file stored in ICB to a normal one-block-file * - * This function requires i_data_sem for writing and releases it. * This function requires i_mutex held */ int udf_expand_file_adinicb(struct inode *inode) { struct page *page; - char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { + down_write(&iinfo->i_data_sem); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else @@ -267,26 +352,13 @@ int udf_expand_file_adinicb(struct inode *inode) mark_inode_dirty(inode); return 0; } - /* - * Release i_data_sem so that we can lock a page - page lock ranks - * above i_data_sem. i_mutex still protects us against file changes. - */ - up_write(&iinfo->i_data_sem); page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) return -ENOMEM; - if (!PageUptodate(page)) { - kaddr = kmap_atomic(page); - memset(kaddr + iinfo->i_lenAlloc, 0x00, - PAGE_SIZE - iinfo->i_lenAlloc); - memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, - iinfo->i_lenAlloc); - flush_dcache_page(page); - SetPageUptodate(page); - kunmap_atomic(kaddr); - } + if (!PageUptodate(page)) + udf_adinicb_readpage(page); down_write(&iinfo->i_data_sem); memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); @@ -295,8 +367,6 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - /* from now on we have normal address_space methods */ - inode->i_data.a_ops = &udf_aops; set_page_dirty(page); unlock_page(page); up_write(&iinfo->i_data_sem); @@ -305,12 +375,10 @@ int udf_expand_file_adinicb(struct inode *inode) /* Restore everything back so that we don't lose data... */ lock_page(page); down_write(&iinfo->i_data_sem); - kaddr = kmap_atomic(page); - memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - kunmap_atomic(kaddr); + memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, + inode->i_size); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - inode->i_data.a_ops = &udf_adinicb_aops; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } @@ -320,162 +388,103 @@ int udf_expand_file_adinicb(struct inode *inode) return err; } -struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, - udf_pblk_t *block, int *err) -{ - udf_pblk_t newblock; - struct buffer_head *dbh = NULL; - struct kernel_lb_addr eloc; - uint8_t alloctype; - struct extent_position epos; +#define UDF_MAP_CREATE 0x01 /* Mapping can allocate new blocks */ +#define UDF_MAP_NOPREALLOC 0x02 /* Do not preallocate blocks */ - struct udf_fileident_bh sfibh, dfibh; - loff_t f_pos = udf_ext0_offset(inode); - int size = udf_ext0_offset(inode) + inode->i_size; - struct fileIdentDesc cfi, *sfi, *dfi; - struct udf_inode_info *iinfo = UDF_I(inode); +#define UDF_BLK_MAPPED 0x01 /* Block was successfully mapped */ +#define UDF_BLK_NEW 0x02 /* Block was freshly allocated */ - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) - alloctype = ICBTAG_FLAG_AD_SHORT; - else - alloctype = ICBTAG_FLAG_AD_LONG; +struct udf_map_rq { + sector_t lblk; + udf_pblk_t pblk; + int iflags; /* UDF_MAP_ flags determining behavior */ + int oflags; /* UDF_BLK_ flags reporting results */ +}; - if (!inode->i_size) { - iinfo->i_alloc_type = alloctype; - mark_inode_dirty(inode); - return NULL; - } +static int udf_map_block(struct inode *inode, struct udf_map_rq *map) +{ + int err; + struct udf_inode_info *iinfo = UDF_I(inode); - /* alloc block, and copy data to it */ - *block = udf_new_block(inode->i_sb, inode, - iinfo->i_location.partitionReferenceNum, - iinfo->i_location.logicalBlockNum, err); - if (!(*block)) - return NULL; - newblock = udf_get_pblock(inode->i_sb, *block, - iinfo->i_location.partitionReferenceNum, - 0); - if (!newblock) - return NULL; - dbh = udf_tgetblk(inode->i_sb, newblock); - if (!dbh) - return NULL; - lock_buffer(dbh); - memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(dbh); - unlock_buffer(dbh); - mark_buffer_dirty_inode(dbh, inode); - - sfibh.soffset = sfibh.eoffset = - f_pos & (inode->i_sb->s_blocksize - 1); - sfibh.sbh = sfibh.ebh = NULL; - dfibh.soffset = dfibh.eoffset = 0; - dfibh.sbh = dfibh.ebh = dbh; - while (f_pos < size) { - iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, - NULL, NULL, NULL); - if (!sfi) { - brelse(dbh); - return NULL; - } - iinfo->i_alloc_type = alloctype; - sfi->descTag.tagLocation = cpu_to_le32(*block); - dfibh.soffset = dfibh.eoffset; - dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); - dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); - if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, - udf_get_fi_ident(sfi))) { - iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - brelse(dbh); - return NULL; + map->oflags = 0; + if (!(map->iflags & UDF_MAP_CREATE)) { + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + + down_read(&iinfo->i_data_sem); + if (inode_bmap(inode, map->lblk, &epos, &eloc, &elen, &offset) + == (EXT_RECORDED_ALLOCATED >> 30)) { + map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, + offset); + map->oflags |= UDF_BLK_MAPPED; } - } - mark_buffer_dirty_inode(dbh, inode); + up_read(&iinfo->i_data_sem); + brelse(epos.bh); - memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); - iinfo->i_lenAlloc = 0; - eloc.logicalBlockNum = *block; - eloc.partitionReferenceNum = - iinfo->i_location.partitionReferenceNum; - iinfo->i_lenExtents = inode->i_size; - epos.bh = NULL; - epos.block = iinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(inode); - udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); - /* UniqueID stuff */ - - brelse(epos.bh); - mark_inode_dirty(inode); - return dbh; -} - -static int udf_get_block(struct inode *inode, sector_t block, - struct buffer_head *bh_result, int create) -{ - int err, new; - sector_t phys = 0; - struct udf_inode_info *iinfo; - - if (!create) { - phys = udf_block_map(inode, block); - if (phys) - map_bh(bh_result, inode->i_sb, phys); return 0; } - err = -EIO; - new = 0; - iinfo = UDF_I(inode); - down_write(&iinfo->i_data_sem); - if (block == iinfo->i_next_alloc_block + 1) { - iinfo->i_next_alloc_block++; - iinfo->i_next_alloc_goal++; - } - /* * Block beyond EOF and prealloc extents? Just discard preallocation * as it is not useful and complicates things. */ - if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents) + if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents) udf_discard_prealloc(inode); udf_clear_extent_cache(inode); - phys = inode_getblk(inode, block, &err, &new); - if (!phys) - goto abort; - - if (new) - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, phys); - -abort: + err = inode_getblk(inode, map); up_write(&iinfo->i_data_sem); return err; } -static struct buffer_head *udf_getblk(struct inode *inode, udf_pblk_t block, - int create, int *err) +static int __udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int flags) { - struct buffer_head *bh; - struct buffer_head dummy; - - dummy.b_state = 0; - dummy.b_blocknr = -1000; - *err = udf_get_block(inode, block, &dummy, create); - if (!*err && buffer_mapped(&dummy)) { - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (buffer_new(&dummy)) { - lock_buffer(bh); - memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - unlock_buffer(bh); - mark_buffer_dirty_inode(bh, inode); - } - return bh; + int err; + struct udf_map_rq map = { + .lblk = block, + .iflags = flags, + }; + + err = udf_map_block(inode, &map); + if (err < 0) + return err; + if (map.oflags & UDF_BLK_MAPPED) { + map_bh(bh_result, inode->i_sb, map.pblk); + if (map.oflags & UDF_BLK_NEW) + set_buffer_new(bh_result); } + return 0; +} - return NULL; +int udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int flags = create ? UDF_MAP_CREATE : 0; + + /* + * We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. + */ + if (!S_ISREG(inode->i_mode)) + flags |= UDF_MAP_NOPREALLOC; + return __udf_get_block(inode, block, bh_result, flags); +} + +/* + * We shouldn't be allocating blocks on page writeback since we allocate them + * on page fault. We can spot dirty buffers without allocated blocks though + * when truncate expands file. These however don't have valid data so we can + * safely ignore them. So never allocate blocks from page writeback. + */ +static int udf_get_block_wb(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + return __udf_get_block(inode, block, bh_result, 0); } /* Extend the file with new blocks totaling 'new_block_bytes', @@ -509,6 +518,7 @@ static int udf_do_extend_file(struct inode *inode, ~(sb->s_blocksize - 1); } + add = 0; /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { @@ -521,8 +531,10 @@ static int udf_do_extend_file(struct inode *inode, } if (fake) { - udf_add_aext(inode, last_pos, &last_ext->extLocation, - last_ext->extLength, 1); + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err < 0) + goto out_err; count++; } else { struct kernel_lb_addr tmploc; @@ -539,6 +551,7 @@ static int udf_do_extend_file(struct inode *inode, if (new_block_bytes) udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } + iinfo->i_lenExtents += add; /* Managed to do everything necessary? */ if (!new_block_bytes) @@ -556,7 +569,8 @@ static int udf_do_extend_file(struct inode *inode, err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) - return err; + goto out_err; + iinfo->i_lenExtents += add; count++; } if (new_block_bytes) { @@ -565,7 +579,8 @@ static int udf_do_extend_file(struct inode *inode, err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) - return err; + goto out_err; + iinfo->i_lenExtents += new_block_bytes; count++; } @@ -579,6 +594,11 @@ out: return -EIO; return count; +out_err: + /* Remove extents we've created so far */ + udf_clear_extent_cache(inode); + udf_truncate_extents(inode); + return err; } /* Extend the final block of the file to final_block_len bytes */ @@ -626,6 +646,7 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) else BUG(); + down_write(&iinfo->i_data_sem); /* * When creating hole in file, just don't bother with preserving * preallocation. It likely won't be very useful anyway. @@ -668,14 +689,13 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) if (err < 0) goto out; err = 0; - iinfo->i_lenExtents = newsize; out: brelse(epos.bh); + up_write(&iinfo->i_data_sem); return err; } -static sector_t inode_getblk(struct inode *inode, sector_t block, - int *err, int *new) +static int inode_getblk(struct inode *inode, struct udf_map_rq *map) { struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; @@ -684,21 +704,20 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; - udf_pblk_t newblocknum, newblock = 0; + udf_pblk_t newblocknum; sector_t offset = 0; int8_t etype; struct udf_inode_info *iinfo = UDF_I(inode); udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF; + int ret = 0; - *err = 0; - *new = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; cur_epos = next_epos = prev_epos; - b_off = (loff_t)block << inode->i_sb->s_blocksize_bits; + b_off = (loff_t)map->lblk << inode->i_sb->s_blocksize_bits; /* find the extent which contains the block we are looking for. alternate between laarr[0] and laarr[1] for locations of the @@ -757,15 +776,18 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); + iinfo->i_lenExtents = + ALIGN(iinfo->i_lenExtents, + inode->i_sb->s_blocksize); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } - newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + map->oflags = UDF_BLK_MAPPED; + map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); goto out_free; } /* Are we beyond EOF and preallocated extent? */ if (etype == -1) { - int ret; loff_t hole_len; isBeyondEOF = true; @@ -785,26 +807,22 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* Create extents for the hole between EOF and offset */ hole_len = (loff_t)offset << inode->i_blkbits; ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); - if (ret < 0) { - *err = ret; + if (ret < 0) goto out_free; - } c = 0; offset = 0; count += ret; - /* We are not covered by a preallocated extent? */ - if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) != - EXT_NOT_RECORDED_ALLOCATED) { - /* Is there any real extent? - otherwise we overwrite - * the fake one... */ - if (count) - c = !c; - laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | - inode->i_sb->s_blocksize; - memset(&laarr[c].extLocation, 0x00, - sizeof(struct kernel_lb_addr)); - count++; - } + /* + * Is there any real extent? - otherwise we overwrite the fake + * one... + */ + if (count) + c = !c; + laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + inode->i_sb->s_blocksize; + memset(&laarr[c].extLocation, 0x00, + sizeof(struct kernel_lb_addr)); + count++; endnum = c + 1; lastblock = 1; } else { @@ -838,7 +856,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) newblocknum = laarr[c].extLocation.logicalBlockNum + offset; else { /* otherwise, allocate a new block */ - if (iinfo->i_next_alloc_block == block) + if (iinfo->i_next_alloc_block == map->lblk) goal = iinfo->i_next_alloc_goal; if (!goal) { @@ -848,11 +866,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, newblocknum = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, - goal, err); - if (!newblocknum) { - *err = -ENOSPC; + goal, &ret); + if (!newblocknum) goto out_free; - } if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; } @@ -863,11 +879,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); - /* We preallocate blocks only for regular files. It also makes sense - * for directories but there's a problem when to drop the - * preallocation. We might use some delayed work for that but I feel - * it's overengineering for a filesystem like UDF. */ - if (S_ISREG(inode->i_mode)) + if (!(map->iflags & UDF_MAP_NOPREALLOC)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); /* merge any continuous blocks in laarr */ @@ -876,28 +888,31 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* write back the new extents, inserting new extents if the new number * of extents is greater than the old number, and deleting extents if * the new number of extents is less than the old number */ - udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + ret = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + if (ret < 0) + goto out_free; - newblock = udf_get_pblock(inode->i_sb, newblocknum, + map->pblk = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); - if (!newblock) { - *err = -EIO; + if (!map->pblk) { + ret = -EFSCORRUPTED; goto out_free; } - *new = 1; - iinfo->i_next_alloc_block = block; - iinfo->i_next_alloc_goal = newblocknum; + map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; + iinfo->i_next_alloc_block = map->lblk + 1; + iinfo->i_next_alloc_goal = newblocknum + 1; inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); + ret = 0; out_free: brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); - return newblock; + return ret; } static void udf_split_extents(struct inode *inode, int *c, int offset, @@ -1080,23 +1095,8 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, blocksize - 1) >> blocksize_bits)))) { if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + - (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + - blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { - lip1->extLength = (lip1->extLength - - (li->extLength & - UDF_EXTENT_LENGTH_MASK) + - UDF_EXTENT_LENGTH_MASK) & - ~(blocksize - 1); - li->extLength = (li->extLength & - UDF_EXTENT_FLAG_MASK) + - (UDF_EXTENT_LENGTH_MASK + 1) - - blocksize; - lip1->extLocation.logicalBlockNum = - li->extLocation.logicalBlockNum + - ((li->extLength & - UDF_EXTENT_LENGTH_MASK) >> - blocksize_bits); - } else { + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + @@ -1159,21 +1159,30 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, } } -static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, - int startnum, int endnum, - struct extent_position *epos) +static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, + int startnum, int endnum, + struct extent_position *epos) { int start = 0, i; struct kernel_lb_addr tmploc; uint32_t tmplen; + int err; if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { - udf_insert_aext(inode, *epos, laarr[i].extLocation, - laarr[i].extLength); + err = udf_insert_aext(inode, *epos, + laarr[i].extLocation, + laarr[i].extLength); + /* + * If we fail here, we are likely corrupting the extent + * list and leaking blocks. At least stop early to + * limit the damage. + */ + if (err < 0) + return err; udf_next_aext(inode, epos, &laarr[i].extLocation, &laarr[i].extLength, 1); start++; @@ -1185,17 +1194,36 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } + return 0; } struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh = NULL; + struct udf_map_rq map = { + .lblk = block, + .iflags = UDF_MAP_NOPREALLOC | (create ? UDF_MAP_CREATE : 0), + }; - bh = udf_getblk(inode, block, create, err); - if (!bh) + *err = udf_map_block(inode, &map); + if (*err || !(map.oflags & UDF_BLK_MAPPED)) return NULL; + bh = sb_getblk(inode->i_sb, map.pblk); + if (!bh) { + *err = -ENOMEM; + return NULL; + } + if (map.oflags & UDF_BLK_NEW) { + lock_buffer(bh); + memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + return bh; + } + if (bh_read(bh, 0) >= 0) return bh; @@ -1206,7 +1234,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int udf_setsize(struct inode *inode, loff_t newsize) { - int err; + int err = 0; struct udf_inode_info *iinfo; unsigned int bsize = i_blocksize(inode); @@ -1216,28 +1244,25 @@ int udf_setsize(struct inode *inode, loff_t newsize) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + filemap_invalidate_lock(inode->i_mapping); iinfo = UDF_I(inode); if (newsize > inode->i_size) { - down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - if (bsize < + if (bsize >= (udf_file_entry_alloc_offset(inode) + newsize)) { - err = udf_expand_file_adinicb(inode); - if (err) - return err; down_write(&iinfo->i_data_sem); - } else { iinfo->i_lenAlloc = newsize; + up_write(&iinfo->i_data_sem); goto set_size; } + err = udf_expand_file_adinicb(inode); + if (err) + goto out_unlock; } err = udf_extend_file(inode, newsize); - if (err) { - up_write(&iinfo->i_data_sem); - return err; - } + if (err) + goto out_unlock; set_size: - up_write(&iinfo->i_data_sem); truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { @@ -1254,14 +1279,14 @@ set_size: err = block_truncate_page(inode->i_mapping, newsize, udf_get_block); if (err) - return err; + goto out_unlock; truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); if (err) - return err; + goto out_unlock; } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); @@ -1269,7 +1294,9 @@ update_time: udf_sync_inode(inode); else mark_inode_dirty(inode); - return 0; +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return err; } /* @@ -1381,6 +1408,7 @@ reread: ret = -EIO; goto out; } + iinfo->i_hidden = hidden_inode; iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; @@ -1537,10 +1565,7 @@ reread: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: case ICBTAG_FILE_TYPE_VAT20: - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; inode->i_mode |= S_IFREG; @@ -1671,7 +1696,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); - bh = udf_tgetblk(inode->i_sb, + bh = sb_getblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); @@ -1716,8 +1741,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); - else - fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + else { + if (iinfo->i_hidden) + fe->fileLinkCount = cpu_to_le16(0); + else + fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + } fe->informationLength = cpu_to_le64(inode->i_size); @@ -1888,8 +1917,13 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->i_state & I_NEW)) { + if (UDF_I(inode)->i_hidden != hidden_inode) { + iput(inode); + return ERR_PTR(-EFSCORRUPTED); + } return inode; + } memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); err = udf_read_inode(inode, hidden_inode); @@ -1922,7 +1956,7 @@ int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, neloc.logicalBlockNum = block; neloc.partitionReferenceNum = epos->block.partitionReferenceNum; - bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); + bh = sb_getblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); if (!bh) return -EIO; lock_buffer(bh); @@ -2139,7 +2173,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); - epos->bh = udf_tread(inode->i_sb, block); + epos->bh = sb_bread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %u failed!\n", block); return -1; @@ -2203,12 +2237,13 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, return etype; } -static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, - struct kernel_lb_addr neloc, uint32_t nelen) +static int udf_insert_aext(struct inode *inode, struct extent_position epos, + struct kernel_lb_addr neloc, uint32_t nelen) { struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; + int err; if (epos.bh) get_bh(epos.bh); @@ -2218,10 +2253,10 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, neloc = oeloc; nelen = (etype << 30) | oelen; } - udf_add_aext(inode, &epos, &neloc, nelen, 1); + err = udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); - return (nelen >> 30); + return err; } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) @@ -2339,28 +2374,3 @@ int8_t inode_bmap(struct inode *inode, sector_t block, return etype; } - -udf_pblk_t udf_block_map(struct inode *inode, sector_t block) -{ - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - udf_pblk_t ret; - - down_read(&UDF_I(inode)->i_data_sem); - - if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) - ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); - else - ret = 0; - - up_read(&UDF_I(inode)->i_data_sem); - brelse(epos.bh); - - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) - return udf_fixed_to_variable(ret); - else - return ret; -} diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 46d697172197..c87ed942d076 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -45,7 +45,7 @@ unsigned int udf_get_last_session(struct super_block *sb) return 0; } -unsigned long udf_get_last_block(struct super_block *sb) +udf_pblk_t udf_get_last_block(struct super_block *sb) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; @@ -54,8 +54,11 @@ unsigned long udf_get_last_block(struct super_block *sb) * The cdrom layer call failed or returned obviously bogus value? * Try using the device size... */ - if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) + if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) { + if (sb_bdev_nr_blocks(sb) > ~(udf_pblk_t)0) + return 0; lblock = sb_bdev_nr_blocks(sb); + } if (lblock) return lblock - 1; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 1614d308d0f0..3777468d06ce 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -28,22 +28,6 @@ #include "udf_i.h" #include "udf_sb.h" -struct buffer_head *udf_tgetblk(struct super_block *sb, udf_pblk_t block) -{ - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) - return sb_getblk(sb, udf_fixed_to_variable(block)); - else - return sb_getblk(sb, block); -} - -struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block) -{ - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) - return sb_bread(sb, udf_fixed_to_variable(block)); - else - return sb_bread(sb, block); -} - struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint32_t type, uint8_t loc) { @@ -216,7 +200,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, if (block == 0xFFFFFFFF) return NULL; - bh = udf_tread(sb, block); + bh = sb_bread(sb, block); if (!bh) { udf_err(sb, "read failed, block=%u, location=%u\n", block, location); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c95c549dd64..fd20423d3ed2 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -41,283 +41,93 @@ static inline int udf_match(int len1, const unsigned char *name1, int len2, return !memcmp(name1, name2, len1); } -int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, - struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, - uint8_t *impuse, uint8_t *fileident) -{ - uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag); - uint16_t crc; - int offset; - uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); - uint8_t lfi = cfi->lengthFileIdent; - int padlen = fibh->eoffset - fibh->soffset - liu - lfi - - sizeof(struct fileIdentDesc); - int adinicb = 0; - - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - adinicb = 1; - - offset = fibh->soffset + sizeof(struct fileIdentDesc); - - if (impuse) { - if (adinicb || (offset + liu < 0)) { - memcpy((uint8_t *)sfi->impUse, impuse, liu); - } else if (offset >= 0) { - memcpy(fibh->ebh->b_data + offset, impuse, liu); - } else { - memcpy((uint8_t *)sfi->impUse, impuse, -offset); - memcpy(fibh->ebh->b_data, impuse - offset, - liu + offset); - } - } - - offset += liu; - - if (fileident) { - if (adinicb || (offset + lfi < 0)) { - memcpy(sfi->impUse + liu, fileident, lfi); - } else if (offset >= 0) { - memcpy(fibh->ebh->b_data + offset, fileident, lfi); - } else { - memcpy(sfi->impUse + liu, fileident, -offset); - memcpy(fibh->ebh->b_data, fileident - offset, - lfi + offset); - } - } - - offset += lfi; - - if (adinicb || (offset + padlen < 0)) { - memset(sfi->impUse + liu + lfi, 0x00, padlen); - } else if (offset >= 0) { - memset(fibh->ebh->b_data + offset, 0x00, padlen); - } else { - memset(sfi->impUse + liu + lfi, 0x00, -offset); - memset(fibh->ebh->b_data, 0x00, padlen + offset); - } - - crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag), - sizeof(struct fileIdentDesc) - sizeof(struct tag)); - - if (fibh->sbh == fibh->ebh) { - crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - crclen + sizeof(struct tag) - - sizeof(struct fileIdentDesc)); - } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { - crc = crc_itu_t(crc, fibh->ebh->b_data + - sizeof(struct fileIdentDesc) + - fibh->soffset, - crclen + sizeof(struct tag) - - sizeof(struct fileIdentDesc)); - } else { - crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - -fibh->soffset - sizeof(struct fileIdentDesc)); - crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset); - } - - cfi->descTag.descCRC = cpu_to_le16(crc); - cfi->descTag.descCRCLength = cpu_to_le16(crclen); - cfi->descTag.tagChecksum = udf_tag_checksum(&cfi->descTag); - - if (adinicb || (sizeof(struct fileIdentDesc) <= -fibh->soffset)) { - memcpy((uint8_t *)sfi, (uint8_t *)cfi, - sizeof(struct fileIdentDesc)); - } else { - memcpy((uint8_t *)sfi, (uint8_t *)cfi, -fibh->soffset); - memcpy(fibh->ebh->b_data, (uint8_t *)cfi - fibh->soffset, - sizeof(struct fileIdentDesc) + fibh->soffset); - } - - if (adinicb) { - mark_inode_dirty(inode); - } else { - if (fibh->sbh != fibh->ebh) - mark_buffer_dirty_inode(fibh->ebh, inode); - mark_buffer_dirty_inode(fibh->sbh, inode); - } - inode_inc_iversion(inode); - - return 0; -} - /** - * udf_find_entry - find entry in given directory. + * udf_fiiter_find_entry - find entry in given directory. * * @dir: directory inode to search in * @child: qstr of the name - * @fibh: buffer head / inode with file identifier descriptor we found - * @cfi: found file identifier descriptor with given name + * @iter: iter to use for searching * * This function searches in the directory @dir for a file name @child. When - * found, @fibh points to the buffer head(s) (bh is NULL for in ICB - * directories) containing the file identifier descriptor (FID). In that case - * the function returns pointer to the FID in the buffer or inode - but note - * that FID may be split among two buffers (blocks) so accessing it via that - * pointer isn't easily possible. This pointer can be used only as an iterator - * for other directory manipulation functions. For inspection of the FID @cfi - * can be used - the found FID is copied there. + * found, @iter points to the position in the directory with given entry. * - * Returns pointer to FID, NULL when nothing found, or error code. + * Returns 0 on success, < 0 on error (including -ENOENT). */ -static struct fileIdentDesc *udf_find_entry(struct inode *dir, - const struct qstr *child, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi) +static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child, + struct udf_fileident_iter *iter) { - struct fileIdentDesc *fi = NULL; - loff_t f_pos; - udf_pblk_t block; int flen; - unsigned char *fname = NULL, *copy_name = NULL; - unsigned char *nameptr; - uint8_t lfi; - uint16_t liu; - loff_t size; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo = UDF_I(dir); + unsigned char *fname = NULL; + struct super_block *sb = dir->i_sb; int isdotdot = child->len == 2 && child->name[0] == '.' && child->name[1] == '.'; - struct super_block *sb = dir->i_sb; - - size = udf_ext0_offset(dir) + dir->i_size; - f_pos = udf_ext0_offset(dir); - - fibh->sbh = fibh->ebh = NULL; - fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { - fi = ERR_PTR(-EIO); - goto out_err; - } - - block = udf_get_lb_pblock(sb, &eloc, offset); - if ((++offset << sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh->sbh = fibh->ebh = udf_tread(sb, block); - if (!fibh->sbh) { - fi = ERR_PTR(-EIO); - goto out_err; - } - } + int ret; fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); - if (!fname) { - fi = ERR_PTR(-ENOMEM); - goto out_err; - } - - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, - &elen, &offset); - if (!fi) { - fi = ERR_PTR(-EIO); - goto out_err; - } - - liu = le16_to_cpu(cfi->lengthOfImpUse); - lfi = cfi->lengthFileIdent; - - if (fibh->sbh == fibh->ebh) { - nameptr = udf_get_fi_ident(fi); - } else { - int poffset; /* Unpaded ending offset */ - - poffset = fibh->soffset + sizeof(struct fileIdentDesc) + - liu + lfi; - - if (poffset >= lfi) - nameptr = (uint8_t *)(fibh->ebh->b_data + - poffset - lfi); - else { - if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN_CS0, - GFP_NOFS); - if (!copy_name) { - fi = ERR_PTR(-ENOMEM); - goto out_err; - } - } - nameptr = copy_name; - memcpy(nameptr, udf_get_fi_ident(fi), - lfi - poffset); - memcpy(nameptr + lfi - poffset, - fibh->ebh->b_data, poffset); - } - } + if (!fname) + return -ENOMEM; - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + for (ret = udf_fiiter_init(iter, dir, 0); + !ret && iter->pos < dir->i_size; + ret = udf_fiiter_advance(iter)) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } - if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } - if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && + if ((iter->fi.fileCharacteristics & FID_FILE_CHAR_PARENT) && isdotdot) goto out_ok; - if (!lfi) + if (!iter->fi.lengthFileIdent) continue; - flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + flen = udf_get_filename(sb, iter->name, + iter->fi.lengthFileIdent, fname, UDF_NAME_LEN); if (flen < 0) { - fi = ERR_PTR(flen); + ret = flen; goto out_err; } if (udf_match(flen, fname, child->len, child->name)) goto out_ok; } + if (!ret) + ret = -ENOENT; - fi = NULL; out_err: - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); + udf_fiiter_release(iter); out_ok: - brelse(epos.bh); kfree(fname); - kfree(copy_name); - return fi; + return ret; } static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; - struct fileIdentDesc cfi; - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi; + struct udf_fileident_iter iter; + int err; if (dentry->d_name.len > UDF_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (IS_ERR(fi)) - return ERR_CAST(fi); + err = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (err < 0 && err != -ENOENT) + return ERR_PTR(err); - if (fi) { + if (err == 0) { struct kernel_lb_addr loc; - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + loc = lelb_to_cpu(iter.fi.icb.extLocation); + udf_fiiter_release(&iter); - loc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(dir->i_sb, &loc); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -326,287 +136,249 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -static struct fileIdentDesc *udf_add_entry(struct inode *dir, - struct dentry *dentry, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi, int *err) +static int udf_expand_dir_adinicb(struct inode *inode, udf_pblk_t *block) { - struct super_block *sb = dir->i_sb; - struct fileIdentDesc *fi = NULL; - unsigned char *name = NULL; - int namelen; - loff_t f_pos; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - int nfidlen; - udf_pblk_t block; + udf_pblk_t newblock; + struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; - uint32_t elen = 0; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo; + struct extent_position epos; + uint8_t alloctype; + struct udf_inode_info *iinfo = UDF_I(inode); + struct udf_fileident_iter iter; + uint8_t *impuse; + int ret; - fibh->sbh = fibh->ebh = NULL; - name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS); - if (!name) { - *err = -ENOMEM; - goto out_err; - } + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + alloctype = ICBTAG_FLAG_AD_SHORT; + else + alloctype = ICBTAG_FLAG_AD_LONG; - if (dentry) { - if (!dentry->d_name.len) { - *err = -EINVAL; - goto out_err; - } - namelen = udf_put_filename(sb, dentry->d_name.name, - dentry->d_name.len, - name, UDF_NAME_LEN_CS0); - if (!namelen) { - *err = -ENAMETOOLONG; - goto out_err; - } - } else { - namelen = 0; + if (!inode->i_size) { + iinfo->i_alloc_type = alloctype; + mark_inode_dirty(inode); + return 0; } - nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); - - f_pos = udf_ext0_offset(dir); - - fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); - dinfo = UDF_I(dir); - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, - &dinfo->i_location, 0); - fibh->soffset = fibh->eoffset = sb->s_blocksize; - goto add; - } - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->sbh) { - *err = -EIO; - goto out_err; - } + /* alloc block, and copy data to it */ + *block = udf_new_block(inode->i_sb, inode, + iinfo->i_location.partitionReferenceNum, + iinfo->i_location.logicalBlockNum, &ret); + if (!(*block)) + return ret; + newblock = udf_get_pblock(inode->i_sb, *block, + iinfo->i_location.partitionReferenceNum, + 0); + if (newblock == 0xffffffff) + return -EFSCORRUPTED; + dbh = sb_getblk(inode->i_sb, newblock); + if (!dbh) + return -ENOMEM; + lock_buffer(dbh); + memcpy(dbh->b_data, iinfo->i_data, inode->i_size); + memset(dbh->b_data + inode->i_size, 0, + inode->i_sb->s_blocksize - inode->i_size); + set_buffer_uptodate(dbh); + unlock_buffer(dbh); + + /* Drop inline data, add block instead */ + iinfo->i_alloc_type = alloctype; + memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); + iinfo->i_lenAlloc = 0; + eloc.logicalBlockNum = *block; + eloc.partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + iinfo->i_lenExtents = inode->i_size; + epos.bh = NULL; + epos.block = iinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(inode); + ret = udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); + brelse(epos.bh); + if (ret < 0) { + brelse(dbh); + udf_free_blocks(inode->i_sb, inode, &eloc, 0, 1); + return ret; + } + mark_inode_dirty(inode); - block = dinfo->i_location.logicalBlockNum; + /* Now fixup tags in moved directory entries */ + for (ret = udf_fiiter_init(&iter, inode, 0); + !ret && iter.pos < inode->i_size; + ret = udf_fiiter_advance(&iter)) { + iter.fi.descTag.tagLocation = cpu_to_le32(*block); + if (iter.fi.lengthOfImpUse != cpu_to_le16(0)) + impuse = dbh->b_data + iter.pos + + sizeof(struct fileIdentDesc); + else + impuse = NULL; + udf_fiiter_write_fi(&iter, impuse); } + brelse(dbh); + /* + * We don't expect the iteration to fail as the directory has been + * already verified to be correct + */ + WARN_ON_ONCE(ret); + udf_fiiter_release(&iter); - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, - &elen, &offset); + return 0; +} - if (!fi) { - *err = -EIO; - goto out_err; - } +static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry, + struct udf_fileident_iter *iter) +{ + struct udf_inode_info *dinfo = UDF_I(dir); + int nfidlen, namelen = 0; + int ret; + int off, blksize = 1 << dir->i_blkbits; + udf_pblk_t block; + char name[UDF_NAME_LEN_CS0]; + + if (dentry) { + if (!dentry->d_name.len) + return -EINVAL; + namelen = udf_put_filename(dir->i_sb, dentry->d_name.name, + dentry->d_name.len, + name, UDF_NAME_LEN_CS0); + if (!namelen) + return -ENAMETOOLONG; + } + nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (udf_dir_entry_len(cfi) == nfidlen) { - cfi->descTag.tagSerialNum = cpu_to_le16(1); - cfi->fileVersionNum = cpu_to_le16(1); - cfi->fileCharacteristics = 0; - cfi->lengthFileIdent = namelen; - cfi->lengthOfImpUse = cpu_to_le16(0); - if (!udf_write_fi(dir, cfi, fi, fibh, NULL, - name)) - goto out_ok; - else { - *err = -EIO; - goto out_err; - } + for (ret = udf_fiiter_init(iter, dir, 0); + !ret && iter->pos < dir->i_size; + ret = udf_fiiter_advance(iter)) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { + if (udf_dir_entry_len(&iter->fi) == nfidlen) { + iter->fi.descTag.tagSerialNum = cpu_to_le16(1); + iter->fi.fileVersionNum = cpu_to_le16(1); + iter->fi.fileCharacteristics = 0; + iter->fi.lengthFileIdent = namelen; + iter->fi.lengthOfImpUse = cpu_to_le16(0); + memcpy(iter->namebuf, name, namelen); + iter->name = iter->namebuf; + return 0; } } } - -add: - f_pos += nfidlen; - + if (ret) { + udf_fiiter_release(iter); + return ret; + } if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && - sb->s_blocksize - fibh->eoffset < nfidlen) { - brelse(epos.bh); - epos.bh = NULL; - fibh->soffset -= udf_ext0_offset(dir); - fibh->eoffset -= udf_ext0_offset(dir); - f_pos -= udf_ext0_offset(dir); - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); - fibh->sbh = fibh->ebh = - udf_expand_dir_adinicb(dir, &block, err); - if (!fibh->sbh) - goto out_err; - epos.block = dinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(dir); - /* Load extent udf_expand_dir_adinicb() has created */ - udf_current_aext(dir, &epos, &eloc, &elen, 1); + blksize - udf_ext0_offset(dir) - iter->pos < nfidlen) { + udf_fiiter_release(iter); + ret = udf_expand_dir_adinicb(dir, &block); + if (ret) + return ret; + ret = udf_fiiter_init(iter, dir, dir->i_size); + if (ret < 0) + return ret; } - /* Entry fits into current block? */ - if (sb->s_blocksize - fibh->eoffset >= nfidlen) { - fibh->soffset = fibh->eoffset; - fibh->eoffset += nfidlen; - if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - } - - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - block = dinfo->i_location.logicalBlockNum; - fi = (struct fileIdentDesc *) - (dinfo->i_data + fibh->soffset - - udf_ext0_offset(dir) + - dinfo->i_lenEAttr); - } else { - block = eloc.logicalBlockNum + - ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - fi = (struct fileIdentDesc *) - (fibh->sbh->b_data + fibh->soffset); - } + /* Get blocknumber to use for entry tag */ + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + block = dinfo->i_location.logicalBlockNum; } else { - /* Round up last extent in the file */ - elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize - - 1) & ~(sb->s_blocksize - 1); - - fibh->soffset = fibh->eoffset - sb->s_blocksize; - fibh->eoffset += nfidlen - sb->s_blocksize; - if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - } + block = iter->eloc.logicalBlockNum + + ((iter->elen - 1) >> dir->i_blkbits); + } + off = iter->pos & (blksize - 1); + if (!off) + off = blksize; + /* Entry fits into current block? */ + if (blksize - udf_ext0_offset(dir) - off >= nfidlen) + goto store_fi; - block = eloc.logicalBlockNum + ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - fibh->ebh = udf_bread(dir, - f_pos >> dir->i_sb->s_blocksize_bits, 1, err); - if (!fibh->ebh) - goto out_err; - /* Extents could have been merged, invalidate our position */ - brelse(epos.bh); - epos.bh = NULL; - epos.block = dinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(dir); - - if (!fibh->soffset) { - /* Find the freshly allocated block */ - while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == - (EXT_RECORDED_ALLOCATED >> 30)) - ; - block = eloc.logicalBlockNum + ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - fi = (struct fileIdentDesc *)(fibh->sbh->b_data); - } else { - fi = (struct fileIdentDesc *) - (fibh->sbh->b_data + sb->s_blocksize + - fibh->soffset); - } + ret = udf_fiiter_append_blk(iter); + if (ret) { + udf_fiiter_release(iter); + return ret; } - memset(cfi, 0, sizeof(struct fileIdentDesc)); - if (UDF_SB(sb)->s_udfrev >= 0x0200) - udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, + /* Entry will be completely in the new block? Update tag location... */ + if (!(iter->pos & (blksize - 1))) + block = iter->eloc.logicalBlockNum + + ((iter->elen - 1) >> dir->i_blkbits); +store_fi: + memset(&iter->fi, 0, sizeof(struct fileIdentDesc)); + if (UDF_SB(dir->i_sb)->s_udfrev >= 0x0200) + udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 3, 1, block, sizeof(struct tag)); else - udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, + udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 2, 1, block, sizeof(struct tag)); - cfi->fileVersionNum = cpu_to_le16(1); - cfi->lengthFileIdent = namelen; - cfi->lengthOfImpUse = cpu_to_le16(0); - if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) { - dir->i_size += nfidlen; - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - dinfo->i_lenAlloc += nfidlen; - else { - /* Find the last extent and truncate it to proper size */ - while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == - (EXT_RECORDED_ALLOCATED >> 30)) - ; - elen -= dinfo->i_lenExtents - dir->i_size; - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - dinfo->i_lenExtents = dir->i_size; - } - - mark_inode_dirty(dir); - goto out_ok; + iter->fi.fileVersionNum = cpu_to_le16(1); + iter->fi.lengthFileIdent = namelen; + iter->fi.lengthOfImpUse = cpu_to_le16(0); + memcpy(iter->namebuf, name, namelen); + iter->name = iter->namebuf; + + dir->i_size += nfidlen; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + dinfo->i_lenAlloc += nfidlen; } else { - *err = -EIO; - goto out_err; + /* Truncate last extent to proper size */ + udf_fiiter_update_elen(iter, iter->elen - + (dinfo->i_lenExtents - dir->i_size)); } + mark_inode_dirty(dir); -out_err: - fi = NULL; - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); -out_ok: - brelse(epos.bh); - kfree(name); - return fi; + return 0; } -static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi) +static void udf_fiiter_delete_entry(struct udf_fileident_iter *iter) { - cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; + iter->fi.fileCharacteristics |= FID_FILE_CHAR_DELETED; + + if (UDF_QUERY_FLAG(iter->dir->i_sb, UDF_FLAG_STRICT)) + memset(&iter->fi.icb, 0x00, sizeof(struct long_ad)); + + udf_fiiter_write_fi(iter, NULL); +} - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - memset(&(cfi->icb), 0x00, sizeof(struct long_ad)); +static void udf_add_fid_counter(struct super_block *sb, bool dir, int val) +{ + struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); - return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); + if (!lvidiu) + return; + mutex_lock(&UDF_SB(sb)->s_alloc_mutex); + if (dir) + le32_add_cpu(&lvidiu->numDirs, val); + else + le32_add_cpu(&lvidiu->numFiles, val); + udf_updated_lvid(sb); + mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); } static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); struct inode *dir = d_inode(dentry->d_parent); - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (unlikely(!fi)) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) { inode_dec_link_count(inode); discard_new_inode(inode); return err; } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + udf_fiiter_write_fi(&iter, NULL); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + udf_fiiter_release(&iter); + udf_add_fid_counter(dir->i_sb, false, 1); d_instantiate_new(dentry, inode); return 0; } -static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = udf_new_inode(dir, mode); @@ -614,10 +386,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); @@ -625,7 +394,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -633,10 +402,7 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); @@ -645,7 +411,7 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, 0); } -static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -661,12 +427,11 @@ static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; @@ -678,183 +443,113 @@ static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; - fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); - if (!fi) { - inode_dec_link_count(inode); + err = udf_fiiter_add_entry(inode, NULL, &iter); + if (err) { + clear_nlink(inode); discard_new_inode(inode); - goto out; + return err; } set_nlink(inode, 2); - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(dinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL); - cfi.fileCharacteristics = + iter.fi.fileCharacteristics = FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; - udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); - brelse(fibh.sbh); + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); mark_inode_dirty(inode); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) { clear_nlink(inode); - mark_inode_dirty(inode); discard_new_inode(inode); - goto out; + return err; } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + iter.fi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); + udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - err = 0; -out: - return err; + return 0; } static int empty_dir(struct inode *dir) { - struct fileIdentDesc *fi, cfi; - struct udf_fileident_bh fibh; - loff_t f_pos; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - udf_pblk_t block; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo = UDF_I(dir); - - f_pos = udf_ext0_offset(dir); - fibh.soffset = fibh.eoffset = f_pos & (dir->i_sb->s_blocksize - 1); - - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - fibh.sbh = fibh.ebh = NULL; - else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block); - if (!fibh.sbh) { - brelse(epos.bh); + struct udf_fileident_iter iter; + int ret; + + for (ret = udf_fiiter_init(&iter, dir, 0); + !ret && iter.pos < dir->i_size; + ret = udf_fiiter_advance(&iter)) { + if (iter.fi.lengthFileIdent && + !(iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED)) { + udf_fiiter_release(&iter); return 0; } - } else { - brelse(epos.bh); - return 0; } - - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc, - &elen, &offset); - if (!fi) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } - - if (cfi.lengthFileIdent && - (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } - } - - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); + udf_fiiter_release(&iter); return 1; } static int udf_rmdir(struct inode *dir, struct dentry *dentry) { - int retval; + int ret; struct inode *inode = d_inode(dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi, cfi; + struct udf_fileident_iter iter; struct kernel_lb_addr tloc; - retval = -ENOENT; - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (IS_ERR_OR_NULL(fi)) { - if (fi) - retval = PTR_ERR(fi); + ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (ret) goto out; - } - retval = -EIO; - tloc = lelb_to_cpu(cfi.icb.extLocation); + ret = -EFSCORRUPTED; + tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_rmdir; - retval = -ENOTEMPTY; + ret = -ENOTEMPTY; if (!empty_dir(inode)) goto end_rmdir; - retval = udf_delete_entry(dir, fi, &fibh, &cfi); - if (retval) - goto end_rmdir; + udf_fiiter_delete_entry(&iter); if (inode->i_nlink != 2) udf_warn(inode->i_sb, "empty directory has nlink != 2 (%u)\n", inode->i_nlink); clear_nlink(inode); inode->i_size = 0; inode_dec_link_count(dir); + udf_add_fid_counter(dir->i_sb, true, -1); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); mark_inode_dirty(dir); - + ret = 0; end_rmdir: - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - + udf_fiiter_release(&iter); out: - return retval; + return ret; } static int udf_unlink(struct inode *dir, struct dentry *dentry) { - int retval; + int ret; struct inode *inode = d_inode(dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi; - struct fileIdentDesc cfi; + struct udf_fileident_iter iter; struct kernel_lb_addr tloc; - retval = -ENOENT; - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - - if (IS_ERR_OR_NULL(fi)) { - if (fi) - retval = PTR_ERR(fi); + ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (ret) goto out; - } - retval = -EIO; - tloc = lelb_to_cpu(cfi.icb.extLocation); + ret = -EFSCORRUPTED; + tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_unlink; @@ -863,25 +558,20 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } - retval = udf_delete_entry(dir, fi, &fibh, &cfi); - if (retval) - goto end_unlink; + udf_fiiter_delete_entry(&iter); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); inode_dec_link_count(inode); + udf_add_fid_counter(dir->i_sb, false, -1); inode->i_ctime = dir->i_ctime; - retval = 0; - + ret = 0; end_unlink: - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - + udf_fiiter_release(&iter); out: - return retval; + return ret; } -static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); @@ -929,15 +619,20 @@ static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, iinfo->i_location.partitionReferenceNum; bsize = sb->s_blocksize; iinfo->i_lenExtents = bsize; - udf_add_aext(inode, &epos, &eloc, bsize, 0); + err = udf_add_aext(inode, &epos, &eloc, bsize, 0); brelse(epos.bh); + if (err < 0) { + udf_free_blocks(sb, inode, &eloc, 0, 1); + goto out_no_entry; + } block = udf_get_pblock(sb, block, iinfo->i_location.partitionReferenceNum, 0); - epos.bh = udf_tgetblk(sb, block); + epos.bh = sb_getblk(sb, block); if (unlikely(!epos.bh)) { err = -ENOMEM; + udf_free_blocks(sb, inode, &eloc, 0, 1); goto out_no_entry; } lock_buffer(epos.bh); @@ -1038,28 +733,23 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) return err; - } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); if (UDF_SB(inode->i_sb)->s_lvid_bh) { - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(lvid_get_unique_id(inode->i_sb)); } - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); inc_nlink(inode); + udf_add_fid_counter(dir->i_sb, false, 1); inode->i_ctime = current_time(inode); mark_inode_dirty(inode); dir->i_ctime = dir->i_mtime = current_time(dir); @@ -1073,84 +763,81 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, /* Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct udf_fileident_bh ofibh, nfibh; - struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; - struct fileIdentDesc ocfi, ncfi; - struct buffer_head *dir_bh = NULL; - int retval = -ENOENT; + struct udf_fileident_iter oiter, niter, diriter; + bool has_diriter = false; + int retval; struct kernel_lb_addr tloc; - struct udf_inode_info *old_iinfo = UDF_I(old_inode); if (flags & ~RENAME_NOREPLACE) return -EINVAL; - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (!ofi || IS_ERR(ofi)) { - if (IS_ERR(ofi)) - retval = PTR_ERR(ofi); - goto end_rename; - } - - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - - brelse(ofibh.sbh); - tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) - goto end_rename; + retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); + if (retval) + return retval; - nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); - if (IS_ERR(nfi)) { - retval = PTR_ERR(nfi); - goto end_rename; - } - if (nfi && !new_inode) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); - nfi = NULL; + tloc = lelb_to_cpu(oiter.fi.icb.extLocation); + if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) { + retval = -ENOENT; + goto out_oiter; } - if (S_ISDIR(old_inode->i_mode)) { - int offset = udf_ext0_offset(old_inode); + if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; if (!empty_dir(new_inode)) - goto end_rename; + goto out_oiter; + } + /* + * We need to protect against old_inode getting converted from + * ICB to normal directory. + */ + inode_lock_nested(old_inode, I_MUTEX_NONDIR2); + retval = udf_fiiter_find_entry(old_inode, &dotdot_name, + &diriter); + if (retval == -ENOENT) { + udf_err(old_inode->i_sb, + "directory (ino %lu) has no '..' entry\n", + old_inode->i_ino); + retval = -EFSCORRUPTED; } - retval = -EIO; - if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - dir_fi = udf_get_fileident( - old_iinfo->i_data - - (old_iinfo->i_efe ? - sizeof(struct extendedFileEntry) : - sizeof(struct fileEntry)), - old_inode->i_sb->s_blocksize, &offset); - } else { - dir_bh = udf_bread(old_inode, 0, 0, &retval); - if (!dir_bh) - goto end_rename; - dir_fi = udf_get_fileident(dir_bh->b_data, - old_inode->i_sb->s_blocksize, &offset); + if (retval) { + inode_unlock(old_inode); + goto out_oiter; } - if (!dir_fi) - goto end_rename; - tloc = lelb_to_cpu(dir_fi->icb.extLocation); + has_diriter = true; + tloc = lelb_to_cpu(diriter.fi.icb.extLocation); if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != - old_dir->i_ino) - goto end_rename; + old_dir->i_ino) { + retval = -EFSCORRUPTED; + udf_err(old_inode->i_sb, + "directory (ino %lu) has parent entry pointing to another inode (%lu != %u)\n", + old_inode->i_ino, old_dir->i_ino, + udf_get_lb_pblock(old_inode->i_sb, &tloc, 0)); + goto out_oiter; + } + } + + retval = udf_fiiter_find_entry(new_dir, &new_dentry->d_name, &niter); + if (retval && retval != -ENOENT) + goto out_oiter; + /* Entry found but not passed by VFS? */ + if (!retval && !new_inode) { + retval = -EFSCORRUPTED; + udf_fiiter_release(&niter); + goto out_oiter; } - if (!nfi) { - nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi, - &retval); - if (!nfi) - goto end_rename; + /* Entry not found? Need to add one... */ + if (retval) { + udf_fiiter_release(&niter); + retval = udf_fiiter_add_entry(new_dir, new_dentry, &niter); + if (retval) + goto out_oiter; } /* @@ -1163,31 +850,46 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, /* * ok, that's it */ - ncfi.fileVersionNum = ocfi.fileVersionNum; - ncfi.fileCharacteristics = ocfi.fileCharacteristics; - memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(ocfi.icb)); - udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); + niter.fi.fileVersionNum = oiter.fi.fileVersionNum; + niter.fi.fileCharacteristics = oiter.fi.fileCharacteristics; + memcpy(&(niter.fi.icb), &(oiter.fi.icb), sizeof(oiter.fi.icb)); + udf_fiiter_write_fi(&niter, NULL); + udf_fiiter_release(&niter); - /* The old fid may have moved - find it again */ - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - udf_delete_entry(old_dir, ofi, &ofibh, &ocfi); + /* + * The old entry may have moved due to new entry allocation. Find it + * again. + */ + udf_fiiter_release(&oiter); + retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); + if (retval) { + udf_err(old_dir->i_sb, + "failed to find renamed entry again in directory (ino %lu)\n", + old_dir->i_ino); + } else { + udf_fiiter_delete_entry(&oiter); + udf_fiiter_release(&oiter); + } if (new_inode) { new_inode->i_ctime = current_time(new_inode); inode_dec_link_count(new_inode); + udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), + -1); } old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); - if (dir_fi) { - dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); - udf_update_tag((char *)dir_fi, udf_dir_entry_len(dir_fi)); - if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(old_inode); - else - mark_buffer_dirty_inode(dir_bh, old_inode); + if (has_diriter) { + diriter.fi.icb.extLocation = + cpu_to_lelb(UDF_I(new_dir)->i_location); + udf_update_tag((char *)&diriter.fi, + udf_dir_entry_len(&diriter.fi)); + udf_fiiter_write_fi(&diriter, NULL); + udf_fiiter_release(&diriter); + inode_unlock(old_inode); inode_dec_link_count(old_dir); if (new_inode) @@ -1197,22 +899,13 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, mark_inode_dirty(new_dir); } } - - if (ofi) { - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - brelse(ofibh.sbh); - } - - retval = 0; - -end_rename: - brelse(dir_bh); - if (nfi) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); + return 0; +out_oiter: + if (has_diriter) { + udf_fiiter_release(&diriter); + inode_unlock(old_inode); } + udf_fiiter_release(&oiter); return retval; } @@ -1221,17 +914,15 @@ static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; struct inode *inode = NULL; - struct fileIdentDesc cfi; - struct udf_fileident_bh fibh; - - if (!udf_find_entry(d_inode(child), &dotdot_name, &fibh, &cfi)) - return ERR_PTR(-EACCES); + struct udf_fileident_iter iter; + int err; - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + err = udf_fiiter_find_entry(d_inode(child), &dotdot_name, &iter); + if (err) + return ERR_PTR(err); - tloc = lelb_to_cpu(cfi.icb.extLocation); + tloc = lelb_to_cpu(iter.fi.icb.extLocation); + udf_fiiter_release(&iter); inode = udf_iget(child->d_sb, &tloc); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 4cbf40575965..5bcfe78d5cab 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -54,6 +54,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, struct udf_part_map *map; struct udf_virtual_data *vdata; struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode); + int err; map = &sbi->s_partmaps[partition]; vdata = &map->s_type_specific.s_virtual; @@ -79,12 +80,10 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, index = vdata->s_start_offset / sizeof(uint32_t) + block; } - loc = udf_block_map(sbi->s_vat_inode, newblock); - - bh = sb_bread(sb, loc); + bh = udf_bread(sbi->s_vat_inode, newblock, 0, &err); if (!bh) { - udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u) VAT: %u[%u]\n", - sb, block, partition, loc, index); + udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u)\n", + sb, block, partition); return 0xFFFFFFFF; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 06eda8177b5f..6304e3c5c3d9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -86,6 +86,13 @@ enum { #define UDF_MAX_LVID_NESTING 1000 enum { UDF_MAX_LINKS = 0xffff }; +/* + * We limit filesize to 4TB. This is arbitrary as the on-disk format supports + * more but because the file space is described by a linked list of extents, + * each of which can have at most 1GB, the creation and handling of extents + * gets unusably slow beyond certain point... + */ +#define UDF_MAX_FILESIZE (1ULL << 42) /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); @@ -147,6 +154,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei->i_next_alloc_goal = 0; ei->i_strat4096 = 0; ei->i_streamdir = 0; + ei->i_hidden = 0; init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); @@ -733,7 +741,7 @@ static int udf_check_vsd(struct super_block *sb) * added */ for (; !nsr && sector < VSD_MAX_SECTOR_OFFSET; sector += sectorsize) { /* Read a block */ - bh = udf_tread(sb, sector >> sb->s_blocksize_bits); + bh = sb_bread(sb, sector >> sb->s_blocksize_bits); if (!bh) break; @@ -1175,7 +1183,6 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_part_map *map = &sbi->s_partmaps[p_index]; struct buffer_head *bh = NULL; struct udf_inode_info *vati; - uint32_t pos; struct virtualAllocationTable20 *vat20; sector_t blocks = sb_bdev_nr_blocks(sb); @@ -1197,10 +1204,14 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { vati = UDF_I(sbi->s_vat_inode); if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - pos = udf_block_map(sbi->s_vat_inode, 0); - bh = sb_bread(sb, pos); - if (!bh) - return -EIO; + int err = 0; + + bh = udf_bread(sbi->s_vat_inode, 0, 0, &err); + if (!bh) { + if (!err) + err = -EFSCORRUPTED; + return err; + } vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) @@ -1838,10 +1849,6 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, uint16_t ident; int ret; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb)) - return -EAGAIN; - bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return -EAGAIN; @@ -1860,10 +1867,10 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set * of anchors. */ -static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, +static int udf_scan_anchors(struct super_block *sb, udf_pblk_t *lastblock, struct kernel_lb_addr *fileset) { - sector_t last[6]; + udf_pblk_t last[6]; int i; struct udf_sb_info *sbi = UDF_SB(sb); int last_count = 0; @@ -1924,46 +1931,6 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, } /* - * Find an anchor volume descriptor and load Volume Descriptor Sequence from - * area specified by it. The function expects sbi->s_lastblock to be the last - * block on the media. - * - * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor - * was not found. - */ -static int udf_find_anchor(struct super_block *sb, - struct kernel_lb_addr *fileset) -{ - struct udf_sb_info *sbi = UDF_SB(sb); - sector_t lastblock = sbi->s_last_block; - int ret; - - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret != -EAGAIN) - goto out; - - /* No anchor found? Try VARCONV conversion of block numbers */ - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); - lastblock = udf_variable_to_fixed(sbi->s_last_block); - /* Firstly, we try to not convert number of the last block */ - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret != -EAGAIN) - goto out; - - lastblock = sbi->s_last_block; - /* Secondly, we try with converted number of the last block */ - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret < 0) { - /* VARCONV didn't help. Clear it. */ - UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); - } -out: - if (ret == 0) - sbi->s_last_block = lastblock; - return ret; -} - -/* * Check Volume Structure Descriptor, find Anchor block and load Volume * Descriptor Sequence. * @@ -2003,7 +1970,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, /* Look for anchor block and load Volume Descriptor Sequence */ sbi->s_anchor = uopt->anchor; - ret = udf_find_anchor(sb, fileset); + ret = udf_scan_anchors(sb, &sbi->s_last_block, fileset); if (ret < 0) { if (!silent && ret == -EAGAIN) udf_warn(sb, "No anchor found\n"); @@ -2297,7 +2264,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = -ENOMEM; goto error_out; } - sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_maxbytes = UDF_MAX_FILESIZE; sb->s_max_links = UDF_MAX_LINKS; return 0; @@ -2454,7 +2421,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, if (bytes) { brelse(bh); newblock = udf_get_lb_pblock(sb, &loc, ++block); - bh = udf_tread(sb, newblock); + bh = sb_bread(sb, newblock); if (!bh) { udf_debug("read failed\n"); goto out; diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index f3642f9c23f8..a34c8c4e6d21 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -107,53 +107,45 @@ static int udf_symlink_filler(struct file *file, struct folio *folio) struct inode *inode = page->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; - int err; + int err = 0; unsigned char *p = page_address(page); - struct udf_inode_info *iinfo; - uint32_t pos; + struct udf_inode_info *iinfo = UDF_I(inode); /* We don't support symlinks longer than one block */ if (inode->i_size > inode->i_sb->s_blocksize) { err = -ENAMETOOLONG; - goto out_unmap; + goto out_unlock; } - iinfo = UDF_I(inode); - pos = udf_block_map(inode, 0); - - down_read(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { symlink = iinfo->i_data + iinfo->i_lenEAttr; } else { - bh = sb_bread(inode->i_sb, pos); - + bh = udf_bread(inode, 0, 0, &err); if (!bh) { - err = -EIO; - goto out_unlock_inode; + if (!err) + err = -EFSCORRUPTED; + goto out_err; } - symlink = bh->b_data; } err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); brelse(bh); if (err) - goto out_unlock_inode; + goto out_err; - up_read(&iinfo->i_data_sem); SetPageUptodate(page); unlock_page(page); return 0; -out_unlock_inode: - up_read(&iinfo->i_data_sem); +out_err: SetPageError(page); -out_unmap: +out_unlock: unlock_page(page); return err; } -static int udf_symlink_getattr(struct user_namespace *mnt_userns, +static int udf_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -161,7 +153,7 @@ static int udf_symlink_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 036ebd892b85..871856c69df5 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -125,7 +125,7 @@ void udf_discard_prealloc(struct inode *inode) struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; - int8_t etype = -1, netype; + int8_t etype = -1; struct udf_inode_info *iinfo = UDF_I(inode); int bsize = 1 << inode->i_blkbits; @@ -136,7 +136,7 @@ void udf_discard_prealloc(struct inode *inode) epos.block = iinfo->i_location; /* Find the last extent in the file */ - while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) { + while (udf_next_aext(inode, &epos, &eloc, &elen, 0) != -1) { brelse(prev_epos.bh); prev_epos = epos; if (prev_epos.bh) @@ -240,7 +240,7 @@ int udf_truncate_extents(struct inode *inode) brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; - epos.bh = udf_tread(sb, + epos.bh = sb_bread(sb, udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index 06ff7006b822..312b7c9ef10e 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -44,7 +44,8 @@ struct udf_inode_info { unsigned i_use : 1; /* unallocSpaceEntry */ unsigned i_strat4096 : 1; unsigned i_streamdir : 1; - unsigned reserved : 25; + unsigned i_hidden : 1; /* hidden system inode */ + unsigned reserved : 24; __u8 *i_data; struct kernel_lb_addr i_locStreamdir; __u64 i_lenStreams; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 291b56dd011e..9af6ff7f9747 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -23,7 +23,6 @@ #define UDF_FLAG_STRICT 5 #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 -#define UDF_FLAG_VARCONV 8 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 @@ -55,6 +54,8 @@ #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 +#define EFSCORRUPTED EUCLEAN + struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 7e258f15b8ef..88692512a466 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -34,9 +34,6 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb, #define udf_debug(fmt, ...) \ pr_debug("%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__) -#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) -#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) - #define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF #define UDF_EXTENT_FLAG_MASK 0xC0000000 @@ -83,14 +80,24 @@ extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; -extern const struct address_space_operations udf_adinicb_aops; extern const struct address_space_operations udf_symlink_aops; -struct udf_fileident_bh { - struct buffer_head *sbh; - struct buffer_head *ebh; - int soffset; - int eoffset; +struct udf_fileident_iter { + struct inode *dir; /* Directory we are working with */ + loff_t pos; /* Logical position in a dir */ + struct buffer_head *bh[2]; /* Buffer containing 'pos' and possibly + * next buffer if entry straddles + * blocks */ + struct kernel_lb_addr eloc; /* Start of extent containing 'pos' */ + uint32_t elen; /* Length of extent containing 'pos' */ + sector_t loffset; /* Block offset of 'pos' within above + * extent */ + struct extent_position epos; /* Position after the above extent */ + struct fileIdentDesc fi; /* Copied directory entry */ + uint8_t *name; /* Pointer to entry name */ + uint8_t *namebuf; /* Storage for entry name in case + * the name is split between two blocks + */ }; struct udf_vds_record { @@ -121,22 +128,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, u32 meta_file_loc, u32 partition_num); /* namei.c */ -extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, - struct fileIdentDesc *, struct udf_fileident_bh *, - uint8_t *, uint8_t *); static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) { return ALIGN(sizeof(struct fileIdentDesc) + le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, UDF_NAME_PAD); } -static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi) -{ - return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse); -} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); + /* inode.c */ extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *, bool hidden_inode); @@ -151,16 +152,14 @@ static inline struct inode *udf_iget(struct super_block *sb, return __udf_iget(sb, ino, false); } extern int udf_expand_file_adinicb(struct inode *); -extern struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, - udf_pblk_t *block, int *err); extern struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err); extern int udf_setsize(struct inode *, loff_t); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); -extern udf_pblk_t udf_block_map(struct inode *inode, sector_t block); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); +int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos); extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, @@ -177,9 +176,6 @@ extern int8_t udf_current_aext(struct inode *, struct extent_position *, extern void udf_update_extra_perms(struct inode *inode, umode_t mode); /* misc.c */ -extern struct buffer_head *udf_tgetblk(struct super_block *sb, - udf_pblk_t block); -extern struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block); extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, uint32_t, uint8_t); extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, @@ -194,7 +190,7 @@ extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); /* lowlevel.c */ extern unsigned int udf_get_last_session(struct super_block *); -extern unsigned long udf_get_last_block(struct super_block *); +udf_pblk_t udf_get_last_block(struct super_block *); /* partition.c */ extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t, @@ -243,14 +239,13 @@ extern udf_pblk_t udf_new_block(struct super_block *sb, struct inode *inode, uint16_t partition, uint32_t goal, int *err); /* directory.c */ -extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, - struct udf_fileident_bh *, - struct fileIdentDesc *, - struct extent_position *, - struct kernel_lb_addr *, uint32_t *, - sector_t *); -extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, - int *offset); +int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, + loff_t pos); +int udf_fiiter_advance(struct udf_fileident_iter *iter); +void udf_fiiter_release(struct udf_fileident_iter *iter); +void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse); +void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen); +int udf_fiiter_append_blk(struct udf_fileident_iter *iter); extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 7e3e08c0166f..06bd84d555bd 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -289,7 +289,7 @@ cg_found: ufs_mark_sb_dirty(sb); inode->i_ino = cg * uspi->s_ipg + bit; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a873de7dec1c..a4246c83a8cd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1212,14 +1212,14 @@ out: return err; } -int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -1229,7 +1229,7 @@ int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return error; } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 29d5a0e0c8f0..36154b5aca6d 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -69,7 +69,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ufs_create (struct user_namespace * mnt_userns, +static int ufs_create (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { @@ -86,7 +86,7 @@ static int ufs_create (struct user_namespace * mnt_userns, return ufs_add_nondir(dentry, inode); } -static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ufs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -106,7 +106,7 @@ static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int ufs_symlink (struct user_namespace * mnt_userns, struct inode * dir, +static int ufs_symlink (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; @@ -166,7 +166,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct user_namespace * mnt_userns, struct inode * dir, +static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; @@ -243,7 +243,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ufs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 550f7c5a3636..6b499180643b 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -123,7 +123,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int ufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); /* namei.c */ diff --git a/fs/utimes.c b/fs/utimes.c index 39f356017635..3701b3946f88 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -7,6 +7,7 @@ #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> +#include <linux/filelock.h> static bool nsec_valid(long nsec) { @@ -62,7 +63,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) } retry_deleg: inode_lock(inode); - error = notify_change(mnt_user_ns(path->mnt), path->dentry, &newattrs, + error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index c4769a9396c5..075f15c43c78 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -294,14 +294,14 @@ out: return err; } -static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns, +static int vboxsf_dir_mkfile(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, umode_t mode, bool excl) { return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL); } -static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns, +static int vboxsf_dir_mkdir(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, umode_t mode) { @@ -387,7 +387,7 @@ static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) return 0; } -static int vboxsf_dir_rename(struct user_namespace *mnt_userns, +static int vboxsf_dir_rename(struct mnt_idmap *idmap, struct inode *old_parent, struct dentry *old_dentry, struct inode *new_parent, @@ -430,7 +430,7 @@ err_put_old_path: return err; } -static int vboxsf_dir_symlink(struct user_namespace *mnt_userns, +static int vboxsf_dir_symlink(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, const char *symname) { diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index e1db0f3f7e5e..dd0ae1188e87 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -231,7 +231,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry) return 0; } -int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, +int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *kstat, u32 request_mask, unsigned int flags) { int err; @@ -252,11 +252,11 @@ int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, if (err) return err; - generic_fillattr(&init_user_ns, d_inode(dentry), kstat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat); return 0; } -int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int vboxsf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry)); diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h index 9047befa66c5..05973eb89d52 100644 --- a/fs/vboxsf/vfsmod.h +++ b/fs/vboxsf/vfsmod.h @@ -97,10 +97,10 @@ int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, struct shfl_fsobjinfo *info); int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info); int vboxsf_inode_revalidate(struct dentry *dentry); -int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, +int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *kstat, u32 request_mask, unsigned int query_flags); -int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int vboxsf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi, struct dentry *dentry); diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index aad1f1d998b9..a7ffd718f171 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -34,14 +34,6 @@ config FS_VERITY If unsure, say N. -config FS_VERITY_DEBUG - bool "FS Verity debugging" - depends on FS_VERITY - help - Enable debugging messages related to fs-verity by default. - - Say N unless you are an fs-verity developer. - config FS_VERITY_BUILTIN_SIGNATURES bool "FS Verity builtin signature support" depends on FS_VERITY diff --git a/fs/verity/enable.c b/fs/verity/enable.c index df6b499bf6a1..e13db6507b38 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -7,136 +7,50 @@ #include "fsverity_private.h" -#include <crypto/hash.h> -#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> -/* - * Read a file data page for Merkle tree construction. Do aggressive readahead, - * since we're sequentially reading the entire file. - */ -static struct page *read_file_data_page(struct file *file, pgoff_t index, - struct file_ra_state *ra, - unsigned long remaining_pages) -{ - DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, index); - struct folio *folio; - - folio = __filemap_get_folio(ractl.mapping, index, FGP_ACCESSED, 0); - if (!folio || !folio_test_uptodate(folio)) { - if (folio) - folio_put(folio); - else - page_cache_sync_ra(&ractl, remaining_pages); - folio = read_cache_folio(ractl.mapping, index, NULL, file); - if (IS_ERR(folio)) - return &folio->page; - } - if (folio_test_readahead(folio)) - page_cache_async_ra(&ractl, folio, remaining_pages); - return folio_file_page(folio, index); -} +struct block_buffer { + u32 filled; + u8 *data; +}; -static int build_merkle_tree_level(struct file *filp, unsigned int level, - u64 num_blocks_to_hash, - const struct merkle_tree_params *params, - u8 *pending_hashes, - struct ahash_request *req) +/* Hash a block, writing the result to the next level's pending block buffer. */ +static int hash_one_block(struct inode *inode, + const struct merkle_tree_params *params, + struct ahash_request *req, struct block_buffer *cur) { - struct inode *inode = file_inode(filp); - const struct fsverity_operations *vops = inode->i_sb->s_vop; - struct file_ra_state ra = { 0 }; - unsigned int pending_size = 0; - u64 dst_block_num; - u64 i; + struct block_buffer *next = cur + 1; int err; - if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */ - return -EINVAL; - - if (level < params->num_levels) { - dst_block_num = params->level_start[level]; - } else { - if (WARN_ON(num_blocks_to_hash != 1)) - return -EINVAL; - dst_block_num = 0; /* unused */ - } + /* Zero-pad the block if it's shorter than the block size. */ + memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - file_ra_state_init(&ra, filp->f_mapping); - - for (i = 0; i < num_blocks_to_hash; i++) { - struct page *src_page; - - if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash) - pr_debug("Hashing block %llu of %llu for level %u\n", - i + 1, num_blocks_to_hash, level); - - if (level == 0) { - /* Leaf: hashing a data block */ - src_page = read_file_data_page(filp, i, &ra, - num_blocks_to_hash - i); - if (IS_ERR(src_page)) { - err = PTR_ERR(src_page); - fsverity_err(inode, - "Error %d reading data page %llu", - err, i); - return err; - } - } else { - unsigned long num_ra_pages = - min_t(unsigned long, num_blocks_to_hash - i, - inode->i_sb->s_bdi->io_pages); - - /* Non-leaf: hashing hash block from level below */ - src_page = vops->read_merkle_tree_page(inode, - params->level_start[level - 1] + i, - num_ra_pages); - if (IS_ERR(src_page)) { - err = PTR_ERR(src_page); - fsverity_err(inode, - "Error %d reading Merkle tree page %llu", - err, params->level_start[level - 1] + i); - return err; - } - } + err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data), + offset_in_page(cur->data), + &next->data[next->filled]); + if (err) + return err; + next->filled += params->digest_size; + cur->filled = 0; + return 0; +} - err = fsverity_hash_page(params, inode, req, src_page, - &pending_hashes[pending_size]); - put_page(src_page); - if (err) - return err; - pending_size += params->digest_size; - - if (level == params->num_levels) /* Root hash? */ - return 0; - - if (pending_size + params->digest_size > params->block_size || - i + 1 == num_blocks_to_hash) { - /* Flush the pending hash block */ - memset(&pending_hashes[pending_size], 0, - params->block_size - pending_size); - err = vops->write_merkle_tree_block(inode, - pending_hashes, - dst_block_num, - params->log_blocksize); - if (err) { - fsverity_err(inode, - "Error %d writing Merkle tree block %llu", - err, dst_block_num); - return err; - } - dst_block_num++; - pending_size = 0; - } +static int write_merkle_tree_block(struct inode *inode, const u8 *buf, + unsigned long index, + const struct merkle_tree_params *params) +{ + u64 pos = (u64)index << params->log_blocksize; + int err; - if (fatal_signal_pending(current)) - return -EINTR; - cond_resched(); - } - return 0; + err = inode->i_sb->s_vop->write_merkle_tree_block(inode, buf, pos, + params->block_size); + if (err) + fsverity_err(inode, "Error %d writing Merkle tree block %lu", + err, index); + return err; } /* @@ -152,13 +66,17 @@ static int build_merkle_tree(struct file *filp, u8 *root_hash) { struct inode *inode = file_inode(filp); - u8 *pending_hashes; + const u64 data_size = inode->i_size; + const int num_levels = params->num_levels; struct ahash_request *req; - u64 blocks; - unsigned int level; - int err = -ENOMEM; + struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {}; + struct block_buffer *buffers = &_buffers[1]; + unsigned long level_offset[FS_VERITY_MAX_LEVELS]; + int level; + u64 offset; + int err; - if (inode->i_size == 0) { + if (data_size == 0) { /* Empty file is a special case; root hash is all 0's */ memset(root_hash, 0, params->digest_size); return 0; @@ -167,29 +85,95 @@ static int build_merkle_tree(struct file *filp, /* This allocation never fails, since it's mempool-backed. */ req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL); - pending_hashes = kmalloc(params->block_size, GFP_KERNEL); - if (!pending_hashes) - goto out; - /* - * Build each level of the Merkle tree, starting at the leaf level - * (level 0) and ascending to the root node (level 'num_levels - 1'). - * Then at the end (level 'num_levels'), calculate the root hash. + * Allocate the block buffers. Buffer "-1" is for data blocks. + * Buffers 0 <= level < num_levels are for the actual tree levels. + * Buffer 'num_levels' is for the root hash. */ - blocks = ((u64)inode->i_size + params->block_size - 1) >> - params->log_blocksize; - for (level = 0; level <= params->num_levels; level++) { - err = build_merkle_tree_level(filp, level, blocks, params, - pending_hashes, req); + for (level = -1; level < num_levels; level++) { + buffers[level].data = kzalloc(params->block_size, GFP_KERNEL); + if (!buffers[level].data) { + err = -ENOMEM; + goto out; + } + } + buffers[num_levels].data = root_hash; + + BUILD_BUG_ON(sizeof(level_offset) != sizeof(params->level_start)); + memcpy(level_offset, params->level_start, sizeof(level_offset)); + + /* Hash each data block, also hashing the tree blocks as they fill up */ + for (offset = 0; offset < data_size; offset += params->block_size) { + ssize_t bytes_read; + loff_t pos = offset; + + buffers[-1].filled = min_t(u64, params->block_size, + data_size - offset); + bytes_read = __kernel_read(filp, buffers[-1].data, + buffers[-1].filled, &pos); + if (bytes_read < 0) { + err = bytes_read; + fsverity_err(inode, "Error %d reading file data", err); + goto out; + } + if (bytes_read != buffers[-1].filled) { + err = -EINVAL; + fsverity_err(inode, "Short read of file data"); + goto out; + } + err = hash_one_block(inode, params, req, &buffers[-1]); if (err) goto out; - blocks = (blocks + params->hashes_per_block - 1) >> - params->log_arity; + for (level = 0; level < num_levels; level++) { + if (buffers[level].filled + params->digest_size <= + params->block_size) { + /* Next block at @level isn't full yet */ + break; + } + /* Next block at @level is full */ + + err = hash_one_block(inode, params, req, + &buffers[level]); + if (err) + goto out; + err = write_merkle_tree_block(inode, + buffers[level].data, + level_offset[level], + params); + if (err) + goto out; + level_offset[level]++; + } + if (fatal_signal_pending(current)) { + err = -EINTR; + goto out; + } + cond_resched(); + } + /* Finish all nonempty pending tree blocks. */ + for (level = 0; level < num_levels; level++) { + if (buffers[level].filled != 0) { + err = hash_one_block(inode, params, req, + &buffers[level]); + if (err) + goto out; + err = write_merkle_tree_block(inode, + buffers[level].data, + level_offset[level], + params); + if (err) + goto out; + } + } + /* The root hash was filled by the last call to hash_one_block(). */ + if (WARN_ON(buffers[num_levels].filled != params->digest_size)) { + err = -EINVAL; + goto out; } - memcpy(root_hash, pending_hashes, params->digest_size); err = 0; out: - kfree(pending_hashes); + for (level = -1; level < num_levels; level++) + kfree(buffers[level].data); fsverity_free_hash_request(params->hash_alg, req); return err; } @@ -263,15 +247,12 @@ static int enable_verity(struct file *filp, * ->begin_enable_verity() and ->end_enable_verity() using the inode * lock and only allow one process to be here at a time on a given file. */ - pr_debug("Building Merkle tree...\n"); BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE); err = build_merkle_tree(filp, ¶ms, desc->root_hash); if (err) { fsverity_err(inode, "Error %d building Merkle tree", err); goto rollback; } - pr_debug("Done building Merkle tree. Root hash is %s:%*phN\n", - params.hash_alg->name, params.digest_size, desc->root_hash); /* * Create the fsverity_info. Don't bother trying to save work by @@ -286,10 +267,6 @@ static int enable_verity(struct file *filp, goto rollback; } - if (arg->sig_size) - pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n", - arg->sig_size); - /* * Tell the filesystem to finish enabling verity on the file. * Serialized with ->begin_enable_verity() by the inode lock. @@ -352,7 +329,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2))) return -EINVAL; - if (arg.block_size != PAGE_SIZE) + if (!is_power_of_2(arg.block_size)) return -EINVAL; if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt)) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index c7fcb855e068..d34dcc033d72 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -8,10 +8,6 @@ #ifndef _FSVERITY_PRIVATE_H #define _FSVERITY_PRIVATE_H -#ifdef CONFIG_FS_VERITY_DEBUG -#define DEBUG -#endif - #define pr_fmt(fmt) "fs-verity: " fmt #include <linux/fsverity.h> @@ -46,17 +42,20 @@ struct merkle_tree_params { unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ unsigned int hashes_per_block; /* number of hashes per tree block */ - unsigned int log_blocksize; /* log2(block_size) */ - unsigned int log_arity; /* log2(hashes_per_block) */ + unsigned int blocks_per_page; /* PAGE_SIZE / block_size */ + u8 log_digestsize; /* log2(digest_size) */ + u8 log_blocksize; /* log2(block_size) */ + u8 log_arity; /* log2(hashes_per_block) */ + u8 log_blocks_per_page; /* log2(blocks_per_page) */ unsigned int num_levels; /* number of levels in Merkle tree */ u64 tree_size; /* Merkle tree size in bytes */ - unsigned long level0_blocks; /* number of blocks in tree level 0 */ + unsigned long tree_pages; /* Merkle tree size in pages */ /* * Starting block index for each tree level, ordered from leaf level (0) * to root level ('num_levels - 1') */ - u64 level_start[FS_VERITY_MAX_LEVELS]; + unsigned long level_start[FS_VERITY_MAX_LEVELS]; }; /* @@ -73,9 +72,10 @@ struct fsverity_info { u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE]; const struct inode *inode; + unsigned long *hash_block_verified; + spinlock_t hash_page_init_lock; }; - #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) @@ -91,9 +91,9 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg, struct ahash_request *req); const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); -int fsverity_hash_page(const struct merkle_tree_params *params, - const struct inode *inode, - struct ahash_request *req, struct page *page, u8 *out); +int fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, struct ahash_request *req, + struct page *page, unsigned int offset, u8 *out); int fsverity_hash_buffer(struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 6f8170cf4ae7..13fcf31be844 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -220,35 +220,33 @@ err_free: } /** - * fsverity_hash_page() - hash a single data or hash page + * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters * @inode: inode for which the hashing is being done * @req: preallocated hash request - * @page: the page to hash + * @page: the page containing the block to hash + * @offset: the offset of the block within @page * @out: output digest, size 'params->digest_size' bytes * - * Hash a single data or hash block, assuming block_size == PAGE_SIZE. - * The hash is salted if a salt is specified in the Merkle tree parameters. + * Hash a single data or hash block. The hash is salted if a salt is specified + * in the Merkle tree parameters. * * Return: 0 on success, -errno on failure */ -int fsverity_hash_page(const struct merkle_tree_params *params, - const struct inode *inode, - struct ahash_request *req, struct page *page, u8 *out) +int fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, struct ahash_request *req, + struct page *page, unsigned int offset, u8 *out) { struct scatterlist sg; DECLARE_CRYPTO_WAIT(wait); int err; - if (WARN_ON(params->block_size != PAGE_SIZE)) - return -EINVAL; - sg_init_table(&sg, 1); - sg_set_page(&sg, page, PAGE_SIZE, 0); + sg_set_page(&sg, page, params->block_size, offset); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, PAGE_SIZE); + ahash_request_set_crypt(req, &sg, out, params->block_size); if (params->hashstate) { err = crypto_ahash_import(req, params->hashstate); @@ -264,7 +262,7 @@ int fsverity_hash_page(const struct merkle_tree_params *params, err = crypto_wait_req(err, &wait); if (err) - fsverity_err(inode, "Error %d computing page hash", err); + fsverity_err(inode, "Error %d computing block hash", err); return err; } diff --git a/fs/verity/init.c b/fs/verity/init.c index c98b7016f446..023905151035 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -49,7 +49,6 @@ static int __init fsverity_init(void) if (err) goto err_exit_workqueue; - pr_debug("Initialized fs-verity\n"); return 0; err_exit_workqueue: diff --git a/fs/verity/open.c b/fs/verity/open.c index 81ff94442f7b..9366b441d01c 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -7,6 +7,7 @@ #include "fsverity_private.h" +#include <linux/mm.h> #include <linux/slab.h> static struct kmem_cache *fsverity_info_cachep; @@ -34,6 +35,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, struct fsverity_hash_alg *hash_alg; int err; u64 blocks; + u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; u64 offset; int level; @@ -54,7 +56,23 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, goto out_err; } - if (log_blocksize != PAGE_SHIFT) { + /* + * fs/verity/ directly assumes that the Merkle tree block size is a + * power of 2 less than or equal to PAGE_SIZE. Another restriction + * arises from the interaction between fs/verity/ and the filesystems + * themselves: filesystems expect to be able to verify a single + * filesystem block of data at a time. Therefore, the Merkle tree block + * size must also be less than or equal to the filesystem block size. + * + * The above are the only hard limitations, so in theory the Merkle tree + * block size could be as small as twice the digest size. However, + * that's not useful, and it would result in some unusually deep and + * large Merkle trees. So we currently require that the Merkle tree + * block size be at least 1024 bytes. That's small enough to test the + * sub-page block case on systems with 4K pages, but not too small. + */ + if (log_blocksize < 10 || log_blocksize > PAGE_SHIFT || + log_blocksize > inode->i_blkbits) { fsverity_warn(inode, "Unsupported log_blocksize: %u", log_blocksize); err = -EINVAL; @@ -62,6 +80,8 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, } params->log_blocksize = log_blocksize; params->block_size = 1 << log_blocksize; + params->log_blocks_per_page = PAGE_SHIFT - log_blocksize; + params->blocks_per_page = 1 << params->log_blocks_per_page; if (WARN_ON(!is_power_of_2(params->digest_size))) { err = -EINVAL; @@ -74,13 +94,10 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, err = -EINVAL; goto out_err; } - params->log_arity = params->log_blocksize - ilog2(params->digest_size); + params->log_digestsize = ilog2(params->digest_size); + params->log_arity = log_blocksize - params->log_digestsize; params->hashes_per_block = 1 << params->log_arity; - pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n", - hash_alg->name, params->block_size, params->hashes_per_block, - (int)salt_size, salt); - /* * Compute the number of levels in the Merkle tree and create a map from * level to the starting block of that level. Level 'num_levels - 1' is @@ -90,31 +107,45 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, /* Compute number of levels and the number of blocks in each level */ blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; - pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { fsverity_err(inode, "Too many levels in Merkle tree"); - err = -EINVAL; + err = -EFBIG; goto out_err; } blocks = (blocks + params->hashes_per_block - 1) >> params->log_arity; - /* temporarily using level_start[] to store blocks in level */ - params->level_start[params->num_levels++] = blocks; + blocks_in_level[params->num_levels++] = blocks; } - params->level0_blocks = params->level_start[0]; /* Compute the starting block of each level */ offset = 0; for (level = (int)params->num_levels - 1; level >= 0; level--) { - blocks = params->level_start[level]; params->level_start[level] = offset; - pr_debug("Level %d is %llu blocks starting at index %llu\n", - level, blocks, offset); - offset += blocks; + offset += blocks_in_level[level]; + } + + /* + * With block_size != PAGE_SIZE, an in-memory bitmap will need to be + * allocated to track the "verified" status of hash blocks. Don't allow + * this bitmap to get too large. For now, limit it to 1 MiB, which + * limits the file size to about 4.4 TB with SHA-256 and 4K blocks. + * + * Together with the fact that the data, and thus also the Merkle tree, + * cannot have more than ULONG_MAX pages, this implies that hash block + * indices can always fit in an 'unsigned long'. But to be safe, we + * explicitly check for that too. Note, this is only for hash block + * indices; data block indices might not fit in an 'unsigned long'. + */ + if ((params->block_size != PAGE_SIZE && offset > 1 << 23) || + offset > ULONG_MAX) { + fsverity_err(inode, "Too many blocks in Merkle tree"); + err = -EFBIG; + goto out_err; } params->tree_size = offset << log_blocksize; + params->tree_pages = PAGE_ALIGN(params->tree_size) >> PAGE_SHIFT; return 0; out_err: @@ -165,7 +196,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, fsverity_err(inode, "Error %d initializing Merkle tree parameters", err); - goto out; + goto fail; } memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); @@ -174,20 +205,48 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, vi->file_digest); if (err) { fsverity_err(inode, "Error %d computing file digest", err); - goto out; + goto fail; } - pr_debug("Computed file digest: %s:%*phN\n", - vi->tree_params.hash_alg->name, - vi->tree_params.digest_size, vi->file_digest); err = fsverity_verify_signature(vi, desc->signature, le32_to_cpu(desc->sig_size)); -out: - if (err) { - fsverity_free_info(vi); - vi = ERR_PTR(err); + if (err) + goto fail; + + if (vi->tree_params.block_size != PAGE_SIZE) { + /* + * When the Merkle tree block size and page size differ, we use + * a bitmap to keep track of which hash blocks have been + * verified. This bitmap must contain one bit per hash block, + * including alignment to a page boundary at the end. + * + * Eventually, to support extremely large files in an efficient + * way, it might be necessary to make pages of this bitmap + * reclaimable. But for now, simply allocating the whole bitmap + * is a simple solution that works well on the files on which + * fsverity is realistically used. E.g., with SHA-256 and 4K + * blocks, a 100MB file only needs a 24-byte bitmap, and the + * bitmap for any file under 17GB fits in a 4K page. + */ + unsigned long num_bits = + vi->tree_params.tree_pages << + vi->tree_params.log_blocks_per_page; + + vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits), + sizeof(unsigned long), + GFP_KERNEL); + if (!vi->hash_block_verified) { + err = -ENOMEM; + goto fail; + } + spin_lock_init(&vi->hash_page_init_lock); } + return vi; + +fail: + fsverity_free_info(vi); + return ERR_PTR(err); } void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) @@ -214,6 +273,7 @@ void fsverity_free_info(struct fsverity_info *vi) if (!vi) return; kfree(vi->tree_params.hashstate); + kvfree(vi->hash_block_verified); kmem_cache_free(fsverity_info_cachep, vi); } @@ -325,67 +385,28 @@ out_free_desc: return err; } -/** - * fsverity_file_open() - prepare to open a verity file - * @inode: the inode being opened - * @filp: the struct file being set up - * - * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. - * - * When combined with fscrypt, this must be called after fscrypt_file_open(). - * Otherwise, we won't have the key set up to decrypt the verity metadata. - * - * Return: 0 on success, -errno on failure - */ -int fsverity_file_open(struct inode *inode, struct file *filp) +int __fsverity_file_open(struct inode *inode, struct file *filp) { - if (!IS_VERITY(inode)) - return 0; - - if (filp->f_mode & FMODE_WRITE) { - pr_debug("Denying opening verity file (ino %lu) for write\n", - inode->i_ino); + if (filp->f_mode & FMODE_WRITE) return -EPERM; - } - return ensure_verity_info(inode); } -EXPORT_SYMBOL_GPL(fsverity_file_open); +EXPORT_SYMBOL_GPL(__fsverity_file_open); -/** - * fsverity_prepare_setattr() - prepare to change a verity inode's attributes - * @dentry: dentry through which the inode is being changed - * @attr: attributes to change - * - * Verity files are immutable, so deny truncates. This isn't covered by the - * open-time check because sys_truncate() takes a path, not a file descriptor. - * - * Return: 0 on success, -errno on failure - */ -int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) +int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { - if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) { - pr_debug("Denying truncate of verity file (ino %lu)\n", - d_inode(dentry)->i_ino); + if (attr->ia_valid & ATTR_SIZE) return -EPERM; - } return 0; } -EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); +EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); -/** - * fsverity_cleanup_inode() - free the inode's verity info, if present - * @inode: an inode being evicted - * - * Filesystems must call this on inode eviction to free ->i_verity_info. - */ -void fsverity_cleanup_inode(struct inode *inode) +void __fsverity_cleanup_inode(struct inode *inode) { fsverity_free_info(inode->i_verity_info); inode->i_verity_info = NULL; } -EXPORT_SYMBOL_GPL(fsverity_cleanup_inode); +EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); int __init fsverity_init_info_cache(void) { diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 143a530a8008..e7d3ca919a1e 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -82,8 +82,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return err; } - pr_debug("Valid signature for file digest %s:%*phN\n", - hash_alg->name, hash_alg->digest_size, vi->file_digest); return 0; } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 961ba248021f..f50e3b5b52c9 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -9,39 +9,12 @@ #include <crypto/hash.h> #include <linux/bio.h> -#include <linux/ratelimit.h> static struct workqueue_struct *fsverity_read_workqueue; -/** - * hash_at_level() - compute the location of the block's hash at the given level - * - * @params: (in) the Merkle tree parameters - * @dindex: (in) the index of the data block being verified - * @level: (in) the level of hash we want (0 is leaf level) - * @hindex: (out) the index of the hash block containing the wanted hash - * @hoffset: (out) the byte offset to the wanted hash within the hash block - */ -static void hash_at_level(const struct merkle_tree_params *params, - pgoff_t dindex, unsigned int level, pgoff_t *hindex, - unsigned int *hoffset) -{ - pgoff_t position; - - /* Offset of the hash within the level's region, in hashes */ - position = dindex >> (level * params->log_arity); - - /* Index of the hash block in the tree overall */ - *hindex = params->level_start[level] + (position >> params->log_arity); - - /* Offset of the wanted hash (in bytes) within the hash block */ - *hoffset = (position & ((1 << params->log_arity) - 1)) << - (params->log_blocksize - params->log_arity); -} - static inline int cmp_hashes(const struct fsverity_info *vi, const u8 *want_hash, const u8 *real_hash, - pgoff_t index, int level) + u64 data_pos, int level) { const unsigned int hsize = vi->tree_params.digest_size; @@ -49,159 +22,312 @@ static inline int cmp_hashes(const struct fsverity_info *vi, return 0; fsverity_err(vi->inode, - "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - index, level, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level, vi->tree_params.hash_alg->name, hsize, want_hash, vi->tree_params.hash_alg->name, hsize, real_hash); return -EBADMSG; } +static bool data_is_zeroed(struct inode *inode, struct page *page, + unsigned int len, unsigned int offset) +{ + void *virt = kmap_local_page(page); + + if (memchr_inv(virt + offset, 0, len)) { + kunmap_local(virt); + fsverity_err(inode, + "FILE CORRUPTED! Data past EOF is not zeroed"); + return false; + } + kunmap_local(virt); + return true; +} + +/* + * Returns true if the hash block with index @hblock_idx in the tree, located in + * @hpage, has already been verified. + */ +static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, + unsigned long hblock_idx) +{ + bool verified; + unsigned int blocks_per_page; + unsigned int i; + + /* + * When the Merkle tree block size and page size are the same, then the + * ->hash_block_verified bitmap isn't allocated, and we use PG_checked + * to directly indicate whether the page's block has been verified. + * + * Using PG_checked also guarantees that we re-verify hash pages that + * get evicted and re-instantiated from the backing storage, as new + * pages always start out with PG_checked cleared. + */ + if (!vi->hash_block_verified) + return PageChecked(hpage); + + /* + * When the Merkle tree block size and page size differ, we use a bitmap + * to indicate whether each hash block has been verified. + * + * However, we still need to ensure that hash pages that get evicted and + * re-instantiated from the backing storage are re-verified. To do + * this, we use PG_checked again, but now it doesn't really mean + * "checked". Instead, now it just serves as an indicator for whether + * the hash page is newly instantiated or not. + * + * The first thread that sees PG_checked=0 must clear the corresponding + * bitmap bits, then set PG_checked=1. This requires a spinlock. To + * avoid having to take this spinlock in the common case of + * PG_checked=1, we start with an opportunistic lockless read. + */ + if (PageChecked(hpage)) { + /* + * A read memory barrier is needed here to give ACQUIRE + * semantics to the above PageChecked() test. + */ + smp_rmb(); + return test_bit(hblock_idx, vi->hash_block_verified); + } + spin_lock(&vi->hash_page_init_lock); + if (PageChecked(hpage)) { + verified = test_bit(hblock_idx, vi->hash_block_verified); + } else { + blocks_per_page = vi->tree_params.blocks_per_page; + hblock_idx = round_down(hblock_idx, blocks_per_page); + for (i = 0; i < blocks_per_page; i++) + clear_bit(hblock_idx + i, vi->hash_block_verified); + /* + * A write memory barrier is needed here to give RELEASE + * semantics to the below SetPageChecked() operation. + */ + smp_wmb(); + SetPageChecked(hpage); + verified = false; + } + spin_unlock(&vi->hash_page_init_lock); + return verified; +} + /* - * Verify a single data page against the file's Merkle tree. + * Verify a single data block against the file's Merkle tree. * * In principle, we need to verify the entire path to the root node. However, - * for efficiency the filesystem may cache the hash pages. Therefore we need - * only ascend the tree until an already-verified page is seen, as indicated by - * the PageChecked bit being set; then verify the path to that page. - * - * This code currently only supports the case where the verity block size is - * equal to PAGE_SIZE. Doing otherwise would be possible but tricky, since we - * wouldn't be able to use the PageChecked bit. - * - * Note that multiple processes may race to verify a hash page and mark it - * Checked, but it doesn't matter; the result will be the same either way. + * for efficiency the filesystem may cache the hash blocks. Therefore we need + * only ascend the tree until an already-verified hash block is seen, and then + * verify the path to that block. * - * Return: true if the page is valid, else false. + * Return: %true if the data block is valid, else %false. */ -static bool verify_page(struct inode *inode, const struct fsverity_info *vi, - struct ahash_request *req, struct page *data_page, - unsigned long level0_ra_pages) +static bool +verify_data_block(struct inode *inode, struct fsverity_info *vi, + struct ahash_request *req, struct page *data_page, + u64 data_pos, unsigned int dblock_offset_in_page, + unsigned long max_ra_pages) { const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; - const pgoff_t index = data_page->index; int level; u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; const u8 *want_hash; u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; - struct page *hpages[FS_VERITY_MAX_LEVELS]; - unsigned int hoffsets[FS_VERITY_MAX_LEVELS]; + /* The hash blocks that are traversed, indexed by level */ + struct { + /* Page containing the hash block */ + struct page *page; + /* Index of the hash block in the tree overall */ + unsigned long index; + /* Byte offset of the hash block within @page */ + unsigned int offset_in_page; + /* Byte offset of the wanted hash within @page */ + unsigned int hoffset; + } hblocks[FS_VERITY_MAX_LEVELS]; + /* + * The index of the previous level's block within that level; also the + * index of that block's hash within the current level. + */ + u64 hidx = data_pos >> params->log_blocksize; int err; - if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page))) - return false; - - pr_debug_ratelimited("Verifying data page %lu...\n", index); + if (unlikely(data_pos >= inode->i_size)) { + /* + * This can happen in the data page spanning EOF when the Merkle + * tree block size is less than the page size. The Merkle tree + * doesn't cover data blocks fully past EOF. But the entire + * page spanning EOF can be visible to userspace via a mmap, and + * any part past EOF should be all zeroes. Therefore, we need + * to verify that any data blocks fully past EOF are all zeroes. + */ + return data_is_zeroed(inode, data_page, params->block_size, + dblock_offset_in_page); + } /* - * Starting at the leaf level, ascend the tree saving hash pages along - * the way until we find a verified hash page, indicated by PageChecked; - * or until we reach the root. + * Starting at the leaf level, ascend the tree saving hash blocks along + * the way until we find a hash block that has already been verified, or + * until we reach the root. */ for (level = 0; level < params->num_levels; level++) { - pgoff_t hindex; + unsigned long next_hidx; + unsigned long hblock_idx; + pgoff_t hpage_idx; + unsigned int hblock_offset_in_page; unsigned int hoffset; struct page *hpage; - hash_at_level(params, index, level, &hindex, &hoffset); + /* + * The index of the block in the current level; also the index + * of that block's hash within the next level. + */ + next_hidx = hidx >> params->log_arity; + + /* Index of the hash block in the tree overall */ + hblock_idx = params->level_start[level] + next_hidx; + + /* Index of the hash page in the tree overall */ + hpage_idx = hblock_idx >> params->log_blocks_per_page; - pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", - level, hindex, hoffset); + /* Byte offset of the hash block within the page */ + hblock_offset_in_page = + (hblock_idx << params->log_blocksize) & ~PAGE_MASK; - hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hindex, - level == 0 ? level0_ra_pages : 0); + /* Byte offset of the hash within the page */ + hoffset = hblock_offset_in_page + + ((hidx << params->log_digestsize) & + (params->block_size - 1)); + + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, + hpage_idx, level == 0 ? min(max_ra_pages, + params->tree_pages - hpage_idx) : 0); if (IS_ERR(hpage)) { err = PTR_ERR(hpage); fsverity_err(inode, "Error %d reading Merkle tree page %lu", - err, hindex); + err, hpage_idx); goto out; } - - if (PageChecked(hpage)) { + if (is_hash_block_verified(vi, hpage, hblock_idx)) { memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); - pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", - params->hash_alg->name, - hsize, want_hash); goto descend; } - pr_debug_ratelimited("Hash page not yet checked\n"); - hpages[level] = hpage; - hoffsets[level] = hoffset; + hblocks[level].page = hpage; + hblocks[level].index = hblock_idx; + hblocks[level].offset_in_page = hblock_offset_in_page; + hblocks[level].hoffset = hoffset; + hidx = next_hidx; } want_hash = vi->root_hash; - pr_debug("Want root hash: %s:%*phN\n", - params->hash_alg->name, hsize, want_hash); descend: - /* Descend the tree verifying hash pages */ + /* Descend the tree verifying hash blocks. */ for (; level > 0; level--) { - struct page *hpage = hpages[level - 1]; - unsigned int hoffset = hoffsets[level - 1]; - - err = fsverity_hash_page(params, inode, req, hpage, real_hash); + struct page *hpage = hblocks[level - 1].page; + unsigned long hblock_idx = hblocks[level - 1].index; + unsigned int hblock_offset_in_page = + hblocks[level - 1].offset_in_page; + unsigned int hoffset = hblocks[level - 1].hoffset; + + err = fsverity_hash_block(params, inode, req, hpage, + hblock_offset_in_page, real_hash); if (err) goto out; - err = cmp_hashes(vi, want_hash, real_hash, index, level - 1); + err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1); if (err) goto out; - SetPageChecked(hpage); + /* + * Mark the hash block as verified. This must be atomic and + * idempotent, as the same hash block might be verified by + * multiple threads concurrently. + */ + if (vi->hash_block_verified) + set_bit(hblock_idx, vi->hash_block_verified); + else + SetPageChecked(hpage); memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); - pr_debug("Verified hash page at level %d, now want %s:%*phN\n", - level - 1, params->hash_alg->name, hsize, want_hash); } - /* Finally, verify the data page */ - err = fsverity_hash_page(params, inode, req, data_page, real_hash); + /* Finally, verify the data block. */ + err = fsverity_hash_block(params, inode, req, data_page, + dblock_offset_in_page, real_hash); if (err) goto out; - err = cmp_hashes(vi, want_hash, real_hash, index, -1); + err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1); out: for (; level > 0; level--) - put_page(hpages[level - 1]); + put_page(hblocks[level - 1].page); return err == 0; } +static bool +verify_data_blocks(struct inode *inode, struct fsverity_info *vi, + struct ahash_request *req, struct folio *data_folio, + size_t len, size_t offset, unsigned long max_ra_pages) +{ + const unsigned int block_size = vi->tree_params.block_size; + u64 pos = (u64)data_folio->index << PAGE_SHIFT; + + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) + return false; + if (WARN_ON_ONCE(!folio_test_locked(data_folio) || + folio_test_uptodate(data_folio))) + return false; + do { + struct page *data_page = + folio_page(data_folio, offset >> PAGE_SHIFT); + + if (!verify_data_block(inode, vi, req, data_page, pos + offset, + offset & ~PAGE_MASK, max_ra_pages)) + return false; + offset += block_size; + len -= block_size; + } while (len); + return true; +} + /** - * fsverity_verify_page() - verify a data page - * @page: the page to verity + * fsverity_verify_blocks() - verify data in a folio + * @folio: the folio containing the data to verify + * @len: the length of the data to verify in the folio + * @offset: the offset of the data to verify in the folio * - * Verify a page that has just been read from a verity file. The page must be a - * pagecache page that is still locked and not yet uptodate. + * Verify data that has just been read from a verity file. The data must be + * located in a pagecache folio that is still locked and not yet uptodate. The + * length and offset of the data must be Merkle tree block size aligned. * - * Return: true if the page is valid, else false. + * Return: %true if the data is valid, else %false. */ -bool fsverity_verify_page(struct page *page) +bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - struct inode *inode = page->mapping->host; - const struct fsverity_info *vi = inode->i_verity_info; + struct inode *inode = folio->mapping->host; + struct fsverity_info *vi = inode->i_verity_info; struct ahash_request *req; bool valid; /* This allocation never fails, since it's mempool-backed. */ req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - valid = verify_page(inode, vi, req, page, 0); + valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0); fsverity_free_hash_request(vi->tree_params.hash_alg, req); return valid; } -EXPORT_SYMBOL_GPL(fsverity_verify_page); +EXPORT_SYMBOL_GPL(fsverity_verify_blocks); #ifdef CONFIG_BLOCK /** * fsverity_verify_bio() - verify a 'read' bio that has just completed * @bio: the bio to verify * - * Verify a set of pages that have just been read from a verity file. The pages - * must be pagecache pages that are still locked and not yet uptodate. If a - * page fails verification, then bio->bi_status is set to an error status. + * Verify the bio's data against the file's Merkle tree. All bio data segments + * must be aligned to the file's Merkle tree block size. If any data fails + * verification, then bio->bi_status is set to an error status. * * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that @@ -212,15 +338,13 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); void fsverity_verify_bio(struct bio *bio) { struct inode *inode = bio_first_page_all(bio)->mapping->host; - const struct fsverity_info *vi = inode->i_verity_info; - const struct merkle_tree_params *params = &vi->tree_params; + struct fsverity_info *vi = inode->i_verity_info; struct ahash_request *req; - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; unsigned long max_ra_pages = 0; /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(params->hash_alg, GFP_NOFS); + req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); if (bio->bi_opf & REQ_RAHEAD) { /* @@ -232,24 +356,18 @@ void fsverity_verify_bio(struct bio *bio) * This improves sequential read performance, as it greatly * reduces the number of I/O requests made to the Merkle tree. */ - bio_for_each_segment_all(bv, bio, iter_all) - max_ra_pages++; - max_ra_pages /= 4; + max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); } - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - unsigned long level0_index = page->index >> params->log_arity; - unsigned long level0_ra_pages = - min(max_ra_pages, params->level0_blocks - level0_index); - - if (!verify_page(inode, vi, req, page, level0_ra_pages)) { + bio_for_each_folio_all(fi, bio) { + if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length, + fi.offset, max_ra_pages)) { bio->bi_status = BLK_STS_IOERR; break; } } - fsverity_free_hash_request(params->hash_alg, req); + fsverity_free_hash_request(vi->tree_params.hash_alg, req); } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ diff --git a/fs/xattr.c b/fs/xattr.c index adab9a70b536..14a7eb3c8fa8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -9,6 +9,7 @@ Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> @@ -82,7 +83,7 @@ xattr_resolve_name(struct inode *inode, const char **name) /** * may_write_xattr - check whether inode allows writing xattr - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never @@ -94,13 +95,13 @@ xattr_resolve_name(struct inode *inode, const char **name) * * Return: On success zero is returned. On error a negative errno is returned. */ -int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) +int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } @@ -110,13 +111,13 @@ int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) * because different namespaces have very different rules. */ static int -xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, +xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; - ret = may_write_xattr(mnt_userns, inode); + ret = may_write_xattr(idmap, inode); if (ret) return ret; } @@ -148,11 +149,11 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && - !inode_owner_or_capable(mnt_userns, inode)) + !inode_owner_or_capable(idmap, inode)) return -EPERM; } - return inode_permission(mnt_userns, inode, mask); + return inode_permission(idmap, inode, mask); } /* @@ -183,7 +184,7 @@ xattr_supported_namespace(struct inode *inode, const char *prefix) EXPORT_SYMBOL(xattr_supported_namespace); int -__vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { @@ -199,7 +200,7 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ - return handler->set(handler, mnt_userns, dentry, inode, name, value, + return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); @@ -208,7 +209,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to @@ -221,7 +222,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * is executed. It also assumes that the caller will make the appropriate * permission checks. */ -int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, +int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -233,7 +234,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { - error = __vfs_setxattr(mnt_userns, dentry, inode, name, value, + error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); @@ -264,7 +265,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @mnt_userns: user namespace of the mount of the target inode + * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to @@ -274,18 +275,18 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, * a delegation was broken on, NULL if none. */ int -__vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); + error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_setxattr(mnt_userns, dentry, name, value, size, + error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; @@ -294,7 +295,7 @@ __vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) goto out; - error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value, + error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: @@ -303,7 +304,7 @@ out: EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int -vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; @@ -312,7 +313,7 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; @@ -320,7 +321,7 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, retry_deleg: inode_lock(inode); - error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, + error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); @@ -337,19 +338,19 @@ retry_deleg: EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t -xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, +xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { - len = security_inode_getsecurity(mnt_userns, inode, name, + len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } - len = security_inode_getsecurity(mnt_userns, inode, name, &buffer, + len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; @@ -374,7 +375,7 @@ out_noalloc: * Returns the result of alloc, if failed, or the getxattr operation. */ int -vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { @@ -383,7 +384,7 @@ vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, char *value = *xattr_value; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_READ); + error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; @@ -427,13 +428,13 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, EXPORT_SYMBOL(__vfs_getxattr); ssize_t -vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_READ); + error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; @@ -444,7 +445,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - int ret = xattr_getsecurity(mnt_userns, inode, suffix, value, + int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module @@ -480,7 +481,7 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) EXPORT_SYMBOL_GPL(vfs_listxattr); int -__vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); @@ -494,7 +495,7 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; - return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0, + return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); @@ -503,25 +504,25 @@ EXPORT_SYMBOL(__vfs_removexattr); * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @mnt_userns: user namespace of the mount of the target inode + * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int -__vfs_removexattr_locked(struct user_namespace *mnt_userns, +__vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); + error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_removexattr(mnt_userns, dentry, name); + error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; @@ -529,7 +530,7 @@ __vfs_removexattr_locked(struct user_namespace *mnt_userns, if (error) goto out; - error = __vfs_removexattr(mnt_userns, dentry, name); + error = __vfs_removexattr(idmap, dentry, name); if (!error) { fsnotify_xattr(dentry); @@ -542,7 +543,7 @@ out: EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int -vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; @@ -551,7 +552,7 @@ vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, retry_deleg: inode_lock(inode); - error = __vfs_removexattr_locked(mnt_userns, dentry, + error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); @@ -605,7 +606,7 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); - return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name, + return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } @@ -714,8 +715,7 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d, if (is_posix_acl_xattr(ctx->kname->name)) error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); else - error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname, - ctx->kvalue, ctx->size); + error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; @@ -892,9 +892,9 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, return error; if (is_posix_acl_xattr(kname)) - return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname); + return vfs_remove_acl(idmap, d, kname); - return vfs_removexattr(mnt_idmap_owner(idmap), d, kname); + return vfs_removexattr(idmap, d, kname); } static int path_removexattr(const char __user *pathname, diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index a05f44eb8178..791db7d9c849 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -242,7 +242,7 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +xfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { umode_t mode; @@ -258,7 +258,7 @@ xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); + error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; set_mode = true; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index dcd176149c7a..bf7f960997d3 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); -extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int xfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 595a5bcf46b9..d06c0cc62f61 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1047,7 +1047,7 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_mnt_user_ns(file), + error = xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), &iattr); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d354ea2b74f9..7f1d715faab5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -777,7 +777,7 @@ xfs_inode_inherit_flags2( */ int xfs_init_new_inode( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, @@ -823,11 +823,11 @@ xfs_init_new_inode( ip->i_projid = prid; if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; inode->i_mode = mode; } else { - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); } /* @@ -836,7 +836,7 @@ xfs_init_new_inode( * (and only if the irix_sgid_inherit compatibility variable is set). */ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) + !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; @@ -946,7 +946,7 @@ xfs_bumplink( int xfs_create( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, xfs_inode_t *dp, struct xfs_name *name, umode_t mode, @@ -978,8 +978,8 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1020,7 +1020,7 @@ xfs_create( */ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + error = xfs_init_new_inode(idmap, tp, dp, ino, mode, is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); if (error) goto out_trans_cancel; @@ -1102,7 +1102,7 @@ xfs_create( int xfs_create_tmpfile( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp) @@ -1127,8 +1127,8 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1144,7 +1144,7 @@ xfs_create_tmpfile( error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 0, 0, prid, false, &ip); if (error) goto out_trans_cancel; @@ -2709,7 +2709,7 @@ out_trans_abort: */ static int xfs_rename_alloc_whiteout( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_name *src_name, struct xfs_inode *dp, struct xfs_inode **wip) @@ -2718,7 +2718,7 @@ xfs_rename_alloc_whiteout( struct qstr name; int error; - error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, + error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); if (error) return error; @@ -2750,7 +2750,7 @@ xfs_rename_alloc_whiteout( */ int xfs_rename( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, @@ -2782,7 +2782,7 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(mnt_userns, src_name, + error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp, &wip); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fa780f08dc89..69d21e42c10a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -473,18 +473,18 @@ int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct user_namespace *mnt_userns, +int xfs_create(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, bool need_xattr, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct user_namespace *mnt_userns, +int xfs_create_tmpfile(struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_rename(struct user_namespace *mnt_userns, +int xfs_rename(struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, @@ -515,7 +515,7 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); -int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp, +int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs, struct xfs_inode **ipp); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 736510bc241b..55bb01173cde 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -665,7 +665,7 @@ xfs_ioc_fsbulkstat( struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), .ocount = 0, }; xfs_ino_t lastino; @@ -844,7 +844,7 @@ xfs_ioc_bulkstat( struct xfs_bulk_ireq hdr; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), }; int error; @@ -1297,7 +1297,7 @@ xfs_ioctl_setattr_check_projid( int xfs_fileattr_set( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { @@ -1371,7 +1371,7 @@ xfs_fileattr_set( */ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && - !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(idmap, VFS_I(ip), CAP_FSETID)) VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index d4abba2c13c1..38be600b5e1e 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -49,7 +49,7 @@ xfs_fileattr_get( extern int xfs_fileattr_set( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 2f54b701eead..ee35eea1ecce 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -204,7 +204,7 @@ xfs_compat_ioc_fsbulkstat( struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), .ocount = 0, }; xfs_ino_t lastino; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 515318dfbc38..24718adb3c16 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -162,12 +162,12 @@ xfs_create_need_xattr( STATIC int xfs_generic_create( - struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t rdev, - struct file *tmpfile) /* unnamed file */ + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev, + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -196,11 +196,11 @@ xfs_generic_create( goto out_free_acl; if (!tmpfile) { - error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev, + error = xfs_create(idmap, XFS_I(dir), &name, mode, rdev, xfs_create_need_xattr(dir, default_acl, acl), &ip); } else { - error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip); + error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; @@ -255,35 +255,34 @@ xfs_generic_create( STATIC int xfs_vn_mknod( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); + return xfs_generic_create(idmap, dir, dentry, mode, rdev, NULL); } STATIC int xfs_vn_create( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); + return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } STATIC int xfs_vn_mkdir( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - NULL); + return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); } STATIC struct dentry * @@ -400,7 +399,7 @@ xfs_vn_unlink( STATIC int xfs_vn_symlink( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) @@ -417,7 +416,7 @@ xfs_vn_symlink( if (unlikely(error)) goto out; - error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip); + error = xfs_symlink(idmap, XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; @@ -443,7 +442,7 @@ xfs_vn_symlink( STATIC int xfs_vn_rename( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *odir, struct dentry *odentry, struct inode *ndir, @@ -472,7 +471,7 @@ xfs_vn_rename( if (unlikely(error)) return error; - return xfs_rename(mnt_userns, XFS_I(odir), &oname, + return xfs_rename(idmap, XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -549,7 +548,7 @@ xfs_stat_blksize( STATIC int xfs_vn_getattr( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, @@ -558,8 +557,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); trace_xfs_getattr(ip); @@ -627,7 +626,7 @@ xfs_vn_getattr( static int xfs_vn_change_ok( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -639,7 +638,7 @@ xfs_vn_change_ok( if (xfs_is_shutdown(mp)) return -EIO; - return setattr_prepare(mnt_userns, dentry, iattr); + return setattr_prepare(idmap, dentry, iattr); } /* @@ -650,7 +649,7 @@ xfs_vn_change_ok( */ static int xfs_setattr_nonsize( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) @@ -679,14 +678,14 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = from_vfsuid(mnt_userns, i_user_ns(inode), + uid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = from_vfsgid(mnt_userns, i_user_ns(inode), + gid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { @@ -719,18 +718,18 @@ xfs_setattr_nonsize( * also. */ if (XFS_IS_UQUOTA_ON(mp) && - i_uid_needs_update(mnt_userns, iattr, inode)) { + i_uid_needs_update(idmap, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } if (XFS_IS_GQUOTA_ON(mp) && - i_gid_needs_update(mnt_userns, iattr, inode)) { + i_gid_needs_update(idmap, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -758,7 +757,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (error) return error; } @@ -779,7 +778,7 @@ out_dqrele: */ STATIC int xfs_setattr_size( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) @@ -812,7 +811,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); + return xfs_setattr_nonsize(idmap, dentry, ip, iattr); } /* @@ -956,7 +955,7 @@ xfs_setattr_size( } ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -977,7 +976,7 @@ out_trans_cancel: int xfs_vn_setattr_size( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -986,15 +985,15 @@ xfs_vn_setattr_size( trace_xfs_setattr(ip); - error = xfs_vn_change_ok(mnt_userns, dentry, iattr); + error = xfs_vn_change_ok(idmap, dentry, iattr); if (error) return error; - return xfs_setattr_size(mnt_userns, dentry, ip, iattr); + return xfs_setattr_size(idmap, dentry, ip, iattr); } STATIC int xfs_vn_setattr( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -1014,14 +1013,14 @@ xfs_vn_setattr( return error; } - error = xfs_vn_setattr_size(mnt_userns, dentry, iattr); + error = xfs_vn_setattr_size(idmap, dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { trace_xfs_setattr(ip); - error = xfs_vn_change_ok(mnt_userns, dentry, iattr); + error = xfs_vn_change_ok(idmap, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); + error = xfs_setattr_nonsize(idmap, dentry, ip, iattr); } return error; @@ -1092,12 +1091,12 @@ xfs_vn_fiemap( STATIC int xfs_vn_tmpfile( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + int err = xfs_generic_create(idmap, dir, file->f_path.dentry, mode, 0, file); return finish_open_simple(file, err); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index e570dcb5df8d..7f84a0843b24 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,7 +13,7 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -int xfs_vn_setattr_size(struct user_namespace *mnt_userns, +int xfs_vn_setattr_size(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *vap); int xfs_inode_init_security(struct inode *inode, struct inode *dir, diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index a1c2bcf65d37..f225413a993c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -55,7 +55,7 @@ struct xfs_bstat_chunk { STATIC int xfs_bulkstat_one_int( struct xfs_mount *mp, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_trans *tp, xfs_ino_t ino, struct xfs_bstat_chunk *bc) @@ -83,8 +83,8 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); /* xfs_iget returns the following without needing * further change. @@ -178,7 +178,7 @@ xfs_bulkstat_one( struct xfs_trans *tp; int error; - if (breq->mnt_userns != &init_user_ns) { + if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; @@ -199,7 +199,7 @@ xfs_bulkstat_one( if (error) goto out; - error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp, + error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp, breq->startino, &bc); xfs_trans_cancel(tp); out: @@ -225,7 +225,7 @@ xfs_bulkstat_iwalk( struct xfs_bstat_chunk *bc = data; int error; - error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data); + error = xfs_bulkstat_one_int(mp, bc->breq->idmap, tp, ino, data); /* bulkstat just skips over missing inodes */ if (error == -ENOENT || error == -EINVAL) return 0; @@ -270,7 +270,7 @@ xfs_bulkstat( unsigned int iwalk_flags = 0; int error; - if (breq->mnt_userns != &init_user_ns) { + if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index e2d0eba43f35..1659f13f17a8 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -8,7 +8,7 @@ /* In-memory representation of a userspace request for batch inode data. */ struct xfs_ibulk { struct xfs_mount *mp; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; void __user *ubuffer; /* user output buffer */ xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index f9878021e7d0..e88f18f85e4b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -34,6 +34,7 @@ typedef __u32 xfs_nlink_t; #include <linux/module.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/filelock.h> #include <linux/swap.h> #include <linux/errno.h> #include <linux/sched/signal.h> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 38d23f0e703a..23d16186e1a3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -322,7 +322,7 @@ xfs_fs_commit_blocks( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_disk_size = iattr->ia_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e2c542f6dcd4..7dc0db7f5a76 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -787,7 +787,7 @@ xfs_qm_qino_alloc( error = xfs_dialloc(&tp, 0, S_IFREG, &ino); if (!error) - error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino, + error = xfs_init_new_inode(&nop_mnt_idmap, tp, NULL, ino, S_IFREG, 1, 0, 0, false, ipp); if (error) { xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 8389f3ef88ef..85e433df6a3f 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -144,7 +144,7 @@ xfs_readlink( int xfs_symlink( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, @@ -193,8 +193,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -231,7 +231,7 @@ xfs_symlink( */ error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, + error = xfs_init_new_inode(idmap, tp, dp, ino, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, false, &ip); if (error) diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index 2586b7e393f3..d1ca1ce62a93 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -7,7 +7,7 @@ /* Kernel only symlink definitions */ -int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp, +int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 10aa1fd39d2b..7b9a0ed1b11f 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -133,7 +133,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, static int xfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a9c5c3f720ad..72ef97320b99 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -604,7 +604,7 @@ unlock: return ret; } -static int zonefs_inode_setattr(struct user_namespace *mnt_userns, +static int zonefs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@ -613,7 +613,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; - ret = setattr_prepare(&init_user_ns, dentry, iattr); + ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (ret) return ret; @@ -630,7 +630,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(mnt_userns, inode, iattr); + ret = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (ret) return ret; } @@ -641,7 +641,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, return ret; } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); return 0; } @@ -1427,7 +1427,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, struct super_block *sb = parent->i_sb; inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; - inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); + inode_init_owner(&nop_mnt_idmap, inode, parent, S_IFDIR | 0555); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h index 7b9571e00cc4..832c6464f063 100644 --- a/include/acpi/actbl3.h +++ b/include/acpi/actbl3.h @@ -443,6 +443,7 @@ struct acpi_tpm2_phy { #define ACPI_TPM2_RESERVED10 10 #define ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC 11 /* V1.2 Rev 8 */ #define ACPI_TPM2_RESERVED 12 +#define ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON 13 /* Optional trailer appears after any start_method subtables */ diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h index 4fc8018eddda..1220d185c776 100644 --- a/include/drm/drm_client.h +++ b/include/drm/drm_client.h @@ -127,11 +127,6 @@ struct drm_client_buffer { struct drm_client_dev *client; /** - * @handle: Buffer handle - */ - u32 handle; - - /** * @pitch: Buffer pitch */ u32 pitch; diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index b111dc7ada78..095370ef029d 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -208,6 +208,18 @@ struct drm_fb_helper { * the smem_start field should always be cleared to zero. */ bool hint_leak_smem_start; + +#ifdef CONFIG_FB_DEFERRED_IO + /** + * @fbdefio: + * + * Temporary storage for the driver's FB deferred I/O handler. If the + * driver uses the DRM fbdev emulation layer, this is set by the core + * to a generic deferred I/O handler if a driver is preferring to use + * a shadow buffer. + */ + struct fb_deferred_io fbdefio; +#endif }; static inline struct drm_fb_helper * diff --git a/include/drm/drm_vma_manager.h b/include/drm/drm_vma_manager.h index 4f8c35206f7c..6c2a2f21dbf0 100644 --- a/include/drm/drm_vma_manager.h +++ b/include/drm/drm_vma_manager.h @@ -74,6 +74,7 @@ void drm_vma_offset_remove(struct drm_vma_offset_manager *mgr, struct drm_vma_offset_node *node); int drm_vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag); +int drm_vma_node_allow_once(struct drm_vma_offset_node *node, struct drm_file *tag); void drm_vma_node_revoke(struct drm_vma_offset_node *node, struct drm_file *tag); bool drm_vma_node_is_allowed(struct drm_vma_offset_node *node, diff --git a/include/kunit/test.h b/include/kunit/test.h index 87ea90576b50..08d3559dd703 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -303,7 +303,6 @@ static inline int kunit_run_all_tests(void) */ #define kunit_test_init_section_suites(__suites...) \ __kunit_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \ - CONCATENATE(__UNIQUE_ID(suites), _probe), \ ##__suites) #define kunit_test_init_section_suite(suite) \ @@ -683,8 +682,9 @@ do { \ .right_text = #right, \ }; \ \ - if (likely(memcmp(__left, __right, __size) op 0)) \ - break; \ + if (likely(__left && __right)) \ + if (likely(memcmp(__left, __right, __size) op 0)) \ + break; \ \ _KUNIT_FAILED(test, \ assert_type, \ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9270cd87da3f..6470f67e63c4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,7 +263,7 @@ struct vgic_dist { struct vgic_io_device dist_iodev; bool has_its; - bool save_its_tables_in_progress; + bool table_write_in_progress; /* * Contains the attributes and gpa of the LPI configuration table. diff --git a/include/linux/apple-gmux.h b/include/linux/apple-gmux.h index ddb10aa67b14..1f68b49bcd68 100644 --- a/include/linux/apple-gmux.h +++ b/include/linux/apple-gmux.h @@ -8,18 +8,118 @@ #define LINUX_APPLE_GMUX_H #include <linux/acpi.h> +#include <linux/io.h> +#include <linux/pnp.h> #define GMUX_ACPI_HID "APP000B" +/* + * gmux port offsets. Many of these are not yet used, but may be in the + * future, and it's useful to have them documented here anyhow. + */ +#define GMUX_PORT_VERSION_MAJOR 0x04 +#define GMUX_PORT_VERSION_MINOR 0x05 +#define GMUX_PORT_VERSION_RELEASE 0x06 +#define GMUX_PORT_SWITCH_DISPLAY 0x10 +#define GMUX_PORT_SWITCH_GET_DISPLAY 0x11 +#define GMUX_PORT_INTERRUPT_ENABLE 0x14 +#define GMUX_PORT_INTERRUPT_STATUS 0x16 +#define GMUX_PORT_SWITCH_DDC 0x28 +#define GMUX_PORT_SWITCH_EXTERNAL 0x40 +#define GMUX_PORT_SWITCH_GET_EXTERNAL 0x41 +#define GMUX_PORT_DISCRETE_POWER 0x50 +#define GMUX_PORT_MAX_BRIGHTNESS 0x70 +#define GMUX_PORT_BRIGHTNESS 0x74 +#define GMUX_PORT_VALUE 0xc2 +#define GMUX_PORT_READ 0xd0 +#define GMUX_PORT_WRITE 0xd4 + +#define GMUX_MIN_IO_LEN (GMUX_PORT_BRIGHTNESS + 4) + #if IS_ENABLED(CONFIG_APPLE_GMUX) +static inline bool apple_gmux_is_indexed(unsigned long iostart) +{ + u16 val; + + outb(0xaa, iostart + 0xcc); + outb(0x55, iostart + 0xcd); + outb(0x00, iostart + 0xce); + + val = inb(iostart + 0xcc) | (inb(iostart + 0xcd) << 8); + if (val == 0x55aa) + return true; + + return false; +} /** - * apple_gmux_present() - detect if gmux is built into the machine + * apple_gmux_detect() - detect if gmux is built into the machine + * + * @pnp_dev: Device to probe or NULL to use the first matching device + * @indexed_ret: Returns (by reference) if the gmux is indexed or not + * + * Detect if a supported gmux device is present by actually probing it. + * This avoids the false positives returned on some models by + * apple_gmux_present(). + * + * Return: %true if a supported gmux ACPI device is detected and the kernel + * was configured with CONFIG_APPLE_GMUX, %false otherwise. + */ +static inline bool apple_gmux_detect(struct pnp_dev *pnp_dev, bool *indexed_ret) +{ + u8 ver_major, ver_minor, ver_release; + struct device *dev = NULL; + struct acpi_device *adev; + struct resource *res; + bool indexed = false; + bool ret = false; + + if (!pnp_dev) { + adev = acpi_dev_get_first_match_dev(GMUX_ACPI_HID, NULL, -1); + if (!adev) + return false; + + dev = get_device(acpi_get_first_physical_node(adev)); + acpi_dev_put(adev); + if (!dev) + return false; + + pnp_dev = to_pnp_dev(dev); + } + + res = pnp_get_resource(pnp_dev, IORESOURCE_IO, 0); + if (!res || resource_size(res) < GMUX_MIN_IO_LEN) + goto out; + + /* + * Invalid version information may indicate either that the gmux + * device isn't present or that it's a new one that uses indexed io. + */ + ver_major = inb(res->start + GMUX_PORT_VERSION_MAJOR); + ver_minor = inb(res->start + GMUX_PORT_VERSION_MINOR); + ver_release = inb(res->start + GMUX_PORT_VERSION_RELEASE); + if (ver_major == 0xff && ver_minor == 0xff && ver_release == 0xff) { + indexed = apple_gmux_is_indexed(res->start); + if (!indexed) + goto out; + } + + if (indexed_ret) + *indexed_ret = indexed; + + ret = true; +out: + put_device(dev); + return ret; +} + +/** + * apple_gmux_present() - check if gmux ACPI device is present * * Drivers may use this to activate quirks specific to dual GPU MacBook Pros * and Mac Pros, e.g. for deferred probing, runtime pm and backlight. * - * Return: %true if gmux is present and the kernel was configured + * Return: %true if gmux ACPI device is present and the kernel was configured * with CONFIG_APPLE_GMUX, %false otherwise. */ static inline bool apple_gmux_present(void) @@ -34,6 +134,11 @@ static inline bool apple_gmux_present(void) return false; } +static inline bool apple_gmux_detect(struct pnp_dev *pnp_dev, bool *indexed_ret) +{ + return false; +} + #endif /* !CONFIG_APPLE_GMUX */ #endif /* LINUX_APPLE_GMUX_H */ diff --git a/include/linux/audit.h b/include/linux/audit.h index 3608992848d3..31086a72e32a 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -14,6 +14,7 @@ #include <linux/audit_arch.h> #include <uapi/linux/audit.h> #include <uapi/linux/netfilter/nf_tables.h> +#include <uapi/linux/fanotify.h> #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) @@ -416,7 +417,7 @@ extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(char *name); -extern void __audit_fanotify(unsigned int response); +extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, @@ -523,10 +524,10 @@ static inline void audit_log_kern_module(char *name) __audit_log_kern_module(name); } -static inline void audit_fanotify(unsigned int response) +static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { if (!audit_dummy_context()) - __audit_fanotify(response); + __audit_fanotify(response, friar); } static inline void audit_tk_injoffset(struct timespec64 offset) @@ -679,7 +680,7 @@ static inline void audit_log_kern_module(char *name) { } -static inline void audit_fanotify(unsigned int response) +static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { } static inline void audit_tk_injoffset(struct timespec64 offset) diff --git a/include/linux/bio.h b/include/linux/bio.h index c1da63f6c808..d766be7152e1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -12,6 +12,8 @@ #define BIO_MAX_VECS 256U +struct queue_limits; + static inline unsigned int bio_max_segs(unsigned int nr_segs) { return min(nr_segs, BIO_MAX_VECS); @@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, struct bio_set *bs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/capability.h b/include/linux/capability.h index 65efb74c3585..03c2a613ad40 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -42,6 +42,7 @@ struct inode; struct dentry; struct task_struct; struct user_namespace; +struct mnt_idmap; extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_init_eff_set; @@ -248,9 +249,9 @@ static inline bool ns_capable_setid(struct user_namespace *ns, int cap) } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct inode *inode); -bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns, +bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); @@ -271,11 +272,11 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) } /* audit system wants to get cap info from files as well */ -int get_vfs_caps_from_disk(struct user_namespace *mnt_userns, +int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); -int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, +int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 00af2c98da75..4497d0a6772c 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -99,16 +99,6 @@ struct ceph_options { #define CEPH_AUTH_NAME_DEFAULT "guest" -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, - CEPH_MOUNT_RECOVER, -}; - static inline unsigned long ceph_timeout_jiffies(unsigned long timeout) { return timeout ?: MAX_SCHEDULE_TIMEOUT; diff --git a/include/linux/efi.h b/include/linux/efi.h index 4b27519143f5..98598bd1d2fa 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -668,7 +668,8 @@ extern struct efi { #define EFI_RT_SUPPORTED_ALL 0x3fff -#define EFI_RT_SUPPORTED_TIME_SERVICES 0x000f +#define EFI_RT_SUPPORTED_TIME_SERVICES 0x0003 +#define EFI_RT_SUPPORTED_WAKEUP_SERVICES 0x000c #define EFI_RT_SUPPORTED_VARIABLE_SERVICES 0x0070 extern struct mm_struct efi_mm; diff --git a/include/linux/evm.h b/include/linux/evm.h index 7a9ee2157f69..7dc1ee74169f 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -21,34 +21,34 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, void *xattr_value, size_t xattr_value_len, struct integrity_iint_cache *iint); -extern int evm_inode_setattr(struct user_namespace *mnt_userns, +extern int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void evm_inode_post_setattr(struct dentry *dentry, int ia_valid); -extern int evm_inode_setxattr(struct user_namespace *mnt_userns, +extern int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size); extern void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); -extern int evm_inode_removexattr(struct user_namespace *mnt_userns, +extern int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name); extern void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name); -static inline void evm_inode_post_remove_acl(struct user_namespace *mnt_userns, +static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { evm_inode_post_removexattr(dentry, acl_name); } -extern int evm_inode_set_acl(struct user_namespace *mnt_userns, +extern int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { - return evm_inode_set_acl(mnt_userns, dentry, acl_name, NULL); + return evm_inode_set_acl(idmap, dentry, acl_name, NULL); } static inline void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name, @@ -90,7 +90,7 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, } #endif -static inline int evm_inode_setattr(struct user_namespace *mnt_userns, +static inline int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return 0; @@ -101,7 +101,7 @@ static inline void evm_inode_post_setattr(struct dentry *dentry, int ia_valid) return; } -static inline int evm_inode_setxattr(struct user_namespace *mnt_userns, +static inline int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size) { @@ -116,7 +116,7 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry, return; } -static inline int evm_inode_removexattr(struct user_namespace *mnt_userns, +static inline int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { @@ -129,21 +129,21 @@ static inline void evm_inode_post_removexattr(struct dentry *dentry, return; } -static inline void evm_inode_post_remove_acl(struct user_namespace *mnt_userns, +static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return; } -static inline int evm_inode_set_acl(struct user_namespace *mnt_userns, +static inline int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return 0; } -static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index fe848901fcc3..9f4d4bcbf251 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -213,7 +213,6 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); - u64 (*fetch_iversion)(struct inode *); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 8ad743def6f3..4f1c4f603118 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -122,6 +122,11 @@ #define ALL_FANOTIFY_EVENT_BITS (FANOTIFY_OUTGOING_EVENTS | \ FANOTIFY_EVENT_FLAGS) +/* These masks check for invalid bits in permission responses. */ +#define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY) +#define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO) +#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS) + /* Do not use these old uapi constants internally */ #undef FAN_ALL_CLASS_BITS #undef FAN_ALL_INIT_FLAGS diff --git a/include/linux/fb.h b/include/linux/fb.h index 96b96323e9cb..73eb1f85ea8e 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -662,6 +662,7 @@ extern int fb_deferred_io_init(struct fb_info *info); extern void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file); +extern void fb_deferred_io_release(struct fb_info *info); extern void fb_deferred_io_cleanup(struct fb_info *info); extern int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h index 9e37e063ac69..47c05a9851d0 100644 --- a/include/linux/fileattr.h +++ b/include/linux/fileattr.h @@ -53,7 +53,7 @@ static inline bool fileattr_has_fsx(const struct fileattr *fa) } int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); #endif /* _LINUX_FILEATTR_H */ diff --git a/include/linux/filelock.h b/include/linux/filelock.h new file mode 100644 index 000000000000..efcdd1631d9b --- /dev/null +++ b/include/linux/filelock.h @@ -0,0 +1,439 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FILELOCK_H +#define _LINUX_FILELOCK_H + +#include <linux/fs.h> + +#define FL_POSIX 1 +#define FL_FLOCK 2 +#define FL_DELEG 4 /* NFSv4 delegation */ +#define FL_ACCESS 8 /* not trying to lock, just looking */ +#define FL_EXISTS 16 /* when unlocking, test for existence */ +#define FL_LEASE 32 /* lease held on this file */ +#define FL_CLOSE 64 /* unlock on close */ +#define FL_SLEEP 128 /* A blocking lock */ +#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ +#define FL_UNLOCK_PENDING 512 /* Lease is being broken */ +#define FL_OFDLCK 1024 /* lock is "owned" by struct file */ +#define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ + +#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) + +/* + * Special return value from posix_lock_file() and vfs_lock_file() for + * asynchronous locking. + */ +#define FILE_LOCK_DEFERRED 1 + +struct file_lock; + +struct file_lock_operations { + void (*fl_copy_lock)(struct file_lock *, struct file_lock *); + void (*fl_release_private)(struct file_lock *); +}; + +struct lock_manager_operations { + void *lm_mod_owner; + fl_owner_t (*lm_get_owner)(fl_owner_t); + void (*lm_put_owner)(fl_owner_t); + void (*lm_notify)(struct file_lock *); /* unblock callback */ + int (*lm_grant)(struct file_lock *, int); + bool (*lm_break)(struct file_lock *); + int (*lm_change)(struct file_lock *, int, struct list_head *); + void (*lm_setup)(struct file_lock *, void **); + bool (*lm_breaker_owns_lease)(struct file_lock *); + bool (*lm_lock_expirable)(struct file_lock *cfl); + void (*lm_expire_lock)(void); +}; + +struct lock_manager { + struct list_head list; + /* + * NFSv4 and up also want opens blocked during the grace period; + * NLM doesn't care: + */ + bool block_opens; +}; + +struct net; +void locks_start_grace(struct net *, struct lock_manager *); +void locks_end_grace(struct lock_manager *); +bool locks_in_grace(struct net *); +bool opens_in_grace(struct net *); + +/* + * struct file_lock has a union that some filesystems use to track + * their own private info. The NFS side of things is defined here: + */ +#include <linux/nfs_fs_i.h> + +/* + * struct file_lock represents a generic "file lock". It's used to represent + * POSIX byte range locks, BSD (flock) locks, and leases. It's important to + * note that the same struct is used to represent both a request for a lock and + * the lock itself, but the same object is never used for both. + * + * FIXME: should we create a separate "struct lock_request" to help distinguish + * these two uses? + * + * The varous i_flctx lists are ordered by: + * + * 1) lock owner + * 2) lock range start + * 3) lock range end + * + * Obviously, the last two criteria only matter for POSIX locks. + */ +struct file_lock { + struct file_lock *fl_blocker; /* The lock, that is blocking us */ + struct list_head fl_list; /* link into file_lock_context */ + struct hlist_node fl_link; /* node in global lists */ + struct list_head fl_blocked_requests; /* list of requests with + * ->fl_blocker pointing here + */ + struct list_head fl_blocked_member; /* node in + * ->fl_blocker->fl_blocked_requests + */ + fl_owner_t fl_owner; + unsigned int fl_flags; + unsigned char fl_type; + unsigned int fl_pid; + int fl_link_cpu; /* what cpu's list is this on? */ + wait_queue_head_t fl_wait; + struct file *fl_file; + loff_t fl_start; + loff_t fl_end; + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + /* for lease breaks: */ + unsigned long fl_break_time; + unsigned long fl_downgrade_time; + + const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ + const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ + union { + struct nfs_lock_info nfs_fl; + struct nfs4_lock_info nfs4_fl; + struct { + struct list_head link; /* link in AFS vnode's pending_locks list */ + int state; /* state of grant or error if -ve */ + unsigned int debug_id; + } afs; + struct { + struct inode *inode; + } ceph; + } fl_u; +} __randomize_layout; + +struct file_lock_context { + spinlock_t flc_lock; + struct list_head flc_flock; + struct list_head flc_posix; + struct list_head flc_lease; +}; + +#ifdef CONFIG_FILE_LOCKING +int fcntl_getlk(struct file *, unsigned int, struct flock *); +int fcntl_setlk(unsigned int, struct file *, unsigned int, + struct flock *); + +#if BITS_PER_LONG == 32 +int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); +int fcntl_setlk64(unsigned int, struct file *, unsigned int, + struct flock64 *); +#endif + +int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +int fcntl_getlease(struct file *filp); + +/* fs/locks.c */ +void locks_free_lock_context(struct inode *inode); +void locks_free_lock(struct file_lock *fl); +void locks_init_lock(struct file_lock *); +struct file_lock * locks_alloc_lock(void); +void locks_copy_lock(struct file_lock *, struct file_lock *); +void locks_copy_conflock(struct file_lock *, struct file_lock *); +void locks_remove_posix(struct file *, fl_owner_t); +void locks_remove_file(struct file *); +void locks_release_private(struct file_lock *); +void posix_test_lock(struct file *, struct file_lock *); +int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); +int locks_delete_block(struct file_lock *); +int vfs_test_lock(struct file *, struct file_lock *); +int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); +int vfs_cancel_lock(struct file *filp, struct file_lock *fl); +bool vfs_inode_has_locks(struct inode *inode); +int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); +int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); +void lease_get_mtime(struct inode *, struct timespec64 *time); +int generic_setlease(struct file *, long, struct file_lock **, void **priv); +int vfs_setlease(struct file *, long, struct file_lock **, void **); +int lease_modify(struct file_lock *, int, struct list_head *); + +struct notifier_block; +int lease_register_notifier(struct notifier_block *); +void lease_unregister_notifier(struct notifier_block *); + +struct files_struct; +void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files); +bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner); + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return smp_load_acquire(&inode->i_flctx); +} + +#else /* !CONFIG_FILE_LOCKING */ +static inline int fcntl_getlk(struct file *file, unsigned int cmd, + struct flock __user *user) +{ + return -EINVAL; +} + +static inline int fcntl_setlk(unsigned int fd, struct file *file, + unsigned int cmd, struct flock __user *user) +{ + return -EACCES; +} + +#if BITS_PER_LONG == 32 +static inline int fcntl_getlk64(struct file *file, unsigned int cmd, + struct flock64 *user) +{ + return -EINVAL; +} + +static inline int fcntl_setlk64(unsigned int fd, struct file *file, + unsigned int cmd, struct flock64 *user) +{ + return -EACCES; +} +#endif +static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +{ + return -EINVAL; +} + +static inline int fcntl_getlease(struct file *filp) +{ + return F_UNLCK; +} + +static inline void +locks_free_lock_context(struct inode *inode) +{ +} + +static inline void locks_init_lock(struct file_lock *fl) +{ + return; +} + +static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) +{ + return; +} + +static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + return; +} + +static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) +{ + return; +} + +static inline void locks_remove_file(struct file *filp) +{ + return; +} + +static inline void posix_test_lock(struct file *filp, struct file_lock *fl) +{ + return; +} + +static inline int posix_lock_file(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) +{ + return -ENOLCK; +} + +static inline int locks_delete_block(struct file_lock *waiter) +{ + return -ENOENT; +} + +static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) +{ + return 0; +} + +static inline int vfs_lock_file(struct file *filp, unsigned int cmd, + struct file_lock *fl, struct file_lock *conf) +{ + return -ENOLCK; +} + +static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) +{ + return 0; +} + +static inline bool vfs_inode_has_locks(struct inode *inode) +{ + return false; +} + +static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) +{ + return -ENOLCK; +} + +static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +{ + return 0; +} + +static inline void lease_get_mtime(struct inode *inode, + struct timespec64 *time) +{ + return; +} + +static inline int generic_setlease(struct file *filp, long arg, + struct file_lock **flp, void **priv) +{ + return -EINVAL; +} + +static inline int vfs_setlease(struct file *filp, long arg, + struct file_lock **lease, void **priv) +{ + return -EINVAL; +} + +static inline int lease_modify(struct file_lock *fl, int arg, + struct list_head *dispose) +{ + return -EINVAL; +} + +struct files_struct; +static inline void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files) {} +static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + return false; +} + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return NULL; +} + +#endif /* !CONFIG_FILE_LOCKING */ + +static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) +{ + return locks_lock_inode_wait(file_inode(filp), fl); +} + +#ifdef CONFIG_FILE_LOCKING +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + /* + * Since this check is lockless, we must ensure that any refcounts + * taken are done before checking i_flctx->flc_lease. Otherwise, we + * could end up racing with tasks trying to set a new lease on this + * file. + */ + smp_mb(); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + return __break_lease(inode, mode, FL_LEASE); + return 0; +} + +static inline int break_deleg(struct inode *inode, unsigned int mode) +{ + /* + * Since this check is lockless, we must ensure that any refcounts + * taken are done before checking i_flctx->flc_lease. Otherwise, we + * could end up racing with tasks trying to set a new lease on this + * file. + */ + smp_mb(); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + return __break_lease(inode, mode, FL_DELEG); + return 0; +} + +static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +{ + int ret; + + ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); + if (ret == -EWOULDBLOCK && delegated_inode) { + *delegated_inode = inode; + ihold(inode); + } + return ret; +} + +static inline int break_deleg_wait(struct inode **delegated_inode) +{ + int ret; + + ret = break_deleg(*delegated_inode, O_WRONLY); + iput(*delegated_inode); + *delegated_inode = NULL; + return ret; +} + +static inline int break_layout(struct inode *inode, bool wait) +{ + smp_mb(); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + return __break_lease(inode, + wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, + FL_LAYOUT); + return 0; +} + +#else /* !CONFIG_FILE_LOCKING */ +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + return 0; +} + +static inline int break_deleg(struct inode *inode, unsigned int mode) +{ + return 0; +} + +static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +{ + return 0; +} + +static inline int break_deleg_wait(struct inode **delegated_inode) +{ + BUG(); + return 0; +} + +static inline int break_layout(struct inode *inode, bool wait) +{ + return 0; +} + +#endif /* CONFIG_FILE_LOCKING */ + +#endif /* _LINUX_FILELOCK_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index c1769a2c5d70..2acc46fb5f97 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1003,135 +1003,11 @@ static inline struct file *get_file(struct file *f) #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif -#define FL_POSIX 1 -#define FL_FLOCK 2 -#define FL_DELEG 4 /* NFSv4 delegation */ -#define FL_ACCESS 8 /* not trying to lock, just looking */ -#define FL_EXISTS 16 /* when unlocking, test for existence */ -#define FL_LEASE 32 /* lease held on this file */ -#define FL_CLOSE 64 /* unlock on close */ -#define FL_SLEEP 128 /* A blocking lock */ -#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ -#define FL_UNLOCK_PENDING 512 /* Lease is being broken */ -#define FL_OFDLCK 1024 /* lock is "owned" by struct file */ -#define FL_LAYOUT 2048 /* outstanding pNFS layout */ -#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ - -#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) - -/* - * Special return value from posix_lock_file() and vfs_lock_file() for - * asynchronous locking. - */ -#define FILE_LOCK_DEFERRED 1 - /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; -struct file_lock_operations { - void (*fl_copy_lock)(struct file_lock *, struct file_lock *); - void (*fl_release_private)(struct file_lock *); -}; - -struct lock_manager_operations { - void *lm_mod_owner; - fl_owner_t (*lm_get_owner)(fl_owner_t); - void (*lm_put_owner)(fl_owner_t); - void (*lm_notify)(struct file_lock *); /* unblock callback */ - int (*lm_grant)(struct file_lock *, int); - bool (*lm_break)(struct file_lock *); - int (*lm_change)(struct file_lock *, int, struct list_head *); - void (*lm_setup)(struct file_lock *, void **); - bool (*lm_breaker_owns_lease)(struct file_lock *); - bool (*lm_lock_expirable)(struct file_lock *cfl); - void (*lm_expire_lock)(void); -}; - -struct lock_manager { - struct list_head list; - /* - * NFSv4 and up also want opens blocked during the grace period; - * NLM doesn't care: - */ - bool block_opens; -}; - -struct net; -void locks_start_grace(struct net *, struct lock_manager *); -void locks_end_grace(struct lock_manager *); -bool locks_in_grace(struct net *); -bool opens_in_grace(struct net *); - -/* that will die - we need it for nfs_lock_info */ -#include <linux/nfs_fs_i.h> - -/* - * struct file_lock represents a generic "file lock". It's used to represent - * POSIX byte range locks, BSD (flock) locks, and leases. It's important to - * note that the same struct is used to represent both a request for a lock and - * the lock itself, but the same object is never used for both. - * - * FIXME: should we create a separate "struct lock_request" to help distinguish - * these two uses? - * - * The varous i_flctx lists are ordered by: - * - * 1) lock owner - * 2) lock range start - * 3) lock range end - * - * Obviously, the last two criteria only matter for POSIX locks. - */ -struct file_lock { - struct file_lock *fl_blocker; /* The lock, that is blocking us */ - struct list_head fl_list; /* link into file_lock_context */ - struct hlist_node fl_link; /* node in global lists */ - struct list_head fl_blocked_requests; /* list of requests with - * ->fl_blocker pointing here - */ - struct list_head fl_blocked_member; /* node in - * ->fl_blocker->fl_blocked_requests - */ - fl_owner_t fl_owner; - unsigned int fl_flags; - unsigned char fl_type; - unsigned int fl_pid; - int fl_link_cpu; /* what cpu's list is this on? */ - wait_queue_head_t fl_wait; - struct file *fl_file; - loff_t fl_start; - loff_t fl_end; - - struct fasync_struct * fl_fasync; /* for lease break notifications */ - /* for lease breaks: */ - unsigned long fl_break_time; - unsigned long fl_downgrade_time; - - const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ - const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ - union { - struct nfs_lock_info nfs_fl; - struct nfs4_lock_info nfs4_fl; - struct { - struct list_head link; /* link in AFS vnode's pending_locks list */ - int state; /* state of grant or error if -ve */ - unsigned int debug_id; - } afs; - struct { - struct inode *inode; - } ceph; - } fl_u; -} __randomize_layout; - -struct file_lock_context { - spinlock_t flc_lock; - struct list_head flc_flock; - struct list_head flc_posix; - struct list_head flc_lease; -}; - /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) @@ -1140,216 +1016,6 @@ struct file_lock_context { extern void send_sigio(struct fown_struct *fown, int fd, int band); -#define locks_inode(f) file_inode(f) - -#ifdef CONFIG_FILE_LOCKING -extern int fcntl_getlk(struct file *, unsigned int, struct flock *); -extern int fcntl_setlk(unsigned int, struct file *, unsigned int, - struct flock *); - -#if BITS_PER_LONG == 32 -extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); -extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, - struct flock64 *); -#endif - -extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); -extern int fcntl_getlease(struct file *filp); - -/* fs/locks.c */ -void locks_free_lock_context(struct inode *inode); -void locks_free_lock(struct file_lock *fl); -extern void locks_init_lock(struct file_lock *); -extern struct file_lock * locks_alloc_lock(void); -extern void locks_copy_lock(struct file_lock *, struct file_lock *); -extern void locks_copy_conflock(struct file_lock *, struct file_lock *); -extern void locks_remove_posix(struct file *, fl_owner_t); -extern void locks_remove_file(struct file *); -extern void locks_release_private(struct file_lock *); -extern void posix_test_lock(struct file *, struct file_lock *); -extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); -extern int locks_delete_block(struct file_lock *); -extern int vfs_test_lock(struct file *, struct file_lock *); -extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); -extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); -bool vfs_inode_has_locks(struct inode *inode); -extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); -extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); -extern void lease_get_mtime(struct inode *, struct timespec64 *time); -extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); -extern int vfs_setlease(struct file *, long, struct file_lock **, void **); -extern int lease_modify(struct file_lock *, int, struct list_head *); - -struct notifier_block; -extern int lease_register_notifier(struct notifier_block *); -extern void lease_unregister_notifier(struct notifier_block *); - -struct files_struct; -extern void show_fd_locks(struct seq_file *f, - struct file *filp, struct files_struct *files); -extern bool locks_owner_has_blockers(struct file_lock_context *flctx, - fl_owner_t owner); - -static inline struct file_lock_context * -locks_inode_context(const struct inode *inode) -{ - return smp_load_acquire(&inode->i_flctx); -} - -#else /* !CONFIG_FILE_LOCKING */ -static inline int fcntl_getlk(struct file *file, unsigned int cmd, - struct flock __user *user) -{ - return -EINVAL; -} - -static inline int fcntl_setlk(unsigned int fd, struct file *file, - unsigned int cmd, struct flock __user *user) -{ - return -EACCES; -} - -#if BITS_PER_LONG == 32 -static inline int fcntl_getlk64(struct file *file, unsigned int cmd, - struct flock64 *user) -{ - return -EINVAL; -} - -static inline int fcntl_setlk64(unsigned int fd, struct file *file, - unsigned int cmd, struct flock64 *user) -{ - return -EACCES; -} -#endif -static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) -{ - return -EINVAL; -} - -static inline int fcntl_getlease(struct file *filp) -{ - return F_UNLCK; -} - -static inline void -locks_free_lock_context(struct inode *inode) -{ -} - -static inline void locks_init_lock(struct file_lock *fl) -{ - return; -} - -static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) -{ - return; -} - -static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) -{ - return; -} - -static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) -{ - return; -} - -static inline void locks_remove_file(struct file *filp) -{ - return; -} - -static inline void posix_test_lock(struct file *filp, struct file_lock *fl) -{ - return; -} - -static inline int posix_lock_file(struct file *filp, struct file_lock *fl, - struct file_lock *conflock) -{ - return -ENOLCK; -} - -static inline int locks_delete_block(struct file_lock *waiter) -{ - return -ENOENT; -} - -static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) -{ - return 0; -} - -static inline int vfs_lock_file(struct file *filp, unsigned int cmd, - struct file_lock *fl, struct file_lock *conf) -{ - return -ENOLCK; -} - -static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) -{ - return 0; -} - -static inline bool vfs_inode_has_locks(struct inode *inode) -{ - return false; -} - -static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) -{ - return -ENOLCK; -} - -static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) -{ - return 0; -} - -static inline void lease_get_mtime(struct inode *inode, - struct timespec64 *time) -{ - return; -} - -static inline int generic_setlease(struct file *filp, long arg, - struct file_lock **flp, void **priv) -{ - return -EINVAL; -} - -static inline int vfs_setlease(struct file *filp, long arg, - struct file_lock **lease, void **priv) -{ - return -EINVAL; -} - -static inline int lease_modify(struct file_lock *fl, int arg, - struct list_head *dispose) -{ - return -EINVAL; -} - -struct files_struct; -static inline void show_fd_locks(struct seq_file *f, - struct file *filp, struct files_struct *files) {} -static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, - fl_owner_t owner) -{ - return false; -} - -static inline struct file_lock_context * -locks_inode_context(const struct inode *inode) -{ - return NULL; -} - -#endif /* !CONFIG_FILE_LOCKING */ - static inline struct inode *file_inode(const struct file *f) { return f->f_inode; @@ -1360,11 +1026,6 @@ static inline struct dentry *file_dentry(const struct file *file) return d_real(file->f_path.dentry, file_inode(file)); } -static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) -{ - return locks_lock_inode_wait(locks_inode(filp), fl); -} - struct fasync_struct { rwlock_t fa_lock; int magic; @@ -1635,22 +1296,22 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) } /** - * i_uid_into_vfsuid - map an inode's i_uid down into a mnt_userns - * @mnt_userns: user namespace of the mount the inode was found from + * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping + * @idmap: idmap of the mount the inode was found from * @inode: inode to map * - * Return: whe inode's i_uid mapped down according to @mnt_userns. + * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned. */ -static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns, +static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, const struct inode *inode) { - return make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid); + return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid); } /** * i_uid_needs_update - check whether inode's i_uid needs to be updated - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * @@ -1659,50 +1320,50 @@ static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns, * * Return: true if @inode's i_uid field needs to be updated, false if not. */ -static inline bool i_uid_needs_update(struct user_namespace *mnt_userns, +static inline bool i_uid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_UID) && !vfsuid_eq(attr->ia_vfsuid, - i_uid_into_vfsuid(mnt_userns, inode))); + i_uid_into_vfsuid(idmap, inode))); } /** * i_uid_update - update @inode's i_uid field - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid. */ -static inline void i_uid_update(struct user_namespace *mnt_userns, +static inline void i_uid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_UID) - inode->i_uid = from_vfsuid(mnt_userns, i_user_ns(inode), + inode->i_uid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); } /** - * i_gid_into_vfsgid - map an inode's i_gid down into a mnt_userns - * @mnt_userns: user namespace of the mount the inode was found from + * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping + * @idmap: idmap of the mount the inode was found from * @inode: inode to map * - * Return: the inode's i_gid mapped down according to @mnt_userns. + * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned. */ -static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns, +static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, const struct inode *inode) { - return make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid); + return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid); } /** * i_gid_needs_update - check whether inode's i_gid needs to be updated - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * @@ -1711,83 +1372,83 @@ static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns, * * Return: true if @inode's i_gid field needs to be updated, false if not. */ -static inline bool i_gid_needs_update(struct user_namespace *mnt_userns, +static inline bool i_gid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_GID) && !vfsgid_eq(attr->ia_vfsgid, - i_gid_into_vfsgid(mnt_userns, inode))); + i_gid_into_vfsgid(idmap, inode))); } /** * i_gid_update - update @inode's i_gid field - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid. */ -static inline void i_gid_update(struct user_namespace *mnt_userns, +static inline void i_gid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_GID) - inode->i_gid = from_vfsgid(mnt_userns, i_user_ns(inode), + inode->i_gid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); } /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via - * an idmapped mount map the caller's fsuid according to @mnt_users. + * an idmapped mount map the caller's fsuid according to @idmap. */ static inline void inode_fsuid_set(struct inode *inode, - struct user_namespace *mnt_userns) + struct mnt_idmap *idmap) { - inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode)); + inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode)); } /** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via - * an idmapped mount map the caller's fsgid according to @mnt_users. + * an idmapped mount map the caller's fsgid according to @idmap. */ static inline void inode_fsgid_set(struct inode *inode, - struct user_namespace *mnt_userns) + struct mnt_idmap *idmap) { - inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode)); + inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode)); } /** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in - * @mnt_userns: user namespace of the relevant mount + * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map - * the caller's fsuid and fsgid according to the @mnt_userns first. + * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not. */ static inline bool fsuidgid_has_mapping(struct super_block *sb, - struct user_namespace *mnt_userns) + struct mnt_idmap *idmap) { struct user_namespace *fs_userns = sb->s_user_ns; kuid_t kuid; kgid_t kgid; - kuid = mapped_fsuid(mnt_userns, fs_userns); + kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) return false; - kgid = mapped_fsgid(mnt_userns, fs_userns); + kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && @@ -1941,42 +1602,42 @@ static inline bool sb_start_intwrite_trylock(struct super_block *sb) return __sb_start_write_trylock(sb, SB_FREEZE_FS); } -bool inode_owner_or_capable(struct user_namespace *mnt_userns, +bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ -int vfs_create(struct user_namespace *, struct inode *, +int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int vfs_mkdir(struct user_namespace *, struct inode *, +int vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); -int vfs_mknod(struct user_namespace *, struct inode *, struct dentry *, +int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -int vfs_symlink(struct user_namespace *, struct inode *, +int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); -int vfs_link(struct dentry *, struct user_namespace *, struct inode *, +int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); -int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *); -int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *, +int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); +int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); /** * struct renamedata - contains all information required for renaming - * @old_mnt_userns: old user namespace of the mount the inode was found from + * @old_mnt_idmap: idmap of the old mount the inode was found from * @old_dir: parent of source * @old_dentry: source - * @new_mnt_userns: new user namespace of the mount the inode was found from + * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { - struct user_namespace *old_mnt_userns; + struct mnt_idmap *old_mnt_idmap; struct inode *old_dir; struct dentry *old_dentry; - struct user_namespace *new_mnt_userns; + struct mnt_idmap *new_mnt_idmap; struct inode *new_dir; struct dentry *new_dentry; struct inode **delegated_inode; @@ -1985,14 +1646,14 @@ struct renamedata { int vfs_rename(struct renamedata *); -static inline int vfs_whiteout(struct user_namespace *mnt_userns, +static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { - return vfs_mknod(mnt_userns, dir, dentry, S_IFCHR | WHITEOUT_MODE, + return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } -struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, +struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); @@ -2016,10 +1677,10 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, /* * VFS file helper functions. */ -void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, +void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); -umode_t mode_strip_sgid(struct user_namespace *mnt_userns, +umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); /* @@ -2137,27 +1798,26 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); - int (*permission) (struct user_namespace *, struct inode *, int); + int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); - int (*create) (struct user_namespace *, struct inode *,struct dentry *, + int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct user_namespace *, struct inode *,struct dentry *, + int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); - int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *, + int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct user_namespace *, struct inode *,struct dentry *, + int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); - int (*rename) (struct user_namespace *, struct inode *, struct dentry *, + int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); - int (*setattr) (struct user_namespace *, struct dentry *, - struct iattr *); - int (*getattr) (struct user_namespace *, const struct path *, + int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); + int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, @@ -2166,13 +1826,13 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct user_namespace *, struct inode *, + int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); - struct posix_acl *(*get_acl)(struct user_namespace *, struct dentry *, + struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, int); - int (*set_acl)(struct user_namespace *, struct dentry *, + int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); - int (*fileattr_set)(struct user_namespace *mnt_userns, + int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); } ____cacheline_aligned; @@ -2326,11 +1986,11 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) -static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns, +static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) { - return !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)); + return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)); } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) @@ -2624,96 +2284,6 @@ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) -#ifdef CONFIG_FILE_LOCKING -static inline int break_lease(struct inode *inode, unsigned int mode) -{ - /* - * Since this check is lockless, we must ensure that any refcounts - * taken are done before checking i_flctx->flc_lease. Otherwise, we - * could end up racing with tasks trying to set a new lease on this - * file. - */ - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, mode, FL_LEASE); - return 0; -} - -static inline int break_deleg(struct inode *inode, unsigned int mode) -{ - /* - * Since this check is lockless, we must ensure that any refcounts - * taken are done before checking i_flctx->flc_lease. Otherwise, we - * could end up racing with tasks trying to set a new lease on this - * file. - */ - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, mode, FL_DELEG); - return 0; -} - -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) -{ - int ret; - - ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); - if (ret == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = inode; - ihold(inode); - } - return ret; -} - -static inline int break_deleg_wait(struct inode **delegated_inode) -{ - int ret; - - ret = break_deleg(*delegated_inode, O_WRONLY); - iput(*delegated_inode); - *delegated_inode = NULL; - return ret; -} - -static inline int break_layout(struct inode *inode, bool wait) -{ - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, - wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, - FL_LAYOUT); - return 0; -} - -#else /* !CONFIG_FILE_LOCKING */ -static inline int break_lease(struct inode *inode, unsigned int mode) -{ - return 0; -} - -static inline int break_deleg(struct inode *inode, unsigned int mode) -{ - return 0; -} - -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) -{ - return 0; -} - -static inline int break_deleg_wait(struct inode **delegated_inode) -{ - BUG(); - return 0; -} - -static inline int break_layout(struct inode *inode, bool wait) -{ - return 0; -} - -#endif /* CONFIG_FILE_LOCKING */ - /* fs/open.c */ struct audit_names; struct filename { @@ -2725,11 +2295,6 @@ struct filename { }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); -static inline struct user_namespace *file_mnt_user_ns(struct file *file) -{ - return mnt_user_ns(file->f_path.mnt); -} - static inline struct mnt_idmap *file_mnt_idmap(struct file *file) { return mnt_idmap(file->f_path.mnt); @@ -2749,7 +2314,7 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) } extern long vfs_truncate(const struct path *, loff_t); -int do_truncate(struct user_namespace *, struct dentry *, loff_t start, +int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); @@ -2904,21 +2469,21 @@ static inline int bmap(struct inode *inode, sector_t *block) } #endif -int notify_change(struct user_namespace *, struct dentry *, +int notify_change(struct mnt_idmap *, struct dentry *, struct iattr *, struct inode **); -int inode_permission(struct user_namespace *, struct inode *, int); -int generic_permission(struct user_namespace *, struct inode *, int); +int inode_permission(struct mnt_idmap *, struct inode *, int); +int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) { - return inode_permission(file_mnt_user_ns(file), + return inode_permission(file_mnt_idmap(file), file_inode(file), mask); } static inline int path_permission(const struct path *path, int mask) { - return inode_permission(mnt_user_ns(path->mnt), + return inode_permission(mnt_idmap(path->mnt), d_inode(path->dentry), mask); } -int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, +int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) @@ -3106,7 +2671,7 @@ extern void __destroy_inode(struct inode *); extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); -extern int setattr_should_drop_suidgid(struct user_namespace *, struct inode *); +extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs(struct file *); /* @@ -3265,7 +2830,7 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); -void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *); +void generic_fillattr(struct mnt_idmap *, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); @@ -3316,9 +2881,9 @@ extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); -extern int simple_setattr(struct user_namespace *, struct dentry *, +extern int simple_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); -extern int simple_getattr(struct user_namespace *, const struct path *, +extern int simple_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); @@ -3327,7 +2892,7 @@ extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); -extern int simple_rename(struct user_namespace *, struct inode *, +extern int simple_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, @@ -3369,11 +2934,11 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); -int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, +int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); -int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *); +int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); -void setattr_copy(struct user_namespace *, struct inode *inode, +void setattr_copy(struct mnt_idmap *, struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); @@ -3540,13 +3105,13 @@ static inline bool is_sxid(umode_t mode) return mode & (S_ISUID | S_ISGID); } -static inline int check_sticky(struct user_namespace *mnt_userns, +static inline int check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; - return __check_sticky(mnt_userns, dir, inode); + return __check_sticky(idmap, dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 4f5f8a651213..e0a49c3125eb 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -257,8 +257,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); -int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, - unsigned int offs); +int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, + size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); @@ -309,8 +309,6 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); -int fscrypt_add_test_dummy_key(struct super_block *sb, - const struct fscrypt_dummy_policy *dummy_policy); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); @@ -422,9 +420,8 @@ static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, return -EOPNOTSUPP; } -static inline int fscrypt_decrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs) +static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs) { return -EOPNOTSUPP; } @@ -530,13 +527,6 @@ static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) return -EOPNOTSUPP; } -static inline int -fscrypt_add_test_dummy_key(struct super_block *sb, - const struct fscrypt_dummy_policy *dummy_policy) -{ - return 0; -} - static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 40f14e5fed9d..119a3266791f 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -12,6 +12,7 @@ #define _LINUX_FSVERITY_H #include <linux/fs.h> +#include <linux/mm.h> #include <crypto/hash_info.h> #include <crypto/sha2.h> #include <uapi/linux/fsverity.h> @@ -93,8 +94,7 @@ struct fsverity_operations { * isn't already cached. Implementations may ignore this * argument; it's only a performance optimization. * - * This can be called at any time on an open verity file, as well as - * between ->begin_enable_verity() and ->end_enable_verity(). It may be + * This can be called at any time on an open verity file. It may be * called by multiple processes concurrently, even with the same page. * * Note that this must retrieve a *page*, not necessarily a *block*. @@ -109,9 +109,9 @@ struct fsverity_operations { * Write a Merkle tree block to the given inode. * * @inode: the inode for which the Merkle tree is being built - * @buf: block to write - * @index: 0-based index of the block within the Merkle tree - * @log_blocksize: log base 2 of the Merkle tree block size + * @buf: the Merkle tree block to write + * @pos: the position of the block in the Merkle tree (in bytes) + * @size: the Merkle tree block size (in bytes) * * This is only called between ->begin_enable_verity() and * ->end_enable_verity(). @@ -119,7 +119,7 @@ struct fsverity_operations { * Return: 0 on success, -errno on failure */ int (*write_merkle_tree_block)(struct inode *inode, const void *buf, - u64 index, int log_blocksize); + u64 pos, unsigned int size); }; #ifdef CONFIG_FS_VERITY @@ -148,9 +148,21 @@ int fsverity_get_digest(struct inode *inode, /* open.c */ -int fsverity_file_open(struct inode *inode, struct file *filp); -int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); -void fsverity_cleanup_inode(struct inode *inode); +int __fsverity_file_open(struct inode *inode, struct file *filp); +int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); +void __fsverity_cleanup_inode(struct inode *inode); + +/** + * fsverity_cleanup_inode() - free the inode's verity info, if present + * @inode: an inode being evicted + * + * Filesystems must call this on inode eviction to free ->i_verity_info. + */ +static inline void fsverity_cleanup_inode(struct inode *inode) +{ + if (inode->i_verity_info) + __fsverity_cleanup_inode(inode); +} /* read_metadata.c */ @@ -158,7 +170,7 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); /* verify.c */ -bool fsverity_verify_page(struct page *page); +bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset); void fsverity_verify_bio(struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); @@ -193,15 +205,15 @@ static inline int fsverity_get_digest(struct inode *inode, /* open.c */ -static inline int fsverity_file_open(struct inode *inode, struct file *filp) +static inline int __fsverity_file_open(struct inode *inode, struct file *filp) { - return IS_VERITY(inode) ? -EOPNOTSUPP : 0; + return -EOPNOTSUPP; } -static inline int fsverity_prepare_setattr(struct dentry *dentry, - struct iattr *attr) +static inline int __fsverity_prepare_setattr(struct dentry *dentry, + struct iattr *attr) { - return IS_VERITY(d_inode(dentry)) ? -EOPNOTSUPP : 0; + return -EOPNOTSUPP; } static inline void fsverity_cleanup_inode(struct inode *inode) @@ -218,7 +230,8 @@ static inline int fsverity_ioctl_read_metadata(struct file *filp, /* verify.c */ -static inline bool fsverity_verify_page(struct page *page) +static inline bool fsverity_verify_blocks(struct folio *folio, size_t len, + size_t offset) { WARN_ON(1); return false; @@ -236,6 +249,16 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work) #endif /* !CONFIG_FS_VERITY */ +static inline bool fsverity_verify_folio(struct folio *folio) +{ + return fsverity_verify_blocks(folio, folio_size(folio), 0); +} + +static inline bool fsverity_verify_page(struct page *page) +{ + return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0); +} + /** * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check @@ -254,4 +277,42 @@ static inline bool fsverity_active(const struct inode *inode) return fsverity_get_info(inode) != NULL; } +/** + * fsverity_file_open() - prepare to open a verity file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * When opening a verity file, deny the open if it is for writing. Otherwise, + * set up the inode's ->i_verity_info if not already done. + * + * When combined with fscrypt, this must be called after fscrypt_file_open(). + * Otherwise, we won't have the key set up to decrypt the verity metadata. + * + * Return: 0 on success, -errno on failure + */ +static inline int fsverity_file_open(struct inode *inode, struct file *filp) +{ + if (IS_VERITY(inode)) + return __fsverity_file_open(inode, filp); + return 0; +} + +/** + * fsverity_prepare_setattr() - prepare to change a verity inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Verity files are immutable, so deny truncates. This isn't covered by the + * open-time check because sys_truncate() takes a path, not a file descriptor. + * + * Return: 0 on success, -errno on failure + */ +static inline int fsverity_prepare_setattr(struct dentry *dentry, + struct iattr *attr) +{ + if (IS_VERITY(d_inode(dentry))) + return __fsverity_prepare_setattr(dentry, attr); + return 0; +} + #endif /* _LINUX_FSVERITY_H */ diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 034b1106d022..a3028e400a9c 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -152,7 +152,10 @@ static inline void totalhigh_pages_add(long count) static inline bool is_kmap_addr(const void *x) { unsigned long addr = (unsigned long)x; - return addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP); + + return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) || + (addr >= __fix_to_virt(FIX_KMAP_END) && + addr < __fix_to_virt(FIX_KMAP_BEGIN)); } #else /* CONFIG_HIGHMEM */ @@ -200,7 +203,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } @@ -227,7 +230,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..9ab9d3105d5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/hugetlb_inline.h> #include <linux/cgroup.h> +#include <linux/page_ref.h> #include <linux/list.h> #include <linux/kref.h> #include <linux/pgtable.h> @@ -742,7 +743,10 @@ static inline struct hstate *hstate_sizelog(int page_size_log) if (!page_size_log) return &default_hstate; - return size_to_hstate(1UL << page_size_log); + if (page_size_log < BITS_PER_LONG) + return size_to_hstate(1UL << page_size_log); + + return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) @@ -1187,6 +1191,18 @@ static inline __init void hugetlb_cma_reserve(int order) } #endif +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return page_count(virt_to_page(pte)) > 1; +} +#else +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return false; +} +#endif + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE diff --git a/include/linux/ima.h b/include/linux/ima.h index 5a0b2a285a18..172b113a9864 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -18,7 +18,7 @@ struct linux_binprm; extern enum hash_algo ima_get_current_hash_algo(void); extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_file_check(struct file *file, int mask); -extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns, +extern void ima_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); @@ -30,7 +30,7 @@ extern int ima_read_file(struct file *file, enum kernel_read_file_id id, bool contents); extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); -extern void ima_post_path_mknod(struct user_namespace *mnt_userns, +extern void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry); extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size); @@ -66,7 +66,7 @@ static inline int ima_file_check(struct file *file, int mask) return 0; } -static inline void ima_post_create_tmpfile(struct user_namespace *mnt_userns, +static inline void ima_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode) { } @@ -111,7 +111,7 @@ static inline int ima_post_read_file(struct file *file, void *buf, loff_t size, return 0; } -static inline void ima_post_path_mknod(struct user_namespace *mnt_userns, +static inline void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { return; @@ -183,18 +183,18 @@ static inline void ima_post_key_create_or_update(struct key *keyring, #ifdef CONFIG_IMA_APPRAISE extern bool is_ima_appraise_enabled(void); -extern void ima_inode_post_setattr(struct user_namespace *mnt_userns, +extern void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry); extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); -extern int ima_inode_set_acl(struct user_namespace *mnt_userns, +extern int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -static inline int ima_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { - return ima_inode_set_acl(mnt_userns, dentry, acl_name, NULL); + return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name); #else @@ -203,7 +203,7 @@ static inline bool is_ima_appraise_enabled(void) return 0; } -static inline void ima_inode_post_setattr(struct user_namespace *mnt_userns, +static inline void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry) { return; @@ -217,7 +217,7 @@ static inline int ima_inode_setxattr(struct dentry *dentry, return 0; } -static inline int ima_inode_set_acl(struct user_namespace *mnt_userns, +static inline int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { @@ -231,7 +231,7 @@ static inline int ima_inode_removexattr(struct dentry *dentry, return 0; } -static inline int ima_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 128a67a40065..0efe4d784358 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -195,21 +195,23 @@ struct io_alloc_cache { struct io_ring_ctx { /* const or read-mostly hot data */ struct { - struct percpu_ref refs; - - struct io_rings *rings; unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; - unsigned int drain_disabled: 1; unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; /* all CQEs should be posted only by the submitter task */ unsigned int task_complete: 1; + unsigned int syscall_iopoll: 1; + unsigned int poll_activated: 1; + unsigned int drain_disabled: 1; + unsigned int compat: 1; + + enum task_work_notify_mode notify_method; + struct io_rings *rings; + struct task_struct *submitter_task; + struct percpu_ref refs; } ____cacheline_aligned_in_smp; /* submission data */ @@ -293,6 +295,7 @@ struct io_ring_ctx { spinlock_t completion_lock; bool poll_multi_queue; + bool cq_waiting; /* * ->iopoll_list is protected by the ctx->uring_lock for @@ -318,9 +321,8 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* Keep this last, we don't need it for the fast path */ - + struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct task_struct *submitter_task; /* slow path rsrc auxilary data, used by update/register */ struct io_rsrc_node *rsrc_backup_node; @@ -357,6 +359,7 @@ struct io_ring_ctx { u32 iowq_limits[2]; bool iowq_limits_set; + struct callback_head poll_wq_task_work; struct list_head defer_list; unsigned sq_thread_idle; /* protected by ->completion_lock */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0983dfc9a203..fca43a4bd96b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -58,8 +58,7 @@ struct vm_fault; #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) #define IOMAP_F_BUFFER_HEAD (1U << 4) -#define IOMAP_F_ZONE_APPEND (1U << 5) -#define IOMAP_F_XATTR (1U << 6) +#define IOMAP_F_XATTR (1U << 5) /* * Flags set by the core iomap code during operations: diff --git a/include/linux/iversion.h b/include/linux/iversion.h index e27bd4f55d84..f174ff1b59ee 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -9,8 +9,26 @@ * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must - * appear different to observers if there was a change to the inode's data or - * metadata since it was last queried. + * appear larger to observers if there was an explicit change to the inode's + * data or metadata since it was last queried. + * + * An explicit change is one that would ordinarily result in a change to the + * inode status change time (aka ctime). i_version must appear to change, even + * if the ctime does not (since the whole point is to avoid missing updates due + * to timestamp granularity). If POSIX or other relevant spec mandates that the + * ctime must change due to an operation, then the i_version counter must be + * incremented as well. + * + * Making the i_version update completely atomic with the operation itself would + * be prohibitively expensive. Traditionally the kernel has updated the times on + * directories after an operation that changes its contents. For regular files, + * the ctime is usually updated before the data is copied into the cache for a + * write. This means that there is a window of time when an observer can + * associate a new timestamp with old file contents. Since the purpose of the + * i_version is to allow for better cache coherency, the i_version must always + * be updated after the results of the operation are visible. Updating it before + * and after a change is also permitted. (Note that no filesystems currently do + * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the @@ -234,42 +252,6 @@ inode_peek_iversion(const struct inode *inode) return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } -/** - * inode_query_iversion - read i_version for later use - * @inode: inode from which i_version should be read - * - * Read the inode i_version counter. This should be used by callers that wish - * to store the returned i_version for later comparison. This will guarantee - * that a later query of the i_version will result in a different value if - * anything has changed. - * - * In this implementation, we fetch the current value, set the QUERIED flag and - * then try to swap it into place with a cmpxchg, if it wasn't already set. If - * that fails, we try again with the newly fetched value from the cmpxchg. - */ -static inline u64 -inode_query_iversion(struct inode *inode) -{ - u64 cur, new; - - cur = inode_peek_iversion_raw(inode); - do { - /* If flag is already set, then no need to swap */ - if (cur & I_VERSION_QUERIED) { - /* - * This barrier (and the implicit barrier in the - * cmpxchg below) pairs with the barrier in - * inode_maybe_inc_iversion(). - */ - smp_mb(); - break; - } - - new = cur | I_VERSION_QUERIED; - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); - return cur >> I_VERSION_QUERIED_SHIFT; -} - /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: @@ -283,6 +265,8 @@ static inline u64 time_to_chattr(struct timespec64 *t) return chattr; } +u64 inode_query_iversion(struct inode *inode); + /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check diff --git a/include/linux/key.h b/include/linux/key.h index d27477faf00d..8dc7f7c3088b 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -386,6 +386,14 @@ extern int wait_for_key_construction(struct key *key, bool intr); extern int key_validate(const struct key *key); +extern key_ref_t key_create(key_ref_t keyring, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags); + extern key_ref_t key_create_or_update(key_ref_t keyring, const char *type, const char *description, diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 70ce419e2709..2b7f067af3c4 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -312,7 +312,7 @@ static inline struct file *nlmsvc_file_file(struct nlm_file *file) static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return locks_inode(nlmsvc_file_file(file)); + return file_inode(nlmsvc_file_file(file)); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) @@ -372,7 +372,7 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) static inline int nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { - return locks_inode(fl1->fl_file) == locks_inode(fl2->fl_file) + return file_inode(fl1->fl_file) == file_inode(fl2->fl_file) && fl1->fl_pid == fl2->fl_pid && fl1->fl_owner == fl2->fl_owner && fl1->fl_start == fl2->fl_start diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 67e4a2c5500b..b60fbcd8cdfa 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -11,6 +11,7 @@ #define LOCKD_XDR_H #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/nfs.h> #include <linux/sunrpc/xdr.h> diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ed6cb2ac55fa..094b76dc7164 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -136,25 +136,25 @@ LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode, LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask) LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr) LSM_HOOK(int, 0, inode_getattr, const struct path *path) -LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry) -LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name) -LSM_HOOK(int, 0, inode_set_acl, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) -LSM_HOOK(int, 0, inode_get_acl, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) -LSM_HOOK(int, 0, inode_remove_acl, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry) -LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_killpriv, struct mnt_idmap *idmap, struct dentry *dentry) -LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns, +LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0a5ba81f7367..6e156d2acffc 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -475,7 +475,7 @@ * @inode_killpriv: * The setuid bit is being removed. Remove similar security labels. * Called with the dentry->d_inode->i_mutex held. - * @mnt_userns: user namespace of the mount. + * @idmap: idmap of the mount. * @dentry is the dentry being changed. * Return 0 on success. If error is returned, then the operation * causing setuid bit removal is failed. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..85dc9b88ea37 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1666,10 +1666,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { + struct mem_cgroup *memcg; + if (mem_cgroup_disabled()) return; - if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + memcg = folio_memcg(folio); + if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 76ef2e4fde38..333c1fec72f8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -573,6 +573,14 @@ struct mlx5_debugfs_entries { struct dentry *lag_debugfs; }; +enum mlx5_func_type { + MLX5_PF, + MLX5_VF, + MLX5_SF, + MLX5_HOST_PF, + MLX5_FUNC_TYPE_NUM, +}; + struct mlx5_ft_pool; struct mlx5_priv { /* IRQ table valid only for real pci devices PF or VF */ @@ -583,11 +591,10 @@ struct mlx5_priv { struct mlx5_nb pg_nb; struct workqueue_struct *pg_wq; struct xarray page_root_xa; - u32 fw_pages; atomic_t reg_pages; struct list_head free_list; - u32 vfs_pages; - u32 host_pf_pages; + u32 fw_pages; + u32 page_counters[MLX5_FUNC_TYPE_NUM]; u32 fw_pages_alloc_failed; u32 give_pages_dropped; u32 reclaim_pages_discard; diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f857163ac89..bd3197748562 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -137,7 +137,7 @@ extern int mmap_rnd_compat_bits __read_mostly; * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 -/* This function must be updated when the size of struct page grows above 80 +/* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, @@ -148,12 +148,18 @@ static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; - /* Check that struct page is either 56, 64, 72, or 80 bytes */ + /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); - BUILD_BUG_ON(sizeof(struct page) > 80); + BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { + case 96: + _pp[11] = 0; + fallthrough; + case 88: + _pp[10] = 0; + fallthrough; case 80: _pp[9] = 0; fallthrough; @@ -2095,8 +2101,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; -int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, - struct page **pages); struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index 0ccca33a7a6d..057c89867aa2 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -113,167 +113,23 @@ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid) #define AS_KUIDT(val) (kuid_t){ __vfsuid_val(val) } #define AS_KGIDT(val) (kgid_t){ __vfsgid_val(val) } -#ifdef CONFIG_MULTIUSER -/** - * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups - * @vfsgid: the mnt gid to match - * - * This function can be used to determine whether @vfsuid matches any of the - * caller's groups. - * - * Return: 1 if vfsuid matches caller's groups, 0 if not. - */ -static inline int vfsgid_in_group_p(vfsgid_t vfsgid) -{ - return in_group_p(AS_KGIDT(vfsgid)); -} -#else -static inline int vfsgid_in_group_p(vfsgid_t vfsgid) -{ - return 1; -} -#endif +int vfsgid_in_group_p(vfsgid_t vfsgid); -/** - * initial_idmapping - check whether this is the initial mapping - * @ns: idmapping to check - * - * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, - * [...], 1000 to 1000 [...]. - * - * Return: true if this is the initial mapping, false if not. - */ -static inline bool initial_idmapping(const struct user_namespace *ns) -{ - return ns == &init_user_ns; -} +vfsuid_t make_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, kuid_t kuid); -/** - * no_idmapping - check whether we can skip remapping a kuid/gid - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * - * This function can be used to check whether a remapping between two - * idmappings is required. - * An idmapped mount is a mount that has an idmapping attached to it that - * is different from the filsystem's idmapping and the initial idmapping. - * If the initial mapping is used or the idmapping of the mount and the - * filesystem are identical no remapping is required. - * - * Return: true if remapping can be skipped, false if not. - */ -static inline bool no_idmapping(const struct user_namespace *mnt_userns, - const struct user_namespace *fs_userns) -{ - return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; -} +vfsgid_t make_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, kgid_t kgid); -/** - * make_vfsuid - map a filesystem kuid into a mnt_userns - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @kuid : kuid to be mapped - * - * Take a @kuid and remap it from @fs_userns into @mnt_userns. Use this - * function when preparing a @kuid to be reported to userspace. - * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kuid unchanged. - * If initial_idmapping() tells us that the filesystem is not mounted with an - * idmapping we know the value of @kuid won't change when calling - * from_kuid() so we can simply retrieve the value via __kuid_val() - * directly. - * - * Return: @kuid mapped according to @mnt_userns. - * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is - * returned. - */ +kuid_t from_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsuid_t vfsuid); -static inline vfsuid_t make_vfsuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kuid_t kuid) -{ - uid_t uid; - - if (no_idmapping(mnt_userns, fs_userns)) - return VFSUIDT_INIT(kuid); - if (initial_idmapping(fs_userns)) - uid = __kuid_val(kuid); - else - uid = from_kuid(fs_userns, kuid); - if (uid == (uid_t)-1) - return INVALID_VFSUID; - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); -} - -/** - * make_vfsgid - map a filesystem kgid into a mnt_userns - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @kgid : kgid to be mapped - * - * Take a @kgid and remap it from @fs_userns into @mnt_userns. Use this - * function when preparing a @kgid to be reported to userspace. - * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kgid unchanged. - * If initial_idmapping() tells us that the filesystem is not mounted with an - * idmapping we know the value of @kgid won't change when calling - * from_kgid() so we can simply retrieve the value via __kgid_val() - * directly. - * - * Return: @kgid mapped according to @mnt_userns. - * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is - * returned. - */ - -static inline vfsgid_t make_vfsgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kgid_t kgid) -{ - gid_t gid; - - if (no_idmapping(mnt_userns, fs_userns)) - return VFSGIDT_INIT(kgid); - if (initial_idmapping(fs_userns)) - gid = __kgid_val(kgid); - else - gid = from_kgid(fs_userns, kgid); - if (gid == (gid_t)-1) - return INVALID_VFSGID; - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); -} - -/** - * from_vfsuid - map a vfsuid into the filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @vfsuid : vfsuid to be mapped - * - * Map @vfsuid into the filesystem idmapping. This function has to be used in - * order to e.g. write @vfsuid to inode->i_uid. - * - * Return: @vfsuid mapped into the filesystem idmapping - */ -static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - vfsuid_t vfsuid) -{ - uid_t uid; - - if (no_idmapping(mnt_userns, fs_userns)) - return AS_KUIDT(vfsuid); - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); - if (uid == (uid_t)-1) - return INVALID_UID; - if (initial_idmapping(fs_userns)) - return KUIDT_INIT(uid); - return make_kuid(fs_userns, uid); -} +kgid_t from_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsgid_t vfsgid); /** * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem - * @mnt_userns: the mount's idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid: vfsuid to be mapped * @@ -283,11 +139,11 @@ static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns, * * Return: true if @vfsuid has a mapping in the filesystem, false if not. */ -static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns, +static inline bool vfsuid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { - return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid)); + return uid_valid(from_vfsuid(idmap, fs_userns, vfsuid)); } static inline bool vfsuid_has_mapping(struct user_namespace *userns, @@ -310,35 +166,8 @@ static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid) } /** - * from_vfsgid - map a vfsgid into the filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @vfsgid : vfsgid to be mapped - * - * Map @vfsgid into the filesystem idmapping. This function has to be used in - * order to e.g. write @vfsgid to inode->i_gid. - * - * Return: @vfsgid mapped into the filesystem idmapping - */ -static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - vfsgid_t vfsgid) -{ - gid_t gid; - - if (no_idmapping(mnt_userns, fs_userns)) - return AS_KGIDT(vfsgid); - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); - if (gid == (gid_t)-1) - return INVALID_GID; - if (initial_idmapping(fs_userns)) - return KGIDT_INIT(gid); - return make_kgid(fs_userns, gid); -} - -/** * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem - * @mnt_userns: the mount's idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid: vfsgid to be mapped * @@ -348,11 +177,11 @@ static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns, * * Return: true if @vfsgid has a mapping in the filesystem, false if not. */ -static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns, +static inline bool vfsgid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { - return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid)); + return gid_valid(from_vfsgid(idmap, fs_userns, vfsgid)); } static inline bool vfsgid_has_mapping(struct user_namespace *userns, @@ -375,8 +204,8 @@ static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) } /** - * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns - * @mnt_userns: the mount's idmapping + * mapped_fsuid - return caller's fsuid mapped according to an idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on @@ -385,18 +214,17 @@ static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) * O_CREAT. Other examples include the allocation of quotas for a specific * user. * - * Return: the caller's current fsuid mapped up according to @mnt_userns. + * Return: the caller's current fsuid mapped up according to @idmap. */ -static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns, +static inline kuid_t mapped_fsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { - return from_vfsuid(mnt_userns, fs_userns, - VFSUIDT_INIT(current_fsuid())); + return from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(current_fsuid())); } /** - * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns - * @mnt_userns: the mount's idmapping + * mapped_fsgid - return caller's fsgid mapped according to an idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on @@ -405,13 +233,15 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns, * O_CREAT. Other examples include the allocation of quotas for a specific * user. * - * Return: the caller's current fsgid mapped up according to @mnt_userns. + * Return: the caller's current fsgid mapped up according to @idmap. */ -static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns, +static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { - return from_vfsgid(mnt_userns, fs_userns, - VFSGIDT_INIT(current_fsgid())); + return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } +bool check_fsmapping(const struct mnt_idmap *idmap, + const struct super_block *sb); + #endif /* _LINUX_MNT_IDMAPPING_H */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 62475996fac6..52f452b2259a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -74,8 +74,6 @@ struct vfsmount { struct mnt_idmap *mnt_idmap; } __randomize_layout; -struct user_namespace *mnt_user_ns(const struct vfsmount *mnt); -struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap); static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ diff --git a/include/linux/namei.h b/include/linux/namei.h index 00fee52df842..0d4531fd46e7 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -68,11 +68,11 @@ extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); -struct dentry *lookup_one(struct user_namespace *, const char *, struct dentry *, int); -struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int); +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); -struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aad12a179e54..e6e02184c25a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2839,8 +2839,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d92fdfd2444c..d6c119e31d7a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -392,11 +392,11 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); -extern int nfs_getattr(struct user_namespace *, const struct path *, +extern int nfs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); -extern int nfs_permission(struct user_namespace *, struct inode *, int); +extern int nfs_permission(struct mnt_idmap *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags); @@ -405,7 +405,7 @@ extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); -extern int nfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern int nfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 50caa117cb62..bb15c9234e21 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -70,7 +70,6 @@ struct nvmem_keepout { * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. * @priv: User context passed to read/write callbacks. - * @wp-gpio: Write protect pin * @ignore_wp: Write Protect pin is managed by the provider. * * Note: A default "nvmem<id>" name will be assigned to the device if @@ -85,7 +84,6 @@ struct nvmem_config { const char *name; int id; struct module *owner; - struct gpio_desc *wp_gpio; const struct nvmem_cell_info *cells; int ncells; const struct nvmem_keepout *keepout; diff --git a/include/linux/pci.h b/include/linux/pci.h index adffd65e84b4..254c8a4126a8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1621,6 +1621,18 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, flags, NULL); } +static inline struct msi_map pci_msix_alloc_irq_at(struct pci_dev *dev, unsigned int index, + const struct irq_affinity_desc *affdesc) +{ + struct msi_map map = { .index = -ENOSYS, }; + + return map; +} + +static inline void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map) +{ +} + static inline void pci_free_irq_vectors(struct pci_dev *dev) { } diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index ef914a600087..525b5d64e394 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -100,7 +100,6 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); - bool (*filter)(struct pmu *pmu, int cpu); int num_events; bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 diff --git a/include/linux/poison.h b/include/linux/poison.h index 2d3249eb0e62..0e8a1f2ceb2f 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -84,4 +84,7 @@ /********** kernel/bpf/ **********/ #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) +/********** VFS **********/ +#define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) + #endif diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index ee608d22ecb9..21cc29b8a9e8 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -69,20 +69,20 @@ extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); -int set_posix_acl(struct user_namespace *, struct dentry *, int, +int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL -int posix_acl_chmod(struct user_namespace *, struct dentry *, umode_t); +int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); -int posix_acl_update_mode(struct user_namespace *, struct inode *, umode_t *, +int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); -int simple_set_acl(struct user_namespace *, struct dentry *, +int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); @@ -91,7 +91,7 @@ void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); -int posix_acl_permission(struct user_namespace *, struct inode *, +int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) @@ -100,14 +100,14 @@ static inline void cache_no_acl(struct inode *inode) inode->i_default_acl = NULL; } -int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); -int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); #else -static inline int posix_acl_chmod(struct user_namespace *mnt_userns, +static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; @@ -134,21 +134,21 @@ static inline void forget_all_cached_acls(struct inode *inode) { } -static inline int vfs_set_acl(struct user_namespace *mnt_userns, +static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } -static inline struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, +static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } -static inline int vfs_remove_acl(struct user_namespace *mnt_userns, +static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 0d8625d71733..11a4becff3a9 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -20,12 +20,12 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) } /* i_mutex must being held */ -static inline bool is_quota_modification(struct user_namespace *mnt_userns, +static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { return ((ia->ia_valid & ATTR_SIZE) || - i_uid_needs_update(mnt_userns, ia, inode) || - i_gid_needs_update(mnt_userns, ia, inode)); + i_uid_needs_update(idmap, ia, inode) || + i_gid_needs_update(idmap, ia, inode)); } #if defined(CONFIG_QUOTA) @@ -116,7 +116,7 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); -int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, +int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) @@ -236,7 +236,7 @@ static inline void dquot_free_inode(struct inode *inode) { } -static inline int dquot_transfer(struct user_namespace *mnt_userns, +static inline int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { return 0; diff --git a/include/linux/security.h b/include/linux/security.h index 5b67f208f7de..5984d0d550b4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -153,12 +153,11 @@ extern int cap_capset(struct cred *new, const struct cred *old, extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file); int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -int cap_inode_removexattr(struct user_namespace *mnt_userns, +int cap_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name); int cap_inode_need_killpriv(struct dentry *dentry); -int cap_inode_killpriv(struct user_namespace *mnt_userns, - struct dentry *dentry); -int cap_inode_getsecurity(struct user_namespace *mnt_userns, +int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry); +int cap_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc); extern int cap_mmap_addr(unsigned long addr); @@ -356,29 +355,28 @@ int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu); int security_inode_permission(struct inode *inode, int mask); -int security_inode_setattr(struct user_namespace *mnt_userns, +int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int security_inode_getattr(const struct path *path); -int security_inode_setxattr(struct user_namespace *mnt_userns, +int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -int security_inode_set_acl(struct user_namespace *mnt_userns, +int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -int security_inode_get_acl(struct user_namespace *mnt_userns, +int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); -int security_inode_remove_acl(struct user_namespace *mnt_userns, +int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int security_inode_getxattr(struct dentry *dentry, const char *name); int security_inode_listxattr(struct dentry *dentry); -int security_inode_removexattr(struct user_namespace *mnt_userns, +int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name); int security_inode_need_killpriv(struct dentry *dentry); -int security_inode_killpriv(struct user_namespace *mnt_userns, - struct dentry *dentry); -int security_inode_getsecurity(struct user_namespace *mnt_userns, +int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry); +int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc); int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags); @@ -862,7 +860,7 @@ static inline int security_inode_permission(struct inode *inode, int mask) return 0; } -static inline int security_inode_setattr(struct user_namespace *mnt_userns, +static inline int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -874,14 +872,14 @@ static inline int security_inode_getattr(const struct path *path) return 0; } -static inline int security_inode_setxattr(struct user_namespace *mnt_userns, +static inline int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { return cap_inode_setxattr(dentry, name, value, size, flags); } -static inline int security_inode_set_acl(struct user_namespace *mnt_userns, +static inline int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) @@ -889,14 +887,14 @@ static inline int security_inode_set_acl(struct user_namespace *mnt_userns, return 0; } -static inline int security_inode_get_acl(struct user_namespace *mnt_userns, +static inline int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return 0; } -static inline int security_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { @@ -918,11 +916,11 @@ static inline int security_inode_listxattr(struct dentry *dentry) return 0; } -static inline int security_inode_removexattr(struct user_namespace *mnt_userns, +static inline int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { - return cap_inode_removexattr(mnt_userns, dentry, name); + return cap_inode_removexattr(idmap, dentry, name); } static inline int security_inode_need_killpriv(struct dentry *dentry) @@ -930,18 +928,18 @@ static inline int security_inode_need_killpriv(struct dentry *dentry) return cap_inode_need_killpriv(dentry); } -static inline int security_inode_killpriv(struct user_namespace *mnt_userns, +static inline int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { - return cap_inode_killpriv(mnt_userns, dentry); + return cap_inode_killpriv(idmap, dentry); } -static inline int security_inode_getsecurity(struct user_namespace *mnt_userns, +static inline int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { - return cap_inode_getsecurity(mnt_userns, inode, name, buffer, alloc); + return cap_inode_getsecurity(idmap, inode, name, buffer, alloc); } static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 71310efe2fab..7bde8e1c228a 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -107,7 +107,7 @@ extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); -extern void shrinker_debugfs_remove(struct shrinker *shrinker); +extern struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker); extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ @@ -115,8 +115,9 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } -static inline void shrinker_debugfs_remove(struct shrinker *shrinker) +static inline struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) { + return NULL; } static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 1341f7d62da4..be48f1cb1878 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -476,6 +476,15 @@ extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) +extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock); +#define atomic_dec_and_raw_lock(atomic, lock) \ + __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock)) + +extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, + unsigned long *flags); +#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \ + __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))) + int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, diff --git a/include/linux/stat.h b/include/linux/stat.h index ff277ced50e9..52150570d37a 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -52,6 +52,15 @@ struct kstat { u64 mnt_id; u32 dio_mem_align; u32 dio_offset_align; + u64 change_cookie; }; +/* These definitions are internal to the kernel for now. Mainly used by nfsd. */ + +/* mask values */ +#define STATX_CHANGE_COOKIE 0x40000000U /* Want/got stx_change_attr */ + +/* file attribute values */ +#define STATX_ATTR_CHANGE_MONOTONIC 0x8000000000000000ULL /* version monotonically increases */ + #endif diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 83ca2e8eb6b5..a152678b82b7 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -252,6 +252,7 @@ struct plat_stmmacenet_data { int rss_en; int mac_port_sel_speed; bool en_tx_lpi_clockgating; + bool rx_clk_runs_in_lpi; int has_xgmac; bool vlan_fail_q_en; u8 vlan_fail_q; diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/include/linux/tpm.h b/include/linux/tpm.h index dfeb25a0362d..4dc97b9f65fb 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -265,6 +265,7 @@ enum tpm2_startup_types { enum tpm2_cc_attrs { TPM2_CC_ATTR_CHANDLES = 25, TPM2_CC_ATTR_RHANDLE = 28, + TPM2_CC_ATTR_VENDOR = 29, }; #define TPM_VID_INTEL 0x8086 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 4342e996bcdb..0e373222a6df 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -270,6 +270,7 @@ struct trace_event_fields { const int align; const int is_signed; const int filter_type; + const int len; }; int (*define_fields)(struct trace_event_call *); }; diff --git a/include/linux/uio.h b/include/linux/uio.h index 9f158238edba..73b1d5d1e4f1 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -346,6 +346,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i); +int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 72299f261b25..43db6e47503c 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -38,4 +38,16 @@ */ #define find_closest_descending(x, a, as) __find_closest(x, a, as, >=) +/** + * is_insidevar - check if the @ptr points inside the @var memory range. + * @ptr: the pointer to a memory address. + * @var: the variable which address and size identify the memory range. + * + * Evaluates to true if the address in @ptr lies within the memory + * range allocated to @var. + */ +#define is_insidevar(ptr, var) \ + ((uintptr_t)(ptr) >= (uintptr_t)(var) && \ + (uintptr_t)(ptr) < (uintptr_t)(var) + sizeof(var)) + #endif diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 2e7dd44926e4..6af72461397d 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -42,7 +42,7 @@ struct xattr_handler { struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, - struct user_namespace *mnt_userns, struct dentry *dentry, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; @@ -56,25 +56,25 @@ struct xattr { }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); -ssize_t vfs_getxattr(struct user_namespace *, struct dentry *, const char *, +ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); -int __vfs_setxattr(struct user_namespace *, struct dentry *, struct inode *, +int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *, const char *, const void *, size_t, int); -int __vfs_setxattr_noperm(struct user_namespace *, struct dentry *, +int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); -int __vfs_setxattr_locked(struct user_namespace *, struct dentry *, +int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, struct inode **); -int vfs_setxattr(struct user_namespace *, struct dentry *, const char *, +int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); -int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *); -int __vfs_removexattr_locked(struct user_namespace *, struct dentry *, +int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); +int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, const char *, struct inode **); -int vfs_removexattr(struct user_namespace *, struct dentry *, const char *); +int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); -int vfs_getxattr_alloc(struct user_namespace *mnt_userns, +int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index b3ba04615caa..56189e4252da 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -336,9 +336,12 @@ struct gdma_queue_spec { }; }; +#define MANA_IRQ_NAME_SZ 32 + struct gdma_irq_context { void (*handler)(void *arg); void *arg; + char name[MANA_IRQ_NAME_SZ]; }; struct gdma_context { diff --git a/include/net/sock.h b/include/net/sock.h index dcd72e6285b2..556209727633 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2434,6 +2434,19 @@ static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struc return false; } +static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) +{ + skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + if (skb) { + if (sk_rmem_schedule(sk, skb, skb->truesize)) { + skb_set_owner_r(skb, sk); + return skb; + } + __kfree_skb(skb); + } + return NULL; +} + static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index 695eebc6f2c8..e39fb0736ade 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -422,6 +422,8 @@ extern int iscsi_host_get_max_scsi_cmds(struct Scsi_Host *shost, extern struct iscsi_cls_session * iscsi_session_setup(struct iscsi_transport *, struct Scsi_Host *shost, uint16_t, int, int, uint32_t, unsigned int); +void iscsi_session_remove(struct iscsi_cls_session *cls_session); +void iscsi_session_free(struct iscsi_cls_session *cls_session); extern void iscsi_session_teardown(struct iscsi_cls_session *); extern void iscsi_session_recovery_timedout(struct iscsi_cls_session *); extern int iscsi_set_param(struct iscsi_cls_conn *cls_conn, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 6548b5b5aa60..75d7d22c3a27 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -32,6 +32,7 @@ struct prelim_ref; struct btrfs_space_info; struct btrfs_raid_bio; struct raid56_bio_trace_info; +struct find_free_extent_ctl; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_root *root, u64 num_bytes, - u64 empty_size, u64 data), + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(root, num_bytes, empty_size, data), + TP_ARGS(root, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) - __field( u64, data ) + __field( u64, flags ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; - __entry->num_bytes = num_bytes; - __entry->empty_size = empty_size; - __entry->data = data; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", show_root_type(__entry->root_objectid), - __entry->num_bytes, __entry->empty_size, __entry->data, - __print_flags((unsigned long)__entry->data, "|", + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS)) +); + +TRACE_EVENT(find_free_extent_search_loop, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), + + TP_ARGS(root, ffe_ctl), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + ), + + TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop) +); + +TRACE_EVENT(find_free_extent_have_block_group, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *block_group), + + TP_ARGS(root, ffe_ctl, block_group), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + __field( bool, hinted ) + __field( u64, bg_start ) + __field( u64, bg_flags ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->bg_start = block_group->start; + __entry->bg_flags = block_group->flags; + ), + + TP_printk_btrfs( +"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop, __entry->hinted, + __entry->bg_start, __entry->bg_flags, + __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS)) ); DECLARE_EVENT_CLASS(btrfs__reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len), + TP_ARGS(block_group, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) + __field( int, bg_size_class ) __field( u64, start ) __field( u64, len ) + __field( u64, loop ) + __field( bool, hinted ) + __field( int, size_class ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; - __entry->start = start; - __entry->len = len; + __entry->bg_size_class = block_group->size_class; + __entry->start = ffe_ctl->search_start; + __entry->len = ffe_ctl->num_bytes; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->size_class = ffe_ctl->size_class; ), - TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) " - "start=%llu len=%llu", + TP_printk_btrfs( +"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->start, __entry->len) + __entry->bg_size_class, __entry->start, __entry->len, + __entry->loop, __entry->hinted, __entry->size_class) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); TRACE_EVENT(btrfs_find_cluster, diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 4f4c44ea3a65..cf4a0d28b178 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -19,12 +19,17 @@ struct erofs_map_blocks; { 1, "DIR" }) #define show_map_flags(flags) __print_flags(flags, "|", \ - { EROFS_GET_BLOCKS_RAW, "RAW" }) + { EROFS_GET_BLOCKS_FIEMAP, "FIEMAP" }, \ + { EROFS_GET_BLOCKS_READMORE, "READMORE" }, \ + { EROFS_GET_BLOCKS_FINDTAIL, "FINDTAIL" }) #define show_mflags(flags) __print_flags(flags, "", \ - { EROFS_MAP_MAPPED, "M" }, \ - { EROFS_MAP_META, "I" }, \ - { EROFS_MAP_ENCODED, "E" }) + { EROFS_MAP_MAPPED, "M" }, \ + { EROFS_MAP_META, "I" }, \ + { EROFS_MAP_ENCODED, "E" }, \ + { EROFS_MAP_FULL_MAPPED, "F" }, \ + { EROFS_MAP_FRAGMENT, "R" }, \ + { EROFS_MAP_PARTIAL_REF, "P" }) TRACE_EVENT(erofs_lookup, @@ -66,8 +71,8 @@ TRACE_EVENT(erofs_fill_inode, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; - __entry->blkaddr = erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid)); - __entry->ofs = erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid)); + __entry->blkaddr = erofs_blknr(erofs_iloc(inode)); + __entry->ofs = erofs_blkoff(erofs_iloc(inode)); ), TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u", diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h index affd541fd25e..b6f679ae21aa 100644 --- a/include/trace/stages/stage4_event_fields.h +++ b/include/trace/stages/stage4_event_fields.h @@ -26,7 +26,8 @@ #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ .size = sizeof(_type[_len]), .align = ALIGN_STRUCTFIELD(_type), \ - .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }, + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER,\ + .len = _len }, #undef __dynamic_array #define __dynamic_array(_type, _item, _len) { \ diff --git a/include/uapi/drm/virtgpu_drm.h b/include/uapi/drm/virtgpu_drm.h index 0512fde5e697..7b158fcb02b4 100644 --- a/include/uapi/drm/virtgpu_drm.h +++ b/include/uapi/drm/virtgpu_drm.h @@ -64,6 +64,7 @@ struct drm_virtgpu_map { __u32 pad; }; +/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */ struct drm_virtgpu_execbuffer { __u32 flags; __u32 size; diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 436258214bb0..cd14c94e9a1e 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -188,15 +188,43 @@ struct fanotify_event_info_error { __u32 error_count; }; +/* + * User space may need to record additional information about its decision. + * The extra information type records what kind of information is included. + * The default is none. We also define an extra information buffer whose + * size is determined by the extra information type. + * + * If the information type is Audit Rule, then the information following + * is the rule number that triggered the user space decision that + * requires auditing. + */ + +#define FAN_RESPONSE_INFO_NONE 0 +#define FAN_RESPONSE_INFO_AUDIT_RULE 1 + struct fanotify_response { __s32 fd; __u32 response; }; +struct fanotify_response_info_header { + __u8 type; + __u8 pad; + __u16 len; +}; + +struct fanotify_response_info_audit_rule { + struct fanotify_response_info_header hdr; + __u32 rule_number; + __u32 subj_trust; + __u32 obj_trust; +}; + /* Legit userspace responses to a _PERM event */ #define FAN_ALLOW 0x01 #define FAN_DENY 0x02 -#define FAN_AUDIT 0x10 /* Bit mask to create audit record for result */ +#define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */ +#define FAN_INFO 0x20 /* Bitmask to indicate additional information */ /* No fd set in event */ #define FAN_NOFD -1 diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2780bce62faf..97661a60b28c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -347,6 +347,8 @@ enum { * applicable for IORING_MSG_DATA, obviously. */ #define IORING_MSG_RING_CQE_SKIP (1U << 0) +/* Pass through the flags from sqe->file_index to cqe->flags */ +#define IORING_MSG_RING_FLAGS_PASS (1U << 1) /* * IO completion data structure (Completion Queue Entry) @@ -470,6 +472,7 @@ struct io_uring_params { #define IORING_FEAT_RSRC_TAGS (1U << 10) #define IORING_FEAT_CQE_SKIP (1U << 11) #define IORING_FEAT_LINKED_FILE (1U << 12) +#define IORING_FEAT_REG_REG_RING (1U << 13) /* * io_uring_register(2) opcodes and arguments @@ -517,7 +520,10 @@ enum { IORING_REGISTER_FILE_ALLOC_RANGE = 25, /* this goes last */ - IORING_REGISTER_LAST + IORING_REGISTER_LAST, + + /* flag added to the opcode to use a registered ring fd */ + IORING_REGISTER_USE_REGISTERED_RING = 1U << 31 }; /* io-wq worker categories */ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 874a92349bf5..283dec7e3645 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -18,6 +18,7 @@ #ifndef _UAPI_LINUX_IP_H #define _UAPI_LINUX_IP_H #include <linux/types.h> +#include <linux/stddef.h> #include <asm/byteorder.h> #define IPTOS_TOS_MASK 0x1E diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 81f4243bebb1..53326dfc59ec 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -4,6 +4,7 @@ #include <linux/libc-compat.h> #include <linux/types.h> +#include <linux/stddef.h> #include <linux/in6.h> #include <asm/byteorder.h> diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index c742469afe21..2d6f80d75ae7 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -15,8 +15,7 @@ enum sctp_conntrack { SCTP_CONNTRACK_SHUTDOWN_RECD, SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, SCTP_CONNTRACK_HEARTBEAT_SENT, - SCTP_CONNTRACK_HEARTBEAT_ACKED, - SCTP_CONNTRACK_DATA_SENT, + SCTP_CONNTRACK_HEARTBEAT_ACKED, /* no longer used */ SCTP_CONNTRACK_MAX }; diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h index 94e74034706d..aa805e6d4e28 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h @@ -94,8 +94,7 @@ enum ctattr_timeout_sctp { CTA_TIMEOUT_SCTP_SHUTDOWN_RECD, CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, - CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, - CTA_TIMEOUT_SCTP_DATA_SENT, + CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, /* no longer used */ __CTA_TIMEOUT_SCTP_MAX }; #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 5cf81dff60aa..727084cd79be 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -808,6 +808,7 @@ struct ufs_hba_monitor { * @urgent_bkops_lvl: keeps track of urgent bkops level for device * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for * device is known or not. + * @wb_mutex: used to serialize devfreq and sysfs write booster toggling * @clk_scaling_lock: used to serialize device commands and clock scaling * @desc_size: descriptor sizes reported by device * @scsi_block_reqs_cnt: reference counting for scsi block requests @@ -951,6 +952,7 @@ struct ufs_hba { enum bkops_status urgent_bkops_lvl; bool is_urgent_bkops_lvl_checked; + struct mutex wb_mutex; struct rw_semaphore clk_scaling_lock; unsigned char desc_size[QUERY_DESC_IDN_MAX]; atomic_t scsi_block_reqs_cnt; diff --git a/io_uring/advise.c b/io_uring/advise.c index 449c6f14649f..7085804c513c 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -39,6 +39,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ma->addr = READ_ONCE(sqe->addr); ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); + req->flags |= REQ_F_FORCE_ASYNC; return 0; #else return -EOPNOTSUPP; @@ -51,8 +52,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); @@ -62,6 +62,18 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) #endif } +static bool io_fadvise_force_async(struct io_fadvise *fa) +{ + switch (fa->advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + return false; + default: + return true; + } +} + int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); @@ -72,6 +84,8 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) fa->offset = READ_ONCE(sqe->off); fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); + if (io_fadvise_force_async(fa)) + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -80,16 +94,7 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa)); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) diff --git a/io_uring/fs.c b/io_uring/fs.c index 7100c293c13a..f6a69a549fd4 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -74,6 +74,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -82,8 +83,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, ren->newpath, ren->flags); @@ -123,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(un->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -131,8 +132,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (un->flags & AT_REMOVEDIR) ret = do_rmdir(un->dfd, un->filename); @@ -170,6 +170,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(mkd->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -178,8 +179,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); @@ -220,6 +220,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -228,8 +229,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); @@ -265,6 +265,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -273,8 +274,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, lnk->newpath, lnk->flags); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a4efada9b3c..3b915deb4d08 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -151,7 +151,7 @@ static void io_move_task_work_from_local(struct io_ring_ctx *ctx); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static __cold void io_fallback_tw(struct io_uring_task *tctx); -static struct kmem_cache *req_cachep; +struct kmem_cache *req_cachep; struct sock *io_uring_get_socket(struct file *file) { @@ -230,6 +230,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + kasan_poison_object_data(req_cachep, req); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -245,17 +246,15 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = false; + bool locked = true; - percpu_ref_get(&ctx->refs); + mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &locked); - - if (locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - } - percpu_ref_put(&ctx->refs); + if (WARN_ON_ONCE(!locked)) + return; + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) @@ -316,6 +315,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); + init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); @@ -407,7 +407,7 @@ static inline void io_arm_ltimeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { @@ -572,6 +572,8 @@ static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { + if (ctx->poll_activated) + io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->drain_active) { @@ -618,6 +620,25 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } +static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + io_commit_cqring(ctx); + __io_cq_unlock(ctx); + io_commit_cqring_flush(ctx); + + /* + * As ->task_complete implies that the ring is single tasked, cq_wait + * may only be waited on by the current in io_cqring_wait(), but since + * it will re-check the wakeup conditions once we return we can safely + * skip waking it up. + */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { + smp_mb(); + __io_cqring_wake(ctx); + } +} + void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { @@ -645,7 +666,6 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) } } -/* Returns true if there are no backlogged entries after the flush */ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { size_t cqe_size = sizeof(struct io_uring_cqe); @@ -693,7 +713,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cqring_do_overflow_flush(ctx); } -void __io_put_task(struct task_struct *task, int nr) +/* can be called by any task */ +static void io_put_task_remote(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; @@ -703,6 +724,21 @@ void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } +/* used by a task to put its own references */ +static void io_put_task_local(struct task_struct *task, int nr) +{ + task->io_uring->cached_refs += nr; +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + io_put_task_local(task, nr); + else + io_put_task_remote(task, nr); +} + void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; @@ -945,15 +981,15 @@ static void __io_req_complete_post(struct io_kiocb *req) req->link = NULL; } } + io_put_kbuf_comp(req); + io_dismantle_req(req); io_req_put_rsrc(req); /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ - io_put_kbuf_comp(req); - io_dismantle_req(req); - io_put_task(req->task, 1); + io_put_task_remote(req->task, 1); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } @@ -980,7 +1016,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); @@ -1076,7 +1112,7 @@ __cold void io_free_req(struct io_kiocb *req) io_req_put_rsrc(req); io_dismantle_req(req); - io_put_task(req->task, 1); + io_put_task_remote(req->task, 1); spin_lock(&ctx->completion_lock); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); @@ -1130,7 +1166,7 @@ static unsigned int handle_tw_list(struct llist_node *node, { unsigned int count = 0; - while (node != last) { + while (node && node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1143,10 +1179,16 @@ static unsigned int handle_tw_list(struct llist_node *node, /* if not contended, grab and improve batching */ *locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); - } + } else if (!*locked) + *locked = mutex_trylock(&(*ctx)->uring_lock); req->io_task_work.func(req, locked); node = next; count++; + if (unlikely(need_resched())) { + ctx_flush_and_put(*ctx, locked); + *ctx = NULL; + cond_resched(); + } } return count; @@ -1190,23 +1232,29 @@ void tctx_task_work(struct callback_head *cb) task_work); struct llist_node fake = {}; struct llist_node *node; - unsigned int loops = 1; - unsigned int count; + unsigned int loops = 0; + unsigned int count = 0; if (unlikely(current->flags & PF_EXITING)) { io_fallback_tw(tctx); return; } - node = io_llist_xchg(&tctx->task_list, &fake); - count = handle_tw_list(node, &ctx, &uring_locked, NULL); - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - while (node != &fake) { + do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); count += handle_tw_list(node, &ctx, &uring_locked, &fake); + + /* skip expensive cmpxchg if there are items in the list */ + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { + io_submit_flush_completions(ctx); + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + } node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - } + } while (node != &fake); ctx_flush_and_put(ctx, &uring_locked); @@ -1241,7 +1289,7 @@ static void io_req_local_work_add(struct io_kiocb *req) percpu_ref_put(&ctx->refs); return; } - /* need it for the following io_cqring_wake() */ + /* needed for the following wake up */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { @@ -1252,10 +1300,11 @@ static void io_req_local_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) io_eventfd_signal(ctx); - __io_cqring_wake(ctx); + + if (READ_ONCE(ctx->cq_waiting)) + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); percpu_ref_put(&ctx->refs); } @@ -1296,21 +1345,19 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) { struct llist_node *node; - struct llist_node fake; - struct llist_node *current_final = NULL; - int ret; - unsigned int loops = 1; + unsigned int loops = 0; + int ret = 0; - if (unlikely(ctx->submitter_task != current)) + if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; - - node = io_llist_xchg(&ctx->work_llist, &fake); - ret = 0; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: - while (node != current_final) { + node = io_llist_xchg(&ctx->work_llist, NULL); + while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1319,26 +1366,20 @@ again: ret++; node = next; } + loops++; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL); - if (node != &fake) { - loops++; - current_final = &fake; - node = io_llist_xchg(&ctx->work_llist, &fake); + if (!llist_empty(&ctx->work_llist)) goto again; - } - - if (*locked) + if (*locked) { io_submit_flush_completions(ctx); + if (!llist_empty(&ctx->work_llist)) + goto again; + } trace_io_uring_local_work_run(ctx, ret, loops); return ret; - } -int io_run_local_work(struct io_ring_ctx *ctx) +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { bool locked; int ret; @@ -1346,8 +1387,19 @@ int io_run_local_work(struct io_ring_ctx *ctx) if (llist_empty(&ctx->work_llist)) return 0; - __set_current_state(TASK_RUNNING); - locked = mutex_trylock(&ctx->uring_lock); + locked = true; + ret = __io_run_local_work(ctx, &locked); + /* shouldn't happen! */ + if (WARN_ON_ONCE(!locked)) + mutex_lock(&ctx->uring_lock); + return ret; +} + +static int io_run_local_work(struct io_ring_ctx *ctx) +{ + bool locked = mutex_trylock(&ctx->uring_lock); + int ret; + ret = __io_run_local_work(ctx, &locked); if (locked) mutex_unlock(&ctx->uring_lock); @@ -1365,10 +1417,12 @@ void io_req_task_submit(struct io_kiocb *req, bool *locked) { io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ - if (likely(!(req->task->flags & PF_EXITING))) - io_queue_sqe(req); - else + if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); + else if (req->flags & REQ_F_FORCE_ASYNC) + io_queue_iowq(req, locked); + else + io_queue_sqe(req); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1467,7 +1521,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } } - __io_cq_unlock_post(ctx); + __io_cq_unlock_post_flush(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); @@ -1708,8 +1762,8 @@ unsigned int io_file_get_flags(struct file *file) bool io_alloc_async_data(struct io_kiocb *req) { - WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); - req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); + WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); + req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; @@ -1719,20 +1773,21 @@ bool io_alloc_async_data(struct io_kiocb *req) int io_req_prep_async(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->prep_async) + if (!cdef->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (!io_op_defs[req->opcode].manual_alloc) { + if (!def->manual_alloc) { if (io_alloc_async_data(req)) return -EAGAIN; } - return def->prep_async(req); + return cdef->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -1765,17 +1820,12 @@ queue: } spin_unlock(&ctx->completion_lock); - ret = io_req_prep_async(req); - if (ret) { -fail: - io_req_defer_failed(req, ret); - return; - } io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { ret = -ENOMEM; - goto fail; + io_req_defer_failed(req, ret); + return; } spin_lock(&ctx->completion_lock); @@ -1801,7 +1851,7 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_NEED_CLEANUP) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; if (def->cleanup) def->cleanup(req); @@ -1825,9 +1875,10 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, + unsigned int issue_flags) { - if (req->file || !io_op_defs[req->opcode].needs_file) + if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) @@ -1840,11 +1891,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; - if (unlikely(!io_assign_file(req, issue_flags))) + if (unlikely(!io_assign_file(req, def, issue_flags))) return -EBADF; if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) @@ -1894,7 +1945,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; @@ -1913,7 +1964,7 @@ fail: io_req_task_queue_fail(req, err); return; } - if (!io_assign_file(req, issue_flags)) { + if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; work->flags |= IO_WQ_WORK_CANCEL; goto fail; @@ -2048,13 +2099,16 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags &= ~REQ_F_HARDLINK; req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); - } else if (unlikely(req->ctx->drain_active)) { - io_drain_req(req); } else { int ret = io_req_prep_async(req); - if (unlikely(ret)) + if (unlikely(ret)) { io_req_defer_failed(req, ret); + return; + } + + if (unlikely(req->ctx->drain_active)) + io_drain_req(req); else io_queue_iowq(req, NULL); } @@ -2106,7 +2160,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def; + const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; @@ -2124,7 +2178,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return -EINVAL; } - def = &io_op_defs[opcode]; + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2335,7 +2389,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -2353,14 +2407,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1; - return &ctx->sq_sqes[head]; + *sqe = &ctx->sq_sqes[head]; + return true; } /* drop invalid entries */ ctx->cq_extra--; WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); - return NULL; + return false; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) @@ -2381,11 +2436,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, &req))) break; - req = io_alloc_req(ctx); - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { + if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } @@ -2420,13 +2473,13 @@ struct io_wait_queue { struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; + ktime_t timeout; }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - !llist_empty(&ctx->work_llist)); + !llist_empty(&ctx->work_llist); } static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -2445,22 +2498,25 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, - wq); - struct io_ring_ctx *ctx = iowq->ctx; + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || io_has_work(ctx)) + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (io_run_task_work_ctx(ctx) > 0) + if (!llist_empty(&ctx->work_llist)) { + __set_current_state(TASK_RUNNING); + if (io_run_local_work(ctx) > 0) + return 1; + } + if (io_run_task_work() > 0) return 1; if (task_sigpending(current)) return -EINTR; @@ -2469,35 +2525,23 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t *timeout) + struct io_wait_queue *iowq) { - int ret; - unsigned long check_cq; - - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(ctx); - if (ret || io_should_wake(iowq)) - return ret; - - check_cq = READ_ONCE(ctx->check_cq); - if (unlikely(check_cq)) { - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) - return -EBADR; - } - if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(!llist_empty(&ctx->work_llist))) + return 1; + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + if (iowq->timeout == KTIME_MAX) + schedule(); + else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) return -ETIME; - - /* - * Run task_work after scheduling. If we got woken because of - * task_work being processed, run it now rather than let the caller - * do another wait loop. - */ - ret = io_run_task_work_sig(ctx); - return ret < 0 ? ret : 1; + return 0; } /* @@ -2510,23 +2554,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - ktime_t timeout = KTIME_MAX; int ret; if (!io_allowed_run_tw(ctx)) return -EEXIST; - - do { - /* always run at least 1 task work to process local work */ - ret = io_run_task_work_ctx(ctx); - if (ret < 0) - return ret; - io_cqring_overflow_flush(ctx); - - /* if user messes with these they will just get an early return */ - if (__io_cqring_events_user(ctx) >= min_events) - return 0; - } while (ret > 0); + if (!llist_empty(&ctx->work_llist)) + io_run_local_work(ctx); + io_run_task_work(); + io_cqring_overflow_flush(ctx); + /* if user messes with these they will just get an early return */ + if (__io_cqring_events_user(ctx) >= min_events) + return 0; if (sig) { #ifdef CONFIG_COMPAT @@ -2541,36 +2579,69 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.timeout = KTIME_MAX; + + if (uts) { + struct timespec64 ts; + + if (get_timespec64(&ts, uts)) + return -EFAULT; + iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + } trace_io_uring_cqring_wait(ctx, min_events); do { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - finish_wait(&ctx->cq_wait, &iowq.wq); - io_cqring_do_overflow_flush(ctx); + unsigned long check_cq; + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + WRITE_ONCE(ctx->cq_waiting, 1); + set_current_state(TASK_INTERRUPTIBLE); + } else { + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, + TASK_INTERRUPTIBLE); + } + + ret = io_cqring_wait_schedule(ctx, &iowq); + __set_current_state(TASK_RUNNING); + WRITE_ONCE(ctx->cq_waiting, 0); + + if (ret < 0) + break; + /* + * Run task_work after scheduling and before io_should_wake(). + * If we got woken because of task_work being processed, run it + * now rather than let the caller do another wait loop. + */ + io_run_task_work(); + if (!llist_empty(&ctx->work_llist)) + io_run_local_work(ctx); + + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_do_overflow_flush(ctx); + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { + ret = -EBADR; + break; + } } - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); - if (__io_cqring_events_user(ctx) >= min_events) + + if (io_should_wake(&iowq)) { + ret = 0; break; + } cond_resched(); - } while (ret > 0); + } while (1); - finish_wait(&ctx->cq_wait, &iowq.wq); + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; @@ -2685,14 +2756,14 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_req_caches_free(struct io_ring_ctx *ctx) { + struct io_kiocb *req; int nr = 0; mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { - struct io_kiocb *req = io_alloc_req(ctx); - + req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } @@ -2764,12 +2835,54 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) kfree(ctx); } +static __cold void io_activate_pollwq_cb(struct callback_head *cb) +{ + struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, + poll_wq_task_work); + + mutex_lock(&ctx->uring_lock); + ctx->poll_activated = true; + mutex_unlock(&ctx->uring_lock); + + /* + * Wake ups for some events between start of polling and activation + * might've been lost due to loose synchronisation. + */ + wake_up_all(&ctx->poll_wq); + percpu_ref_put(&ctx->refs); +} + +static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) +{ + spin_lock(&ctx->completion_lock); + /* already activated or in progress */ + if (ctx->poll_activated || ctx->poll_wq_task_work.func) + goto out; + if (WARN_ON_ONCE(!ctx->task_complete)) + goto out; + if (!ctx->submitter_task) + goto out; + /* + * with ->submitter_task only the submitter task completes requests, we + * only need to sync with it, which is done by injecting a tw + */ + init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); + percpu_ref_get(&ctx->refs); + if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) + percpu_ref_put(&ctx->refs); +out: + spin_unlock(&ctx->completion_lock); +} + static __poll_t io_uring_poll(struct file *file, poll_table *wait) { struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->cq_wait, wait); + if (unlikely(!ctx->poll_activated)) + io_activate_pollwq(ctx); + + poll_wait(file, &ctx->poll_wq, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -2792,7 +2905,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * pushes them to do the flush. */ - if (io_cqring_events(ctx) || io_has_work(ctx)) + if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -3055,10 +3168,12 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; + cond_resched(); } } - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx) > 0; ret |= io_cancel_defer_files(ctx, task, cancel_all); mutex_lock(&ctx->uring_lock); @@ -3330,11 +3445,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) { - ret = io_sqpoll_wait_sq(ctx); - if (ret) - goto out; - } + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); + ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); @@ -3575,6 +3688,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->task_complete = true; /* + * lazy poll_wq activation relies on ->task_complete for synchronisation + * purposes, see io_activate_pollwq() + */ + if (!ctx->task_complete) + ctx->poll_activated = true; + + /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events * polling again, they can rely on io_sq_thread to do polling @@ -3665,7 +3785,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE; + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3762,7 +3882,7 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_op_defs[i].not_supported) + if (!io_issue_defs[i].not_supported) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; @@ -3867,8 +3987,15 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + /* + * Lazy activation attempts would fail if it was polled before + * submitter_task is set. + */ + if (wq_has_sleeper(&ctx->poll_wq)) + io_activate_pollwq(ctx); + } if (ctx->restrictions.registered) ctx->restricted = 1; @@ -4179,17 +4306,36 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, struct io_ring_ctx *ctx; long ret = -EBADF; struct fd f; + bool use_registered_ring; + + use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); + opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; if (opcode >= IORING_REGISTER_LAST) return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; + if (use_registered_ring) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + struct io_uring_task *tctx = current->io_uring; - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(f.file)) - goto out_fput; + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) + return -EINVAL; + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + f.file = tctx->registered_rings[fd]; + f.flags = 0; + if (unlikely(!f.file)) + return -EBADF; + } else { + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + ret = -EOPNOTSUPP; + if (!io_is_uring_fops(f.file)) + goto out_fput; + } ctx = f.file->private_data; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ab4b2a1c3b7e..2711865f1e19 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -3,6 +3,8 @@ #include <linux/errno.h> #include <linux/lockdep.h> +#include <linux/resume_user_mode.h> +#include <linux/kasan.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> #include "io-wq.h" @@ -28,8 +30,6 @@ enum { struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); bool io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); -int io_run_local_work(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -72,7 +72,6 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); -void __io_put_task(struct task_struct *task, int nr); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); @@ -222,6 +221,13 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) +{ + if (wq_has_sleeper(&ctx->poll_wq)) + __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, + poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); +} + /* requires smb_mb() prior, see wq_has_sleeper() */ static inline void __io_cqring_wake(struct io_ring_ctx *ctx) { @@ -270,6 +276,15 @@ static inline int io_run_task_work(void) */ if (test_thread_flag(TIF_NOTIFY_SIGNAL)) clear_notify_signal(); + /* + * PF_IO_WORKER never returns to userspace, so check here if we have + * notify work that needs processing. + */ + if (current->flags & PF_IO_WORKER && + test_thread_flag(TIF_NOTIFY_RESUME)) { + __set_current_state(TASK_RUNNING); + resume_user_mode_work(NULL); + } if (task_work_pending(current)) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -284,42 +299,6 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); } -static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) -{ - int ret = 0; - int ret2; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - ret = io_run_local_work(ctx); - - /* want to run this after in case more is added */ - ret2 = io_run_task_work(); - - /* Try propagate error in favour of if tasks were run, - * but still make sure to run them if requested - */ - if (ret >= 0) - ret += ret2; - - return ret; -} - -static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) -{ - bool locked; - int ret; - - if (llist_empty(&ctx->work_llist)) - return 0; - - locked = true; - ret = __io_run_local_work(ctx, &locked); - /* shouldn't happen! */ - if (WARN_ON_ONCE(!locked)) - mutex_lock(&ctx->uring_lock); - return ret; -} - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -345,19 +324,11 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - static inline void io_get_task_refs(int nr) { struct io_uring_task *tctx = current->io_uring; @@ -372,19 +343,31 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) return !ctx->submit_state.free_list.next; } -static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +extern struct kmem_cache *req_cachep; + +static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { - if (unlikely(io_req_cache_empty(ctx))) - return __io_alloc_req_refill(ctx); - return true; + struct io_kiocb *req; + + req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); + kasan_unpoison_object_data(req_cachep, req); + wq_stack_extract(&ctx->submit_state.free_list); + return req; } -static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) { - struct io_wq_work_node *node; + if (unlikely(io_req_cache_empty(ctx))) { + if (!__io_alloc_req_refill(ctx)) + return false; + } + *req = io_extract_req(ctx); + return true; +} - node = wq_stack_extract(&ctx->submit_state.free_list); - return container_of(node, struct io_kiocb, comp_list); +static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) +{ + return likely(ctx->submitter_task == current); } static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 15602a136821..8803c0979e2a 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -13,6 +13,11 @@ #include "filetable.h" #include "msg_ring.h" + +/* All valid masks for MSG_RING */ +#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ + IORING_MSG_RING_FLAGS_PASS) + struct io_msg { struct file *file; struct file *src_file; @@ -21,7 +26,10 @@ struct io_msg { u32 len; u32 cmd; u32 src_fd; - u32 dst_fd; + union { + u32 dst_fd; + u32 cqe_flags; + }; u32 flags; }; @@ -91,6 +99,11 @@ static void io_msg_tw_complete(struct callback_head *head) if (current->flags & PF_EXITING) { ret = -EOWNERDEAD; } else { + u32 flags = 0; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + /* * If the target ring is using IOPOLL mode, then we need to be * holding the uring_lock for posting completions. Other ring @@ -99,7 +112,7 @@ static void io_msg_tw_complete(struct callback_head *head) */ if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&target_ctx->uring_lock); @@ -114,9 +127,12 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + u32 flags = 0; int ret; - if (msg->src_fd || msg->dst_fd || msg->flags) + if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS) + return -EINVAL; + if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; @@ -124,15 +140,18 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) if (io_msg_need_remote(target_ctx)) return io_msg_exec_remote(req, io_msg_tw_complete); + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; io_double_unlock_ctx(target_ctx); } else { - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; } return ret; @@ -241,7 +260,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) msg->src_fd = READ_ONCE(sqe->addr3); msg->dst_fd = READ_ONCE(sqe->file_index); msg->flags = READ_ONCE(sqe->msg_ring_flags); - if (msg->flags & ~IORING_MSG_RING_CQE_SKIP) + if (msg->flags & ~IORING_MSG_RING_MASK) return -EINVAL; return 0; diff --git a/io_uring/net.c b/io_uring/net.c index fbc34a7c2743..cbd4b725f58c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -62,6 +62,7 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 addr_len; + u16 buf_group; void __user *addr; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -89,6 +90,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; shutdown->how = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -98,8 +100,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -181,7 +182,7 @@ static int io_setup_async_msg(struct io_kiocb *req, if (async_msg->msg.msg_name) async_msg->msg.msg_name = &async_msg->addr; /* if were using fast_iov, set it to the new one */ - if (!kmsg->free_iov) { + if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; } @@ -344,7 +345,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; - struct iovec iov; struct socket *sock; unsigned flags; int min_ret = 0; @@ -378,7 +378,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - ret = import_single_range(ITER_SOURCE, sr->buf, sr->len, &iov, &msg.msg_iter); + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); if (unlikely(ret)) return ret; @@ -580,6 +580,15 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->opcode == IORING_OP_RECV && sr->len) return -EINVAL; req->flags |= REQ_F_APOLL_MULTISHOT; + /* + * Store the buffer group for this multishot receive separately, + * as if we end up doing an io-wq based issue that selects a + * buffer, it has to be committed immediately and that will + * clear ->buf_list. This means we lose the link to the buffer + * list, and the eventual buffer put on completion then cannot + * restore it. + */ + sr->buf_group = req->buf_index; } #ifdef CONFIG_COMPAT @@ -596,6 +605,7 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) sr->done_io = 0; sr->len = 0; /* get from the provided buffer */ + req->buf_index = sr->buf_group; } /* @@ -764,10 +774,7 @@ retry_multishot: } } - kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = len; - iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, kmsg->fast_iov, 1, - len); + iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); } flags = sr->msg_flags; @@ -835,7 +842,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; struct socket *sock; - struct iovec iov; unsigned int cflags; unsigned flags; int ret, min_ret = 0; @@ -863,7 +869,7 @@ retry_multishot: sr->buf = buf; } - ret = import_single_range(ITER_DEST, sr->buf, len, &iov, &msg.msg_iter); + ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); if (unlikely(ret)) goto out_free; @@ -1074,7 +1080,6 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; - struct iovec iov; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1116,8 +1121,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) msg.sg_from_iter = io_sg_from_iter; } else { io_notif_set_extended(zc->notif); - ret = import_single_range(ITER_SOURCE, zc->buf, zc->len, &iov, - &msg.msg_iter); + ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); if (unlikely(ret)) return ret; ret = io_notif_account_mem(zc->notif, zc->len); diff --git a/io_uring/notif.c b/io_uring/notif.c index c4bb793ebf0e..09dfd0832d19 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -68,9 +68,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) struct io_kiocb *notif; struct io_notif_data *nd; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; - notif = io_alloc_req(ctx); notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 3aa0d65c50e3..cca7c5b55208 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -46,11 +46,10 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -const struct io_op_def io_op_defs[] = { +const struct io_issue_def io_issue_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, - .name = "NOP", .prep = io_nop_prep, .issue = io_nop, }, @@ -64,13 +63,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READV", .prep = io_prep_rw, .issue = io_read, - .prep_async = io_readv_prep_async, - .cleanup = io_readv_writev_cleanup, - .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -82,18 +76,12 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITEV", .prep = io_prep_rw, .issue = io_write, - .prep_async = io_writev_prep_async, - .cleanup = io_readv_writev_cleanup, - .fail = io_rw_fail, }, [IORING_OP_FSYNC] = { .needs_file = 1, .audit_skip = 1, - .name = "FSYNC", .prep = io_fsync_prep, .issue = io_fsync, }, @@ -106,11 +94,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ_FIXED", .prep = io_prep_rw, .issue = io_read, - .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -122,30 +107,24 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE_FIXED", .prep = io_prep_rw, .issue = io_write, - .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "POLL_ADD", .prep = io_poll_add_prep, .issue = io_poll_add, }, [IORING_OP_POLL_REMOVE] = { .audit_skip = 1, - .name = "POLL_REMOVE", .prep = io_poll_remove_prep, .issue = io_poll_remove, }, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, .audit_skip = 1, - .name = "SYNC_FILE_RANGE", .prep = io_sfr_prep, .issue = io_sync_file_range, }, @@ -155,14 +134,9 @@ const struct io_op_def io_op_defs[] = { .pollout = 1, .ioprio = 1, .manual_alloc = 1, - .name = "SENDMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, - .prep_async = io_sendmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif @@ -174,29 +148,21 @@ const struct io_op_def io_op_defs[] = { .buffer_select = 1, .ioprio = 1, .manual_alloc = 1, - .name = "RECVMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, - .prep_async = io_recvmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "TIMEOUT", .prep = io_timeout_prep, .issue = io_timeout, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ .audit_skip = 1, - .name = "TIMEOUT_REMOVE", .prep = io_timeout_remove_prep, .issue = io_timeout_remove, }, @@ -206,7 +172,6 @@ const struct io_op_def io_op_defs[] = { .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ - .name = "ACCEPT", #if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, @@ -216,14 +181,11 @@ const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, - .name = "ASYNC_CANCEL", .prep = io_async_cancel_prep, .issue = io_async_cancel, }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "LINK_TIMEOUT", .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -231,46 +193,36 @@ const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .name = "CONNECT", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, .issue = io_connect, - .prep_async = io_connect_prep_async, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .name = "FALLOCATE", .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { - .name = "OPENAT", .prep = io_openat_prep, .issue = io_openat, - .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { - .name = "CLOSE", .prep = io_close_prep, .issue = io_close, }, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, .iopoll = 1, - .name = "FILES_UPDATE", .prep = io_files_update_prep, .issue = io_files_update, }, [IORING_OP_STATX] = { .audit_skip = 1, - .name = "STATX", .prep = io_statx_prep, .issue = io_statx, - .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { .needs_file = 1, @@ -282,11 +234,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ", .prep = io_prep_rw, .issue = io_read, - .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .needs_file = 1, @@ -298,21 +247,17 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE", .prep = io_prep_rw, .issue = io_write, - .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { .needs_file = 1, .audit_skip = 1, - .name = "FADVISE", .prep = io_fadvise_prep, .issue = io_fadvise, }, [IORING_OP_MADVISE] = { - .name = "MADVISE", + .audit_skip = 1, .prep = io_madvise_prep, .issue = io_madvise, }, @@ -323,13 +268,9 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .manual_alloc = 1, - .name = "SEND", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, - .fail = io_sendrecv_fail, - .prep_async = io_send_prep_async, #else .prep = io_eopnotsupp_prep, #endif @@ -341,25 +282,20 @@ const struct io_op_def io_op_defs[] = { .buffer_select = 1, .audit_skip = 1, .ioprio = 1, - .name = "RECV", #if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_OPENAT2] = { - .name = "OPENAT2", .prep = io_openat2_prep, .issue = io_openat2, - .cleanup = io_open_cleanup, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "EPOLL", #if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, @@ -372,21 +308,18 @@ const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "SPLICE", .prep = io_splice_prep, .issue = io_splice, }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, - .name = "PROVIDE_BUFFERS", .prep = io_provide_buffers_prep, .issue = io_provide_buffers, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, - .name = "REMOVE_BUFFERS", .prep = io_remove_buffers_prep, .issue = io_remove_buffers, }, @@ -395,13 +328,11 @@ const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "TEE", .prep = io_tee_prep, .issue = io_tee, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, - .name = "SHUTDOWN", #if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, @@ -410,72 +341,51 @@ const struct io_op_def io_op_defs[] = { #endif }, [IORING_OP_RENAMEAT] = { - .name = "RENAMEAT", .prep = io_renameat_prep, .issue = io_renameat, - .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { - .name = "UNLINKAT", .prep = io_unlinkat_prep, .issue = io_unlinkat, - .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { - .name = "MKDIRAT", .prep = io_mkdirat_prep, .issue = io_mkdirat, - .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { - .name = "SYMLINKAT", .prep = io_symlinkat_prep, .issue = io_symlinkat, - .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { - .name = "LINKAT", .prep = io_linkat_prep, .issue = io_linkat, - .cleanup = io_link_cleanup, }, [IORING_OP_MSG_RING] = { .needs_file = 1, .iopoll = 1, - .name = "MSG_RING", .prep = io_msg_ring_prep, .issue = io_msg_ring, - .cleanup = io_msg_ring_cleanup, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, - .name = "FSETXATTR", .prep = io_fsetxattr_prep, .issue = io_fsetxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { - .name = "SETXATTR", .prep = io_setxattr_prep, .issue = io_setxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, - .name = "FGETXATTR", .prep = io_fgetxattr_prep, .issue = io_fgetxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { - .name = "GETXATTR", .prep = io_getxattr_prep, .issue = io_getxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .audit_skip = 1, - .name = "SOCKET", #if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, @@ -486,16 +396,12 @@ const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, - .name = "URING_CMD", .iopoll = 1, .iopoll_queue = 1, - .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, - .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { - .name = "SEND_ZC", .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, @@ -503,32 +409,243 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_send_zc, - .prep_async = io_send_prep_async, - .cleanup = io_send_zc_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_SENDMSG_ZC] = { - .name = "SENDMSG_ZC", .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, +#else + .prep = io_eopnotsupp_prep, +#endif + }, +}; + + +const struct io_cold_def io_cold_defs[] = { + [IORING_OP_NOP] = { + .name = "NOP", + }, + [IORING_OP_READV] = { + .async_size = sizeof(struct io_async_rw), + .name = "READV", + .prep_async = io_readv_prep_async, + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_WRITEV] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITEV", + .prep_async = io_writev_prep_async, + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_FSYNC] = { + .name = "FSYNC", + }, + [IORING_OP_READ_FIXED] = { + .async_size = sizeof(struct io_async_rw), + .name = "READ_FIXED", + .fail = io_rw_fail, + }, + [IORING_OP_WRITE_FIXED] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITE_FIXED", + .fail = io_rw_fail, + }, + [IORING_OP_POLL_ADD] = { + .name = "POLL_ADD", + }, + [IORING_OP_POLL_REMOVE] = { + .name = "POLL_REMOVE", + }, + [IORING_OP_SYNC_FILE_RANGE] = { + .name = "SYNC_FILE_RANGE", + }, + [IORING_OP_SENDMSG] = { + .name = "SENDMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_sendmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_RECVMSG] = { + .name = "RECVMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_recvmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_TIMEOUT] = { + .async_size = sizeof(struct io_timeout_data), + .name = "TIMEOUT", + }, + [IORING_OP_TIMEOUT_REMOVE] = { + .name = "TIMEOUT_REMOVE", + }, + [IORING_OP_ACCEPT] = { + .name = "ACCEPT", + }, + [IORING_OP_ASYNC_CANCEL] = { + .name = "ASYNC_CANCEL", + }, + [IORING_OP_LINK_TIMEOUT] = { + .async_size = sizeof(struct io_timeout_data), + .name = "LINK_TIMEOUT", + }, + [IORING_OP_CONNECT] = { + .name = "CONNECT", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_connect), + .prep_async = io_connect_prep_async, +#endif + }, + [IORING_OP_FALLOCATE] = { + .name = "FALLOCATE", + }, + [IORING_OP_OPENAT] = { + .name = "OPENAT", + .cleanup = io_open_cleanup, + }, + [IORING_OP_CLOSE] = { + .name = "CLOSE", + }, + [IORING_OP_FILES_UPDATE] = { + .name = "FILES_UPDATE", + }, + [IORING_OP_STATX] = { + .name = "STATX", + .cleanup = io_statx_cleanup, + }, + [IORING_OP_READ] = { + .async_size = sizeof(struct io_async_rw), + .name = "READ", + .fail = io_rw_fail, + }, + [IORING_OP_WRITE] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITE", + .fail = io_rw_fail, + }, + [IORING_OP_FADVISE] = { + .name = "FADVISE", + }, + [IORING_OP_MADVISE] = { + .name = "MADVISE", + }, + [IORING_OP_SEND] = { + .name = "SEND", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .fail = io_sendrecv_fail, + .prep_async = io_send_prep_async, +#endif + }, + [IORING_OP_RECV] = { + .name = "RECV", +#if defined(CONFIG_NET) + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_OPENAT2] = { + .name = "OPENAT2", + .cleanup = io_open_cleanup, + }, + [IORING_OP_EPOLL_CTL] = { + .name = "EPOLL", + }, + [IORING_OP_SPLICE] = { + .name = "SPLICE", + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .name = "PROVIDE_BUFFERS", + }, + [IORING_OP_REMOVE_BUFFERS] = { + .name = "REMOVE_BUFFERS", + }, + [IORING_OP_TEE] = { + .name = "TEE", + }, + [IORING_OP_SHUTDOWN] = { + .name = "SHUTDOWN", + }, + [IORING_OP_RENAMEAT] = { + .name = "RENAMEAT", + .cleanup = io_renameat_cleanup, + }, + [IORING_OP_UNLINKAT] = { + .name = "UNLINKAT", + .cleanup = io_unlinkat_cleanup, + }, + [IORING_OP_MKDIRAT] = { + .name = "MKDIRAT", + .cleanup = io_mkdirat_cleanup, + }, + [IORING_OP_SYMLINKAT] = { + .name = "SYMLINKAT", + .cleanup = io_link_cleanup, + }, + [IORING_OP_LINKAT] = { + .name = "LINKAT", + .cleanup = io_link_cleanup, + }, + [IORING_OP_MSG_RING] = { + .name = "MSG_RING", + .cleanup = io_msg_ring_cleanup, + }, + [IORING_OP_FSETXATTR] = { + .name = "FSETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SETXATTR] = { + .name = "SETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_FGETXATTR] = { + .name = "FGETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_GETXATTR] = { + .name = "GETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SOCKET] = { + .name = "SOCKET", + }, + [IORING_OP_URING_CMD] = { + .name = "URING_CMD", + .async_size = uring_cmd_pdu_size(1), + .prep_async = io_uring_cmd_prep_async, + }, + [IORING_OP_SEND_ZC] = { + .name = "SEND_ZC", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_send_prep_async, + .cleanup = io_send_zc_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_SENDMSG_ZC] = { + .name = "SENDMSG_ZC", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, -#else - .prep = io_eopnotsupp_prep, #endif }, }; @@ -536,7 +653,7 @@ const struct io_op_def io_op_defs[] = { const char *io_uring_get_opcode(u8 opcode) { if (opcode < IORING_OP_LAST) - return io_op_defs[opcode].name; + return io_cold_defs[opcode].name; return "INVALID"; } @@ -544,12 +661,13 @@ void __init io_uring_optable_init(void) { int i; - BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST); + BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST); - for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { - BUG_ON(!io_op_defs[i].prep); - if (io_op_defs[i].prep != io_eopnotsupp_prep) - BUG_ON(!io_op_defs[i].issue); - WARN_ON_ONCE(!io_op_defs[i].name); + for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) { + BUG_ON(!io_issue_defs[i].prep); + if (io_issue_defs[i].prep != io_eopnotsupp_prep) + BUG_ON(!io_issue_defs[i].issue); + WARN_ON_ONCE(!io_cold_defs[i].name); } } diff --git a/io_uring/opdef.h b/io_uring/opdef.h index df7e13d9bfba..c22c8696e749 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -2,7 +2,7 @@ #ifndef IOU_OP_DEF_H #define IOU_OP_DEF_H -struct io_op_def { +struct io_issue_def { /* needs req->file assigned */ unsigned needs_file : 1; /* should block plug */ @@ -29,19 +29,24 @@ struct io_op_def { unsigned iopoll_queue : 1; /* opcode specific path will handle ->async_data allocation if needed */ unsigned manual_alloc : 1; + + int (*issue)(struct io_kiocb *, unsigned int); + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); +}; + +struct io_cold_def { /* size of async data needed, if any */ unsigned short async_size; const char *name; - int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); - int (*issue)(struct io_kiocb *, unsigned int); int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; -extern const struct io_op_def io_op_defs[]; +extern const struct io_issue_def io_issue_defs[]; +extern const struct io_cold_def io_cold_defs[]; void io_uring_optable_init(void); #endif diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 67178e4bb282..a1b98c81a52d 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -31,6 +31,15 @@ struct io_close { u32 file_slot; }; +static bool io_openat_force_async(struct io_open *open) +{ + /* + * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, + * it'll always -EAGAIN + */ + return open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE); +} + static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); @@ -61,6 +70,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; + if (io_openat_force_async(open)) + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -108,12 +119,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) nonblock_set = op.open_flag & O_NONBLOCK; resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { - /* - * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN - */ - if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) - return -EAGAIN; + WARN_ON_ONCE(io_openat_force_async(open)); op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } diff --git a/io_uring/poll.c b/io_uring/poll.c index 2ac1366adbd7..8339a92b4510 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -678,7 +678,7 @@ alloc_apoll: int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask = POLLPRI | POLLERR | EPOLLET; diff --git a/io_uring/rw.c b/io_uring/rw.c index 9c3ddd46a1ad..4c233910e200 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -391,7 +391,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, rw->len = sqe_len; } - ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); + ret = import_ubuf(ddir, buf, sqe_len, iter); if (ret) return ERR_PTR(ret); return NULL; @@ -410,7 +410,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req, unsigned int issue_flags) { *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (unlikely(IS_ERR(*iovec))) + if (IS_ERR(*iovec)) return PTR_ERR(*iovec); iov_iter_save_state(&s->iter, &s->iter_state); @@ -450,7 +450,10 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) struct iovec iovec; ssize_t nr; - if (!iov_iter_is_bvec(iter)) { + if (iter_is_ubuf(iter)) { + iovec.iov_base = iter->ubuf + iter->iov_offset; + iovec.iov_len = iov_iter_count(iter); + } else if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else { iovec.iov_base = u64_to_user_ptr(rw->addr); @@ -495,7 +498,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, io->free_iovec = iovec; io->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter)) + if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) return; if (!iovec) { unsigned iov_off = 0; @@ -516,7 +519,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_rw_state *s, bool force) { - if (!force && !io_op_defs[req->opcode].prep_async) + if (!force && !io_cold_defs[req->opcode].prep_async) return 0; if (!req_has_async_data(req)) { struct io_async_rw *iorw; diff --git a/io_uring/splice.c b/io_uring/splice.c index 53e4232d0866..2a4bbb719531 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -34,6 +34,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -52,8 +53,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) struct file *in; long ret = 0; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (sp->flags & SPLICE_F_FD_IN_FIXED) in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); @@ -94,8 +94,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) struct file *in; long ret = 0; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (sp->flags & SPLICE_F_FD_IN_FIXED) in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 559652380672..0119d3f1a556 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -312,7 +312,7 @@ static int io_sq_thread(void *data) do_exit(0); } -int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { DEFINE_WAIT(wait); @@ -327,7 +327,6 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); - return 0; } __cold int io_sq_offload_create(struct io_ring_ctx *ctx, diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 0c3fbcd1f583..e1b8d508d22d 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -26,4 +26,4 @@ void io_sq_thread_stop(struct io_sq_data *sqd); void io_sq_thread_park(struct io_sq_data *sqd); void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); -int io_sqpoll_wait_sq(struct io_ring_ctx *ctx); +void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); diff --git a/io_uring/statx.c b/io_uring/statx.c index d8fc933d3f59..abb874209caa 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -48,6 +48,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -56,8 +57,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags) struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); diff --git a/io_uring/sync.c b/io_uring/sync.c index 64e87ea2b8fb..255f68c37e55 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -32,6 +32,8 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); sync->flags = READ_ONCE(sqe->sync_range_flags); + req->flags |= REQ_F_FORCE_ASYNC; + return 0; } @@ -41,8 +43,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) int ret; /* sync_file_range always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); @@ -62,6 +63,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -72,8 +74,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags) int ret; /* fsync always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); @@ -91,6 +92,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->addr); sync->mode = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -100,8 +102,8 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) int ret; /* fallocate always requiring blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); + ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 6201a9f442c6..e1c810e0b85a 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -75,6 +75,7 @@ static int __io_getxattr_prep(struct io_kiocb *req, } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -109,8 +110,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_getxattr(mnt_idmap(req->file->f_path.mnt), req->file->f_path.dentry, @@ -127,8 +127,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) struct path path; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); retry: ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); @@ -174,6 +173,7 @@ static int __io_setxattr_prep(struct io_kiocb *req, } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -222,8 +222,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) { int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = __io_setxattr(req, issue_flags, &req->file->f_path); io_xattr_finish(req, ret); @@ -237,8 +236,7 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) struct path path; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); retry: ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index d09aa1c1e3e6..0160e9f2b07c 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -608,7 +608,7 @@ out_unlock: return error; } -static int mqueue_create(struct user_namespace *mnt_userns, struct inode *dir, +static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return mqueue_create_attr(dentry, mode, NULL); @@ -887,7 +887,7 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro, if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return -EINVAL; acc = oflag2acc[oflag & O_ACCMODE]; - return inode_permission(&init_user_ns, d_inode(dentry), acc); + return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc); } static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, @@ -979,7 +979,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = -ENOENT; } else { ihold(inode); - err = vfs_unlink(&init_user_ns, d_inode(dentry->d_parent), + err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent), dentry, NULL); } dput(dentry); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 547c88be8a28..93d0b87f3283 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -64,6 +64,7 @@ #include <uapi/linux/limits.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/openat2.h> // struct open_how +#include <uapi/linux/fanotify.h> #include "audit.h" @@ -2252,7 +2253,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, if (!dentry) return 0; - rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps); + rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps); if (rc) return rc; @@ -2807,7 +2808,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - get_vfs_caps_from_disk(&init_user_ns, + get_vfs_caps_from_disk(&nop_mnt_idmap, bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; @@ -2877,10 +2878,21 @@ void __audit_log_kern_module(char *name) context->type = AUDIT_KERN_MODULE; } -void __audit_fanotify(unsigned int response) +void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - audit_log(audit_context(), GFP_KERNEL, - AUDIT_FANOTIFY, "resp=%u", response); + /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */ + switch (friar->hdr.type) { + case FAN_RESPONSE_INFO_NONE: + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, + "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2", + response, FAN_RESPONSE_INFO_NONE); + break; + case FAN_RESPONSE_INFO_AUDIT_RULE: + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, + "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u", + response, friar->hdr.type, friar->rule_number, + friar->subj_trust, friar->obj_trust); + } } void __audit_tk_injoffset(struct timespec64 offset) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index a4a41ee3e80b..e14c822f8911 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks) */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) #ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7dd8af06413..b7017cae6fd1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7782,9 +7782,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); - return 0; end: - btf_free_dtor_kfunc_tab(btf); + if (ret) + btf_free_dtor_kfunc_tab(btf); btf_put(btf); return ret; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4f841e16779e..9948b542a470 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, inode->i_mtime = inode->i_atime; inode->i_ctime = inode->i_atime; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return inode; } @@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, dir->i_ctime = dir->i_mtime; } -static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) return simple_lookup(dir, dentry, flags); } -static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); @@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(&init_user_ns, inode, MAY_READ); + int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (ret) return ERR_PTR(ret); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index ebcc3dd0fa19..1db156405b68 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size) if (size <= 192) return size_index[(size - 1) / 8] - 1; - return fls(size - 1) - 1; + return fls(size - 1) - 2; } #define NUM_CACHES 11 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbef0b0967ae..7ee218827259 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3243,13 +3243,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +/* Copy src state preserving dst->parent and dst->live fields */ +static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) +{ + struct bpf_reg_state *parent = dst->parent; + enum bpf_reg_liveness live = dst->live; + + *dst = *src; + dst->parent = parent; + dst->live = live; +} + static void save_register_state(struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { int i; - state->stack[spi].spilled_ptr = *reg; + copy_register_state(&state->stack[spi].spilled_ptr, reg); if (size == BPF_REG_SIZE) state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; @@ -3577,7 +3588,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; } else { for (i = 0; i < size; i++) { @@ -3598,7 +3609,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -9592,7 +9603,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - *dst_reg = *ptr_reg; + copy_register_state(dst_reg, ptr_reg); } ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); @@ -10845,7 +10856,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * to propagate min/max range. */ src_reg->id = ++env->id_gen; - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { @@ -10856,7 +10867,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared otherwise * dst_reg min/max could be incorrectly * propagated into src_reg by find_equal_scalars() @@ -11655,7 +11666,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; + copy_register_state(reg, known_reg); })); } diff --git a/kernel/capability.c b/kernel/capability.c index 860fd22117c1..339a44dfe2f4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -486,11 +486,11 @@ EXPORT_SYMBOL(file_ns_capable); * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct inode *inode) { - return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) && - vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode)); + return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && + vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** @@ -502,13 +502,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ -bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns, +bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && - privileged_wrt_inode_uidgid(ns, mnt_userns, inode); + privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c099cf3fa02d..935e8121b21e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5065,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) if (!inode) return -ENOMEM; - ret = inode_permission(&init_user_ns, inode, MAY_WRITE); + ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a29c0b13706b..ca826bd1eba3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void) /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs) if (top_cs && (task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) continue; - set_cpus_allowed_ptr(task, cs->effective_cpus); + + cpumask_and(new_cpus, cs->effective_cpus, + task_cpu_possible_mask(task)); + set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); } @@ -1346,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ - if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) && + if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) && partition_is_populated(parent, cs)) return PERR_NOCPUS; @@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmp->new_cpus); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1661,7 +1665,7 @@ update_parent_subparts: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, tmp->new_cpus); /* * On legacy hierarchy, if the effective cpumask of any non- @@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) } } - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmpmask.new_cpus); if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); @@ -2324,6 +2328,7 @@ out: new_prs = -new_prs; spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; + WRITE_ONCE(cs->prs_err, err); spin_unlock_irq(&callback_lock); /* * Update child cpusets, if present. @@ -3347,7 +3352,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); @@ -3384,7 +3389,7 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated) update_tasks_nodemask(cs); } @@ -3691,15 +3696,38 @@ void __init cpuset_init_smp(void) * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset. + * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; + struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - guarantee_online_cpus(tsk, pmask); + rcu_read_lock(); + + cs = task_cs(tsk); + if (cs != &top_cpuset) + guarantee_online_cpus(tsk, pmask); + /* + * Tasks in the top cpuset won't get update to their cpumasks + * when a hotplug online/offline event happens. So we include all + * offline cpus in the allowed cpu list. + */ + if ((cs == &top_cpuset) || cpumask_empty(pmask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + + /* + * We first exclude cpus allocated to partitions. If there is no + * allowable online cpu left, we fall back to all possible cpus. + */ + cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus); + if (!cpumask_intersects(pmask, cpu_online_mask)) + cpumask_copy(pmask, possible_mask); + } + + rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..c4be13e50547 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; - + raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; - raw_spin_lock_irq(&ctx->lock); list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; - raw_spin_unlock_irq(&ctx->lock); } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } - + raw_spin_unlock_irq(&ctx->lock); return epc; } @@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head) static void put_pmu_ctx(struct perf_event_pmu_context *epc) { + struct perf_event_context *ctx = epc->ctx; unsigned long flags; - if (!atomic_dec_and_test(&epc->refcount)) + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; - if (epc->ctx) { - struct perf_event_context *ctx = epc->ctx; + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - /* - * XXX - * - * lockdep_assert_held(&ctx->mutex); - * - * can't because of the call-site in _free_event()/put_event() - * which isn't always called under ctx->mutex. - */ - - WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_del_init(&epc->pmu_ctx_entry); - epc->ctx = NULL; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + if (epc->embedded) return; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8fe1da9614ee..798a9042421f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -114,7 +114,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; - if (WARN_ON(!is_fwnode_irqchip(fwnode))) + if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); @@ -1915,7 +1915,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - debugfs_remove(debugfs_lookup(d->name, domain_dir)); + debugfs_lookup_and_remove(d->name, domain_dir); } void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 955267bbc2be..783a3e6a0b10 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1000,7 +1000,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, fail: msi_unlock_descs(dev); free_fwnode: - kfree(fwnode); + irq_domain_free_fwnode(fwnode); free_bundle: kfree(bundle); return false; @@ -1013,6 +1013,7 @@ free_bundle: */ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) { + struct fwnode_handle *fwnode = NULL; struct msi_domain_info *info; struct irq_domain *domain; @@ -1025,7 +1026,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; + if (irq_domain_is_msi_device(domain)) + fwnode = domain->fwnode; irq_domain_remove(domain); + irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); unlock: diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 010cf4e6d0b8..728f434de2bb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -901,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * then we need to wake the new top waiter up to try * to get the lock. */ - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_state(waiter->task, waiter->wake_state); + top_waiter = rt_mutex_top_waiter(lock); + if (prerequeue_top_waiter != top_waiter) + wake_up_state(top_waiter->task, top_waiter->wake_state); raw_spin_unlock_irq(&lock->wait_lock); return 0; } diff --git a/kernel/module/main.c b/kernel/module/main.c index 48568a0f5651..4ac3fe43e6c8 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2393,7 +2393,8 @@ static bool finished_loading(const char *name) sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE; + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); return ret; @@ -2569,20 +2570,35 @@ static int add_unformed_module(struct module *mod) mod->state = MODULE_STATE_UNFORMED; -again: mutex_lock(&module_mutex); old = find_module_all(mod->name, strlen(mod->name), true); if (old != NULL) { - if (old->state != MODULE_STATE_LIVE) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); err = wait_event_interruptible(module_wq, finished_loading(mod->name)); if (err) goto out_unlocked; - goto again; + + /* The module might have gone in the meantime. */ + mutex_lock(&module_mutex); + old = find_module_all(mod->name, strlen(mod->name), + true); } - err = -EEXIST; + + /* + * We are here only when the same module was being loaded. Do + * not try to load it again right now. It prevents long delays + * caused by serialized module load failures. It might happen + * when more devices of the same type trigger load of + * a particular module. + */ + if (old && old->state == MODULE_STATE_LIVE) + err = -EEXIST; + else + err = -EBUSY; goto out; } mod_update_bounds(mod); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bb1ee6d7bdde..2a4918a1faa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2951,8 +2951,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, } if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { - if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) + if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) { + if (ctx->flags & SCA_USER) + swap(p->user_cpus_ptr, ctx->user_mask); goto out; + } if (WARN_ON_ONCE(p == current && is_migration_disabled(p) && @@ -8290,12 +8293,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_put_task; + /* + * With non-SMP configs, user_cpus_ptr/user_mask isn't used and + * alloc_user_cpus_ptr() returns NULL. + */ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); - if (IS_ENABLED(CONFIG_SMP) && !user_mask) { + if (user_mask) { + cpumask_copy(user_mask, in_mask); + } else if (IS_ENABLED(CONFIG_SMP)) { retval = -ENOMEM; goto out_put_task; } - cpumask_copy(user_mask, in_mask); + ac = (struct affinity_context){ .new_mask = in_mask, .user_mask = user_mask, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c36aa54ae071..0f8736991427 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7229,10 +7229,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) eenv_task_busy_time(&eenv, p, prev_cpu); for (; pd; pd = pd->next) { + unsigned long util_min = p_util_min, util_max = p_util_max; unsigned long cpu_cap, cpu_thermal_cap, util; unsigned long cur_delta, max_spare_cap = 0; unsigned long rq_util_min, rq_util_max; - unsigned long util_min, util_max; unsigned long prev_spare_cap = 0; int max_spare_cap_cpu = -1; unsigned long base_energy; @@ -7251,6 +7251,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) eenv.pd_cap = 0; for_each_cpu(cpu, cpus) { + struct rq *rq = cpu_rq(cpu); + eenv.pd_cap += cpu_thermal_cap; if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) @@ -7269,24 +7271,19 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * much capacity we can get out of the CPU; this is * aligned with sched_cpu_util(). */ - if (uclamp_is_used()) { - if (uclamp_rq_is_idle(cpu_rq(cpu))) { - util_min = p_util_min; - util_max = p_util_max; - } else { - /* - * Open code uclamp_rq_util_with() except for - * the clamp() part. Ie: apply max aggregation - * only. util_fits_cpu() logic requires to - * operate on non clamped util but must use the - * max-aggregated uclamp_{min, max}. - */ - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); - - util_min = max(rq_util_min, p_util_min); - util_max = max(rq_util_max, p_util_max); - } + if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { + /* + * Open code uclamp_rq_util_with() except for + * the clamp() part. Ie: apply max aggregation + * only. util_fits_cpu() logic requires to + * operate on non clamped util but must use the + * max-aggregated uclamp_{min, max}. + */ + rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN); + rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX); + + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); } if (!util_fits_cpu(util, util_min, util_max, cpu)) continue; @@ -8871,16 +8868,23 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) * * Thermal pressure will impact all cpus in this perf domain * equally. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_energy_enabled()) { unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); - struct perf_domain *pd = rcu_dereference(rq->rd->pd); + struct perf_domain *pd; + rcu_read_lock(); + + pd = rcu_dereference(rq->rd->pd); rq->cpu_capacity_inverted = 0; for (; pd; pd = pd->next) { struct cpumask *pd_span = perf_domain_span(pd); unsigned long pd_cap_orig, pd_cap; + /* We can't be inverted against our own pd */ + if (cpumask_test_cpu(cpu_of(rq), pd_span)) + continue; + cpu = cpumask_any(pd_span); pd_cap_orig = arch_scale_cpu_capacity(cpu); @@ -8905,6 +8909,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) break; } } + + rcu_read_unlock(); } trace_sched_cpu_capacity_tp(rq); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8ac8b81bfee6..02e011cabe91 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t) group = t->group; /* - * Wakeup waiters to stop polling. Can happen if cgroup is deleted - * from under a polling process. + * Wakeup waiters to stop polling and clear the queue to prevent it from + * being accessed later. Can happen if cgroup is deleted from under a + * polling process. */ - wake_up_interruptible(&t->event_wait); + wake_up_pollfree(&t->event_wait); mutex_lock(&group->trigger_lock); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5897828b9d7e..7e5dff602585 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward); -u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle) { struct alarm_base *base = &alarm_bases[alarm->type]; + ktime_t now = base->get_ktime(); + + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { + /* + * Same issue as with posix_timer_fn(). Timers which are + * periodic but the signal is ignored can starve the system + * with a very small interval. The real fix which was + * promised in the context of posix_timer_fn() never + * materialized, but someone should really work on it. + * + * To prevent DOS fake @now to be 1 jiffie out which keeps + * the overrun accounting correct but creates an + * inconsistency vs. timer_gettime(2). + */ + ktime_t kj = NSEC_PER_SEC / HZ; + + if (interval < kj) + now = ktime_add(now, kj); + } + + return alarm_forward(alarm, now, interval); +} - return alarm_forward(alarm, base->get_ktime(), interval); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + return __alarm_forward_now(alarm, interval, false); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (posix_timer_event(ptr, si_private) && ptr->it_interval) { /* * Handle ignored signals and rearm the timer. This will go - * away once we handle ignored signals proper. + * away once we handle ignored signals proper. Ensure that + * small intervals cannot starve the system. */ - ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); ++ptr->it_requeue_pending; ptr->it_active = 1; result = ALARMTIMER_RESTART; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 197545241ab8..d7043043f59c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -933,8 +933,8 @@ config RING_BUFFER_RECORD_RECURSION default y help The ring buffer has its own internal recursion. Although when - recursion happens it wont cause harm because of the protection, - but it does cause an unwanted overhead. Enabling this option will + recursion happens it won't cause harm because of the protection, + but it does cause unwanted overhead. Enabling this option will place where recursion was detected into the ftrace "recursed_functions" file. @@ -1017,8 +1017,8 @@ config RING_BUFFER_STARTUP_TEST The test runs for 10 seconds. This will slow your boot time by at least 10 more seconds. - At the end of the test, statics and more checks are done. - It will output the stats of each per cpu buffer. What + At the end of the test, statistics and more checks are done. + It will output the stats of each per cpu buffer: What was written, the sizes, what was read, what was lost, and other similar details. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f47274de012b..c09792c551bf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -833,6 +833,7 @@ static void do_bpf_send_signal(struct irq_work *entry) work = container_of(entry, struct send_signal_irq_work, irq_work); group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + put_task_struct(work->task); } static int bpf_send_signal_common(u32 sig, enum pid_type type) @@ -867,7 +868,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * to the irq_work. The current task may change when queued * irq works get executed. */ - work->task = current; + work->task = get_task_struct(current); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 442438b93fe9..750aa3f08b25 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1248,12 +1248,17 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu(&hash->rcu, __free_ftrace_hash_rcu); } +/** + * ftrace_free_filter - remove all filters for an ftrace_ops + * @ops - the ops to remove the filters from + */ void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); } +EXPORT_SYMBOL_GPL(ftrace_free_filter); static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { @@ -5839,6 +5844,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi); * * Filters denote which functions should be enabled when tracing is enabled * If @ip is NULL, it fails to update filter. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). */ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) @@ -5858,7 +5867,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); * * Filters denote which functions should be enabled when tracing is enabled * If @ips array or any ip specified within is NULL , it fails to update filter. - */ + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). +*/ int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, int remove, int reset) { @@ -5900,6 +5913,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). */ int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) @@ -5919,6 +5936,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). */ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 6c97cc2d754a..7e9061828c24 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -516,7 +516,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user struct rv_monitor_def *mdef; int retval = -EINVAL; bool enable = true; - char *ptr = buff; + char *ptr; int len; if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a555a861b978..c9e40f692650 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9148,9 +9148,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf, if (val > 100) return -EINVAL; - if (!val) - val = 1; - tr->buffer_percent = val; (*ppos)++; @@ -10295,6 +10292,8 @@ void __init early_trace_init(void) static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); + + init_events(); } void __init trace_init(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e46a49269be2..085a31b978a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1282,6 +1282,7 @@ struct ftrace_event_field { int offset; int size; int is_signed; + int len; }; struct prog_entry; @@ -1490,6 +1491,7 @@ extern void trace_event_enable_cmd_record(bool enable); extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); +extern int init_events(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); extern void __trace_early_add_events(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 33e0b4f8ebe6..6a942fa275c7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type) + int is_signed, int filter_type, int len) { struct ftrace_event_field *field; @@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->len = len; list_add(&field->link, head); @@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type); + is_signed, filter_type, 0); } EXPORT_SYMBOL_GPL(trace_define_field); +static int trace_define_field_ext(struct trace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type, int len) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type, len); +} + #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type); \ + filter_type, 0); \ if (ret) \ return ret; @@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), FILTER_OTHER, 0); \ if (ret) \ return ret; @@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); - else - seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + else if (field->len) + seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, - array_descriptor, field->offset, + field->len, field->offset, field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + field->offset, field->size, !!field->is_signed); return 0; } @@ -2379,9 +2399,10 @@ event_define_fields(struct trace_event_call *call) } offset = ALIGN(offset, field->align); - ret = trace_define_field(call, field->type, field->name, + ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, - field->is_signed, field->filter_type); + field->is_signed, field->filter_type, + field->len); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 96acc2b71ac7..e095c3b3a50d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -128,7 +128,7 @@ static bool is_not(const char *str) } /** - * prog_entry - a singe entry in the filter program + * struct prog_entry - a singe entry in the filter program * @target: Index to jump to on a branch (actually one minus the index) * @when_to_branch: The value of the result of the predicate to do a branch * @pred: The predicate to execute. @@ -140,16 +140,16 @@ struct prog_entry { }; /** - * update_preds- assign a program entry a label target + * update_preds - assign a program entry a label target * @prog: The program array * @N: The index of the current entry in @prog - * @when_to_branch: What to assign a program entry for its branch condition + * @invert: What to assign a program entry for its branch condition * * The program entry at @N has a target that points to the index of a program * entry that can have its target and when_to_branch fields updated. * Update the current program entry denoted by index @N target field to be * that of the updated entry. This will denote the entry to update if - * we are processing an "||" after an "&&" + * we are processing an "||" after an "&&". */ static void update_preds(struct prog_entry *prog, int N, int invert) { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fcaf226b7744..5edbf6b1da3f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1988,6 +1988,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 : HIST_FIELD_FN_BUCKET; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); + if (!hist_field->operands[0]) + goto free; hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); if (!hist_field->type) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d960f6b11b5e..58f3946081e2 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ .size = sizeof(_type[_len]), .align = __alignof__(_type), \ - is_signed_type(_type), .filter_type = FILTER_OTHER }, + is_signed_type(_type), .filter_type = FILTER_OTHER, \ + .len = _len }, #undef __array_desc #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 94c1b5eb1dc0..210e1f168392 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -147,9 +147,8 @@ static void osnoise_unregister_instance(struct trace_array *tr) * register/unregister serialization is provided by trace's * trace_types_lock. */ - lockdep_assert_held(&trace_types_lock); - - list_for_each_entry_rcu(inst, &osnoise_instances, list) { + list_for_each_entry_rcu(inst, &osnoise_instances, list, + lockdep_is_held(&trace_types_lock)) { if (inst->tr == tr) { list_del_rcu(&inst->list); found = 1; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57a13b61f186..bd475a00f96d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1535,7 +1535,7 @@ static struct trace_event *events[] __initdata = { NULL }; -__init static int init_events(void) +__init int init_events(void) { struct trace_event *event; int i, ret; @@ -1548,4 +1548,3 @@ __init static int init_events(void) return 0; } -early_initcall(init_events); diff --git a/kernel/umh.c b/kernel/umh.c index 850631518665..fbf872c624cb 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -438,21 +438,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; - if (wait & UMH_KILLABLE) - state |= TASK_KILLABLE; - if (wait & UMH_FREEZABLE) state |= TASK_FREEZABLE; - retval = wait_for_completion_state(&done, state); - if (!retval) - goto wait_done; - if (wait & UMH_KILLABLE) { + retval = wait_for_completion_state(&done, state | TASK_KILLABLE); + if (!retval) + goto wait_done; + /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; + + /* + * fallthrough; in case of -ERESTARTSYS now do uninterruptible + * wait_for_completion_state(). Since umh_complete() shall call + * complete() in a moment if xchg() above returned NULL, this + * uninterruptible wait_for_completion_state() will not block + * SIGKILL'ed processes for long. + */ } + wait_for_completion_state(&done, state); wait_done: retval = sub_info->retval; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 881c3f84e88a..02ee440f7be3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -754,6 +754,7 @@ config DEBUG_KMEMLEAK select KALLSYMS select CRC32 select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF help Say Y here if you want to enable the memory leak detector. The memory allocation/freeing is traced in a way @@ -1207,7 +1208,7 @@ config SCHED_DEBUG depends on DEBUG_KERNEL && PROC_FS default y help - If you say Y here, the /proc/sched_debug file will be provided + If you say Y here, the /sys/kernel/debug/sched file will be provided that can help debug the scheduler. The runtime overhead of this option is minimal. @@ -1917,7 +1918,7 @@ config FUNCTION_ERROR_INJECTION help Add fault injections into various functions that are annotated with ALLOW_ERROR_INJECTION() in the kernel. BPF may also modify the return - value of theses functions. This is useful to test error paths of code. + value of these functions. This is useful to test error paths of code. If unsure, say N @@ -2566,6 +2567,15 @@ config MEMCPY_KUNIT_TEST If unsure, say N. +config MEMCPY_SLOW_KUNIT_TEST + bool "Include exhaustive memcpy tests" + depends on MEMCPY_KUNIT_TEST + default y + help + Some memcpy tests are quite exhaustive in checking for overlaps + and bit ranges. These can be very slow, so they are split out + as a separate config, in case they need to be disabled. + config IS_SIGNED_TYPE_KUNIT_TEST tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 375575a5a0e3..4dedd61e5192 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -194,7 +194,7 @@ config KCSAN_WEAK_MEMORY Enable support for modeling a subset of weak memory, which allows detecting a subset of data races due to missing memory barriers. - Depends on KCSAN_STRICT, because the options strenghtening certain + Depends on KCSAN_STRICT, because the options strengthening certain plain accesses by default (depending on !KCSAN_STRICT) reduce the ability to detect any data races invoving reordered accesses, in particular reordered writes. diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c index 9555b68bb774..1dcca8f2e194 100644 --- a/lib/dec_and_lock.c +++ b/lib/dec_and_lock.c @@ -49,3 +49,34 @@ int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); + +int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock); + +int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, + unsigned long *flags) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock_irqsave(lock, *flags); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock_irqrestore(lock, *flags); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f9a3ff37ecd1..d9b3332c8405 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1877,6 +1877,17 @@ int import_single_range(int rw, void __user *buf, size_t len, } EXPORT_SYMBOL(import_single_range); +int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) +{ + if (len > MAX_RW_COUNT) + len = MAX_RW_COUNT; + if (unlikely(!access_ok(buf, len))) + return -EFAULT; + + iov_iter_ubuf(i, rw, buf, len); + return 0; +} + /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. @@ -1891,8 +1902,8 @@ EXPORT_SYMBOL(import_single_range); */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { - if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && - !iov_iter_is_kvec(i) && !iter_is_ubuf(i)) + if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && + !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; diff --git a/lib/kunit/assert.c b/lib/kunit/assert.c index f5b50babe38d..05a09652f5a1 100644 --- a/lib/kunit/assert.c +++ b/lib/kunit/assert.c @@ -241,24 +241,34 @@ void kunit_mem_assert_format(const struct kunit_assert *assert, mem_assert = container_of(assert, struct kunit_mem_assert, assert); - string_stream_add(stream, - KUNIT_SUBTEST_INDENT "Expected %s %s %s, but\n", - mem_assert->text->left_text, - mem_assert->text->operation, - mem_assert->text->right_text); + if (!mem_assert->left_value) { + string_stream_add(stream, + KUNIT_SUBTEST_INDENT "Expected %s is not null, but is\n", + mem_assert->text->left_text); + } else if (!mem_assert->right_value) { + string_stream_add(stream, + KUNIT_SUBTEST_INDENT "Expected %s is not null, but is\n", + mem_assert->text->right_text); + } else { + string_stream_add(stream, + KUNIT_SUBTEST_INDENT "Expected %s %s %s, but\n", + mem_assert->text->left_text, + mem_assert->text->operation, + mem_assert->text->right_text); - string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", - mem_assert->text->left_text); - kunit_assert_hexdump(stream, mem_assert->left_value, - mem_assert->right_value, mem_assert->size); + string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", + mem_assert->text->left_text); + kunit_assert_hexdump(stream, mem_assert->left_value, + mem_assert->right_value, mem_assert->size); - string_stream_add(stream, "\n"); + string_stream_add(stream, "\n"); - string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", - mem_assert->text->right_text); - kunit_assert_hexdump(stream, mem_assert->right_value, - mem_assert->left_value, mem_assert->size); + string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", + mem_assert->text->right_text); + kunit_assert_hexdump(stream, mem_assert->right_value, + mem_assert->left_value, mem_assert->size); - kunit_assert_print_msg(message, stream); + kunit_assert_print_msg(message, stream); + } } EXPORT_SYMBOL_GPL(kunit_mem_assert_format); diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c9ebf975e56b..890ba5b3a981 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -21,6 +21,7 @@ #include "try-catch-impl.h" DEFINE_STATIC_KEY_FALSE(kunit_running); +EXPORT_SYMBOL_GPL(kunit_running); #if IS_BUILTIN(CONFIG_KUNIT) /* diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 26e2045d3cda..5a976393c9ae 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -670,12 +670,13 @@ static inline unsigned long mte_pivot(const struct maple_enode *mn, unsigned char piv) { struct maple_node *node = mte_to_node(mn); + enum maple_type type = mte_node_type(mn); - if (piv >= mt_pivots[piv]) { + if (piv >= mt_pivots[type]) { WARN_ON(1); return 0; } - switch (mte_node_type(mn)) { + switch (type) { case maple_arange_64: return node->ma64.pivot[piv]; case maple_range_64: @@ -4887,7 +4888,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; - unsigned long max, min, index; + unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) @@ -4909,8 +4910,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); - index = mas->index; - while (index <= max) { + while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; @@ -4941,10 +4941,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, offset); } - if (unlikely(index > max)) { - mas_set_err(mas, -EBUSY); - return false; - } + if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) + goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; @@ -4961,9 +4959,11 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) return false; ascend: - if (mte_is_root(mas->node)) - mas_set_err(mas, -EBUSY); + if (!mte_is_root(mas->node)) + return false; +no_space: + mas_set_err(mas, -EBUSY); return false; } diff --git a/lib/memcpy_kunit.c b/lib/memcpy_kunit.c index 89128551448d..887926f04731 100644 --- a/lib/memcpy_kunit.c +++ b/lib/memcpy_kunit.c @@ -309,6 +309,8 @@ static void set_random_nonzero(struct kunit *test, u8 *byte) static void init_large(struct kunit *test) { + if (!IS_ENABLED(CONFIG_MEMCPY_SLOW_KUNIT_TEST)) + kunit_skip(test, "Slow test skipped. Enable with CONFIG_MEMCPY_SLOW_KUNIT_TEST=y"); /* Get many bit patterns. */ get_random_bytes(large_src, ARRAY_SIZE(large_src)); diff --git a/lib/nlattr.c b/lib/nlattr.c index 9055e8b4d144..489e15bde5c1 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> +#include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> @@ -381,6 +382,7 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (type <= 0 || type > maxtype) return 0; + type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); @@ -596,6 +598,7 @@ static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, } continue; } + type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); diff --git a/lib/parser.c b/lib/parser.c index bcb23484100e..2b5e2b480253 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -11,6 +11,15 @@ #include <linux/slab.h> #include <linux/string.h> +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define NUMBER_BUF_LEN 24 + /** * match_one - Determines if a string matches a simple pattern * @s: the string to examine for presence of the pattern @@ -129,14 +138,12 @@ EXPORT_SYMBOL(match_token); static int match_number(substring_t *s, int *result, int base) { char *endp; - char *buf; + char buf[NUMBER_BUF_LEN]; int ret; long val; - buf = match_strdup(s); - if (!buf) - return -ENOMEM; - + if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) + return -ERANGE; ret = 0; val = simple_strtol(buf, &endp, base); if (endp == buf) @@ -145,7 +152,6 @@ static int match_number(substring_t *s, int *result, int base) ret = -ERANGE; else *result = (int) val; - kfree(buf); return ret; } @@ -163,18 +169,15 @@ static int match_number(substring_t *s, int *result, int base) */ static int match_u64int(substring_t *s, u64 *result, int base) { - char *buf; + char buf[NUMBER_BUF_LEN]; int ret; u64 val; - buf = match_strdup(s); - if (!buf) - return -ENOMEM; - + if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) + return -ERANGE; ret = kstrtoull(buf, base, &val); if (!ret) *result = val; - kfree(buf); return ret; } @@ -206,14 +209,12 @@ EXPORT_SYMBOL(match_int); */ int match_uint(substring_t *s, unsigned int *result) { - int err = -ENOMEM; - char *buf = match_strdup(s); + char buf[NUMBER_BUF_LEN]; - if (buf) { - err = kstrtouint(buf, 10, result); - kfree(buf); - } - return err; + if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) + return -ERANGE; + + return kstrtouint(buf, 10, result); } EXPORT_SYMBOL(match_uint); diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 497fc93ccf9e..ec847bf4dcb4 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2517,6 +2517,91 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt) mt_set_non_kernel(0); } +static noinline void check_empty_area_window(struct maple_tree *mt) +{ + unsigned long i, nr_entries = 20; + MA_STATE(mas, mt, 0, 0); + + for (i = 1; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 9, + xa_mk_value(i), GFP_KERNEL); + + /* Create another hole besides the one at 0 */ + mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL); + + /* Check lower bounds that don't fit */ + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY); + + /* Check lower bound that does fit */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0); + MT_BUG_ON(mt, mas.index != 5); + MT_BUG_ON(mt, mas.last != 9); + rcu_read_unlock(); + + /* Check one gap that doesn't fit and one that does */ + rcu_read_lock(); + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0); + MT_BUG_ON(mt, mas.index != 161); + MT_BUG_ON(mt, mas.last != 169); + + /* Check one gap that does fit above the min */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0); + MT_BUG_ON(mt, mas.index != 216); + MT_BUG_ON(mt, mas.last != 218); + + /* Check size that doesn't fit any gap */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY); + + /* + * Check size that doesn't fit the lower end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY); + + /* + * Check size that doesn't fit the upper end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY); + + /* Check mas_empty_area forward */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 8); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY); + + mas_reset(&mas); + mas_empty_area(&mas, 100, 165, 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY); + rcu_read_unlock(); +} + static DEFINE_MTREE(tree); static int maple_tree_seed(void) { @@ -2765,6 +2850,10 @@ static int maple_tree_seed(void) check_bnode_min_spanning(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_empty_area_window(&tree); + mtree_destroy(&tree); + #if defined(BENCH) skip: #endif diff --git a/mm/compaction.c b/mm/compaction.c index ca1603524bbe..8238e83385a7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1839,6 +1839,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; + set_pageblock_skip(freepage); break; } } diff --git a/mm/filemap.c b/mm/filemap.c index c4d4ace9cc70..0e20a8d6dd93 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2588,18 +2588,19 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio; int err = 0; + /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) @@ -1914,7 +1914,7 @@ static unsigned long collect_longterm_unpinnable_pages( drain_allow = false; } - if (!folio_isolate_lru(folio)) + if (folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index abe6cfd92ffa..1b791b26d72d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3272,8 +3272,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_writable_migration_entry(entry)) - pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); if (!is_migration_entry_young(entry)) @@ -3281,6 +3279,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) /* NOTE: this may contain setting soft-dirty on some archs */ if (PageDirty(new) && is_migration_entry_dirty(entry)) pmde = pmd_mkdirty(pmde); + if (is_writable_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); + else + pmde = pmd_wrprotect(pmde); if (PageAnon(new)) { rmap_t rmap_flags = RMAP_COMPOUND; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7fcdb98c9e68..bdbfeb6fb393 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5051,6 +5051,9 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { + /* No swap on hugetlb */ + WARN_ON_ONCE( + is_swapin_error_entry(pte_to_swp_entry(entry))); /* * We copy the pte marker only if the dst vma has * uffd-wp enabled. diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 833bf2cfd2a3..21e66d7f261d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -246,6 +246,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { + if (!kasan_arch_is_ready()) + return false; + if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index b076f597a378..cb762982c8ba 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -191,7 +191,12 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, bool kasan_byte_accessible(const void *addr) { - s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); + s8 shadow_byte; + + if (!kasan_arch_is_ready()) + return true; + + shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE; } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 2fba1f51f042..15cfb34d16a1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -291,6 +291,9 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) unsigned long shadow_start, shadow_end; int ret; + if (!kasan_arch_is_ready()) + return 0; + if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -459,6 +462,9 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; + if (!kasan_arch_is_ready()) + return; + region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -502,6 +508,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ + if (!kasan_arch_is_ready()) + return (void *)start; + if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -524,6 +533,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { + if (!kasan_arch_is_ready()) + return; + if (!is_vmalloc_or_module_addr(start)) return; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79be13133322..a26a28e3738c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } +/* + * See pmd_trans_unstable() for how the result may change out from + * underneath us, even if we hold mmap_lock in read. + */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) @@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, #endif if (pmd_none(pmde)) return SCAN_PMD_NONE; + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; + if (pmd_devmap(pmde)) + return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -1642,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * has higher cost too. It would also probably require locking * the anon_vma. */ - if (vma->anon_vma) { + if (READ_ONCE(vma->anon_vma)) { result = SCAN_PAGE_ANON; goto next; } @@ -1671,6 +1679,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, if ((cc->is_khugepaged || is_target) && mmap_write_trylock(mm)) { /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no + * ->anon_vma exists or the anon_vma is locked. + * We already checked ->anon_vma above, but that check + * is racy because ->anon_vma can be populated under the + * mmap lock in read mode. + */ + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto unlock_next; + } + /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte * markers installed. Skip it only, so the rest mm/vma @@ -2591,6 +2611,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_CGROUP_CHARGE_FAIL: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 92f670edbf51..55dc8b8b0616 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") == 0) + else if (strcmp(str, "on") == 0) { kmemleak_skip_disable = 1; + stack_depot_want_early_init(); + } else return -EINVAL; return 0; @@ -2093,7 +2095,6 @@ void __init kmemleak_init(void) if (kmemleak_error) return; - stack_depot_init(); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); @@ -2629,8 +2629,11 @@ struct page *ksm_might_need_to_copy(struct page *page, new_page = NULL; } if (new_page) { - copy_user_highpage(new_page, page, address, vma); - + if (copy_mc_user_highpage(new_page, page, address, vma)) { + put_page(new_page); + memory_failure_queue(page_to_pfn(page), 0); + return ERR_PTR(-EHWPOISON); + } SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); diff --git a/mm/madvise.c b/mm/madvise.c index b6ea204d4e23..18c2e2affac4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -329,7 +329,7 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ - return inode_owner_or_capable(&init_user_ns, + return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 685e30e6d27c..d036c7861310 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1640,13 +1640,7 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - /* - * Reserved pages are always initialized by the end of - * memblock_free_all() (by memmap_init() and, if deferred - * initialization is enabled, memmap_init_reserved_pages()), so - * these pages can be released directly to the buddy allocator. - */ - __free_pages_core(pfn_to_page(cursor), 0); + memblock_free_pages(pfn_to_page(cursor), cursor, 0); totalram_pages_inc(); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..73afff8062f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,7 +63,6 @@ #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> -#include <linux/parser.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -2393,8 +2392,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP, - NULL); + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2685,8 +2683,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options, - NULL); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3506,8 +3503,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, - NULL)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3618,8 +3614,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP, - NULL)) + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6429,8 +6424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6479,8 +6473,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6603,54 +6596,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_NODES = 0, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t if_tokens = { - { MEMORY_RECLAIM_NODES, "nodes=%s" }, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | - MEMCG_RECLAIM_PROACTIVE; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - int token; - char value[256]; - nodemask_t nodemask = NODE_MASK_ALL; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; + unsigned int reclaim_options; + int err; buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - token = match_token(start, if_tokens, args); - match_strlcpy(value, args, sizeof(value)); - switch (token) { - case MEMORY_RECLAIM_NODES: - if (nodelist_parse(value, nodemask) < 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - } - + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6667,8 +6627,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options, - &nodemask); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/memory.c b/mm/memory.c index aad226daf41b..f526b9152bef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -828,12 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { - /* - * We're copying the pgtable should only because dst_vma has - * uffd-wp enabled, do sanity check. - */ - WARN_ON_ONCE(!userfaultfd_wp(dst_vma)); - set_pte_at(dst_mm, addr, dst_pte, pte); + if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma)) + set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } if (!userfaultfd_wp(dst_vma)) @@ -3629,8 +3625,12 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. + * + * This should also cover the case where e.g. the pte changed + * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. + * So is_pte_marker() check is not enough to safely drop the pte. */ - if (is_pte_marker(*vmf->pte)) + if (pte_same(vmf->orig_pte, *vmf->pte)) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3840,6 +3840,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!page)) { ret = VM_FAULT_OOM; goto out_page; + } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) { + ret = VM_FAULT_HWPOISON; + goto out_page; } folio = page_folio(page); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f..f940395667c8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + !hugetlb_pmd_shared(pte))) { if (isolate_hugetlb(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* diff --git a/mm/migrate.c b/mm/migrate.c index a4d3fc65085f..cc5455614e01 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -224,6 +224,8 @@ static bool remove_migration_pte(struct folio *folio, pte = maybe_mkwrite(pte, vma); else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); + else + pte = pte_wrprotect(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; diff --git a/mm/mincore.c b/mm/mincore.c index a085a2aeabd8..cd69b9db0081 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -168,7 +168,7 @@ static inline bool can_do_mincore(struct vm_area_struct *vma) * for writing; otherwise we'd be including shared non-exclusive * mappings, which opens a side channel. */ - return inode_owner_or_capable(&init_user_ns, + return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 908df12caa26..61cf60015a8b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -245,7 +245,13 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (pte_marker_entry_uffd_wp(entry)) { + } else if (is_pte_marker_entry(entry)) { + /* + * Ignore swapin errors unconditionally, + * because any access should sigbus anyway. + */ + if (is_swapin_error_entry(entry)) + continue; /* * If this is uffd-wp pte marker and we'd like * to unprotect it, drop it; the next page diff --git a/mm/mremap.c b/mm/mremap.c index fe587c5d6591..930f65c315c0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we are adding to - * the already existing vma, vma_merge() will merge this extension with - * the already existing vma (expand operation itself) and possibly also - * with the next vma if it becomes adjacent to the expanded vma and - * otherwise compatible. + * Function vma_merge() is called on the extension we + * are adding to the already existing vma, vma_merge() + * will merge this extension with the already existing + * vma (expand operation itself) and possibly also with + * the next vma if it becomes adjacent to the expanded + * vma and otherwise compatible. + * + * However, vma_merge() can currently fail due to + * is_mergeable_vma() check for vm_ops->close (see the + * comment there). Yet this should not prevent vma + * expanding, so perform a simple expand for such vma. + * Ideally the check for close op should be only done + * when a vma would be actually removed due to a merge. */ - vma = vma_merge(mm, vma, extension_start, extension_end, + if (!vma->vm_ops || !vma->vm_ops->close) { + vma = vma_merge(mm, vma, extension_start, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + } else if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + vma = NULL; + } if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0745aedebb37..3bb3484563ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5631,9 +5631,12 @@ EXPORT_SYMBOL(get_zeroed_page); */ void __free_pages(struct page *page, unsigned int order) { + /* get PageHead before we drop reference */ + int head = PageHead(page); + if (put_page_testzero(page)) free_the_page(page, order); - else if (!PageHead(page)) + else if (!head) while (order-- > 0) free_the_page(page + (1 << order), order); } diff --git a/mm/secretmem.c b/mm/secretmem.c index 04c3ac9448a1..afcf46e99cda 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -162,7 +162,7 @@ const struct address_space_operations secretmem_aops = { .migrate_folio = secretmem_migrate_folio, }; -static int secretmem_setattr(struct user_namespace *mnt_userns, +static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@ -175,7 +175,7 @@ static int secretmem_setattr(struct user_namespace *mnt_userns, if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else - ret = simple_setattr(mnt_userns, dentry, iattr); + ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); diff --git a/mm/shmem.c b/mm/shmem.c index 0005ab2c29af..41f82c5a5e28 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1045,7 +1045,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_getattr(struct user_namespace *mnt_userns, +static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -1066,7 +1066,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(idmap, inode, stat); if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; @@ -1080,7 +1080,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, return 0; } -static int shmem_setattr(struct user_namespace *mnt_userns, +static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -1089,7 +1089,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns, bool update_mtime = false; bool update_ctime = true; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -1127,9 +1127,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns, } } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { inode->i_ctime = current_time(inode); if (update_mtime) @@ -2327,8 +2327,9 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) #define shmem_initxattrs NULL #endif -static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, + struct inode *dir, umode_t mode, dev_t dev, + unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -2341,7 +2342,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, inode = new_inode(sb); if (inode) { inode->i_ino = ino; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_generation = get_random_u32(); @@ -2913,13 +2914,13 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) * File creation. Allocate an inode, and we're done.. */ static int -shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, +shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { error = simple_acl_create(dir, inode); if (error) @@ -2944,13 +2945,13 @@ out_iput: } static int -shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, @@ -2968,22 +2969,22 @@ out_iput: return error; } -static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; - if ((error = shmem_mknod(&init_user_ns, dir, dentry, - mode | S_IFDIR, 0))) + error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); + if (error) return error; inc_nlink(dir); return 0; } -static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* @@ -3043,7 +3044,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } -static int shmem_whiteout(struct user_namespace *mnt_userns, +static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; @@ -3053,7 +3054,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns, if (!whiteout) return -ENOMEM; - error = shmem_mknod(&init_user_ns, old_dir, whiteout, + error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) @@ -3076,7 +3077,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns, * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename2(struct user_namespace *mnt_userns, +static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -3096,7 +3097,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns, if (flags & RENAME_WHITEOUT) { int error; - error = shmem_whiteout(&init_user_ns, old_dir, old_dentry); + error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } @@ -3122,7 +3123,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns, return 0; } -static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; @@ -3134,7 +3135,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (len > PAGE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, + inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); if (!inode) return -ENOSPC; @@ -3227,7 +3228,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -static int shmem_fileattr_set(struct user_namespace *mnt_userns, +static int shmem_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -3301,7 +3302,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, } static int shmem_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -3817,7 +3818,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #endif uuid_gen(&sb->s_uuid); - inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, + VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -4042,7 +4044,11 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, +#ifdef CONFIG_SHMEM + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, +#else .fs_flags = FS_USERNS_MOUNT, +#endif }; void __init shmem_init(void) @@ -4194,7 +4200,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) +#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) @@ -4217,8 +4223,11 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, - flags); + if (is_idmapped_mnt(mnt)) + return ERR_PTR(-EINVAL); + + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, + S_IFREG | S_IRWXUGO, 0, flags); if (unlikely(!inode)) { shmem_unacct_size(flags, size); return ERR_PTR(-ENOSPC); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index b05295bab322..39c3491e28a3 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -246,18 +246,21 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) } EXPORT_SYMBOL(shrinker_debugfs_rename); -void shrinker_debugfs_remove(struct shrinker *shrinker) +struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) { + struct dentry *entry = shrinker->debugfs_entry; + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; - if (!shrinker->debugfs_entry) - return; + if (entry) { + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); + shrinker->debugfs_entry = NULL; + } - debugfs_remove_recursive(shrinker->debugfs_entry); - ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); + return entry; } static int __init shrinker_debugfs_init(void) diff --git a/mm/swap.c b/mm/swap.c index 70e2063ef43a..4c03ecab698e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -158,36 +158,6 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * get_kernel_pages() - pin kernel pages in memory - * @kiov: An array of struct kvec structures - * @nr_segs: number of segments to pin - * @write: pinning for read/write, currently ignored - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_segs long. - * - * Returns number of pages pinned. This may be fewer than the number requested. - * If nr_segs is 0 or negative, returns 0. If no pages were pinned, returns 0. - * Each page returned must be released with a put_page() call when it is - * finished with. - */ -int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, - struct page **pages) -{ - int seg; - - for (seg = 0; seg < nr_segs; seg++) { - if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) - return seg; - - pages[seg] = kmap_to_page(kiov[seg].iov_base); - get_page(pages[seg]); - } - - return seg; -} -EXPORT_SYMBOL_GPL(get_kernel_pages); - typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) diff --git a/mm/swapfile.c b/mm/swapfile.c index 908a529bca12..eb9b0bf1fcdd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1100,6 +1100,7 @@ start_over: goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); + cond_resched(); spin_lock(&swap_avail_lock); nextsi: @@ -1763,12 +1764,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; + bool hwposioned = false; int ret = 1; swapcache = page; page = ksm_might_need_to_copy(page, vma, addr); if (unlikely(!page)) return -ENOMEM; + else if (unlikely(PTR_ERR(page) == -EHWPOISON)) + hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { @@ -1776,15 +1780,19 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - if (unlikely(!PageUptodate(page))) { - pte_t pteval; + if (unlikely(hwposioned || !PageUptodate(page))) { + swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - pteval = swp_entry_to_pte(make_swapin_error_entry()); - set_pte_at(vma->vm_mm, addr, pte, pteval); - swap_free(entry); + if (hwposioned) { + swp_entry = make_hwpoison_entry(swapcache); + page = swapcache; + } else { + swp_entry = make_swapin_error_entry(); + } + new_pte = swp_entry_to_pte(swp_entry); ret = 0; - goto out; + goto setpte; } /* See do_swap_page() */ @@ -1816,6 +1824,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(*pte)) new_pte = pte_mkuffd_wp(new_pte); +setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..5b7b8d4f5297 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -741,6 +741,8 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { + struct dentry *debugfs_entry; + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; @@ -749,9 +751,11 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); - shrinker_debugfs_remove(shrinker); + debugfs_entry = shrinker_debugfs_remove(shrinker); up_write(&shrinker_rwsem); + debugfs_remove_recursive(debugfs_entry); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -3323,13 +3327,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm) if (mem_cgroup_disabled()) return; + /* migration can happen before addition */ + if (!mm->lru_gen.memcg) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(task); rcu_read_unlock(); if (memcg == mm->lru_gen.memcg) return; - VM_WARN_ON_ONCE(!mm->lru_gen.memcg); VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); lru_gen_del_mm(mm); @@ -6754,8 +6761,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6770,7 +6776,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), - .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9445bee6b014..702bc3fd687a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -113,7 +113,23 @@ * have room for two bit at least. */ #define OBJ_ALLOCATED_TAG 1 -#define OBJ_TAG_BITS 1 + +#ifdef CONFIG_ZPOOL +/* + * The second least-significant bit in the object's header identifies if the + * value stored at the header is a deferred handle from the last reclaim + * attempt. + * + * As noted above, this is valid because we have room for two bits. + */ +#define OBJ_DEFERRED_HANDLE_TAG 2 +#define OBJ_TAG_BITS 2 +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG) +#else +#define OBJ_TAG_BITS 1 +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG +#endif /* CONFIG_ZPOOL */ + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) @@ -222,6 +238,12 @@ struct link_free { * Handle of allocated object. */ unsigned long handle; +#ifdef CONFIG_ZPOOL + /* + * Deferred handle of a reclaimed object. + */ + unsigned long deferred_handle; +#endif }; }; @@ -272,8 +294,6 @@ struct zspage { /* links the zspage to the lru list in the pool */ struct list_head lru; bool under_reclaim; - /* list of unfreed handles whose objects have been reclaimed */ - unsigned long *deferred_handles; #endif struct zs_pool *pool; @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, + int tag) { unsigned long handle; struct zspage *zspage = get_zspage(page); @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) } else handle = *(unsigned long *)obj; - if (!(handle & OBJ_ALLOCATED_TAG)) + if (!(handle & tag)) return false; - *phandle = handle & ~OBJ_ALLOCATED_TAG; + /* Clear all tags before returning the handle */ + *phandle = handle & ~OBJ_TAG_MASK; return true; } +static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +static bool obj_stores_deferred_handle(struct page *page, void *obj, + unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -946,22 +981,36 @@ unlock: } #ifdef CONFIG_ZPOOL +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx); + /* * Free all the deferred handles whose objects are freed in zs_free. */ -static void free_handles(struct zs_pool *pool, struct zspage *zspage) +static void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) { - unsigned long handle = (unsigned long)zspage->deferred_handles; + int obj_idx = 0; + struct page *page = get_first_page(zspage); + unsigned long handle; - while (handle) { - unsigned long nxt_handle = handle_to_obj(handle); + while (1) { + handle = find_deferred_handle_obj(class, page, &obj_idx); + if (!handle) { + page = get_next_page(page); + if (!page) + break; + obj_idx = 0; + continue; + } cache_free_handle(pool, handle); - handle = nxt_handle; + obj_idx++; } } #else -static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} +static inline void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) {} #endif static void __free_zspage(struct zs_pool *pool, struct size_class *class, @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(fg != ZS_EMPTY); /* Free all deferred handles from zs_free */ - free_handles(pool, zspage); + free_handles(pool, class, zspage); next = page = get_first_page(zspage); do { @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) #ifdef CONFIG_ZPOOL INIT_LIST_HEAD(&zspage->lru); zspage->under_reclaim = false; - zspage->deferred_handles = NULL; #endif set_freeobj(zspage, 0); @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(int class_size, unsigned long obj) +static void obj_free(int class_size, unsigned long obj, unsigned long *handle) { struct link_free *link; struct zspage *zspage; @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj) zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); - - /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - if (likely(!ZsHugePage(zspage))) - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; - else - f_page->index = 0; + + if (handle) { +#ifdef CONFIG_ZPOOL + /* Stores the (deferred) handle in the object's header */ + *handle |= OBJ_DEFERRED_HANDLE_TAG; + *handle &= ~OBJ_ALLOCATED_TAG; + + if (likely(!ZsHugePage(zspage))) + link->deferred_handle = *handle; + else + f_page->index = *handle; +#endif + } else { + /* Insert this object in containing zspage's freelist */ + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; + set_freeobj(zspage, f_objidx); + } + kunmap_atomic(vaddr); - set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); } @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) zspage = get_zspage(f_page); class = zspage_class(pool, zspage); - obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); #ifdef CONFIG_ZPOOL @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle) * Reclaim needs the handles during writeback. It'll free * them along with the zspage when it's done with them. * - * Record current deferred handle at the memory location - * whose address is given by handle. + * Record current deferred handle in the object's header. */ - record_obj(handle, (unsigned long)zspage->deferred_handles); - zspage->deferred_handles = (unsigned long *)handle; + obj_free(class->size, obj, &handle); spin_unlock(&pool->lock); return; } #endif + obj_free(class->size, obj, NULL); + fullness = fix_fullness_group(class, zspage); if (fullness == ZS_EMPTY) free_zspage(pool, class, zspage); @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, } /* - * Find alloced object in zspage from index object and + * Find object with a certain tag in zspage from index object and * return handle. */ -static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) +static unsigned long find_tagged_obj(struct size_class *class, + struct page *page, int *obj_idx, int tag) { unsigned int offset; int index = *obj_idx; @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_tagged(page, addr + offset, &handle, tag)) break; offset += class->size; @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class, return handle; } +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +/* + * Find object storing a deferred handle in header in zspage from index object + * and return handle. + */ +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + struct zs_compact_control { /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); - obj_free(class->size, used_obj); + obj_free(class->size, used_obj, NULL); } /* Remember last position in this iteration */ @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool) EXPORT_SYMBOL_GPL(zs_destroy_pool); #ifdef CONFIG_ZPOOL +static void restore_freelist(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + unsigned int obj_idx = 0; + unsigned long handle, off = 0; /* off is within-page offset */ + struct page *page = get_first_page(zspage); + struct link_free *prev_free = NULL; + void *prev_page_vaddr = NULL; + + /* in case no free object found */ + set_freeobj(zspage, (unsigned int)(-1UL)); + + while (page) { + void *vaddr = kmap_atomic(page); + struct page *next_page; + + while (off < PAGE_SIZE) { + void *obj_addr = vaddr + off; + + /* skip allocated object */ + if (obj_allocated(page, obj_addr, &handle)) { + obj_idx++; + off += class->size; + continue; + } + + /* free deferred handle from reclaim attempt */ + if (obj_stores_deferred_handle(page, obj_addr, &handle)) + cache_free_handle(pool, handle); + + if (prev_free) + prev_free->next = obj_idx << OBJ_TAG_BITS; + else /* first free object found */ + set_freeobj(zspage, obj_idx); + + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free); + /* if last free object in a previous page, need to unmap */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + obj_idx++; + off += class->size; + } + + /* + * Handle the last (full or partial) object on this page. + */ + next_page = get_next_page(page); + if (next_page) { + if (!prev_free || prev_page_vaddr) { + /* + * There is no free object in this page, so we can safely + * unmap it. + */ + kunmap_atomic(vaddr); + } else { + /* update prev_page_vaddr since prev_free is on this page */ + prev_page_vaddr = vaddr; + } + } else { /* this is the last page */ + if (prev_free) { + /* + * Reset OBJ_TAG_BITS bit to last link to tell + * whether it's allocated object or not. + */ + prev_free->next = -1UL << OBJ_TAG_BITS; + } + + /* unmap previous page (if not done yet) */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + kunmap_atomic(vaddr); + } + + page = next_page; + off %= PAGE_SIZE; + } +} + static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) { int i, obj_idx, ret = 0; @@ -2561,6 +2728,12 @@ next: return 0; } + /* + * Eviction fails on one of the handles, so we need to restore zspage. + * We need to rebuild its freelist (and free stored deferred handles), + * put it back to the correct size class, and add it to the LRU list. + */ + restore_freelist(pool, class, zspage); putback_zspage(class, zspage); list_add(&zspage->lru, &pool->lru); unlock_zspage(zspage); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index f20f4373ff40..9554abcfd5b4 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -871,6 +871,7 @@ static unsigned int ip_sabotage_in(void *priv, if (nf_bridge && !nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { + nf_bridge_info_free(skb); state->okfn(state->net, state->sk, skb); return NF_STOLEN; } diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 748be7253248..78c9729a6057 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1015,6 +1015,7 @@ static void caif_sock_destructor(struct sock *sk) return; } sk_stream_kill_queues(&cf_sk->sk); + WARN_ON_ONCE(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } diff --git a/net/can/isotp.c b/net/can/isotp.c index 608f8c24ae46..fc81d77724a1 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -140,7 +140,7 @@ struct isotp_sock { canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; - struct hrtimer rxtimer, txtimer; + struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; @@ -871,7 +871,7 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data) } /* start timer to send next consecutive frame with correct delay */ - hrtimer_start(&so->txtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) @@ -879,49 +879,39 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; - enum hrtimer_restart restart = HRTIMER_NORESTART; - switch (so->tx.state) { - case ISOTP_SENDING: - - /* cfecho should be consumed by isotp_rcv_echo() here */ - if (!so->cfecho) { - /* start timeout for unlikely lost echo skb */ - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), - ktime_set(ISOTP_ECHO_TIMEOUT, 0))); - restart = HRTIMER_RESTART; + /* don't handle timeouts in IDLE state */ + if (so->tx.state == ISOTP_IDLE) + return HRTIMER_NORESTART; - /* push out the next consecutive frame */ - isotp_send_cframe(so); - break; - } + /* we did not get any flow control or echo frame in time */ - /* cfecho has not been cleared in isotp_rcv_echo() */ - pr_notice_once("can-isotp: cfecho %08X timeout\n", so->cfecho); - fallthrough; + /* report 'communication error on send' */ + sk->sk_err = ECOMM; + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); - case ISOTP_WAIT_FC: - case ISOTP_WAIT_FIRST_FC: + /* reset tx state */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); - /* we did not get any flow control frame in time */ + return HRTIMER_NORESTART; +} - /* report 'communication error on send' */ - sk->sk_err = ECOMM; - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); +static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + txfrtimer); - /* reset tx state */ - so->tx.state = ISOTP_IDLE; - wake_up_interruptible(&so->wait); - break; + /* start echo timeout handling and cover below protocol error */ + hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), + HRTIMER_MODE_REL_SOFT); - default: - WARN_ONCE(1, "can-isotp: tx timer state %08X cfecho %08X\n", - so->tx.state, so->cfecho); - } + /* cfecho should be consumed by isotp_rcv_echo() here */ + if (so->tx.state == ISOTP_SENDING && !so->cfecho) + isotp_send_cframe(so); - return restart; + return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -1162,6 +1152,10 @@ static int isotp_release(struct socket *sock) /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + /* force state machines to be idle also when a signal occurred */ + so->tx.state = ISOTP_IDLE; + so->rx.state = ISOTP_IDLE; + spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); @@ -1194,6 +1188,7 @@ static int isotp_release(struct socket *sock) } } + hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); @@ -1597,6 +1592,8 @@ static int isotp_init(struct sock *sk) so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; + hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c index f33c47327927..ca4ad6cdd5cb 100644 --- a/net/can/j1939/address-claim.c +++ b/net/can/j1939/address-claim.c @@ -165,6 +165,46 @@ static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) * leaving this function. */ ecu = j1939_ecu_get_by_name_locked(priv, name); + + if (ecu && ecu->addr == skcb->addr.sa) { + /* The ISO 11783-5 standard, in "4.5.2 - Address claim + * requirements", states: + * d) No CF shall begin, or resume, transmission on the + * network until 250 ms after it has successfully claimed + * an address except when responding to a request for + * address-claimed. + * + * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim + * prioritization" show that the CF begins the transmission + * after 250 ms from the first AC (address-claimed) message + * even if it sends another AC message during that time window + * to resolve the address contention with another CF. + * + * As stated in "4.4.2.3 - Address-claimed message": + * In order to successfully claim an address, the CF sending + * an address claimed message shall not receive a contending + * claim from another CF for at least 250 ms. + * + * As stated in "4.4.3.2 - NAME management (NM) message": + * 1) A commanding CF can + * d) request that a CF with a specified NAME transmit + * the address-claimed message with its current NAME. + * 2) A target CF shall + * d) send an address-claimed message in response to a + * request for a matching NAME + * + * Taking the above arguments into account, the 250 ms wait is + * requested only during network initialization. + * + * Do not restart the timer on AC message if both the NAME and + * the address match and so if the address has already been + * claimed (timer has expired) or the AC message has been sent + * to resolve the contention with another CF (timer is still + * running). + */ + goto out_ecu_put; + } + if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) ecu = j1939_ecu_create_locked(priv, name); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 5c722b55fe23..fce9b9ebf13f 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1092,10 +1092,6 @@ static bool j1939_session_deactivate(struct j1939_session *session) bool active; j1939_session_list_lock(priv); - /* This function should be called with a session ref-count of at - * least 2. - */ - WARN_ON_ONCE(kref_read(&session->kref) < 2); active = j1939_session_deactivate_locked(session); j1939_session_list_unlock(priv); diff --git a/net/can/raw.c b/net/can/raw.c index 81071cdb0301..ba86782ba8bb 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -132,8 +132,8 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; /* make sure to not pass oversized frames to the socket */ - if ((can_is_canfd_skb(oskb) && !ro->fd_frames && !ro->xl_frames) || - (can_is_canxl_skb(oskb) && !ro->xl_frames)) + if ((!ro->fd_frames && can_is_canfd_skb(oskb)) || + (!ro->xl_frames && can_is_canxl_skb(oskb))) return; /* eliminate multiple filter matches for the same skb */ @@ -670,6 +670,11 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames && !ro->fd_frames) { + ro->fd_frames = ro->xl_frames; + return -EINVAL; + } break; case CAN_RAW_XL_FRAMES: @@ -679,6 +684,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames) + ro->fd_frames = ro->xl_frames; break; case CAN_RAW_JOIN_FILTERS: @@ -786,6 +794,25 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return 0; } +static bool raw_bad_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +{ + /* Classical CAN -> no checks for flags and device capabilities */ + if (can_is_can_skb(skb)) + return false; + + /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ + if (ro->fd_frames && can_is_canfd_skb(skb) && + (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) + return false; + + /* CAN XL -> needs to be enabled and a CAN XL device */ + if (ro->xl_frames && can_is_canxl_skb(skb) && + can_is_canxl_dev_mtu(mtu)) + return false; + + return true; +} + static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; @@ -833,20 +860,8 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto free_skb; err = -EINVAL; - if (ro->xl_frames && can_is_canxl_dev_mtu(dev->mtu)) { - /* CAN XL, CAN FD and Classical CAN */ - if (!can_is_canxl_skb(skb) && !can_is_canfd_skb(skb) && - !can_is_can_skb(skb)) - goto free_skb; - } else if (ro->fd_frames && dev->mtu == CANFD_MTU) { - /* CAN FD and Classical CAN */ - if (!can_is_canfd_skb(skb) && !can_is_can_skb(skb)) - goto free_skb; - } else { - /* Classical CAN */ - if (!can_is_can_skb(skb)) - goto free_skb; - } + if (raw_bad_txframe(ro, skb, dev->mtu)) + goto free_skb; sockcm_init(&sockc, sk); if (msg->msg_controllen) { diff --git a/net/core/dev.c b/net/core/dev.c index b76fb37b381e..f23e287602b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1869,14 +1869,6 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb) -{ - rtnl_lock(); - __move_netdevice_notifier_net(src_net, dst_net, nb); - rtnl_unlock(); -} - int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -10375,7 +10367,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = atomic_long_read(&src[i]); + dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); diff --git a/net/core/devlink.c b/net/core/devlink.c index 032d6d0a5ce6..0bfc144df8b9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4742,11 +4742,8 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, if (err) return err; - if (dest_net && !net_eq(dest_net, curr_net)) { - move_netdevice_notifier_net(curr_net, dest_net, - &devlink->netdevice_nb); + if (dest_net && !net_eq(dest_net, curr_net)) write_pnet(&devlink->_net, dest_net); - } err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); @@ -9979,7 +9976,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, goto err_xa_alloc; devlink->netdevice_nb.notifier_call = devlink_netdevice_event; - ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); + ret = register_netdevice_notifier(&devlink->netdevice_nb); if (ret) goto err_register_netdevice_notifier; @@ -10171,8 +10168,7 @@ void devlink_free(struct devlink *devlink) xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->ports); - WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), - &devlink->netdevice_nb)); + WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb)); xa_erase(&devlinks, devlink->index); @@ -10503,6 +10499,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this @@ -10512,6 +10510,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, netdev); break; case NETDEV_UNREGISTER: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. diff --git a/net/core/gro.c b/net/core/gro.c index 506f83d715f8..4bac7ea6e025 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) struct sk_buff *lp; int segs; + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f00a79fc301b..4edd2176e238 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -269,7 +269,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || - time_after(tref, n->updated)) + !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); @@ -289,7 +289,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) static void neigh_add_timer(struct neighbour *n, unsigned long when) { + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); @@ -1001,12 +1011,14 @@ static void neigh_periodic_work(struct work_struct *work) goto next_elt; } - if (time_before(n->used, n->confirmed)) + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; neigh_mark_dead(n); write_unlock(&n->lock); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5581d22cc191..7b69cf882b8e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -137,12 +137,12 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) return 0; if (ops->id && ops->size) { -cleanup: ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; } +cleanup: kfree(data); out: @@ -304,6 +304,12 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); +/* init code that must occur even if setup_net() is not called. */ +static __net_init void preinit_net(struct net *net) +{ + ref_tracker_dir_init(&net->notrefcnt_tracker, 128); +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -316,7 +322,6 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128); - ref_tracker_dir_init(&net->notrefcnt_tracker, 128); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); @@ -472,6 +477,8 @@ struct net *copy_net_ns(unsigned long flags, rv = -ENOMEM; goto dec_ucounts; } + + preinit_net(net); refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); @@ -1118,6 +1125,7 @@ void __init net_ns_init(void) init_net.key_domain = &init_net_key_domain; #endif down_write(&pernet_ops_rwsem); + preinit_net(&init_net); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4a0eb5593275..a31ff4d83ecc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4100,7 +4100,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_shinfo(skb)->frag_list = NULL; - do { + while (list_skb) { nskb = list_skb; list_skb = list_skb->next; @@ -4146,8 +4146,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, if (skb_needs_linearize(nskb, features) && __skb_linearize(nskb)) goto err_linearize; - - } while (list_skb); + } skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; diff --git a/net/core/sock.c b/net/core/sock.c index f954d5893e79..6f27c24016fe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1531,6 +1531,8 @@ set_sndbuf: ret = -EINVAL; break; } + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() */ WRITE_ONCE(sk->sk_txrehash, (u8)val); break; @@ -3451,7 +3453,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; - sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 22fa2c5bc6ec..a68a7290a3b2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1569,15 +1569,16 @@ void sock_map_unhash(struct sock *sk) psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; + saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + } else { + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); } - - saved_unhash = psock->saved_unhash; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; + if (saved_unhash) + saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); @@ -1590,17 +1591,18 @@ void sock_map_destroy(struct sock *sk) psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->destroy) - sk->sk_prot->destroy(sk); - return; + saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + } else { + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); } - - saved_destroy = psock->saved_destroy; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - sk_psock_put(sk, psock); - saved_destroy(sk); + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; + if (saved_destroy) + saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); @@ -1615,16 +1617,21 @@ void sock_map_close(struct sock *sk, long timeout) if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); - return sk->sk_prot->close(sk, timeout); + saved_close = READ_ONCE(sk->sk_prot)->close; + } else { + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); } - - saved_close = psock->saved_close; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - release_sock(sk); - cancel_work_sync(&psock->work); - sk_psock_put(sk, psock); + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); diff --git a/net/core/stream.c b/net/core/stream.c index cd06750dd329..434446ab14c5 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -209,7 +209,6 @@ void sk_stream_kill_queues(struct sock *sk) sk_mem_reclaim_final(sk); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk->sk_forward_alloc); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4260fe466993..b9d7c3dd1cb3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -551,11 +551,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); } return newsk; @@ -615,7 +613,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) @@ -679,7 +677,6 @@ ipv6_pktoptions: np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &DCCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); memmove(IP6CB(opt_skb), &DCCP_SKB_CB(opt_skb)->header.h6, sizeof(struct inet6_skb_parm)); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6c0ec2789943..cf11f10927e1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -347,6 +347,7 @@ lookup_protocol: sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; inet->mc_loop = 1; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ce9ff3c62e84..3bb890a40ed7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/netlink.h> #include <linux/hash.h> +#include <linux/nospec.h> #include <net/arp.h> #include <net/inet_dscp.h> @@ -1022,6 +1023,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) if (type > RTAX_MAX) return false; + type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d1f837579398..f2c43f67187d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1225,9 +1225,6 @@ int inet_csk_listen_start(struct sock *sk) sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); - if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT) - sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 7fcfdfd8f9de..0e3ee1532848 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> +#include <linux/nospec.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/ip.h> @@ -25,6 +26,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, return -EINVAL; } + type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 94aad3870c5f..cf26d65ca389 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -6,6 +6,7 @@ #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> +#include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> @@ -639,10 +640,9 @@ EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; struct proto *prot = newsk->sk_prot; - if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) + if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f7a84a4acffc..faa47f9ea73a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3127,17 +3127,17 @@ static void add_v4_addrs(struct inet6_dev *idev) offset = sizeof(struct in6_addr) - 4; memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); - if (idev->dev->flags&IFF_POINTOPOINT) { + if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { + scope = IPV6_ADDR_COMPATv4; + plen = 96; + pflags |= RTF_NONEXTHOP; + } else { if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) return; addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64; - } else { - scope = IPV6_ADDR_COMPATv4; - plen = 96; - pflags |= RTF_NONEXTHOP; } if (addr.s6_addr32[3]) { @@ -3447,6 +3447,30 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +static void addrconf_init_auto_addrs(struct net_device *dev) +{ + switch (dev->type) { +#if IS_ENABLED(CONFIG_IPV6_SIT) + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; +#endif +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) + case ARPHRD_IP6GRE: + case ARPHRD_IPGRE: + addrconf_gre_config(dev); + break; +#endif + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + } +} + static int fixup_permanent_addr(struct net *net, struct inet6_dev *idev, struct inet6_ifaddr *ifp) @@ -3615,26 +3639,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } - switch (dev->type) { -#if IS_ENABLED(CONFIG_IPV6_SIT) - case ARPHRD_SIT: - addrconf_sit_config(dev); - break; -#endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) - case ARPHRD_IP6GRE: - case ARPHRD_IPGRE: - addrconf_gre_config(dev); - break; -#endif - case ARPHRD_LOOPBACK: - init_loopback(dev); - break; - - default: - addrconf_dev_config(dev); - break; - } + addrconf_init_auto_addrs(dev); if (!IS_ERR_OR_NULL(idev)) { if (run_pending) @@ -6397,7 +6402,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; @@ -6408,7 +6413,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev && idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fee9163382c2..847934763868 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -222,6 +222,7 @@ lookup_protocol: np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e624497fa992..9b6818453afe 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -51,7 +51,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; - fl6->flowlabel = np->flow_label; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!oif) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 60fd91bb5171..c314fdde0097 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -547,7 +547,20 @@ int ip6_forward(struct sk_buff *skb) pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { - hdr->hop_limit--; + /* It's tempting to decrease the hop limit + * here by 1, as we do at the end of the + * function too. + * + * But that would be incorrect, as proxying is + * not forwarding. The ip6_input function + * will handle this packet locally, and it + * depends on the hop limit being unchanged. + * + * One example is the NDP hop limit, that + * always has to stay 255, but other would be + * similar checks around RA packets, where the + * user can even change the desired limit. + */ return ip6_input(skb); } else if (proxied < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 11b736a76bd7..a52a4f12f146 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -272,6 +272,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; @@ -1387,14 +1388,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_mask(sk, GFP_ATOMIC)); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) { + if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); - skb_set_owner_r(newnp->pktoptions, newsk); - } } } else { if (!req_unhash && found_dup_sk) { @@ -1466,7 +1464,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + opt_skb = skb_clone_and_charge_r(skb, sk); reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1552,7 +1550,6 @@ ipv6_pktoptions: if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { diff --git a/net/key/af_key.c b/net/key/af_key.c index 2bdbcec781cd..a815f5ab4c49 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1261,7 +1261,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; - x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); + x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index c2aae2a6d6a6..97bb4401dd3e 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -213,7 +213,6 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, ret = ieee802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); - kfree_skb(skb); return; } diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index fc9e728b6333..3150f3f0c872 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -544,9 +544,6 @@ static int mctp_sk_init(struct sock *sk) static void mctp_sk_close(struct sock *sk, long timeout) { - struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); - - del_timer_sync(&msk->key_expiry); sk_common_release(sk); } @@ -580,7 +577,19 @@ static void mctp_sk_unhash(struct sock *sk) spin_lock_irqsave(&key->lock, fl2); __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_CLOSED); } + sock_set_flag(sk, SOCK_DEAD); spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + /* Since there are no more tag allocations (we have removed all of the + * keys), stop any pending expiry events. the timer cannot be re-queued + * as the sk is no longer observable + */ + del_timer_sync(&msk->key_expiry); +} + +static void mctp_sk_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); } static struct proto mctp_proto = { @@ -619,6 +628,7 @@ static int mctp_pf_create(struct net *net, struct socket *sock, return -ENOMEM; sock_init_data(sock, sk); + sk->sk_destruct = mctp_sk_destruct; rc = 0; if (sk->sk_prot->init) diff --git a/net/mctp/route.c b/net/mctp/route.c index f9a80b82dc51..f51a05ec7162 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -147,6 +147,7 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, key->valid = true; spin_lock_init(&key->lock); refcount_set(&key->refs, 1); + sock_hold(key->sk); return key; } @@ -165,6 +166,7 @@ void mctp_key_unref(struct mctp_sk_key *key) mctp_dev_release_key(key->dev, key); spin_unlock_irqrestore(&key->lock, flags); + sock_put(key->sk); kfree(key); } @@ -177,6 +179,11 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) spin_lock_irqsave(&net->mctp.keys_lock, flags); + if (sock_flag(&msk->sk, SOCK_DEAD)) { + rc = -EINVAL; + goto out_unlock; + } + hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { if (mctp_key_match(tmp, key->local_addr, key->peer_addr, key->tag)) { @@ -198,6 +205,7 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) hlist_add_head(&key->sklist, &msk->keys); } +out_unlock: spin_unlock_irqrestore(&net->mctp.keys_lock, flags); return rc; @@ -315,8 +323,8 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) { + struct mctp_sk_key *key, *any_key = NULL; struct net *net = dev_net(skb->dev); - struct mctp_sk_key *key; struct mctp_sock *msk; struct mctp_hdr *mh; unsigned long f; @@ -361,13 +369,11 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * key for reassembly - we'll create a more specific * one for future packets if required (ie, !EOM). */ - key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); - if (key) { - msk = container_of(key->sk, + any_key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); + if (any_key) { + msk = container_of(any_key->sk, struct mctp_sock, sk); - spin_unlock_irqrestore(&key->lock, f); - mctp_key_unref(key); - key = NULL; + spin_unlock_irqrestore(&any_key->lock, f); } } @@ -419,14 +425,14 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * this function. */ rc = mctp_key_add(key, msk); - if (rc) { - kfree(key); - } else { + if (!rc) trace_mctp_key_acquire(key); - /* we don't need to release key->lock on exit */ - mctp_key_unref(key); - } + /* we don't need to release key->lock on exit, so + * clean up here and suppress the unlock via + * setting to NULL + */ + mctp_key_unref(key); key = NULL; } else { @@ -473,6 +479,8 @@ out_unlock: spin_unlock_irqrestore(&key->lock, f); mctp_key_unref(key); } + if (any_key) + mctp_key_unref(any_key); out: if (rc) kfree_skb(skb); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 35b5f806fdda..dc5165d3eec4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1428,6 +1428,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, free: kfree(table); out: + mdev->sysctl = NULL; return -ENOBUFS; } @@ -1437,6 +1438,9 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct net *net = dev_net(dev); struct ctl_table *table; + if (!mdev->sysctl) + return; + table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2ea7eae43bdb..10fe9771a852 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -998,8 +998,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, { int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; - struct mptcp_sock *msk; struct socket *ssock; + struct sock *newsk; int backlog = 1024; int err; @@ -1008,11 +1008,13 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; - msk = mptcp_sk(entry->lsk->sk); - if (!msk) + newsk = entry->lsk->sk; + if (!newsk) return -EINVAL; - ssock = __mptcp_nmpc_socket(msk); + lock_sock(newsk); + ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); + release_sock(newsk); if (!ssock) return -EINVAL; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cd6cc67c2c5..bc6c1f62a690 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2897,6 +2897,7 @@ bool __mptcp_close(struct sock *sk, long timeout) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; + int subflows_alive = 0; sk->sk_shutdown = SHUTDOWN_MASK; @@ -2922,6 +2923,8 @@ cleanup: struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + subflows_alive += ssk->sk_state != TCP_CLOSE; + /* since the close timeout takes precedence on the fail one, * cancel the latter */ @@ -2937,6 +2940,12 @@ cleanup: } sock_orphan(sk); + /* all the subflows are closed, only timeout can change the msk + * state, let's not keep resources busy for no reasons + */ + if (subflows_alive == 0) + inet_sk_state_store(sk, TCP_CLOSE); + sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (msk->token) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4b1e6ec1b36..7f2c3727ab23 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { + struct sock *sk = (struct sock *)msk; struct socket *sock; + int ret = -EINVAL; /* Limit to first subflow, before the connection establishment */ + lock_sock(sk); sock = __mptcp_nmpc_socket(msk); if (!sock) - return -EINVAL; + goto unlock; - return tcp_setsockopt(sock->sk, level, optname, optval, optlen); + ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); + +unlock: + release_sock(sk); + return ret; } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ec54413fb31f..32904c76c6a1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1399,6 +1399,7 @@ void __mptcp_error_report(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err = sock_error(ssk); + int ssk_state; if (!err) continue; @@ -1409,7 +1410,14 @@ void __mptcp_error_report(struct sock *sk) if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) continue; - inet_sk_state_store(sk, inet_sk_state_load(ssk)); + /* We need to propagate only transition to CLOSE state. + * Orphaned socket will see such state change via + * subflow_sched_work_if_closed() and that path will properly + * destroy the msk as needed. + */ + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + inet_sk_state_store(sk, ssk_state); sk->sk_err = -err; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ @@ -1679,7 +1687,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, if (err) return err; - lock_sock(sf->sk); + lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index d88b92a8ffca..011d414038ea 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -27,22 +27,16 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR - - And so for me for SCTP :D -Kiran */ - static const char *const sctp_conntrack_names[] = { - "NONE", - "CLOSED", - "COOKIE_WAIT", - "COOKIE_ECHOED", - "ESTABLISHED", - "SHUTDOWN_SENT", - "SHUTDOWN_RECD", - "SHUTDOWN_ACK_SENT", - "HEARTBEAT_SENT", - "HEARTBEAT_ACKED", + [SCTP_CONNTRACK_NONE] = "NONE", + [SCTP_CONNTRACK_CLOSED] = "CLOSED", + [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", + [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", + [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", + [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", + [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", + [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; #define SECS * HZ @@ -54,13 +48,11 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, - [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, - [SCTP_CONNTRACK_DATA_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 @@ -74,8 +66,6 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT -#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED -#define sDS SCTP_CONNTRACK_DATA_SENT #define sIV SCTP_CONNTRACK_MAX /* @@ -98,10 +88,6 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. -HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction - opposite to that of the HEARTBEAT/DATA chunk. Secondary connection - is established. -DATA_SENT - We have seen a DATA/SACK in a new flow. */ /* TODO @@ -115,38 +101,36 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL}, -/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, -/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, -/* data/sack */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV}, -/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA}, -/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, -/* data/sack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; @@ -158,6 +142,7 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) } #endif +/* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ @@ -258,11 +243,6 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; - case SCTP_CID_DATA: - case SCTP_CID_SACK: - pr_debug("SCTP_CID_DATA/SACK"); - i = 11; - break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", @@ -316,9 +296,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; - } else if (sch->type == SCTP_CID_HEARTBEAT || - sch->type == SCTP_CID_DATA || - sch->type == SCTP_CID_SACK) { + } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; @@ -404,19 +382,19 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; - } else { - /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, map) && - !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && - !test_bit(SCTP_CID_COOKIE_ECHO, map) && - !test_bit(SCTP_CID_ABORT, map) && - !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && - !test_bit(SCTP_CID_HEARTBEAT, map) && - !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && - sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); - goto out; - } + } + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; @@ -424,22 +402,29 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { - /* Sec 8.5.1 (A) */ + /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { - /* Sec 8.5.1 (B) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir]) + /* (B) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - /* Sec 8.5.1 (C) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir] && - sch->flags & SCTP_CHUNK_FLAG_T) + /* (C) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { - /* Sec 8.5.1 (D) */ + /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_HEARTBEAT) { @@ -476,11 +461,6 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } - } else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) { - if (ct->proto.sctp.vtag[dir] == 0) { - pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); - ct->proto.sctp.vtag[dir] = sh->vtag; - } } old_state = ct->proto.sctp.state; @@ -518,8 +498,12 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } ct->proto.sctp.state = new_state; - if (old_state != new_state) + if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + if (new_state == SCTP_CONNTRACK_ESTABLISHED && + !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } } spin_unlock_bh(&ct->lock); @@ -533,14 +517,6 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && - dir == IP_CT_DIR_REPLY && - new_state == SCTP_CONNTRACK_ESTABLISHED) { - pr_debug("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_ASSURED, ct); - } - return NF_ACCEPT; out_unlock: @@ -701,7 +677,6 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, - [CTA_TIMEOUT_SCTP_DATA_SENT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0250725e38a4..460294bd4b60 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -601,8 +601,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, - NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, - NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, @@ -887,18 +885,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { - .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = { - .procname = "nf_conntrack_sctp_timeout_data_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { @@ -1042,8 +1028,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); - XASSIGN(HEARTBEAT_ACKED, sn); - XASSIGN(DATA_SENT, sn); #undef XASSIGN #endif } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 7325bee7d144..19ea4d3c3553 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, - const struct nft_rbtree_elem *interval) +static int nft_rbtree_cmp(const struct nft_set *set, + const struct nft_rbtree_elem *e1, + const struct nft_rbtree_elem *e2) { - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), + set->klen); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); @@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set rbe = rb_entry(parent, struct nft_rbtree_elem, node); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && - nft_rbtree_equal(set, this, interval) && + !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; @@ -215,154 +215,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static int nft_rbtree_gc_elem(const struct nft_set *__set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); + struct nft_rbtree_elem *rbe_prev; + struct nft_set_gc_batch *gcb; + + gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); + if (!gcb) + return -ENOMEM; + + /* search for expired end interval coming before this element. */ + do { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_end(rbe_prev)) + break; + + prev = rb_prev(prev); + } while (prev != NULL); + + rb_erase(&rbe_prev->node, &priv->root); + rb_erase(&rbe->node, &priv->root); + atomic_sub(2, &set->nelems); + + nft_set_gc_batch_add(gcb, rbe); + nft_set_gc_batch_complete(gcb); + + return 0; +} + +static bool nft_rbtree_update_first(const struct nft_set *set, + struct nft_rbtree_elem *rbe, + struct rb_node *first) +{ + struct nft_rbtree_elem *first_elem; + + first_elem = rb_entry(first, struct nft_rbtree_elem, node); + /* this element is closest to where the new element is to be inserted: + * update the first element for the node list path. + */ + if (nft_rbtree_cmp(set, rbe, first_elem) < 0) + return true; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { - bool overlap = false, dup_end_left = false, dup_end_right = false; + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_rbtree_elem *rbe; - struct rb_node *parent, **p; - int d; + int d, err; - /* Detect overlaps as we descend the tree. Set the flag in these cases: - * - * a1. _ _ __>| ?_ _ __| (insert end before existing end) - * a2. _ _ ___| ?_ _ _>| (insert end after existing end) - * a3. _ _ ___? >|_ _ __| (insert start before existing end) - * - * and clear it later on, as we eventually reach the points indicated by - * '?' above, in the cases described below. We'll always meet these - * later, locally, due to tree ordering, and overlaps for the intervals - * that are the closest together are always evaluated last. - * - * b1. _ _ __>| !_ _ __| (insert end before existing start) - * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) - * '--' no nodes falling in this range - * b4. >|_ _ ! (insert start before existing start) - * - * Case a3. resolves to b3.: - * - if the inserted start element is the leftmost, because the '0' - * element in the tree serves as end element - * - otherwise, if an existing end is found immediately to the left. If - * there are existing nodes in between, we need to further descend the - * tree before we can conclude the new start isn't causing an overlap - * - * or to b4., which, preceded by a3., means we already traversed one or - * more existing intervals entirely, from the right. - * - * For a new, rightmost pair of elements, we'll hit cases b3. and b2., - * in that order. - * - * The flag is also cleared in two special cases: - * - * b5. |__ _ _!|<_ _ _ (insert start right before existing end) - * b6. |__ _ >|!__ _ _ (insert end right after existing start) - * - * which always happen as last step and imply that no further - * overlapping is possible. - * - * Another special case comes from the fact that start elements matching - * an already existing start element are allowed: insertion is not - * performed but we return -EEXIST in that case, and the error will be - * cleared by the caller if NLM_F_EXCL is not present in the request. - * This way, request for insertion of an exact overlap isn't reported as - * error to userspace if not desired. - * - * However, if the existing start matches a pre-existing start, but the - * end element doesn't match the corresponding pre-existing end element, - * we need to report a partial overlap. This is a local condition that - * can be noticed without need for a tracking flag, by checking for a - * local duplicated end for a corresponding start, from left and right, - * separately. + /* Descend the tree to search for an existing element greater than the + * key value to insert that is greater than the new element. This is the + * first element to walk the ordered elements to find possible overlap. */ - parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), - nft_set_ext_key(&new->ext), - set->klen); + d = nft_rbtree_cmp(set, rbe, new); + if (d < 0) { p = &parent->rb_left; - - if (nft_rbtree_interval_start(new)) { - if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext) && !*p) - overlap = false; - } else { - if (dup_end_left && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_right = true; - continue; - } - } } else if (d > 0) { - p = &parent->rb_right; + if (!first || + nft_rbtree_update_first(set, rbe, first)) + first = &rbe->node; - if (nft_rbtree_interval_end(new)) { - if (dup_end_right && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_left = true; - continue; - } - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - overlap = nft_rbtree_interval_end(rbe); - } + p = &parent->rb_right; } else { - if (nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(new)) { + if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; - - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_rbtree_interval_start(rbe) && - nft_rbtree_interval_end(new)) { + else p = &parent->rb_right; + } + } + + if (!first) + first = rb_first(&priv->root); + + /* Detect overlap by going through the list of valid tree nodes. + * Values stored in the tree are in reversed order, starting from + * highest to lowest value. + */ + for (node = first; node != NULL; node = rb_next(node)) { + rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - *ext = &rbe->ext; - return -EEXIST; - } else { - overlap = false; - if (nft_rbtree_interval_end(rbe)) - p = &parent->rb_left; - else - p = &parent->rb_right; + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* perform garbage collection to avoid bogus overlap reports. */ + if (nft_set_elem_expired(&rbe->ext)) { + err = nft_rbtree_gc_elem(set, priv, rbe); + if (err < 0) + return err; + + continue; + } + + d = nft_rbtree_cmp(set, rbe, new); + if (d == 0) { + /* Matching end element: no need to look for an + * overlapping greater or equal element. + */ + if (nft_rbtree_interval_end(rbe)) { + rbe_le = rbe; + break; + } + + /* first element that is greater or equal to key value. */ + if (!rbe_ge) { + rbe_ge = rbe; + continue; + } + + /* this is a closer more or equal element, update it. */ + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { + rbe_ge = rbe; + continue; + } + + /* element is equal to key value, make sure flags are + * the same, an existing more or equal start element + * must not be replaced by more or equal end element. + */ + if ((nft_rbtree_interval_start(new) && + nft_rbtree_interval_start(rbe_ge)) || + (nft_rbtree_interval_end(new) && + nft_rbtree_interval_end(rbe_ge))) { + rbe_ge = rbe; + continue; } + } else if (d > 0) { + /* annotate element greater than the new element. */ + rbe_ge = rbe; + continue; + } else if (d < 0) { + /* annotate element less than the new element. */ + rbe_le = rbe; + break; } + } - dup_end_left = dup_end_right = false; + /* - new start element matching existing start element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { + *ext = &rbe_ge->ext; + return -EEXIST; } - if (overlap) + /* - new end element matching existing end element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + *ext = &rbe_le->ext; + return -EEXIST; + } + + /* - new start element with existing closest, less or equal key value + * being a start element: partial overlap, reported as -ENOTEMPTY. + * Anonymous sets allow for two consecutive start element since they + * are constant, skip them to avoid bogus overlap reports. + */ + if (!nft_set_is_anonymous(set) && rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, less or equal key value + * being a end element: partial overlap, reported as -ENOTEMPTY. + */ + if (rbe_le && + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* - new end element with existing closest, greater or equal key value + * being an end element: partial overlap, reported as -ENOTEMPTY + */ + if (rbe_ge && + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* Accepted element: pick insertion point depending on key value */ + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; @@ -501,23 +563,37 @@ static void nft_rbtree_gc(struct work_struct *work) struct nft_rbtree *priv; struct rb_node *node; struct nft_set *set; + struct net *net; + u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + genmask = nft_genmask_cur(net); write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is + * always visited before the start element. + */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) + + if (nft_set_elem_mark_busy(&rbe->ext)) { + rbe_end = NULL; continue; + } if (rbe_prev) { rb_erase(&rbe_prev->node, &priv->root); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index bca2a470ccad..c64277659753 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -580,7 +580,9 @@ static int netlink_insert(struct sock *sk, u32 portid) if (nlk_sk(sk)->bound) goto err; - nlk_sk(sk)->portid = portid; + /* portid can be read locklessly from netlink_getname(). */ + WRITE_ONCE(nlk_sk(sk)->portid, portid); + sock_hold(sk); err = __netlink_insert(table, sk); @@ -1096,9 +1098,11 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; if (addr->sa_family == AF_UNSPEC) { - sk->sk_state = NETLINK_UNCONNECTED; - nlk->dst_portid = 0; - nlk->dst_group = 0; + /* paired with READ_ONCE() in netlink_getsockbyportid() */ + WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); + /* dst_portid and dst_group can be read locklessly */ + WRITE_ONCE(nlk->dst_portid, 0); + WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) @@ -1119,9 +1123,11 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, err = netlink_autobind(sock); if (err == 0) { - sk->sk_state = NETLINK_CONNECTED; - nlk->dst_portid = nladdr->nl_pid; - nlk->dst_group = ffs(nladdr->nl_groups); + /* paired with READ_ONCE() in netlink_getsockbyportid() */ + WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); + /* dst_portid and dst_group can be read locklessly */ + WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); + WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; @@ -1138,10 +1144,12 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_pad = 0; if (peer) { - nladdr->nl_pid = nlk->dst_portid; - nladdr->nl_groups = netlink_group_mask(nlk->dst_group); + /* Paired with WRITE_ONCE() in netlink_connect() */ + nladdr->nl_pid = READ_ONCE(nlk->dst_portid); + nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { - nladdr->nl_pid = nlk->portid; + /* Paired with WRITE_ONCE() in netlink_insert() */ + nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); @@ -1168,8 +1176,9 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); - if (sock->sk_state == NETLINK_CONNECTED && - nlk->dst_portid != nlk_sk(ssk)->portid) { + /* dst_portid and sk_state can be changed in netlink_connect() */ + if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && + READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } @@ -1886,8 +1895,9 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { - dst_portid = nlk->dst_portid; - dst_group = nlk->dst_group; + /* Paired with WRITE_ONCE() in netlink_connect() */ + dst_portid = READ_ONCE(nlk->dst_portid); + dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 6f7f4392cffb..5a4cb796150f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -400,6 +400,11 @@ static int nr_listen(struct socket *sock, int backlog) struct sock *sk = sock->sk; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index a8da88db7893..4e7c968cde2d 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -121,6 +121,7 @@ static void nr_heartbeat_expiry(struct timer_list *t) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); goto out; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a71795355aec..fcee6012293b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1004,14 +1004,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) { error = -ENOMEM; - goto err_kfree_key; + goto err_kfree_flow; } ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) - goto err_kfree_flow; + goto err_kfree_key; ovs_flow_mask_key(&new_flow->key, key, true, &mask); @@ -1019,14 +1019,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], key, log); if (error) - goto err_kfree_flow; + goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); - goto err_kfree_flow; + goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, @@ -1126,10 +1126,10 @@ err_unlock_ovs: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); -err_kfree_flow: - ovs_flow_free(new_flow, false); err_kfree_key: kfree(key); +err_kfree_flow: + ovs_flow_free(new_flow, false); error: return error; } diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 6e38f68f88c2..f2698d2316df 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -449,7 +449,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) err = attach_meter(meter_tbl, meter); if (err) - goto exit_unlock; + goto exit_free_old_meter; ovs_unlock(); @@ -472,6 +472,8 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_free_old_meter: + ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 1990d496fcfc..e595079c2caf 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -83,7 +83,10 @@ static struct qrtr_node *node_get(unsigned int node_id) node->id = node_id; - radix_tree_insert(&nodes, node_id, node); + if (radix_tree_insert(&nodes, node_id, node)) { + kfree(node); + return NULL; + } return node; } diff --git a/net/rds/message.c b/net/rds/message.c index b47e4f0a1639..c19c93561227 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_lock_irqsave(&q->lock, flags); head = &q->zcookie_head; if (!list_empty(head)) { - info = list_entry(head, struct rds_msg_zcopy_info, - rs_zcookie_next); - if (info && rds_zcookie_add(info, cookie)) { + info = list_first_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (rds_zcookie_add(info, cookie)) { spin_unlock_irqrestore(&q->lock, flags); kfree(rds_info_from_znotifier(znotif)); /* caller invokes rds_wake_sk_sleep() */ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 36fefc3957d7..ca2b17f32670 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -488,6 +488,12 @@ static int rose_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { struct rose_sock *rose = rose_sk(sk); @@ -497,8 +503,10 @@ static int rose_listen(struct socket *sock, int backlog) memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; + release_sock(sk); return 0; } + release_sock(sk); return -EOPNOTSUPP; } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 4b1b59da5c0b..4d15b6a6169c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -93,7 +93,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); - bstats_update(&ca->tcf_bstats, skb); + tcf_action_update_bstats(&ca->common, skb); action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); @@ -212,8 +212,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, index = actparm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_ctinfo_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index ee2a050c887b..6640e75eaa02 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -12,6 +12,7 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/refcount.h> +#include <linux/rcupdate.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> @@ -339,6 +340,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcf_result cr = {}; int err, balloc = 0; struct tcf_exts e; + bool update_h = false; err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) @@ -456,10 +458,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } } - if (cp->perfect) + if (cp->perfect) { r = cp->perfect + handle; - else - r = tcindex_lookup(cp, handle) ? : &new_filter_result; + } else { + /* imperfect area is updated in-place using rcu */ + update_h = !!tcindex_lookup(cp, handle); + r = &new_filter_result; + } if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); @@ -485,7 +490,28 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, rcu_assign_pointer(tp->root, cp); - if (r == &new_filter_result) { + if (update_h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *cf; + + f->result.res = r->res; + tcf_exts_change(&f->result.exts, &r->exts); + + /* imperfect area bucket */ + fp = cp->h + (handle % cp->hash); + + /* lookup the filter, guaranteed to exist */ + for (cf = rcu_dereference_bh_rtnl(*fp); cf; + fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) + if (cf->key == (u16)handle) + break; + + f->next = cf->next; + + cf = rcu_replace_pointer(*fp, f, 1); + tcf_exts_get_net(&cf->result.exts); + tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work); + } else if (r == &new_filter_result) { struct tcindex_filter *nfp; struct tcindex_filter __rcu **fp; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index f46643850df8..92f2975b6a82 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -431,7 +431,10 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) while (cl->cmode == HTB_MAY_BORROW && p && mask) { m = mask; while (m) { - int prio = ffz(~m); + unsigned int prio = ffz(~m); + + if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio))) + break; m &= ~(1 << prio); if (p->inner.clprio[prio].feed.rb_node) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9a11a499ea2d..c322a61eaeea 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1700,7 +1700,6 @@ static void taprio_reset(struct Qdisc *sch) int i; hrtimer_cancel(&q->advance_timer); - qdisc_synchronize(sch); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 59e653b528b1..6b95d3ba8fe1 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -73,6 +73,12 @@ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, } } + /* If somehow no addresses were found that can be used with this + * scope, it's an error. + */ + if (list_empty(&dest->address_list)) + error = -ENETUNREACH; + out: if (error) sctp_bind_addr_clean(dest); diff --git a/net/sctp/diag.c b/net/sctp/diag.c index a557009e9832..c3d6b92dd386 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -343,11 +343,9 @@ static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) + if (!list_is_first(&tsp->asoc->asocs, &ep->asocs)) return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index ca1eba95c293..2f66a2006517 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -196,9 +196,7 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport) /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); - if ((time_before(transport->hb_timer.expires, expires) || - !timer_pending(&transport->hb_timer)) && - !mod_timer(&transport->hb_timer, + if (!mod_timer(&transport->hb_timer, expires + get_random_u32_below(transport->rto))) sctp_transport_hold(transport); } diff --git a/net/socket.c b/net/socket.c index 888cd618a968..c6c44e26e954 100644 --- a/net/socket.c +++ b/net/socket.c @@ -385,7 +385,7 @@ static const struct xattr_handler sockfs_xattr_handler = { }; static int sockfs_security_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) @@ -589,10 +589,10 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } -static int sockfs_setattr(struct user_namespace *mnt_userns, +static int sockfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { - int err = simple_setattr(&init_user_ns, dentry, iattr); + int err = simple_setattr(&nop_mnt_idmap, dentry, iattr); if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); @@ -971,9 +971,12 @@ static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, static void sock_recv_mark(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - if (sock_flag(sk, SOCK_RCVMARK) && skb) - put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), - &skb->mark); + if (sock_flag(sk, SOCK_RCVMARK) && skb) { + /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ + __u32 mark = skb->mark; + + put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark); + } } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b35c8701876a..a38733f2197a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2614,6 +2614,7 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, /* Send a 'SYN-' to destination */ m.msg_name = dest; m.msg_namelen = destlen; + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); /* If connect is in non-blocking case, set MSG_DONTWAIT to * indicate send_msg() is never blocked. @@ -2776,6 +2777,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); __tipc_sendstream(new_sock, &m, 0); release_sock(new_sk); exit: diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9ed978634125..a83d2b4275fa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2427,7 +2427,7 @@ static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx) { struct tls_rec *rec; - rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); + rec = list_first_entry_or_null(&ctx->tx_list, struct tls_rec, list); if (!rec) return false; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f0c2293f1d3b..81ff98298996 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1190,7 +1190,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct net *net = sock_net(sk); - struct user_namespace *ns; // barf... + struct mnt_idmap *idmap; struct unix_address *addr; struct dentry *dentry; struct path parent; @@ -1217,10 +1217,10 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, /* * All right, let's create it. */ - ns = mnt_user_ns(parent.mnt); + idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) - err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); + err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); @@ -1245,7 +1245,7 @@ out_unlock: err = -EINVAL; out_unlink: /* failed after successful mknod? unlink what we'd created... */ - vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL); + vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: done_path_create(&parent, dentry); out: diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 3b55502b2965..5c7ad301d742 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -482,6 +482,12 @@ static int x25_listen(struct socket *sock, int backlog) int rc = -EOPNOTSUPP; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + rc = -EINVAL; + release_sock(sk); + return rc; + } + if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index a0f62fa02e06..8cbf45a8bcdc 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -5,6 +5,7 @@ * Based on code and translator idea by: Florian Westphal <fw@strlen.de> */ #include <linux/compat.h> +#include <linux/nospec.h> #include <linux/xfrm.h> #include <net/xfrm.h> @@ -302,7 +303,7 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src) nla_for_each_attr(nla, attrs, len, remaining) { int err; - switch (type) { + switch (nlh_src->nlmsg_type) { case XFRM_MSG_NEWSPDINFO: err = xfrm_nla_cpy(dst, nla, nla_len(nla)); break; @@ -437,6 +438,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } + type = array_index_nospec(type, XFRMA_MAX + 1); if (nla_len(nla) < compat_policy[type].len) { NL_SET_ERR_MSG(extack, "Attribute bad length"); return -EOPNOTSUPP; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c06e54a10540..436d29640ac2 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -279,8 +279,7 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), - ipipv6_hdr(skb)); + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1f99dc469027..35279c220bd7 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e9eb82c5457d..5c61ec04b839 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; @@ -3661,7 +3661,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = ktime_get_real_seconds(); + /* This lockless write can happen from different cpus. */ + WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; @@ -3676,7 +3677,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, xfrm_pol_put(pols[0]); return 0; } - pols[1]->curlft.use_time = ktime_get_real_seconds(); + /* This write can happen from different cpus. */ + WRITE_ONCE(pols[1]->curlft.use_time, + ktime_get_real_seconds()); npols++; } } @@ -3742,6 +3745,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, goto reject; } + if (if_id) + secpath_reset(skb); + xfrm_pols_put(pols, npols); return 1; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 89c731f4f0c7..00afe831c71c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -577,7 +577,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->km.state == XFRM_STATE_EXPIRED) goto expired; if (x->lft.hard_add_expires_seconds) { - long tmo = x->lft.hard_add_expires_seconds + + time64_t tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { if (x->xflags & XFRM_SOFT_EXPIRE) { @@ -594,8 +594,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) next = tmo; } if (x->lft.hard_use_expires_seconds) { - long tmo = x->lft.hard_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + time64_t tmo = x->lft.hard_use_expires_seconds + + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -604,7 +604,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->km.dying) goto resched; if (x->lft.soft_add_expires_seconds) { - long tmo = x->lft.soft_add_expires_seconds + + time64_t tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { warn = 1; @@ -616,8 +616,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } } if (x->lft.soft_use_expires_seconds) { - long tmo = x->lft.soft_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + time64_t tmo = x->lft.soft_use_expires_seconds + + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) @@ -1906,7 +1906,7 @@ out: hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); - if (x1->curlft.use_time) + if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { @@ -1940,8 +1940,8 @@ int xfrm_state_check_expire(struct xfrm_state *x) { xfrm_dev_state_update_curlft(x); - if (!x->curlft.use_time) - x->curlft.use_time = ktime_get_real_seconds(); + if (!READ_ONCE(x->curlft.use_time)) + WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { diff --git a/rust/Makefile b/rust/Makefile index ff70c4c916f8..8a521f2b6422 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -50,6 +50,7 @@ core-cfgs = \ --cfg no_fp_fmt_parse alloc-cfgs = \ + --cfg no_borrow \ --cfg no_fmt \ --cfg no_global_oom_handling \ --cfg no_macros \ @@ -359,8 +360,22 @@ rust-analyzer: $(Q)$(srctree)/scripts/generate_rust_analyzer.py $(srctree) $(objtree) \ $(RUST_LIB_SRC) > $(objtree)/rust-project.json +redirect-intrinsics = \ + __eqsf2 __gesf2 __lesf2 __nesf2 __unordsf2 \ + __unorddf2 \ + __muloti4 __multi3 \ + __udivmodti4 __udivti3 __umodti3 + +ifneq ($(or $(CONFIG_ARM64),$(and $(CONFIG_RISCV),$(CONFIG_64BIT))),) + # These intrinsics are defined for ARM64 and RISCV64 + redirect-intrinsics += \ + __ashrti3 \ + __ashlti3 __lshrti3 +endif + $(obj)/core.o: private skip_clippy = 1 $(obj)/core.o: private skip_flags = -Dunreachable_pub +$(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--redefine-sym $(sym)=__rust$(sym)) $(obj)/core.o: private rustc_target_flags = $(core-cfgs) $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs $(obj)/target.json FORCE $(call if_changed_dep,rustc_library) diff --git a/rust/alloc/borrow.rs b/rust/alloc/borrow.rs deleted file mode 100644 index dde4957200d4..000000000000 --- a/rust/alloc/borrow.rs +++ /dev/null @@ -1,498 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT - -//! A module for working with borrowed data. - -#![stable(feature = "rust1", since = "1.0.0")] - -use core::cmp::Ordering; -use core::hash::{Hash, Hasher}; -use core::ops::Deref; -#[cfg(not(no_global_oom_handling))] -use core::ops::{Add, AddAssign}; - -#[stable(feature = "rust1", since = "1.0.0")] -pub use core::borrow::{Borrow, BorrowMut}; - -use core::fmt; -#[cfg(not(no_global_oom_handling))] -use crate::string::String; - -use Cow::*; - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, B: ?Sized> Borrow<B> for Cow<'a, B> -where - B: ToOwned, - <B as ToOwned>::Owned: 'a, -{ - fn borrow(&self) -> &B { - &**self - } -} - -/// A generalization of `Clone` to borrowed data. -/// -/// Some types make it possible to go from borrowed to owned, usually by -/// implementing the `Clone` trait. But `Clone` works only for going from `&T` -/// to `T`. The `ToOwned` trait generalizes `Clone` to construct owned data -/// from any borrow of a given type. -#[cfg_attr(not(test), rustc_diagnostic_item = "ToOwned")] -#[stable(feature = "rust1", since = "1.0.0")] -pub trait ToOwned { - /// The resulting type after obtaining ownership. - #[stable(feature = "rust1", since = "1.0.0")] - type Owned: Borrow<Self>; - - /// Creates owned data from borrowed data, usually by cloning. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// let s: &str = "a"; - /// let ss: String = s.to_owned(); - /// - /// let v: &[i32] = &[1, 2]; - /// let vv: Vec<i32> = v.to_owned(); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[must_use = "cloning is often expensive and is not expected to have side effects"] - fn to_owned(&self) -> Self::Owned; - - /// Uses borrowed data to replace owned data, usually by cloning. - /// - /// This is borrow-generalized version of `Clone::clone_from`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// # #![feature(toowned_clone_into)] - /// let mut s: String = String::new(); - /// "hello".clone_into(&mut s); - /// - /// let mut v: Vec<i32> = Vec::new(); - /// [1, 2][..].clone_into(&mut v); - /// ``` - #[unstable(feature = "toowned_clone_into", reason = "recently added", issue = "41263")] - fn clone_into(&self, target: &mut Self::Owned) { - *target = self.to_owned(); - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<T> ToOwned for T -where - T: Clone, -{ - type Owned = T; - fn to_owned(&self) -> T { - self.clone() - } - - fn clone_into(&self, target: &mut T) { - target.clone_from(self); - } -} - -/// A clone-on-write smart pointer. -/// -/// The type `Cow` is a smart pointer providing clone-on-write functionality: it -/// can enclose and provide immutable access to borrowed data, and clone the -/// data lazily when mutation or ownership is required. The type is designed to -/// work with general borrowed data via the `Borrow` trait. -/// -/// `Cow` implements `Deref`, which means that you can call -/// non-mutating methods directly on the data it encloses. If mutation -/// is desired, `to_mut` will obtain a mutable reference to an owned -/// value, cloning if necessary. -/// -/// If you need reference-counting pointers, note that -/// [`Rc::make_mut`][crate::rc::Rc::make_mut] and -/// [`Arc::make_mut`][crate::sync::Arc::make_mut] can provide clone-on-write -/// functionality as well. -/// -/// # Examples -/// -/// ``` -/// use std::borrow::Cow; -/// -/// fn abs_all(input: &mut Cow<[i32]>) { -/// for i in 0..input.len() { -/// let v = input[i]; -/// if v < 0 { -/// // Clones into a vector if not already owned. -/// input.to_mut()[i] = -v; -/// } -/// } -/// } -/// -/// // No clone occurs because `input` doesn't need to be mutated. -/// let slice = [0, 1, 2]; -/// let mut input = Cow::from(&slice[..]); -/// abs_all(&mut input); -/// -/// // Clone occurs because `input` needs to be mutated. -/// let slice = [-1, 0, 1]; -/// let mut input = Cow::from(&slice[..]); -/// abs_all(&mut input); -/// -/// // No clone occurs because `input` is already owned. -/// let mut input = Cow::from(vec![-1, 0, 1]); -/// abs_all(&mut input); -/// ``` -/// -/// Another example showing how to keep `Cow` in a struct: -/// -/// ``` -/// use std::borrow::Cow; -/// -/// struct Items<'a, X: 'a> where [X]: ToOwned<Owned = Vec<X>> { -/// values: Cow<'a, [X]>, -/// } -/// -/// impl<'a, X: Clone + 'a> Items<'a, X> where [X]: ToOwned<Owned = Vec<X>> { -/// fn new(v: Cow<'a, [X]>) -> Self { -/// Items { values: v } -/// } -/// } -/// -/// // Creates a container from borrowed values of a slice -/// let readonly = [1, 2]; -/// let borrowed = Items::new((&readonly[..]).into()); -/// match borrowed { -/// Items { values: Cow::Borrowed(b) } => println!("borrowed {b:?}"), -/// _ => panic!("expect borrowed value"), -/// } -/// -/// let mut clone_on_write = borrowed; -/// // Mutates the data from slice into owned vec and pushes a new value on top -/// clone_on_write.values.to_mut().push(3); -/// println!("clone_on_write = {:?}", clone_on_write.values); -/// -/// // The data was mutated. Let's check it out. -/// match clone_on_write { -/// Items { values: Cow::Owned(_) } => println!("clone_on_write contains owned data"), -/// _ => panic!("expect owned data"), -/// } -/// ``` -#[stable(feature = "rust1", since = "1.0.0")] -#[cfg_attr(not(test), rustc_diagnostic_item = "Cow")] -pub enum Cow<'a, B: ?Sized + 'a> -where - B: ToOwned, -{ - /// Borrowed data. - #[stable(feature = "rust1", since = "1.0.0")] - Borrowed(#[stable(feature = "rust1", since = "1.0.0")] &'a B), - - /// Owned data. - #[stable(feature = "rust1", since = "1.0.0")] - Owned(#[stable(feature = "rust1", since = "1.0.0")] <B as ToOwned>::Owned), -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized + ToOwned> Clone for Cow<'_, B> { - fn clone(&self) -> Self { - match *self { - Borrowed(b) => Borrowed(b), - Owned(ref o) => { - let b: &B = o.borrow(); - Owned(b.to_owned()) - } - } - } - - fn clone_from(&mut self, source: &Self) { - match (self, source) { - (&mut Owned(ref mut dest), &Owned(ref o)) => o.borrow().clone_into(dest), - (t, s) => *t = s.clone(), - } - } -} - -impl<B: ?Sized + ToOwned> Cow<'_, B> { - /// Returns true if the data is borrowed, i.e. if `to_mut` would require additional work. - /// - /// # Examples - /// - /// ``` - /// #![feature(cow_is_borrowed)] - /// use std::borrow::Cow; - /// - /// let cow = Cow::Borrowed("moo"); - /// assert!(cow.is_borrowed()); - /// - /// let bull: Cow<'_, str> = Cow::Owned("...moo?".to_string()); - /// assert!(!bull.is_borrowed()); - /// ``` - #[unstable(feature = "cow_is_borrowed", issue = "65143")] - #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")] - pub const fn is_borrowed(&self) -> bool { - match *self { - Borrowed(_) => true, - Owned(_) => false, - } - } - - /// Returns true if the data is owned, i.e. if `to_mut` would be a no-op. - /// - /// # Examples - /// - /// ``` - /// #![feature(cow_is_borrowed)] - /// use std::borrow::Cow; - /// - /// let cow: Cow<'_, str> = Cow::Owned("moo".to_string()); - /// assert!(cow.is_owned()); - /// - /// let bull = Cow::Borrowed("...moo?"); - /// assert!(!bull.is_owned()); - /// ``` - #[unstable(feature = "cow_is_borrowed", issue = "65143")] - #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")] - pub const fn is_owned(&self) -> bool { - !self.is_borrowed() - } - - /// Acquires a mutable reference to the owned form of the data. - /// - /// Clones the data if it is not already owned. - /// - /// # Examples - /// - /// ``` - /// use std::borrow::Cow; - /// - /// let mut cow = Cow::Borrowed("foo"); - /// cow.to_mut().make_ascii_uppercase(); - /// - /// assert_eq!( - /// cow, - /// Cow::Owned(String::from("FOO")) as Cow<str> - /// ); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - pub fn to_mut(&mut self) -> &mut <B as ToOwned>::Owned { - match *self { - Borrowed(borrowed) => { - *self = Owned(borrowed.to_owned()); - match *self { - Borrowed(..) => unreachable!(), - Owned(ref mut owned) => owned, - } - } - Owned(ref mut owned) => owned, - } - } - - /// Extracts the owned data. - /// - /// Clones the data if it is not already owned. - /// - /// # Examples - /// - /// Calling `into_owned` on a `Cow::Borrowed` returns a clone of the borrowed data: - /// - /// ``` - /// use std::borrow::Cow; - /// - /// let s = "Hello world!"; - /// let cow = Cow::Borrowed(s); - /// - /// assert_eq!( - /// cow.into_owned(), - /// String::from(s) - /// ); - /// ``` - /// - /// Calling `into_owned` on a `Cow::Owned` returns the owned data. The data is moved out of the - /// `Cow` without being cloned. - /// - /// ``` - /// use std::borrow::Cow; - /// - /// let s = "Hello world!"; - /// let cow: Cow<str> = Cow::Owned(String::from(s)); - /// - /// assert_eq!( - /// cow.into_owned(), - /// String::from(s) - /// ); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - pub fn into_owned(self) -> <B as ToOwned>::Owned { - match self { - Borrowed(borrowed) => borrowed.to_owned(), - Owned(owned) => owned, - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -#[rustc_const_unstable(feature = "const_deref", issue = "88955")] -impl<B: ?Sized + ToOwned> const Deref for Cow<'_, B> -where - B::Owned: ~const Borrow<B>, -{ - type Target = B; - - fn deref(&self) -> &B { - match *self { - Borrowed(borrowed) => borrowed, - Owned(ref owned) => owned.borrow(), - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> Eq for Cow<'_, B> where B: Eq + ToOwned {} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> Ord for Cow<'_, B> -where - B: Ord + ToOwned, -{ - #[inline] - fn cmp(&self, other: &Self) -> Ordering { - Ord::cmp(&**self, &**other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, 'b, B: ?Sized, C: ?Sized> PartialEq<Cow<'b, C>> for Cow<'a, B> -where - B: PartialEq<C> + ToOwned, - C: ToOwned, -{ - #[inline] - fn eq(&self, other: &Cow<'b, C>) -> bool { - PartialEq::eq(&**self, &**other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, B: ?Sized> PartialOrd for Cow<'a, B> -where - B: PartialOrd + ToOwned, -{ - #[inline] - fn partial_cmp(&self, other: &Cow<'a, B>) -> Option<Ordering> { - PartialOrd::partial_cmp(&**self, &**other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> fmt::Debug for Cow<'_, B> -where - B: fmt::Debug + ToOwned<Owned: fmt::Debug>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - Borrowed(ref b) => fmt::Debug::fmt(b, f), - Owned(ref o) => fmt::Debug::fmt(o, f), - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> fmt::Display for Cow<'_, B> -where - B: fmt::Display + ToOwned<Owned: fmt::Display>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - Borrowed(ref b) => fmt::Display::fmt(b, f), - Owned(ref o) => fmt::Display::fmt(o, f), - } - } -} - -#[stable(feature = "default", since = "1.11.0")] -impl<B: ?Sized> Default for Cow<'_, B> -where - B: ToOwned<Owned: Default>, -{ - /// Creates an owned Cow<'a, B> with the default value for the contained owned value. - fn default() -> Self { - Owned(<B as ToOwned>::Owned::default()) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> Hash for Cow<'_, B> -where - B: Hash + ToOwned, -{ - #[inline] - fn hash<H: Hasher>(&self, state: &mut H) { - Hash::hash(&**self, state) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<T: ?Sized + ToOwned> AsRef<T> for Cow<'_, T> { - fn as_ref(&self) -> &T { - self - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> Add<&'a str> for Cow<'a, str> { - type Output = Cow<'a, str>; - - #[inline] - fn add(mut self, rhs: &'a str) -> Self::Output { - self += rhs; - self - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> Add<Cow<'a, str>> for Cow<'a, str> { - type Output = Cow<'a, str>; - - #[inline] - fn add(mut self, rhs: Cow<'a, str>) -> Self::Output { - self += rhs; - self - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> AddAssign<&'a str> for Cow<'a, str> { - fn add_assign(&mut self, rhs: &'a str) { - if self.is_empty() { - *self = Cow::Borrowed(rhs) - } else if !rhs.is_empty() { - if let Cow::Borrowed(lhs) = *self { - let mut s = String::with_capacity(lhs.len() + rhs.len()); - s.push_str(lhs); - *self = Cow::Owned(s); - } - self.to_mut().push_str(rhs); - } - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> AddAssign<Cow<'a, str>> for Cow<'a, str> { - fn add_assign(&mut self, rhs: Cow<'a, str>) { - if self.is_empty() { - *self = rhs - } else if !rhs.is_empty() { - if let Cow::Borrowed(lhs) = *self { - let mut s = String::with_capacity(lhs.len() + rhs.len()); - s.push_str(lhs); - *self = Cow::Owned(s); - } - self.to_mut().push_str(&rhs); - } - } -} diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs index 233bcd5e4654..3aebf83c9967 100644 --- a/rust/alloc/lib.rs +++ b/rust/alloc/lib.rs @@ -100,7 +100,7 @@ #![cfg_attr(not(no_global_oom_handling), feature(const_alloc_error))] #![feature(const_box)] #![cfg_attr(not(no_global_oom_handling), feature(const_btree_new))] -#![feature(const_cow_is_borrowed)] +#![cfg_attr(not(no_borrow), feature(const_cow_is_borrowed))] #![feature(const_convert)] #![feature(const_size_of_val)] #![feature(const_align_of_val)] @@ -215,6 +215,7 @@ pub mod boxed; mod boxed { pub use std::boxed::Box; } +#[cfg(not(no_borrow))] pub mod borrow; pub mod collections; #[cfg(not(no_global_oom_handling))] diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs index 8ac6c1e3b2a8..f77c7368d534 100644 --- a/rust/alloc/vec/mod.rs +++ b/rust/alloc/vec/mod.rs @@ -72,6 +72,7 @@ use core::ptr::{self, NonNull}; use core::slice::{self, SliceIndex}; use crate::alloc::{Allocator, Global}; +#[cfg(not(no_borrow))] use crate::borrow::{Cow, ToOwned}; use crate::boxed::Box; use crate::collections::TryReserveError; @@ -94,6 +95,7 @@ pub use self::drain::Drain; mod drain; +#[cfg(not(no_borrow))] #[cfg(not(no_global_oom_handling))] mod cow; @@ -3103,6 +3105,7 @@ impl<T, const N: usize> From<[T; N]> for Vec<T> { } } +#[cfg(not(no_borrow))] #[stable(feature = "vec_from_cow_slice", since = "1.14.0")] impl<'a, T> From<Cow<'a, [T]>> for Vec<T> where diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index c48bc284214a..75d85bd6c592 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -7,6 +7,7 @@ */ #include <linux/slab.h> +#include <linux/refcount.h> /* `bindgen` gets confused at certain things. */ const gfp_t BINDINGS_GFP_KERNEL = GFP_KERNEL; diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 6c50ee62c56b..7b246454e009 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -41,6 +41,7 @@ mod bindings_raw { #[allow(dead_code)] mod bindings_helper { // Import the generated bindings for types. + use super::bindings_raw::*; include!(concat!( env!("OBJTREE"), "/rust/bindings/bindings_helpers_generated.rs" diff --git a/rust/compiler_builtins.rs b/rust/compiler_builtins.rs index f8f39a3e6855..43378357ece9 100644 --- a/rust/compiler_builtins.rs +++ b/rust/compiler_builtins.rs @@ -28,7 +28,7 @@ macro_rules! define_panicking_intrinsics( ($reason: tt, { $($ident: ident, )* }) => { $( #[doc(hidden)] - #[no_mangle] + #[export_name = concat!("__rust", stringify!($ident))] pub extern "C" fn $ident() { panic!($reason); } @@ -61,3 +61,6 @@ define_panicking_intrinsics!("`u128` should not be used", { __udivti3, __umodti3, }); + +// NOTE: if you are adding a new intrinsic here, you should also add it to +// `redirect-intrinsics` in `rust/Makefile`. diff --git a/rust/helpers.c b/rust/helpers.c index b4f15eee2ffd..09a4d93f9d62 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -20,6 +20,7 @@ #include <linux/bug.h> #include <linux/build_bug.h> +#include <linux/refcount.h> __noreturn void rust_helper_BUG(void) { @@ -27,6 +28,24 @@ __noreturn void rust_helper_BUG(void) } EXPORT_SYMBOL_GPL(rust_helper_BUG); +refcount_t rust_helper_REFCOUNT_INIT(int n) +{ + return (refcount_t)REFCOUNT_INIT(n); +} +EXPORT_SYMBOL_GPL(rust_helper_REFCOUNT_INIT); + +void rust_helper_refcount_inc(refcount_t *r) +{ + refcount_inc(r); +} +EXPORT_SYMBOL_GPL(rust_helper_refcount_inc); + +bool rust_helper_refcount_dec_and_test(refcount_t *r) +{ + return refcount_dec_and_test(r); +} +EXPORT_SYMBOL_GPL(rust_helper_refcount_dec_and_test); + /* * We use `bindgen`'s `--size_t-is-usize` option to bind the C `size_t` type * as the Rust `usize` type, so we can use it in contexts where Rust diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 53040fa9e897..223564f9f0cc 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -13,7 +13,12 @@ #![no_std] #![feature(allocator_api)] +#![feature(coerce_unsized)] #![feature(core_ffi_c)] +#![feature(dispatch_from_dyn)] +#![feature(generic_associated_types)] +#![feature(receiver_trait)] +#![feature(unsize)] // Ensure conditional compilation based on the kernel configuration works; // otherwise we may silently break things like initcall handling. @@ -31,6 +36,7 @@ mod static_assert; #[doc(hidden)] pub mod std_vendor; pub mod str; +pub mod sync; pub mod types; #[doc(hidden)] diff --git a/rust/kernel/prelude.rs b/rust/kernel/prelude.rs index 7a90249ee9b9..0bc1c97e5604 100644 --- a/rust/kernel/prelude.rs +++ b/rust/kernel/prelude.rs @@ -11,15 +11,21 @@ //! use kernel::prelude::*; //! ``` +#[doc(no_inline)] pub use core::pin::Pin; +#[doc(no_inline)] pub use alloc::{boxed::Box, vec::Vec}; +#[doc(no_inline)] pub use macros::{module, vtable}; pub use super::build_assert; -pub use super::{dbg, pr_alert, pr_crit, pr_debug, pr_emerg, pr_err, pr_info, pr_notice, pr_warn}; +// `super::std_vendor` is hidden, which makes the macro inline for some reason. +#[doc(no_inline)] +pub use super::dbg; +pub use super::{pr_alert, pr_crit, pr_debug, pr_emerg, pr_err, pr_info, pr_notice, pr_warn}; pub use super::static_assert; diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs index 29bf9c2e8aee..30103325696d 100644 --- a/rust/kernel/print.rs +++ b/rust/kernel/print.rs @@ -142,17 +142,24 @@ pub fn call_printk_cont(args: fmt::Arguments<'_>) { macro_rules! print_macro ( // The non-continuation cases (most of them, e.g. `INFO`). ($format_string:path, false, $($arg:tt)+) => ( - // SAFETY: This hidden macro should only be called by the documented - // printing macros which ensure the format string is one of the fixed - // ones. All `__LOG_PREFIX`s are null-terminated as they are generated - // by the `module!` proc macro or fixed values defined in a kernel - // crate. - unsafe { - $crate::print::call_printk( - &$format_string, - crate::__LOG_PREFIX, - format_args!($($arg)+), - ); + // To remain sound, `arg`s must be expanded outside the `unsafe` block. + // Typically one would use a `let` binding for that; however, `format_args!` + // takes borrows on the arguments, but does not extend the scope of temporaries. + // Therefore, a `match` expression is used to keep them around, since + // the scrutinee is kept until the end of the `match`. + match format_args!($($arg)+) { + // SAFETY: This hidden macro should only be called by the documented + // printing macros which ensure the format string is one of the fixed + // ones. All `__LOG_PREFIX`s are null-terminated as they are generated + // by the `module!` proc macro or fixed values defined in a kernel + // crate. + args => unsafe { + $crate::print::call_printk( + &$format_string, + crate::__LOG_PREFIX, + args, + ); + } } ); diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs new file mode 100644 index 000000000000..33da23e3076d --- /dev/null +++ b/rust/kernel/sync.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Synchronisation primitives. +//! +//! This module contains the kernel APIs related to synchronisation that have been ported or +//! wrapped for usage by Rust code in the kernel. + +mod arc; + +pub use arc::{Arc, ArcBorrow, UniqueArc}; diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs new file mode 100644 index 000000000000..f2f1c83d72ba --- /dev/null +++ b/rust/kernel/sync/arc.rs @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A reference-counted pointer. +//! +//! This module implements a way for users to create reference-counted objects and pointers to +//! them. Such a pointer automatically increments and decrements the count, and drops the +//! underlying object when it reaches zero. It is also safe to use concurrently from multiple +//! threads. +//! +//! It is different from the standard library's [`Arc`] in a few ways: +//! 1. It is backed by the kernel's `refcount_t` type. +//! 2. It does not support weak references, which allows it to be half the size. +//! 3. It saturates the reference count instead of aborting when it goes over a threshold. +//! 4. It does not provide a `get_mut` method, so the ref counted object is pinned. +//! +//! [`Arc`]: https://doc.rust-lang.org/std/sync/struct.Arc.html + +use crate::{ + bindings, + error::Result, + types::{ForeignOwnable, Opaque}, +}; +use alloc::boxed::Box; +use core::{ + marker::{PhantomData, Unsize}, + mem::{ManuallyDrop, MaybeUninit}, + ops::{Deref, DerefMut}, + pin::Pin, + ptr::NonNull, +}; + +/// A reference-counted pointer to an instance of `T`. +/// +/// The reference count is incremented when new instances of [`Arc`] are created, and decremented +/// when they are dropped. When the count reaches zero, the underlying `T` is also dropped. +/// +/// # Invariants +/// +/// The reference count on an instance of [`Arc`] is always non-zero. +/// The object pointed to by [`Arc`] is always pinned. +/// +/// # Examples +/// +/// ``` +/// use kernel::sync::Arc; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// // Create a ref-counted instance of `Example`. +/// let obj = Arc::try_new(Example { a: 10, b: 20 })?; +/// +/// // Get a new pointer to `obj` and increment the refcount. +/// let cloned = obj.clone(); +/// +/// // Assert that both `obj` and `cloned` point to the same underlying object. +/// assert!(core::ptr::eq(&*obj, &*cloned)); +/// +/// // Destroy `obj` and decrement its refcount. +/// drop(obj); +/// +/// // Check that the values are still accessible through `cloned`. +/// assert_eq!(cloned.a, 10); +/// assert_eq!(cloned.b, 20); +/// +/// // The refcount drops to zero when `cloned` goes out of scope, and the memory is freed. +/// ``` +/// +/// Using `Arc<T>` as the type of `self`: +/// +/// ``` +/// use kernel::sync::Arc; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// impl Example { +/// fn take_over(self: Arc<Self>) { +/// // ... +/// } +/// +/// fn use_reference(self: &Arc<Self>) { +/// // ... +/// } +/// } +/// +/// let obj = Arc::try_new(Example { a: 10, b: 20 })?; +/// obj.use_reference(); +/// obj.take_over(); +/// ``` +/// +/// Coercion from `Arc<Example>` to `Arc<dyn MyTrait>`: +/// +/// ``` +/// use kernel::sync::{Arc, ArcBorrow}; +/// +/// trait MyTrait { +/// // Trait has a function whose `self` type is `Arc<Self>`. +/// fn example1(self: Arc<Self>) {} +/// +/// // Trait has a function whose `self` type is `ArcBorrow<'_, Self>`. +/// fn example2(self: ArcBorrow<'_, Self>) {} +/// } +/// +/// struct Example; +/// impl MyTrait for Example {} +/// +/// // `obj` has type `Arc<Example>`. +/// let obj: Arc<Example> = Arc::try_new(Example)?; +/// +/// // `coerced` has type `Arc<dyn MyTrait>`. +/// let coerced: Arc<dyn MyTrait> = obj; +/// ``` +pub struct Arc<T: ?Sized> { + ptr: NonNull<ArcInner<T>>, + _p: PhantomData<ArcInner<T>>, +} + +#[repr(C)] +struct ArcInner<T: ?Sized> { + refcount: Opaque<bindings::refcount_t>, + data: T, +} + +// This is to allow [`Arc`] (and variants) to be used as the type of `self`. +impl<T: ?Sized> core::ops::Receiver for Arc<T> {} + +// This is to allow coercion from `Arc<T>` to `Arc<U>` if `T` can be converted to the +// dynamically-sized type (DST) `U`. +impl<T: ?Sized + Unsize<U>, U: ?Sized> core::ops::CoerceUnsized<Arc<U>> for Arc<T> {} + +// This is to allow `Arc<U>` to be dispatched on when `Arc<T>` can be coerced into `Arc<U>`. +impl<T: ?Sized + Unsize<U>, U: ?Sized> core::ops::DispatchFromDyn<Arc<U>> for Arc<T> {} + +// SAFETY: It is safe to send `Arc<T>` to another thread when the underlying `T` is `Sync` because +// it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, it needs +// `T` to be `Send` because any thread that has an `Arc<T>` may ultimately access `T` directly, for +// example, when the reference count reaches zero and `T` is dropped. +unsafe impl<T: ?Sized + Sync + Send> Send for Arc<T> {} + +// SAFETY: It is safe to send `&Arc<T>` to another thread when the underlying `T` is `Sync` for the +// same reason as above. `T` needs to be `Send` as well because a thread can clone an `&Arc<T>` +// into an `Arc<T>`, which may lead to `T` being accessed by the same reasoning as above. +unsafe impl<T: ?Sized + Sync + Send> Sync for Arc<T> {} + +impl<T> Arc<T> { + /// Constructs a new reference counted instance of `T`. + pub fn try_new(contents: T) -> Result<Self> { + // INVARIANT: The refcount is initialised to a non-zero value. + let value = ArcInner { + // SAFETY: There are no safety requirements for this FFI call. + refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + data: contents, + }; + + let inner = Box::try_new(value)?; + + // SAFETY: We just created `inner` with a reference count of 1, which is owned by the new + // `Arc` object. + Ok(unsafe { Self::from_inner(Box::leak(inner).into()) }) + } +} + +impl<T: ?Sized> Arc<T> { + /// Constructs a new [`Arc`] from an existing [`ArcInner`]. + /// + /// # Safety + /// + /// The caller must ensure that `inner` points to a valid location and has a non-zero reference + /// count, one of which will be owned by the new [`Arc`] instance. + unsafe fn from_inner(inner: NonNull<ArcInner<T>>) -> Self { + // INVARIANT: By the safety requirements, the invariants hold. + Arc { + ptr: inner, + _p: PhantomData, + } + } + + /// Returns an [`ArcBorrow`] from the given [`Arc`]. + /// + /// This is useful when the argument of a function call is an [`ArcBorrow`] (e.g., in a method + /// receiver), but we have an [`Arc`] instead. Getting an [`ArcBorrow`] is free when optimised. + #[inline] + pub fn as_arc_borrow(&self) -> ArcBorrow<'_, T> { + // SAFETY: The constraint that the lifetime of the shared reference must outlive that of + // the returned `ArcBorrow` ensures that the object remains alive and that no mutable + // reference can be created. + unsafe { ArcBorrow::new(self.ptr) } + } +} + +impl<T: 'static> ForeignOwnable for Arc<T> { + type Borrowed<'a> = ArcBorrow<'a, T>; + + fn into_foreign(self) -> *const core::ffi::c_void { + ManuallyDrop::new(self).ptr.as_ptr() as _ + } + + unsafe fn borrow<'a>(ptr: *const core::ffi::c_void) -> ArcBorrow<'a, T> { + // SAFETY: By the safety requirement of this function, we know that `ptr` came from + // a previous call to `Arc::into_foreign`. + let inner = NonNull::new(ptr as *mut ArcInner<T>).unwrap(); + + // SAFETY: The safety requirements of `from_foreign` ensure that the object remains alive + // for the lifetime of the returned value. Additionally, the safety requirements of + // `ForeignOwnable::borrow_mut` ensure that no new mutable references are created. + unsafe { ArcBorrow::new(inner) } + } + + unsafe fn from_foreign(ptr: *const core::ffi::c_void) -> Self { + // SAFETY: By the safety requirement of this function, we know that `ptr` came from + // a previous call to `Arc::into_foreign`, which guarantees that `ptr` is valid and + // holds a reference count increment that is transferrable to us. + unsafe { Self::from_inner(NonNull::new(ptr as _).unwrap()) } + } +} + +impl<T: ?Sized> Deref for Arc<T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is + // safe to dereference it. + unsafe { &self.ptr.as_ref().data } + } +} + +impl<T: ?Sized> Clone for Arc<T> { + fn clone(&self) -> Self { + // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero. + // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is + // safe to increment the refcount. + unsafe { bindings::refcount_inc(self.ptr.as_ref().refcount.get()) }; + + // SAFETY: We just incremented the refcount. This increment is now owned by the new `Arc`. + unsafe { Self::from_inner(self.ptr) } + } +} + +impl<T: ?Sized> Drop for Arc<T> { + fn drop(&mut self) { + // SAFETY: By the type invariant, there is necessarily a reference to the object. We cannot + // touch `refcount` after it's decremented to a non-zero value because another thread/CPU + // may concurrently decrement it to zero and free it. It is ok to have a raw pointer to + // freed/invalid memory as long as it is never dereferenced. + let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); + + // INVARIANT: If the refcount reaches zero, there are no other instances of `Arc`, and + // this instance is being dropped, so the broken invariant is not observable. + // SAFETY: Also by the type invariant, we are allowed to decrement the refcount. + let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; + if is_zero { + // The count reached zero, we must free the memory. + // + // SAFETY: The pointer was initialised from the result of `Box::leak`. + unsafe { Box::from_raw(self.ptr.as_ptr()) }; + } + } +} + +impl<T: ?Sized> From<UniqueArc<T>> for Arc<T> { + fn from(item: UniqueArc<T>) -> Self { + item.inner + } +} + +impl<T: ?Sized> From<Pin<UniqueArc<T>>> for Arc<T> { + fn from(item: Pin<UniqueArc<T>>) -> Self { + // SAFETY: The type invariants of `Arc` guarantee that the data is pinned. + unsafe { Pin::into_inner_unchecked(item).inner } + } +} + +/// A borrowed reference to an [`Arc`] instance. +/// +/// For cases when one doesn't ever need to increment the refcount on the allocation, it is simpler +/// to use just `&T`, which we can trivially get from an `Arc<T>` instance. +/// +/// However, when one may need to increment the refcount, it is preferable to use an `ArcBorrow<T>` +/// over `&Arc<T>` because the latter results in a double-indirection: a pointer (shared reference) +/// to a pointer (`Arc<T>`) to the object (`T`). An [`ArcBorrow`] eliminates this double +/// indirection while still allowing one to increment the refcount and getting an `Arc<T>` when/if +/// needed. +/// +/// # Invariants +/// +/// There are no mutable references to the underlying [`Arc`], and it remains valid for the +/// lifetime of the [`ArcBorrow`] instance. +/// +/// # Example +/// +/// ``` +/// use crate::sync::{Arc, ArcBorrow}; +/// +/// struct Example; +/// +/// fn do_something(e: ArcBorrow<'_, Example>) -> Arc<Example> { +/// e.into() +/// } +/// +/// let obj = Arc::try_new(Example)?; +/// let cloned = do_something(obj.as_arc_borrow()); +/// +/// // Assert that both `obj` and `cloned` point to the same underlying object. +/// assert!(core::ptr::eq(&*obj, &*cloned)); +/// ``` +/// +/// Using `ArcBorrow<T>` as the type of `self`: +/// +/// ``` +/// use crate::sync::{Arc, ArcBorrow}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// impl Example { +/// fn use_reference(self: ArcBorrow<'_, Self>) { +/// // ... +/// } +/// } +/// +/// let obj = Arc::try_new(Example { a: 10, b: 20 })?; +/// obj.as_arc_borrow().use_reference(); +/// ``` +pub struct ArcBorrow<'a, T: ?Sized + 'a> { + inner: NonNull<ArcInner<T>>, + _p: PhantomData<&'a ()>, +} + +// This is to allow [`ArcBorrow`] (and variants) to be used as the type of `self`. +impl<T: ?Sized> core::ops::Receiver for ArcBorrow<'_, T> {} + +// This is to allow `ArcBorrow<U>` to be dispatched on when `ArcBorrow<T>` can be coerced into +// `ArcBorrow<U>`. +impl<T: ?Sized + Unsize<U>, U: ?Sized> core::ops::DispatchFromDyn<ArcBorrow<'_, U>> + for ArcBorrow<'_, T> +{ +} + +impl<T: ?Sized> Clone for ArcBorrow<'_, T> { + fn clone(&self) -> Self { + *self + } +} + +impl<T: ?Sized> Copy for ArcBorrow<'_, T> {} + +impl<T: ?Sized> ArcBorrow<'_, T> { + /// Creates a new [`ArcBorrow`] instance. + /// + /// # Safety + /// + /// Callers must ensure the following for the lifetime of the returned [`ArcBorrow`] instance: + /// 1. That `inner` remains valid; + /// 2. That no mutable references to `inner` are created. + unsafe fn new(inner: NonNull<ArcInner<T>>) -> Self { + // INVARIANT: The safety requirements guarantee the invariants. + Self { + inner, + _p: PhantomData, + } + } +} + +impl<T: ?Sized> From<ArcBorrow<'_, T>> for Arc<T> { + fn from(b: ArcBorrow<'_, T>) -> Self { + // SAFETY: The existence of `b` guarantees that the refcount is non-zero. `ManuallyDrop` + // guarantees that `drop` isn't called, so it's ok that the temporary `Arc` doesn't own the + // increment. + ManuallyDrop::new(unsafe { Arc::from_inner(b.inner) }) + .deref() + .clone() + } +} + +impl<T: ?Sized> Deref for ArcBorrow<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: By the type invariant, the underlying object is still alive with no mutable + // references to it, so it is safe to create a shared reference. + unsafe { &self.inner.as_ref().data } + } +} + +/// A refcounted object that is known to have a refcount of 1. +/// +/// It is mutable and can be converted to an [`Arc`] so that it can be shared. +/// +/// # Invariants +/// +/// `inner` always has a reference count of 1. +/// +/// # Examples +/// +/// In the following example, we make changes to the inner object before turning it into an +/// `Arc<Test>` object (after which point, it cannot be mutated directly). Note that `x.into()` +/// cannot fail. +/// +/// ``` +/// use kernel::sync::{Arc, UniqueArc}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result<Arc<Example>> { +/// let mut x = UniqueArc::try_new(Example { a: 10, b: 20 })?; +/// x.a += 1; +/// x.b += 1; +/// Ok(x.into()) +/// } +/// +/// # test().unwrap(); +/// ``` +/// +/// In the following example we first allocate memory for a ref-counted `Example` but we don't +/// initialise it on allocation. We do initialise it later with a call to [`UniqueArc::write`], +/// followed by a conversion to `Arc<Example>`. This is particularly useful when allocation happens +/// in one context (e.g., sleepable) and initialisation in another (e.g., atomic): +/// +/// ``` +/// use kernel::sync::{Arc, UniqueArc}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result<Arc<Example>> { +/// let x = UniqueArc::try_new_uninit()?; +/// Ok(x.write(Example { a: 10, b: 20 }).into()) +/// } +/// +/// # test().unwrap(); +/// ``` +/// +/// In the last example below, the caller gets a pinned instance of `Example` while converting to +/// `Arc<Example>`; this is useful in scenarios where one needs a pinned reference during +/// initialisation, for example, when initialising fields that are wrapped in locks. +/// +/// ``` +/// use kernel::sync::{Arc, UniqueArc}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result<Arc<Example>> { +/// let mut pinned = Pin::from(UniqueArc::try_new(Example { a: 10, b: 20 })?); +/// // We can modify `pinned` because it is `Unpin`. +/// pinned.as_mut().a += 1; +/// Ok(pinned.into()) +/// } +/// +/// # test().unwrap(); +/// ``` +pub struct UniqueArc<T: ?Sized> { + inner: Arc<T>, +} + +impl<T> UniqueArc<T> { + /// Tries to allocate a new [`UniqueArc`] instance. + pub fn try_new(value: T) -> Result<Self> { + Ok(Self { + // INVARIANT: The newly-created object has a ref-count of 1. + inner: Arc::try_new(value)?, + }) + } + + /// Tries to allocate a new [`UniqueArc`] instance whose contents are not initialised yet. + pub fn try_new_uninit() -> Result<UniqueArc<MaybeUninit<T>>> { + Ok(UniqueArc::<MaybeUninit<T>> { + // INVARIANT: The newly-created object has a ref-count of 1. + inner: Arc::try_new(MaybeUninit::uninit())?, + }) + } +} + +impl<T> UniqueArc<MaybeUninit<T>> { + /// Converts a `UniqueArc<MaybeUninit<T>>` into a `UniqueArc<T>` by writing a value into it. + pub fn write(mut self, value: T) -> UniqueArc<T> { + self.deref_mut().write(value); + let inner = ManuallyDrop::new(self).inner.ptr; + UniqueArc { + // SAFETY: The new `Arc` is taking over `ptr` from `self.inner` (which won't be + // dropped). The types are compatible because `MaybeUninit<T>` is compatible with `T`. + inner: unsafe { Arc::from_inner(inner.cast()) }, + } + } +} + +impl<T: ?Sized> From<UniqueArc<T>> for Pin<UniqueArc<T>> { + fn from(obj: UniqueArc<T>) -> Self { + // SAFETY: It is not possible to move/replace `T` inside a `Pin<UniqueArc<T>>` (unless `T` + // is `Unpin`), so it is ok to convert it to `Pin<UniqueArc<T>>`. + unsafe { Pin::new_unchecked(obj) } + } +} + +impl<T: ?Sized> Deref for UniqueArc<T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.inner.deref() + } +} + +impl<T: ?Sized> DerefMut for UniqueArc<T> { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: By the `Arc` type invariant, there is necessarily a reference to the object, so + // it is safe to dereference it. Additionally, we know there is only one reference when + // it's inside a `UniqueArc`, so it is safe to get a mutable reference. + unsafe { &mut self.inner.ptr.as_mut().data } + } +} diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index e84e51ec9716..9d0fdbc55843 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -2,7 +2,220 @@ //! Kernel types. -use core::{cell::UnsafeCell, mem::MaybeUninit}; +use alloc::boxed::Box; +use core::{ + cell::UnsafeCell, + mem::MaybeUninit, + ops::{Deref, DerefMut}, +}; + +/// Used to transfer ownership to and from foreign (non-Rust) languages. +/// +/// Ownership is transferred from Rust to a foreign language by calling [`Self::into_foreign`] and +/// later may be transferred back to Rust by calling [`Self::from_foreign`]. +/// +/// This trait is meant to be used in cases when Rust objects are stored in C objects and +/// eventually "freed" back to Rust. +pub trait ForeignOwnable: Sized { + /// Type of values borrowed between calls to [`ForeignOwnable::into_foreign`] and + /// [`ForeignOwnable::from_foreign`]. + type Borrowed<'a>; + + /// Converts a Rust-owned object to a foreign-owned one. + /// + /// The foreign representation is a pointer to void. + fn into_foreign(self) -> *const core::ffi::c_void; + + /// Borrows a foreign-owned object. + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for + /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. + /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow_mut`] + /// for this object must have been dropped. + unsafe fn borrow<'a>(ptr: *const core::ffi::c_void) -> Self::Borrowed<'a>; + + /// Mutably borrows a foreign-owned object. + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for + /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. + /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow`] and + /// [`ForeignOwnable::borrow_mut`] for this object must have been dropped. + unsafe fn borrow_mut(ptr: *const core::ffi::c_void) -> ScopeGuard<Self, fn(Self)> { + // SAFETY: The safety requirements ensure that `ptr` came from a previous call to + // `into_foreign`. + ScopeGuard::new_with_data(unsafe { Self::from_foreign(ptr) }, |d| { + d.into_foreign(); + }) + } + + /// Converts a foreign-owned object back to a Rust-owned one. + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for + /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. + /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow`] and + /// [`ForeignOwnable::borrow_mut`] for this object must have been dropped. + unsafe fn from_foreign(ptr: *const core::ffi::c_void) -> Self; +} + +impl<T: 'static> ForeignOwnable for Box<T> { + type Borrowed<'a> = &'a T; + + fn into_foreign(self) -> *const core::ffi::c_void { + Box::into_raw(self) as _ + } + + unsafe fn borrow<'a>(ptr: *const core::ffi::c_void) -> &'a T { + // SAFETY: The safety requirements for this function ensure that the object is still alive, + // so it is safe to dereference the raw pointer. + // The safety requirements of `from_foreign` also ensure that the object remains alive for + // the lifetime of the returned value. + unsafe { &*ptr.cast() } + } + + unsafe fn from_foreign(ptr: *const core::ffi::c_void) -> Self { + // SAFETY: The safety requirements of this function ensure that `ptr` comes from a previous + // call to `Self::into_foreign`. + unsafe { Box::from_raw(ptr as _) } + } +} + +impl ForeignOwnable for () { + type Borrowed<'a> = (); + + fn into_foreign(self) -> *const core::ffi::c_void { + core::ptr::NonNull::dangling().as_ptr() + } + + unsafe fn borrow<'a>(_: *const core::ffi::c_void) -> Self::Borrowed<'a> {} + + unsafe fn from_foreign(_: *const core::ffi::c_void) -> Self {} +} + +/// Runs a cleanup function/closure when dropped. +/// +/// The [`ScopeGuard::dismiss`] function prevents the cleanup function from running. +/// +/// # Examples +/// +/// In the example below, we have multiple exit paths and we want to log regardless of which one is +/// taken: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example1(arg: bool) { +/// let _log = ScopeGuard::new(|| pr_info!("example1 completed\n")); +/// +/// if arg { +/// return; +/// } +/// +/// pr_info!("Do something...\n"); +/// } +/// +/// # example1(false); +/// # example1(true); +/// ``` +/// +/// In the example below, we want to log the same message on all early exits but a different one on +/// the main exit path: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example2(arg: bool) { +/// let log = ScopeGuard::new(|| pr_info!("example2 returned early\n")); +/// +/// if arg { +/// return; +/// } +/// +/// // (Other early returns...) +/// +/// log.dismiss(); +/// pr_info!("example2 no early return\n"); +/// } +/// +/// # example2(false); +/// # example2(true); +/// ``` +/// +/// In the example below, we need a mutable object (the vector) to be accessible within the log +/// function, so we wrap it in the [`ScopeGuard`]: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example3(arg: bool) -> Result { +/// let mut vec = +/// ScopeGuard::new_with_data(Vec::new(), |v| pr_info!("vec had {} elements\n", v.len())); +/// +/// vec.try_push(10u8)?; +/// if arg { +/// return Ok(()); +/// } +/// vec.try_push(20u8)?; +/// Ok(()) +/// } +/// +/// # assert_eq!(example3(false), Ok(())); +/// # assert_eq!(example3(true), Ok(())); +/// ``` +/// +/// # Invariants +/// +/// The value stored in the struct is nearly always `Some(_)`, except between +/// [`ScopeGuard::dismiss`] and [`ScopeGuard::drop`]: in this case, it will be `None` as the value +/// will have been returned to the caller. Since [`ScopeGuard::dismiss`] consumes the guard, +/// callers won't be able to use it anymore. +pub struct ScopeGuard<T, F: FnOnce(T)>(Option<(T, F)>); + +impl<T, F: FnOnce(T)> ScopeGuard<T, F> { + /// Creates a new guarded object wrapping the given data and with the given cleanup function. + pub fn new_with_data(data: T, cleanup_func: F) -> Self { + // INVARIANT: The struct is being initialised with `Some(_)`. + Self(Some((data, cleanup_func))) + } + + /// Prevents the cleanup function from running and returns the guarded data. + pub fn dismiss(mut self) -> T { + // INVARIANT: This is the exception case in the invariant; it is not visible to callers + // because this function consumes `self`. + self.0.take().unwrap().0 + } +} + +impl ScopeGuard<(), fn(())> { + /// Creates a new guarded object with the given cleanup function. + pub fn new(cleanup: impl FnOnce()) -> ScopeGuard<(), impl FnOnce(())> { + ScopeGuard::new_with_data((), move |_| cleanup()) + } +} + +impl<T, F: FnOnce(T)> Deref for ScopeGuard<T, F> { + type Target = T; + + fn deref(&self) -> &T { + // The type invariants guarantee that `unwrap` will succeed. + &self.0.as_ref().unwrap().0 + } +} + +impl<T, F: FnOnce(T)> DerefMut for ScopeGuard<T, F> { + fn deref_mut(&mut self) -> &mut T { + // The type invariants guarantee that `unwrap` will succeed. + &mut self.0.as_mut().unwrap().0 + } +} + +impl<T, F: FnOnce(T)> Drop for ScopeGuard<T, F> { + fn drop(&mut self) { + // Run the cleanup function if one is still present. + if let Some((data, cleanup)) = self.0.take() { + cleanup(data) + } + } +} /// Stores an opaque value. /// diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index d52370cad0b6..a825dbd2c9cf 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -152,6 +152,7 @@ static void __exit ftrace_direct_multi_exit(void) { kthread_stop(simple_tsk); unregister_ftrace_direct_multi(&direct, my_tramp); + ftrace_free_filter(&direct); } module_init(ftrace_direct_multi_init); diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index ec1088922517..d955a2650605 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -79,6 +79,7 @@ static int __init ftrace_direct_multi_init(void) static void __exit ftrace_direct_multi_exit(void) { unregister_ftrace_direct_multi(&direct, (unsigned long) my_tramp); + ftrace_free_filter(&direct); } module_init(ftrace_direct_multi_init); diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index 836391e5d209..4815a8e32227 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -66,9 +66,13 @@ endif # Don't stop modules_install even if we can't sign external modules. # ifeq ($(CONFIG_MODULE_SIG_ALL),y) +ifeq ($(filter pkcs11:%, $(CONFIG_MODULE_SIG_KEY)),) sig-key := $(if $(wildcard $(CONFIG_MODULE_SIG_KEY)),,$(srctree)/)$(CONFIG_MODULE_SIG_KEY) +else +sig-key := $(CONFIG_MODULE_SIG_KEY) +endif quiet_cmd_sign = SIGN $@ - cmd_sign = scripts/sign-file $(CONFIG_MODULE_SIG_HASH) $(sig-key) certs/signing_key.x509 $@ \ + cmd_sign = scripts/sign-file $(CONFIG_MODULE_SIG_HASH) "$(sig-key)" certs/signing_key.x509 $@ \ $(if $(KBUILD_EXTMOD),|| true) else quiet_cmd_sign := diff --git a/scripts/atomic/atomics.tbl b/scripts/atomic/atomics.tbl index fbee2f6190d9..fbee2f6190d9 100755..100644 --- a/scripts/atomic/atomics.tbl +++ b/scripts/atomic/atomics.tbl diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h index 9a1895747b15..84c730da36dd 100644 --- a/scripts/gcc-plugins/gcc-common.h +++ b/scripts/gcc-plugins/gcc-common.h @@ -71,7 +71,9 @@ #include "varasm.h" #include "stor-layout.h" #include "internal-fn.h" +#include "gimple.h" #include "gimple-expr.h" +#include "gimple-iterator.h" #include "gimple-fold.h" #include "context.h" #include "tree-ssa-alias.h" @@ -85,10 +87,8 @@ #include "tree-eh.h" #include "stmt.h" #include "gimplify.h" -#include "gimple.h" #include "tree-phinodes.h" #include "tree-cfg.h" -#include "gimple-iterator.h" #include "gimple-ssa.h" #include "ssa-iterators.h" diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index 15fc4626d236..9ee99f9fae8d 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -163,7 +163,7 @@ def get_current_task(cpu): task_ptr_type = task_type.get_type().pointer() if utils.is_target_arch("x86"): - var_ptr = gdb.parse_and_eval("¤t_task") + var_ptr = gdb.parse_and_eval("&pcpu_hot.current_task") return per_cpu(var_ptr, cpu).dereference() elif utils.is_target_arch("aarch64"): current_task_addr = gdb.parse_and_eval("$SP_EL0") diff --git a/scripts/tracing/ftrace-bisect.sh b/scripts/tracing/ftrace-bisect.sh index 926701162bc8..bb4f59262bbe 100755 --- a/scripts/tracing/ftrace-bisect.sh +++ b/scripts/tracing/ftrace-bisect.sh @@ -12,7 +12,7 @@ # (note, if this is a problem with function_graph tracing, then simply # replace "function" with "function_graph" in the following steps). # -# # cd /sys/kernel/debug/tracing +# # cd /sys/kernel/tracing # # echo schedule > set_ftrace_filter # # echo function > current_tracer # @@ -20,22 +20,40 @@ # # # echo nop > current_tracer # -# # cat available_filter_functions > ~/full-file +# Starting with v5.1 this can be done with numbers, making it much faster: +# +# The old (slow) way, for kernels before v5.1. +# +# [old-way] # cat available_filter_functions > ~/full-file +# +# [old-way] *** Note *** this process will take several minutes to update the +# [old-way] filters. Setting multiple functions is an O(n^2) operation, and we +# [old-way] are dealing with thousands of functions. So go have coffee, talk +# [old-way] with your coworkers, read facebook. And eventually, this operation +# [old-way] will end. +# +# The new way (using numbers) is an O(n) operation, and usually takes less than a second. +# +# seq `wc -l available_filter_functions | cut -d' ' -f1` > ~/full-file +# +# This will create a sequence of numbers that match the functions in +# available_filter_functions, and when echoing in a number into the +# set_ftrace_filter file, it will enable the corresponding function in +# O(1) time. Making enabling all functions O(n) where n is the number of +# functions to enable. +# +# For either the new or old way, the rest of the operations remain the same. +# # # ftrace-bisect ~/full-file ~/test-file ~/non-test-file # # cat ~/test-file > set_ftrace_filter # -# *** Note *** this will take several minutes. Setting multiple functions is -# an O(n^2) operation, and we are dealing with thousands of functions. So go -# have coffee, talk with your coworkers, read facebook. And eventually, this -# operation will end. -# # # echo function > current_tracer # # If it crashes, we know that ~/test-file has a bad function. # # Reboot back to test kernel. # -# # cd /sys/kernel/debug/tracing +# # cd /sys/kernel/tracing # # mv ~/test-file ~/full-file # # If it didn't crash. diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 424b2c1e586d..db7a51acf9db 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1793,7 +1793,7 @@ fail2: return error; } -static int ns_mkdir_op(struct user_namespace *mnt_userns, struct inode *dir, +static int ns_mkdir_op(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct aa_ns *ns, *parent; diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 6dd3cc5309bf..f3715cda59c5 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -313,7 +313,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, d = bprm->file->f_path.dentry; for (i = 0; i < attach->xattr_count; i++) { - size = vfs_getxattr_alloc(&init_user_ns, d, attach->xattrs[i], + size = vfs_getxattr_alloc(&nop_mnt_idmap, d, attach->xattrs[i], &value, value_size, GFP_KERNEL); if (size >= 0) { u32 index, perm; @@ -862,7 +862,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm) const char *info = NULL; int error = 0; bool unsafe = false; - vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_user_ns(bprm->file), + vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(bprm->file), file_inode(bprm->file)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), diff --git a/security/apparmor/file.c b/security/apparmor/file.c index cb3d3060d104..9119ddda6217 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -459,7 +459,7 @@ static int __file_path_perm(const char *op, struct aa_label *label, { struct aa_profile *profile; struct aa_perms perms = {}; - vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_user_ns(file), + vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(file), file_inode(file)); struct path_cond cond = { .uid = vfsuid_into_kuid(vfsuid), diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index c6728a629437..d6cc4812ca53 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -227,8 +227,7 @@ static int common_perm(const char *op, const struct path *path, u32 mask, */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { - struct user_namespace *mnt_userns = mnt_user_ns(path->mnt); - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), @@ -273,14 +272,13 @@ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); - struct user_namespace *mnt_userns = mnt_user_ns(dir->mnt); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; @@ -379,7 +377,7 @@ static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_d label = begin_current_label_crit_section(); if (!unconfined(label)) { - struct user_namespace *mnt_userns = mnt_user_ns(old_dir->mnt); + struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; @@ -388,14 +386,14 @@ static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_d struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; - vfsuid = i_uid_into_vfsuid(mnt_userns, d_backing_inode(old_dentry)); + vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; - vfsuid = i_uid_into_vfsuid(mnt_userns, d_backing_inode(old_dentry)); + vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, label, &new_path, 0, @@ -460,13 +458,13 @@ static int apparmor_file_open(struct file *file) label = aa_get_newest_cred_label(file->f_cred); if (!unconfined(label)) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, label, &file->f_path, 0, diff --git a/security/apparmor/policy_compat.c b/security/apparmor/policy_compat.c index 9e52e218bf30..cc89d1e88fb7 100644 --- a/security/apparmor/policy_compat.c +++ b/security/apparmor/policy_compat.c @@ -160,8 +160,7 @@ static struct aa_perms *compute_fperms(struct aa_dfa *dfa) if (!table) return NULL; - /* zero init so skip the trap state (state == 0) */ - for (state = 1; state < state_count; state++) { + for (state = 0; state < state_count; state++) { table[state * 2] = compute_fperms_user(dfa, state); table[state * 2 + 1] = compute_fperms_other(dfa, state); } diff --git a/security/commoncap.c b/security/commoncap.c index 1164278b97fd..aec62db55271 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -305,24 +305,24 @@ int cap_inode_need_killpriv(struct dentry *dentry) /** * cap_inode_killpriv - Erase the security markings on an inode * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Return: 0 if successful, -ve on error. */ -int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry) +int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { int error; - error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS); + error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; @@ -377,7 +377,7 @@ static bool is_v3header(int size, const struct vfs_cap_data *cap) * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ -int cap_inode_getsecurity(struct user_namespace *mnt_userns, +int cap_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -398,7 +398,7 @@ int cap_inode_getsecurity(struct user_namespace *mnt_userns, dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; - size = vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS, &tmpbuf, + size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf, sizeof(struct vfs_ns_cap_data), GFP_NOFS); dput(dentry); /* gcc11 complains if we don't check for !tmpbuf */ @@ -420,7 +420,7 @@ int cap_inode_getsecurity(struct user_namespace *mnt_userns, kroot = make_kuid(fs_ns, root); /* If this is an idmapped mount shift the kuid. */ - vfsroot = make_vfsuid(mnt_userns, fs_ns, kroot); + vfsroot = make_vfsuid(idmap, fs_ns, kroot); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ @@ -510,7 +510,7 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap) /** * cap_convert_nscap - check vfs caps * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: used to retrieve inode to check permissions on * @ivalue: vfs caps value which may be modified by this function * @size: size of @ivalue @@ -518,15 +518,15 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap) * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Return: On success, return the new size; on error, return < 0. */ -int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, +int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; @@ -544,9 +544,9 @@ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, return -EINVAL; if (!validheader(size, cap)) return -EINVAL; - if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP)) + if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; - if (size == XATTR_CAPS_SZ_2 && (mnt_userns == fs_ns)) + if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap)) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; @@ -555,7 +555,7 @@ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, if (!vfsuid_valid(vfsrootid)) return -EINVAL; - rootid = from_vfsuid(mnt_userns, fs_ns, vfsrootid); + rootid = from_vfsuid(idmap, fs_ns, vfsrootid); if (!uid_valid(rootid)) return -EINVAL; @@ -626,19 +626,19 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, /** * get_vfs_caps_from_disk - retrieve vfs caps from disk * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry from which @inode is retrieved * @cpu_caps: vfs capabilities * * Extract the on-exec-apply capability sets for an executable file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ -int get_vfs_caps_from_disk(struct user_namespace *mnt_userns, +int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { @@ -695,7 +695,7 @@ int get_vfs_caps_from_disk(struct user_namespace *mnt_userns, return -EINVAL; } - rootvfsuid = make_vfsuid(mnt_userns, fs_ns, rootkuid); + rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid); if (!vfsuid_valid(rootvfsuid)) return -ENODATA; @@ -747,7 +747,7 @@ static int get_file_caps(struct linux_binprm *bprm, struct file *file, if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; - rc = get_vfs_caps_from_disk(file_mnt_user_ns(file), + rc = get_vfs_caps_from_disk(file_mnt_idmap(file), file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) @@ -1016,23 +1016,23 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, /** * cap_inode_removexattr - Determine whether an xattr may be removed * - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ -int cap_inode_removexattr(struct user_namespace *mnt_userns, +int cap_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; @@ -1047,7 +1047,7 @@ int cap_inode_removexattr(struct user_namespace *mnt_userns, struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; - if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP)) + if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; return 0; } diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index fa5ff13fa8c9..52b811da6989 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -265,7 +265,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, req_xattr_value_len); continue; } - size = vfs_getxattr_alloc(&init_user_ns, dentry, xattr->name, + size = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr->name, &xattr_value, xattr_size, GFP_NOFS); if (size == -ENOMEM) { error = -ENOMEM; @@ -274,7 +274,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, if (size < 0) continue; - user_space_size = vfs_getxattr(&init_user_ns, dentry, + user_space_size = vfs_getxattr(&nop_mnt_idmap, dentry, xattr->name, NULL, 0); if (user_space_size != size) pr_debug("file %s: xattr %s size mismatch (kernel: %d, user: %d)\n", @@ -331,7 +331,7 @@ static int evm_is_immutable(struct dentry *dentry, struct inode *inode) return 1; /* Do this the hard way */ - rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM, + rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) { if (rc == -ENODATA) @@ -376,12 +376,12 @@ int evm_update_evmxattr(struct dentry *dentry, const char *xattr_name, xattr_value_len, &data); if (rc == 0) { data.hdr.xattr.sha1.type = EVM_XATTR_HMAC; - rc = __vfs_setxattr_noperm(&init_user_ns, dentry, + rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, &data.hdr.xattr.data[1], SHA1_DIGEST_SIZE + 1, 0); } else if (rc == -ENODATA && (inode->i_opflags & IOP_XATTR)) { - rc = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_EVM); + rc = __vfs_removexattr(&nop_mnt_idmap, dentry, XATTR_NAME_EVM); } return rc; } diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index f02e609460e2..cf24c5255583 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -184,7 +184,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, /* if status is not PASS, try to check again - against -ENOMEM */ /* first need to know the sig type */ - rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM, + rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) { evm_status = INTEGRITY_FAIL; @@ -436,7 +436,7 @@ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) /* * evm_xattr_change - check if passed xattr value differs from current value - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: requested xattr * @xattr_value: requested xattr value @@ -446,14 +446,14 @@ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) * * Returns 1 if passed xattr value differs from current value, 0 otherwise. */ -static int evm_xattr_change(struct user_namespace *mnt_userns, +static int evm_xattr_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { char *xattr_data = NULL; int rc = 0; - rc = vfs_getxattr_alloc(&init_user_ns, dentry, xattr_name, &xattr_data, + rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data, 0, GFP_NOFS); if (rc < 0) { rc = 1; @@ -482,7 +482,7 @@ out: * For posix xattr acls only, permit security.evm, even if it currently * doesn't exist, to be updated unless the EVM signature is immutable. */ -static int evm_protect_xattr(struct user_namespace *mnt_userns, +static int evm_protect_xattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { @@ -538,7 +538,7 @@ out: return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && - !evm_xattr_change(mnt_userns, dentry, xattr_name, xattr_value, + !evm_xattr_change(idmap, dentry, xattr_name, xattr_value, xattr_value_len)) return 0; @@ -553,7 +553,7 @@ out: /** * evm_inode_setxattr - protect the EVM extended attribute - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value @@ -565,7 +565,7 @@ out: * userspace from writing HMAC value. Writing 'security.evm' requires * requires CAP_SYS_ADMIN privileges. */ -int evm_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { @@ -584,20 +584,20 @@ int evm_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) return -EPERM; } - return evm_protect_xattr(mnt_userns, dentry, xattr_name, xattr_value, + return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_removexattr - protect the EVM extended attribute - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that * the current value is valid. */ -int evm_inode_removexattr(struct user_namespace *mnt_userns, +int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { /* Policy permits modification of the protected xattrs even though @@ -606,11 +606,11 @@ int evm_inode_removexattr(struct user_namespace *mnt_userns, if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; - return evm_protect_xattr(mnt_userns, dentry, xattr_name, NULL, 0); + return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0); } #ifdef CONFIG_FS_POSIX_ACL -static int evm_inode_set_acl_change(struct user_namespace *mnt_userns, +static int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { @@ -622,14 +622,14 @@ static int evm_inode_set_acl_change(struct user_namespace *mnt_userns, if (!kacl) return 1; - rc = posix_acl_update_mode(mnt_userns, inode, &mode, &kacl); + rc = posix_acl_update_mode(idmap, inode, &mode, &kacl); if (rc || (inode->i_mode != mode)) return 1; return 0; } #else -static inline int evm_inode_set_acl_change(struct user_namespace *mnt_userns, +static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) @@ -640,7 +640,7 @@ static inline int evm_inode_set_acl_change(struct user_namespace *mnt_userns, /** * evm_inode_set_acl - protect the EVM extended attribute from posix acls - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls @@ -649,7 +649,7 @@ static inline int evm_inode_set_acl_change(struct user_namespace *mnt_userns, * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. */ -int evm_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { enum integrity_status evm_status; @@ -678,7 +678,7 @@ int evm_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && - !evm_inode_set_acl_change(mnt_userns, dentry, acl_name, kacl)) + !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl)) return 0; if (evm_status != INTEGRITY_PASS_IMMUTABLE) @@ -779,14 +779,14 @@ void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name) evm_update_evmxattr(dentry, xattr_name, NULL, 0); } -static int evm_attr_change(struct user_namespace *mnt_userns, +static int evm_attr_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = attr->ia_valid; - if (!i_uid_needs_update(mnt_userns, attr, inode) && - !i_gid_needs_update(mnt_userns, attr, inode) && + if (!i_uid_needs_update(idmap, attr, inode) && + !i_gid_needs_update(idmap, attr, inode) && (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode)) return 0; @@ -800,7 +800,7 @@ static int evm_attr_change(struct user_namespace *mnt_userns, * Permit update of file attributes when files have a valid EVM signature, * except in the case of them having an immutable portable signature. */ -int evm_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; @@ -827,7 +827,7 @@ int evm_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && - !evm_attr_change(mnt_userns, dentry, attr)) + !evm_attr_change(idmap, dentry, attr)) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), diff --git a/security/integrity/evm/evm_secfs.c b/security/integrity/evm/evm_secfs.c index 8a9db7dfca7e..9b907c2fee60 100644 --- a/security/integrity/evm/evm_secfs.c +++ b/security/integrity/evm/evm_secfs.c @@ -228,7 +228,7 @@ static ssize_t evm_write_xattrs(struct file *file, const char __user *buf, newattrs.ia_valid = ATTR_MODE; inode = evm_xattrs->d_inode; inode_lock(inode); - err = simple_setattr(&init_user_ns, evm_xattrs, &newattrs); + err = simple_setattr(&nop_mnt_idmap, evm_xattrs, &newattrs); inode_unlock(inode); if (!err) err = count; diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 03b440921e61..d8530e722515 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -254,7 +254,7 @@ static inline void ima_process_queued_keys(void) {} #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */ /* LIM API function definitions */ -int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode, +int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, @@ -268,7 +268,7 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); -int process_buffer_measurement(struct user_namespace *mnt_userns, +int process_buffer_measurement(struct mnt_idmap *idmap, struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *func_data, @@ -285,7 +285,7 @@ void ima_free_template_entry(struct ima_template_entry *entry); const char *ima_d_path(const struct path *path, char **pathbuf, char *filename); /* IMA policy related functions */ -int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode, +int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, @@ -318,7 +318,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig); -int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode, +int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, @@ -346,7 +346,7 @@ static inline int ima_appraise_measurement(enum ima_hooks func, return INTEGRITY_UNKNOWN; } -static inline int ima_must_appraise(struct user_namespace *mnt_userns, +static inline int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index c1e76282b5ee..9345fd66f5b8 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -163,7 +163,7 @@ err_out: /** * ima_get_action - appraise & measure decision based on policy. - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate * @secid: secid of the task being validated @@ -186,7 +186,7 @@ err_out: * Returns IMA_MEASURE, IMA_APPRAISE mask. * */ -int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode, +int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, @@ -196,7 +196,7 @@ int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode, flags &= ima_policy_flag; - return ima_match_policy(mnt_userns, inode, cred, secid, func, mask, + return ima_match_policy(idmap, inode, cred, secid, func, mask, flags, pcr, template_desc, func_data, allowed_algos); } diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index ee6f7e237f2e..555342d337f9 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -70,7 +70,7 @@ bool is_ima_appraise_enabled(void) * * Return 1 to appraise or hash */ -int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode, +int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { u32 secid; @@ -79,7 +79,7 @@ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode, return 0; security_current_getsecid_subj(&secid); - return ima_match_policy(mnt_userns, inode, current_cred(), secid, + return ima_match_policy(idmap, inode, current_cred(), secid, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } @@ -98,7 +98,7 @@ static int ima_fix_xattr(struct dentry *dentry, iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } - rc = __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_IMA, + rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); @@ -225,7 +225,7 @@ int ima_read_xattr(struct dentry *dentry, { int ret; - ret = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_IMA, + ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; @@ -456,7 +456,7 @@ int ima_check_blacklist(struct integrity_iint_cache *iint, rc = is_binary_blacklisted(digest, digestsize); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) - process_buffer_measurement(&init_user_ns, NULL, digest, digestsize, + process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); } @@ -622,7 +622,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) /** * ima_inode_post_setattr - reflect file metadata changes - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * * Changes to a dentry's metadata might result in needing to appraise. @@ -630,7 +630,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ -void ima_inode_post_setattr(struct user_namespace *mnt_userns, +void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); @@ -641,7 +641,7 @@ void ima_inode_post_setattr(struct user_namespace *mnt_userns, || !(inode->i_opflags & IOP_XATTR)) return; - action = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, POST_SETATTR); + action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = integrity_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); @@ -774,7 +774,7 @@ int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, return result; } -int ima_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) diff --git a/security/integrity/ima/ima_asymmetric_keys.c b/security/integrity/ima/ima_asymmetric_keys.c index f6aa0b47a772..caacfe6860b1 100644 --- a/security/integrity/ima/ima_asymmetric_keys.c +++ b/security/integrity/ima/ima_asymmetric_keys.c @@ -60,7 +60,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key, * if the IMA policy is configured to measure a key linked * to the given keyring. */ - process_buffer_measurement(&init_user_ns, NULL, payload, payload_len, + process_buffer_measurement(&nop_mnt_idmap, NULL, payload, payload_len, keyring->description, KEY_CHECK, 0, keyring->description, false, NULL, 0); } diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 377300973e6c..358578267fea 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -224,7 +224,7 @@ static int process_measurement(struct file *file, const struct cred *cred, * bitmask based on the appraise/audit/measurement policy. * Included is the appraise submask. */ - action = ima_get_action(file_mnt_user_ns(file), inode, cred, secid, + action = ima_get_action(file_mnt_idmap(file), inode, cred, secid, mask, func, &pcr, &template_desc, NULL, &allowed_algos); violation_check = ((func == FILE_CHECK || func == MMAP_CHECK) && @@ -451,7 +451,7 @@ int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot) security_current_getsecid_subj(&secid); inode = file_inode(vma->vm_file); - action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode, + action = ima_get_action(file_mnt_idmap(vma->vm_file), inode, current_cred(), secid, MAY_EXEC, MMAP_CHECK, &pcr, &template, NULL, NULL); @@ -638,14 +638,14 @@ EXPORT_SYMBOL_GPL(ima_inode_hash); /** * ima_post_create_tmpfile - mark newly created tmpfile as new - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode of the newly created tmpfile * * No measuring, appraising or auditing of newly created tmpfiles is needed. * Skip calling process_measurement(), but indicate which newly, created * tmpfiles are in policy. */ -void ima_post_create_tmpfile(struct user_namespace *mnt_userns, +void ima_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode) { struct integrity_iint_cache *iint; @@ -654,7 +654,7 @@ void ima_post_create_tmpfile(struct user_namespace *mnt_userns, if (!ima_policy_flag || !S_ISREG(inode->i_mode)) return; - must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, + must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS, FILE_CHECK); if (!must_appraise) return; @@ -671,13 +671,13 @@ void ima_post_create_tmpfile(struct user_namespace *mnt_userns, /** * ima_post_path_mknod - mark as a new inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: newly created dentry * * Mark files created via the mknodat syscall as new, so that the * file data can be written later. */ -void ima_post_path_mknod(struct user_namespace *mnt_userns, +void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { struct integrity_iint_cache *iint; @@ -687,7 +687,7 @@ void ima_post_path_mknod(struct user_namespace *mnt_userns, if (!ima_policy_flag || !S_ISREG(inode->i_mode)) return; - must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, + must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS, FILE_CHECK); if (!must_appraise) return; @@ -869,7 +869,7 @@ int ima_post_load_data(char *buf, loff_t size, /** * process_buffer_measurement - Measure the buffer or the buffer data hash - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode associated with the object being measured (NULL for KEY_CHECK) * @buf: pointer to the buffer that needs to be added to the log. * @size: size of buffer(in bytes). @@ -887,7 +887,7 @@ int ima_post_load_data(char *buf, loff_t size, * has been written to the passed location but not added to a measurement entry, * a negative value otherwise. */ -int process_buffer_measurement(struct user_namespace *mnt_userns, +int process_buffer_measurement(struct mnt_idmap *idmap, struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *func_data, @@ -931,7 +931,7 @@ int process_buffer_measurement(struct user_namespace *mnt_userns, */ if (func) { security_current_getsecid_subj(&secid); - action = ima_get_action(mnt_userns, inode, current_cred(), + action = ima_get_action(idmap, inode, current_cred(), secid, 0, func, &pcr, &template, func_data, NULL); if (!(action & IMA_MEASURE) && !digest) @@ -1011,7 +1011,7 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) if (!f.file) return; - process_buffer_measurement(file_mnt_user_ns(f.file), file_inode(f.file), + process_buffer_measurement(file_mnt_idmap(f.file), file_inode(f.file), buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0, NULL, false, NULL, 0); fdput(f); @@ -1044,7 +1044,7 @@ int ima_measure_critical_data(const char *event_label, if (!event_name || !event_label || !buf || !buf_len) return -ENOPARAM; - return process_buffer_measurement(&init_user_ns, NULL, buf, buf_len, + return process_buffer_measurement(&nop_mnt_idmap, NULL, buf, buf_len, event_name, CRITICAL_DATA, 0, event_label, hash, digest, digest_len); diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 6a68ec270822..fc128a6b4abe 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -552,7 +552,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule, /** * ima_match_rules - determine whether an inode matches the policy rule. * @rule: a pointer to a rule - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: a pointer to an inode * @cred: a pointer to a credentials structure for user validation * @secid: the secid of the task to be validated @@ -563,7 +563,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule, * Returns true on rule match, false on failure. */ static bool ima_match_rules(struct ima_rule_entry *rule, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, const char *func_data) @@ -624,11 +624,11 @@ static bool ima_match_rules(struct ima_rule_entry *rule, return false; } if ((rule->flags & IMA_FOWNER) && - !rule->fowner_op(i_uid_into_vfsuid(mnt_userns, inode), + !rule->fowner_op(i_uid_into_vfsuid(idmap, inode), rule->fowner)) return false; if ((rule->flags & IMA_FGROUP) && - !rule->fgroup_op(i_gid_into_vfsgid(mnt_userns, inode), + !rule->fgroup_op(i_gid_into_vfsgid(idmap, inode), rule->fgroup)) return false; for (i = 0; i < MAX_LSM_RULES; i++) { @@ -713,7 +713,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) /** * ima_match_policy - decision based on LSM and other conditions - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: pointer to an inode for which the policy decision is being made * @cred: pointer to a credentials structure for which the policy decision is * being made @@ -732,7 +732,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) * list when walking it. Reads are many orders of magnitude more numerous * than writes so ima_match_policy() is classical RCU candidate. */ -int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode, +int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, @@ -752,7 +752,7 @@ int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode, if (!(entry->action & actmask)) continue; - if (!ima_match_rules(entry, mnt_userns, inode, cred, secid, + if (!ima_match_rules(entry, idmap, inode, cred, secid, func, mask, func_data)) continue; diff --git a/security/integrity/ima/ima_queue_keys.c b/security/integrity/ima/ima_queue_keys.c index 93056c03bf5a..4f0aea155bf9 100644 --- a/security/integrity/ima/ima_queue_keys.c +++ b/security/integrity/ima/ima_queue_keys.c @@ -159,7 +159,7 @@ void ima_process_queued_keys(void) list_for_each_entry_safe(entry, tmp, &ima_keys, list) { if (!timer_expired) - process_buffer_measurement(&init_user_ns, NULL, + process_buffer_measurement(&nop_mnt_idmap, NULL, entry->payload, entry->payload_len, entry->keyring_name, diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 4564faae7d67..6cd0add524cd 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -598,7 +598,7 @@ int ima_eventevmsig_init(struct ima_event_data *event_data, if (!event_data->file) return 0; - rc = vfs_getxattr_alloc(&init_user_ns, file_dentry(event_data->file), + rc = vfs_getxattr_alloc(&nop_mnt_idmap, file_dentry(event_data->file), XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0 || xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) { diff --git a/security/keys/key.c b/security/keys/key.c index c45afdd1dfbb..5c0c7df833f8 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -788,38 +788,18 @@ error: goto out; } -/** - * key_create_or_update - Update or create and instantiate a key. - * @keyring_ref: A pointer to the destination keyring with possession flag. - * @type: The type of key. - * @description: The searchable description for the key. - * @payload: The data to use to instantiate or update the key. - * @plen: The length of @payload. - * @perm: The permissions mask for a new key. - * @flags: The quota flags for a new key. - * - * Search the destination keyring for a key of the same description and if one - * is found, update it, otherwise create and instantiate a new one and create a - * link to it from that keyring. - * - * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be - * concocted. - * - * Returns a pointer to the new key if successful, -ENODEV if the key type - * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the - * caller isn't permitted to modify the keyring or the LSM did not permit - * creation of the key. - * - * On success, the possession flag from the keyring ref will be tacked on to - * the key ref before it is returned. +/* + * Create or potentially update a key. The combined logic behind + * key_create_or_update() and key_create() */ -key_ref_t key_create_or_update(key_ref_t keyring_ref, - const char *type, - const char *description, - const void *payload, - size_t plen, - key_perm_t perm, - unsigned long flags) +static key_ref_t __key_create_or_update(key_ref_t keyring_ref, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags, + bool allow_update) { struct keyring_index_key index_key = { .description = description, @@ -906,14 +886,23 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, goto error_link_end; } - /* if it's possible to update this type of key, search for an existing - * key of the same type and description in the destination keyring and - * update that instead if possible + /* if it's requested and possible to update this type of key, search + * for an existing key of the same type and description in the + * destination keyring and update that instead if possible */ - if (index_key.type->update) { + if (allow_update) { + if (index_key.type->update) { + key_ref = find_key_to_update(keyring_ref, &index_key); + if (key_ref) + goto found_matching_key; + } + } else { key_ref = find_key_to_update(keyring_ref, &index_key); - if (key_ref) - goto found_matching_key; + if (key_ref) { + key_ref_put(key_ref); + key_ref = ERR_PTR(-EEXIST); + goto error_link_end; + } } /* if the client doesn't provide, decide on the permissions we want */ @@ -985,9 +974,83 @@ error: goto error_free_prep; } + +/** + * key_create_or_update - Update or create and instantiate a key. + * @keyring_ref: A pointer to the destination keyring with possession flag. + * @type: The type of key. + * @description: The searchable description for the key. + * @payload: The data to use to instantiate or update the key. + * @plen: The length of @payload. + * @perm: The permissions mask for a new key. + * @flags: The quota flags for a new key. + * + * Search the destination keyring for a key of the same description and if one + * is found, update it, otherwise create and instantiate a new one and create a + * link to it from that keyring. + * + * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be + * concocted. + * + * Returns a pointer to the new key if successful, -ENODEV if the key type + * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the + * caller isn't permitted to modify the keyring or the LSM did not permit + * creation of the key. + * + * On success, the possession flag from the keyring ref will be tacked on to + * the key ref before it is returned. + */ +key_ref_t key_create_or_update(key_ref_t keyring_ref, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags) +{ + return __key_create_or_update(keyring_ref, type, description, payload, + plen, perm, flags, true); +} EXPORT_SYMBOL(key_create_or_update); /** + * key_create - Create and instantiate a key. + * @keyring_ref: A pointer to the destination keyring with possession flag. + * @type: The type of key. + * @description: The searchable description for the key. + * @payload: The data to use to instantiate or update the key. + * @plen: The length of @payload. + * @perm: The permissions mask for a new key. + * @flags: The quota flags for a new key. + * + * Create and instantiate a new key and link to it from the destination keyring. + * + * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be + * concocted. + * + * Returns a pointer to the new key if successful, -EEXIST if a key with the + * same description already exists, -ENODEV if the key type wasn't available, + * -ENOTDIR if the keyring wasn't a keyring, -EACCES if the caller isn't + * permitted to modify the keyring or the LSM did not permit creation of the + * key. + * + * On success, the possession flag from the keyring ref will be tacked on to + * the key ref before it is returned. + */ +key_ref_t key_create(key_ref_t keyring_ref, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags) +{ + return __key_create_or_update(keyring_ref, type, description, payload, + plen, perm, flags, false); +} +EXPORT_SYMBOL(key_create); + +/** * key_update - Update a key's contents. * @key_ref: The pointer (plus possession flag) to the key. * @payload: The data to be used to update the key. diff --git a/security/security.c b/security/security.c index d1571900a8c7..4e1150c44ab7 100644 --- a/security/security.c +++ b/security/security.c @@ -1354,7 +1354,7 @@ int security_inode_permission(struct inode *inode, int mask) return call_int_hook(inode_permission, 0, inode, mask); } -int security_inode_setattr(struct user_namespace *mnt_userns, +int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int ret; @@ -1364,7 +1364,7 @@ int security_inode_setattr(struct user_namespace *mnt_userns, ret = call_int_hook(inode_setattr, 0, dentry, attr); if (ret) return ret; - return evm_inode_setattr(mnt_userns, dentry, attr); + return evm_inode_setattr(idmap, dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); @@ -1375,7 +1375,7 @@ int security_inode_getattr(const struct path *path) return call_int_hook(inode_getattr, 0, path); } -int security_inode_setxattr(struct user_namespace *mnt_userns, +int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -1387,7 +1387,7 @@ int security_inode_setxattr(struct user_namespace *mnt_userns, * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ - ret = call_int_hook(inode_setxattr, 1, mnt_userns, dentry, name, value, + ret = call_int_hook(inode_setxattr, 1, idmap, dentry, name, value, size, flags); if (ret == 1) @@ -1397,10 +1397,10 @@ int security_inode_setxattr(struct user_namespace *mnt_userns, ret = ima_inode_setxattr(dentry, name, value, size); if (ret) return ret; - return evm_inode_setxattr(mnt_userns, dentry, name, value, size); + return evm_inode_setxattr(idmap, dentry, name, value, size); } -int security_inode_set_acl(struct user_namespace *mnt_userns, +int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { @@ -1408,38 +1408,38 @@ int security_inode_set_acl(struct user_namespace *mnt_userns, if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; - ret = call_int_hook(inode_set_acl, 0, mnt_userns, dentry, acl_name, + ret = call_int_hook(inode_set_acl, 0, idmap, dentry, acl_name, kacl); if (ret) return ret; - ret = ima_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + ret = ima_inode_set_acl(idmap, dentry, acl_name, kacl); if (ret) return ret; - return evm_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + return evm_inode_set_acl(idmap, dentry, acl_name, kacl); } -int security_inode_get_acl(struct user_namespace *mnt_userns, +int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; - return call_int_hook(inode_get_acl, 0, mnt_userns, dentry, acl_name); + return call_int_hook(inode_get_acl, 0, idmap, dentry, acl_name); } -int security_inode_remove_acl(struct user_namespace *mnt_userns, +int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; - ret = call_int_hook(inode_remove_acl, 0, mnt_userns, dentry, acl_name); + ret = call_int_hook(inode_remove_acl, 0, idmap, dentry, acl_name); if (ret) return ret; - ret = ima_inode_remove_acl(mnt_userns, dentry, acl_name); + ret = ima_inode_remove_acl(idmap, dentry, acl_name); if (ret) return ret; - return evm_inode_remove_acl(mnt_userns, dentry, acl_name); + return evm_inode_remove_acl(idmap, dentry, acl_name); } void security_inode_post_setxattr(struct dentry *dentry, const char *name, @@ -1465,7 +1465,7 @@ int security_inode_listxattr(struct dentry *dentry) return call_int_hook(inode_listxattr, 0, dentry); } -int security_inode_removexattr(struct user_namespace *mnt_userns, +int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { int ret; @@ -1476,15 +1476,15 @@ int security_inode_removexattr(struct user_namespace *mnt_userns, * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ - ret = call_int_hook(inode_removexattr, 1, mnt_userns, dentry, name); + ret = call_int_hook(inode_removexattr, 1, idmap, dentry, name); if (ret == 1) - ret = cap_inode_removexattr(mnt_userns, dentry, name); + ret = cap_inode_removexattr(idmap, dentry, name); if (ret) return ret; ret = ima_inode_removexattr(dentry, name); if (ret) return ret; - return evm_inode_removexattr(mnt_userns, dentry, name); + return evm_inode_removexattr(idmap, dentry, name); } int security_inode_need_killpriv(struct dentry *dentry) @@ -1492,13 +1492,13 @@ int security_inode_need_killpriv(struct dentry *dentry) return call_int_hook(inode_need_killpriv, 0, dentry); } -int security_inode_killpriv(struct user_namespace *mnt_userns, +int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { - return call_int_hook(inode_killpriv, 0, mnt_userns, dentry); + return call_int_hook(inode_killpriv, 0, idmap, dentry); } -int security_inode_getsecurity(struct user_namespace *mnt_userns, +int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -1511,7 +1511,7 @@ int security_inode_getsecurity(struct user_namespace *mnt_userns, * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) { - rc = hp->hook.inode_getsecurity(mnt_userns, inode, name, buffer, alloc); + rc = hp->hook.inode_getsecurity(idmap, inode, name, buffer, alloc); if (rc != LSM_RET_DEFAULT(inode_getsecurity)) return rc; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 3c5be76a9199..9a5bdfc21314 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3145,7 +3145,7 @@ static bool has_cap_mac_admin(bool audit) return true; } -static int selinux_inode_setxattr(struct user_namespace *mnt_userns, +static int selinux_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -3167,13 +3167,13 @@ static int selinux_inode_setxattr(struct user_namespace *mnt_userns, } if (!selinux_initialized(&selinux_state)) - return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM); + return (inode_owner_or_capable(idmap, inode) ? 0 : -EPERM); sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; ad.type = LSM_AUDIT_DATA_DENTRY; @@ -3240,20 +3240,20 @@ static int selinux_inode_setxattr(struct user_namespace *mnt_userns, &ad); } -static int selinux_inode_set_acl(struct user_namespace *mnt_userns, +static int selinux_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } -static int selinux_inode_get_acl(struct user_namespace *mnt_userns, +static int selinux_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return dentry_has_perm(current_cred(), dentry, FILE__GETATTR); } -static int selinux_inode_remove_acl(struct user_namespace *mnt_userns, +static int selinux_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); @@ -3313,11 +3313,11 @@ static int selinux_inode_listxattr(struct dentry *dentry) return dentry_has_perm(cred, dentry, FILE__GETATTR); } -static int selinux_inode_removexattr(struct user_namespace *mnt_userns, +static int selinux_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { if (strcmp(name, XATTR_NAME_SELINUX)) { - int rc = cap_inode_removexattr(mnt_userns, dentry, name); + int rc = cap_inode_removexattr(idmap, dentry, name); if (rc) return rc; @@ -3383,7 +3383,7 @@ static int selinux_path_notify(const struct path *path, u64 mask, * * Permission check is handled by selinux_inode_getxattr hook. */ -static int selinux_inode_getsecurity(struct user_namespace *mnt_userns, +static int selinux_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -6588,14 +6588,14 @@ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen */ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { - return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SELINUX, + return __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0); } static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { int len = 0; - len = selinux_inode_getsecurity(&init_user_ns, inode, + len = selinux_inode_getsecurity(&nop_mnt_idmap, inode, XATTR_SELINUX_SUFFIX, ctx, true); if (len < 0) return len; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 9a82a15685d1..cfcbb748da25 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1207,7 +1207,7 @@ static int smack_inode_getattr(const struct path *path) /** * smack_inode_setxattr - Smack check for setting xattrs - * @mnt_userns: active user namespace + * @idmap: idmap of the mount * @dentry: the object * @name: name of the attribute * @value: value of the attribute @@ -1218,7 +1218,7 @@ static int smack_inode_getattr(const struct path *path) * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_setxattr(struct user_namespace *mnt_userns, +static int smack_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -1334,7 +1334,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name) /** * smack_inode_removexattr - Smack check on removexattr - * @mnt_userns: active user namespace + * @idmap: idmap of the mount * @dentry: the object * @name: name of the attribute * @@ -1342,7 +1342,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name) * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_removexattr(struct user_namespace *mnt_userns, +static int smack_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode_smack *isp; @@ -1358,7 +1358,7 @@ static int smack_inode_removexattr(struct user_namespace *mnt_userns, if (!smack_privileged(CAP_MAC_ADMIN)) rc = -EPERM; } else - rc = cap_inode_removexattr(mnt_userns, dentry, name); + rc = cap_inode_removexattr(idmap, dentry, name); if (rc != 0) return rc; @@ -1394,14 +1394,14 @@ static int smack_inode_removexattr(struct user_namespace *mnt_userns, /** * smack_inode_set_acl - Smack check for setting posix acls - * @mnt_userns: the userns attached to the mnt this request came from + * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * @kacl: the posix acls * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_set_acl(struct user_namespace *mnt_userns, +static int smack_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { @@ -1418,13 +1418,13 @@ static int smack_inode_set_acl(struct user_namespace *mnt_userns, /** * smack_inode_get_acl - Smack check for getting posix acls - * @mnt_userns: the userns attached to the mnt this request came from + * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_get_acl(struct user_namespace *mnt_userns, +static int smack_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct smk_audit_info ad; @@ -1440,13 +1440,13 @@ static int smack_inode_get_acl(struct user_namespace *mnt_userns, /** * smack_inode_remove_acl - Smack check for getting posix acls - * @mnt_userns: the userns attached to the mnt this request came from + * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_remove_acl(struct user_namespace *mnt_userns, +static int smack_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct smk_audit_info ad; @@ -1462,7 +1462,7 @@ static int smack_inode_remove_acl(struct user_namespace *mnt_userns, /** * smack_inode_getsecurity - get smack xattrs - * @mnt_userns: active user namespace + * @idmap: idmap of the mount * @inode: the object * @name: attribute name * @buffer: where to put the result @@ -1470,7 +1470,7 @@ static int smack_inode_remove_acl(struct user_namespace *mnt_userns, * * Returns the size of the attribute or an error code */ -static int smack_inode_getsecurity(struct user_namespace *mnt_userns, +static int smack_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -3507,7 +3507,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) */ if (isp->smk_flags & SMK_INODE_CHANGED) { isp->smk_flags &= ~SMK_INODE_CHANGED; - rc = __vfs_setxattr(&init_user_ns, dp, inode, + rc = __vfs_setxattr(&nop_mnt_idmap, dp, inode, XATTR_NAME_SMACKTRANSMUTE, TRANS_TRUE, TRANS_TRUE_SIZE, 0); @@ -4686,7 +4686,7 @@ static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { - return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SMACK, + return __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_SMACK, ctx, ctxlen, 0); } diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 81025f50a542..f901504b5afc 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -541,16 +541,15 @@ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) struct sg_table *sgt; void *p; +#ifdef CONFIG_SND_DMA_SGBUF + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return snd_dma_sg_fallback_alloc(dmab, size); +#endif sgt = dma_alloc_noncontiguous(dmab->dev.dev, size, dmab->dev.dir, DEFAULT_GFP, 0); #ifdef CONFIG_SND_DMA_SGBUF - if (!sgt && !get_dma_ops(dmab->dev.dev)) { - if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG) - dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; - else - dmab->dev.type = SNDRV_DMA_TYPE_DEV_SG_FALLBACK; + if (!sgt && !get_dma_ops(dmab->dev.dev)) return snd_dma_sg_fallback_alloc(dmab, size); - } #endif if (!sgt) return NULL; @@ -717,19 +716,38 @@ static const struct snd_malloc_ops snd_dma_sg_wc_ops = { /* Fallback SG-buffer allocations for x86 */ struct snd_dma_sg_fallback { + bool use_dma_alloc_coherent; size_t count; struct page **pages; + /* DMA address array; the first page contains #pages in ~PAGE_MASK */ + dma_addr_t *addrs; }; static void __snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab, struct snd_dma_sg_fallback *sgbuf) { - bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; - size_t i; - - for (i = 0; i < sgbuf->count && sgbuf->pages[i]; i++) - do_free_pages(page_address(sgbuf->pages[i]), PAGE_SIZE, wc); + size_t i, size; + + if (sgbuf->pages && sgbuf->addrs) { + i = 0; + while (i < sgbuf->count) { + if (!sgbuf->pages[i] || !sgbuf->addrs[i]) + break; + size = sgbuf->addrs[i] & ~PAGE_MASK; + if (WARN_ON(!size)) + break; + if (sgbuf->use_dma_alloc_coherent) + dma_free_coherent(dmab->dev.dev, size << PAGE_SHIFT, + page_address(sgbuf->pages[i]), + sgbuf->addrs[i] & PAGE_MASK); + else + do_free_pages(page_address(sgbuf->pages[i]), + size << PAGE_SHIFT, false); + i += size; + } + } kvfree(sgbuf->pages); + kvfree(sgbuf->addrs); kfree(sgbuf); } @@ -738,24 +756,36 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) struct snd_dma_sg_fallback *sgbuf; struct page **pagep, *curp; size_t chunk, npages; + dma_addr_t *addrp; dma_addr_t addr; void *p; - bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; + + /* correct the type */ + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_SG) + dmab->dev.type = SNDRV_DMA_TYPE_DEV_SG_FALLBACK; + else if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG) + dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; sgbuf = kzalloc(sizeof(*sgbuf), GFP_KERNEL); if (!sgbuf) return NULL; + sgbuf->use_dma_alloc_coherent = cpu_feature_enabled(X86_FEATURE_XENPV); size = PAGE_ALIGN(size); sgbuf->count = size >> PAGE_SHIFT; sgbuf->pages = kvcalloc(sgbuf->count, sizeof(*sgbuf->pages), GFP_KERNEL); - if (!sgbuf->pages) + sgbuf->addrs = kvcalloc(sgbuf->count, sizeof(*sgbuf->addrs), GFP_KERNEL); + if (!sgbuf->pages || !sgbuf->addrs) goto error; pagep = sgbuf->pages; - chunk = size; + addrp = sgbuf->addrs; + chunk = (PAGE_SIZE - 1) << PAGE_SHIFT; /* to fit in low bits in addrs */ while (size > 0) { chunk = min(size, chunk); - p = do_alloc_pages(dmab->dev.dev, chunk, &addr, wc); + if (sgbuf->use_dma_alloc_coherent) + p = dma_alloc_coherent(dmab->dev.dev, chunk, &addr, DEFAULT_GFP); + else + p = do_alloc_pages(dmab->dev.dev, chunk, &addr, false); if (!p) { if (chunk <= PAGE_SIZE) goto error; @@ -767,17 +797,25 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) size -= chunk; /* fill pages */ npages = chunk >> PAGE_SHIFT; + *addrp = npages; /* store in lower bits */ curp = virt_to_page(p); - while (npages--) + while (npages--) { *pagep++ = curp++; + *addrp++ |= addr; + addr += PAGE_SIZE; + } } p = vmap(sgbuf->pages, sgbuf->count, VM_MAP, PAGE_KERNEL); if (!p) goto error; + + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK) + set_pages_array_wc(sgbuf->pages, sgbuf->count); + dmab->private_data = sgbuf; /* store the first page address for convenience */ - dmab->addr = snd_sgbuf_get_addr(dmab, 0); + dmab->addr = sgbuf->addrs[0] & PAGE_MASK; return p; error: @@ -787,10 +825,23 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) static void snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab) { + struct snd_dma_sg_fallback *sgbuf = dmab->private_data; + + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK) + set_pages_array_wb(sgbuf->pages, sgbuf->count); vunmap(dmab->area); __snd_dma_sg_fallback_free(dmab, dmab->private_data); } +static dma_addr_t snd_dma_sg_fallback_get_addr(struct snd_dma_buffer *dmab, + size_t offset) +{ + struct snd_dma_sg_fallback *sgbuf = dmab->private_data; + size_t index = offset >> PAGE_SHIFT; + + return (sgbuf->addrs[index] & PAGE_MASK) | (offset & ~PAGE_MASK); +} + static int snd_dma_sg_fallback_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { @@ -805,8 +856,8 @@ static const struct snd_malloc_ops snd_dma_sg_fallback_ops = { .alloc = snd_dma_sg_fallback_alloc, .free = snd_dma_sg_fallback_free, .mmap = snd_dma_sg_fallback_mmap, + .get_addr = snd_dma_sg_fallback_get_addr, /* reuse vmalloc helpers */ - .get_addr = snd_dma_vmalloc_get_addr, .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; diff --git a/sound/firewire/motu/motu-hwdep.c b/sound/firewire/motu/motu-hwdep.c index a900fc0e7644..88d1f4b56e4b 100644 --- a/sound/firewire/motu/motu-hwdep.c +++ b/sound/firewire/motu/motu-hwdep.c @@ -87,6 +87,10 @@ static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, return -EFAULT; count = consumed; + } else { + spin_unlock_irq(&motu->lock); + + count = 0; } return count; diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c index 1a868dd9dc4b..890c2f7c33fc 100644 --- a/sound/pci/hda/hda_bind.c +++ b/sound/pci/hda/hda_bind.c @@ -144,6 +144,7 @@ static int hda_codec_driver_probe(struct device *dev) error: snd_hda_codec_cleanup_for_unbind(codec); + codec->preset = NULL; return err; } @@ -166,6 +167,7 @@ static int hda_codec_driver_remove(struct device *dev) if (codec->patch_ops.free) codec->patch_ops.free(codec); snd_hda_codec_cleanup_for_unbind(codec); + codec->preset = NULL; module_put(dev->driver->owner); return 0; } diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index edd653ece70d..2e728aad6771 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -795,7 +795,6 @@ void snd_hda_codec_cleanup_for_unbind(struct hda_codec *codec) snd_array_free(&codec->cvt_setups); snd_array_free(&codec->spdif_out); snd_array_free(&codec->verbs); - codec->preset = NULL; codec->follower_dig_outs = NULL; codec->spdif_status_reset = 0; snd_array_free(&codec->mixers); @@ -928,7 +927,6 @@ snd_hda_codec_device_init(struct hda_bus *bus, unsigned int codec_addr, codec->depop_delay = -1; codec->fixup_id = HDA_FIXUP_ID_NOT_SET; codec->core.dev.release = snd_hda_codec_dev_release; - codec->core.exec_verb = codec_exec_verb; codec->core.type = HDA_DEV_LEGACY; mutex_init(&codec->spdif_mutex); @@ -999,6 +997,7 @@ int snd_hda_codec_device_new(struct hda_bus *bus, struct snd_card *card, if (snd_BUG_ON(codec_addr > HDA_MAX_CODEC_ADDRESS)) return -EINVAL; + codec->core.exec_verb = codec_exec_verb; codec->card = card; codec->addr = codec_addr; diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 7b1a30a551f6..75e1d00074b9 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1125,6 +1125,7 @@ static const struct hda_device_id snd_hda_id_conexant[] = { HDA_CODEC_ENTRY(0x14f11f87, "SN6140", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f12008, "CX8200", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f120d0, "CX11970", patch_conexant_auto), + HDA_CODEC_ENTRY(0x14f120d1, "SN6180", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15045, "CX20549 (Venice)", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15047, "CX20551 (Waikiki)", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15051, "CX20561 (Hermosa)", patch_conexant_auto), diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 6fab7c8fc19a..e103bb3693c0 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -832,7 +832,7 @@ do_sku: alc_setup_gpio(codec, 0x02); break; case 7: - alc_setup_gpio(codec, 0x03); + alc_setup_gpio(codec, 0x04); break; case 5: default: @@ -9202,6 +9202,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x142b, "Acer Swift SF314-42", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1430, "Acer TravelMate B311R-31", ALC256_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1466, "Acer Aspire A515-56", ALC255_FIXUP_ACER_HEADPHONE_AND_MIC), + SND_PCI_QUIRK(0x1025, 0x1534, "Acer Predator PH315-54", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z), SND_PCI_QUIRK(0x1028, 0x053c, "Dell Latitude E5430", ALC292_FIXUP_DELL_E7X), SND_PCI_QUIRK(0x1028, 0x054b, "Dell XPS one 2710", ALC275_FIXUP_DELL_XPS), @@ -9422,6 +9423,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x89c3, "Zbook Studio G9", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89c6, "Zbook Fury 17 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89ca, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x89d3, "HP EliteBook 645 G9 (MB 89D2)", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8a78, "HP Dev One", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x103c, 0x8aa0, "HP ProBook 440 G9 (MB 8A9E)", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8aa3, "HP ProBook 450 G9 (MB 8AA1)", ALC236_FIXUP_HP_GPIO_LED), @@ -9430,8 +9432,21 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b42, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b43, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b44, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b45, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b46, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b47, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8b7a, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b7d, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b87, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b8a, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b8b, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b8d, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b92, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8bf0, "HP", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), @@ -9478,6 +9493,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1d4e, "ASUS TM420", ALC256_FIXUP_ASUS_HPE), SND_PCI_QUIRK(0x1043, 0x1e02, "ASUS UX3402", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1e11, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA502), + SND_PCI_QUIRK(0x1043, 0x1e12, "ASUS UM3402", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), SND_PCI_QUIRK(0x1043, 0x1e5e, "ASUS ROG Strix G513", ALC294_FIXUP_ASUS_G513_PINS), SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), @@ -9521,6 +9537,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc812, "Samsung Notebook Pen S (NT950SBE-X58)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc830, "Samsung Galaxy Book Ion (NT950XCJ-X716A)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc832, "Samsung Galaxy Book Flex Alpha (NP730QCJ)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), + SND_PCI_QUIRK(0x144d, 0xca03, "Samsung Galaxy Book2 Pro 360 (NP930QED)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), @@ -9699,6 +9716,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802), SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X), + SND_PCI_QUIRK(0x1c6c, 0x1251, "Positivo N14KP6-TG", ALC288_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d05, 0x1132, "TongFang PHxTxX1", ALC256_FIXUP_SET_COEF_DEFAULTS), SND_PCI_QUIRK(0x1d05, 0x1096, "TongFang GMxMRxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x1100, "TongFang GKxNRxx", ALC269_FIXUP_NO_SHUTUP), diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c index aea7fae2ca4b..2994f85bc1b9 100644 --- a/sound/pci/hda/patch_via.c +++ b/sound/pci/hda/patch_via.c @@ -819,6 +819,9 @@ static int add_secret_dac_path(struct hda_codec *codec) return 0; nums = snd_hda_get_connections(codec, spec->gen.mixer_nid, conn, ARRAY_SIZE(conn) - 1); + if (nums < 0) + return nums; + for (i = 0; i < nums; i++) { if (get_wcaps_type(get_wcaps(codec, conn[i])) == AC_WID_AUD_OUT) return 0; diff --git a/sound/pci/lx6464es/lx_core.c b/sound/pci/lx6464es/lx_core.c index d3f58a3d17fb..b5b0d43bb8dc 100644 --- a/sound/pci/lx6464es/lx_core.c +++ b/sound/pci/lx6464es/lx_core.c @@ -493,12 +493,11 @@ int lx_buffer_ask(struct lx6464es *chip, u32 pipe, int is_capture, dev_dbg(chip->card->dev, "CMD_08_ASK_BUFFERS: needed %d, freed %d\n", *r_needed, *r_freed); - for (i = 0; i < MAX_STREAM_BUFFER; ++i) { - for (i = 0; i != chip->rmh.stat_len; ++i) - dev_dbg(chip->card->dev, - " stat[%d]: %x, %x\n", i, - chip->rmh.stat[i], - chip->rmh.stat[i] & MASK_DATA_SIZE); + for (i = 0; i < MAX_STREAM_BUFFER && i < chip->rmh.stat_len; + ++i) { + dev_dbg(chip->card->dev, " stat[%d]: %x, %x\n", i, + chip->rmh.stat[i], + chip->rmh.stat[i] & MASK_DATA_SIZE); } } diff --git a/sound/soc/amd/acp-es8336.c b/sound/soc/amd/acp-es8336.c index 2fe8df86053a..89499542c803 100644 --- a/sound/soc/amd/acp-es8336.c +++ b/sound/soc/amd/acp-es8336.c @@ -198,9 +198,11 @@ static int st_es8336_late_probe(struct snd_soc_card *card) int ret; adev = acpi_dev_get_first_match_dev("ESSX8336", NULL, -1); - if (adev) - put_device(&adev->dev); + if (!adev) + return -ENODEV; + codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); if (!codec_dev) dev_err(card->dev, "can not find codec dev\n"); diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 0d283e41f66d..36314753923b 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -230,10 +230,31 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { { .driver_data = &acp6x_card, .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "TIMI"), + DMI_MATCH(DMI_PRODUCT_NAME, "Redmi Book Pro 15 2022"), + } + }, + { + .driver_data = &acp6x_card, + .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Razer"), DMI_MATCH(DMI_PRODUCT_NAME, "Blade 14 (2022) - RZ09-0427"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "RB"), + DMI_MATCH(DMI_PRODUCT_NAME, "Swift SFA16-41"), + } + }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IRBIS"), + DMI_MATCH(DMI_PRODUCT_NAME, "15NBC1011"), + } + }, {} }; diff --git a/sound/soc/codecs/cs42l56.c b/sound/soc/codecs/cs42l56.c index 26066682c983..3b0e715549c9 100644 --- a/sound/soc/codecs/cs42l56.c +++ b/sound/soc/codecs/cs42l56.c @@ -1191,18 +1191,12 @@ static int cs42l56_i2c_probe(struct i2c_client *i2c_client) if (pdata) { cs42l56->pdata = *pdata; } else { - pdata = devm_kzalloc(&i2c_client->dev, sizeof(*pdata), - GFP_KERNEL); - if (!pdata) - return -ENOMEM; - if (i2c_client->dev.of_node) { ret = cs42l56_handle_of_data(i2c_client, &cs42l56->pdata); if (ret != 0) return ret; } - cs42l56->pdata = *pdata; } if (cs42l56->pdata.gpio_nreset) { diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 9ddf6a35e91c..28a0565c2a95 100755..100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -729,14 +729,16 @@ static int es8326_probe(struct snd_soc_component *component) } dev_dbg(component->dev, "jack-pol %x", es8326->jack_pol); - ret = device_property_read_u8(component->dev, "everest,interrupt-src", &es8326->jack_pol); + ret = device_property_read_u8(component->dev, "everest,interrupt-src", + &es8326->interrupt_src); if (ret != 0) { dev_dbg(component->dev, "interrupt-src return %d", ret); es8326->interrupt_src = ES8326_HP_DET_SRC_PIN9; } dev_dbg(component->dev, "interrupt-src %x", es8326->interrupt_src); - ret = device_property_read_u8(component->dev, "everest,interrupt-clk", &es8326->jack_pol); + ret = device_property_read_u8(component->dev, "everest,interrupt-clk", + &es8326->interrupt_clk); if (ret != 0) { dev_dbg(component->dev, "interrupt-clk return %d", ret); es8326->interrupt_clk = 0x45; diff --git a/sound/soc/codecs/es8326.h b/sound/soc/codecs/es8326.h index 8e5ffe5ee10d..8e5ffe5ee10d 100755..100644 --- a/sound/soc/codecs/es8326.h +++ b/sound/soc/codecs/es8326.h diff --git a/sound/soc/codecs/rt715-sdca-sdw.c b/sound/soc/codecs/rt715-sdca-sdw.c index 3f981a9e7fb6..c54ecf3e6987 100644 --- a/sound/soc/codecs/rt715-sdca-sdw.c +++ b/sound/soc/codecs/rt715-sdca-sdw.c @@ -167,7 +167,7 @@ static int rt715_sdca_read_prop(struct sdw_slave *slave) } /* set the timeout values */ - prop->clk_stop_timeout = 20; + prop->clk_stop_timeout = 200; return 0; } diff --git a/sound/soc/codecs/tas5805m.c b/sound/soc/codecs/tas5805m.c index beb4ec629a03..4e38eb7acea1 100644 --- a/sound/soc/codecs/tas5805m.c +++ b/sound/soc/codecs/tas5805m.c @@ -154,6 +154,7 @@ static const uint32_t tas5805m_volume[] = { #define TAS5805M_VOLUME_MIN 0 struct tas5805m_priv { + struct i2c_client *i2c; struct regulator *pvdd; struct gpio_desc *gpio_pdn_n; @@ -165,6 +166,9 @@ struct tas5805m_priv { int vol[2]; bool is_powered; bool is_muted; + + struct work_struct work; + struct mutex lock; }; static void set_dsp_scale(struct regmap *rm, int offset, int vol) @@ -181,13 +185,11 @@ static void set_dsp_scale(struct regmap *rm, int offset, int vol) regmap_bulk_write(rm, offset, v, ARRAY_SIZE(v)); } -static void tas5805m_refresh(struct snd_soc_component *component) +static void tas5805m_refresh(struct tas5805m_priv *tas5805m) { - struct tas5805m_priv *tas5805m = - snd_soc_component_get_drvdata(component); struct regmap *rm = tas5805m->regmap; - dev_dbg(component->dev, "refresh: is_muted=%d, vol=%d/%d\n", + dev_dbg(&tas5805m->i2c->dev, "refresh: is_muted=%d, vol=%d/%d\n", tas5805m->is_muted, tas5805m->vol[0], tas5805m->vol[1]); regmap_write(rm, REG_PAGE, 0x00); @@ -201,6 +203,9 @@ static void tas5805m_refresh(struct snd_soc_component *component) set_dsp_scale(rm, 0x24, tas5805m->vol[0]); set_dsp_scale(rm, 0x28, tas5805m->vol[1]); + regmap_write(rm, REG_PAGE, 0x00); + regmap_write(rm, REG_BOOK, 0x00); + /* Set/clear digital soft-mute */ regmap_write(rm, REG_DEVICE_CTRL_2, (tas5805m->is_muted ? DCTRL2_MUTE : 0) | @@ -226,8 +231,11 @@ static int tas5805m_vol_get(struct snd_kcontrol *kcontrol, struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); + mutex_lock(&tas5805m->lock); ucontrol->value.integer.value[0] = tas5805m->vol[0]; ucontrol->value.integer.value[1] = tas5805m->vol[1]; + mutex_unlock(&tas5805m->lock); + return 0; } @@ -243,11 +251,13 @@ static int tas5805m_vol_put(struct snd_kcontrol *kcontrol, snd_soc_kcontrol_component(kcontrol); struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); + int ret = 0; if (!(volume_is_valid(ucontrol->value.integer.value[0]) && volume_is_valid(ucontrol->value.integer.value[1]))) return -EINVAL; + mutex_lock(&tas5805m->lock); if (tas5805m->vol[0] != ucontrol->value.integer.value[0] || tas5805m->vol[1] != ucontrol->value.integer.value[1]) { tas5805m->vol[0] = ucontrol->value.integer.value[0]; @@ -256,11 +266,12 @@ static int tas5805m_vol_put(struct snd_kcontrol *kcontrol, tas5805m->vol[0], tas5805m->vol[1], tas5805m->is_powered); if (tas5805m->is_powered) - tas5805m_refresh(component); - return 1; + tas5805m_refresh(tas5805m); + ret = 1; } + mutex_unlock(&tas5805m->lock); - return 0; + return ret; } static const struct snd_kcontrol_new tas5805m_snd_controls[] = { @@ -294,54 +305,83 @@ static int tas5805m_trigger(struct snd_pcm_substream *substream, int cmd, struct snd_soc_component *component = dai->component; struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); - struct regmap *rm = tas5805m->regmap; - unsigned int chan, global1, global2; switch (cmd) { case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: - dev_dbg(component->dev, "DSP startup\n"); - - /* We mustn't issue any I2C transactions until the I2S - * clock is stable. Furthermore, we must allow a 5ms - * delay after the first set of register writes to - * allow the DSP to boot before configuring it. - */ - usleep_range(5000, 10000); - send_cfg(rm, dsp_cfg_preboot, - ARRAY_SIZE(dsp_cfg_preboot)); - usleep_range(5000, 15000); - send_cfg(rm, tas5805m->dsp_cfg_data, - tas5805m->dsp_cfg_len); - - tas5805m->is_powered = true; - tas5805m_refresh(component); + dev_dbg(component->dev, "clock start\n"); + schedule_work(&tas5805m->work); break; case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: case SNDRV_PCM_TRIGGER_PAUSE_PUSH: - dev_dbg(component->dev, "DSP shutdown\n"); + break; - tas5805m->is_powered = false; + default: + return -EINVAL; + } - regmap_write(rm, REG_PAGE, 0x00); - regmap_write(rm, REG_BOOK, 0x00); + return 0; +} - regmap_read(rm, REG_CHAN_FAULT, &chan); - regmap_read(rm, REG_GLOBAL_FAULT1, &global1); - regmap_read(rm, REG_GLOBAL_FAULT2, &global2); +static void do_work(struct work_struct *work) +{ + struct tas5805m_priv *tas5805m = + container_of(work, struct tas5805m_priv, work); + struct regmap *rm = tas5805m->regmap; - dev_dbg(component->dev, - "fault regs: CHAN=%02x, GLOBAL1=%02x, GLOBAL2=%02x\n", - chan, global1, global2); + dev_dbg(&tas5805m->i2c->dev, "DSP startup\n"); - regmap_write(rm, REG_DEVICE_CTRL_2, DCTRL2_MODE_HIZ); - break; + mutex_lock(&tas5805m->lock); + /* We mustn't issue any I2C transactions until the I2S + * clock is stable. Furthermore, we must allow a 5ms + * delay after the first set of register writes to + * allow the DSP to boot before configuring it. + */ + usleep_range(5000, 10000); + send_cfg(rm, dsp_cfg_preboot, ARRAY_SIZE(dsp_cfg_preboot)); + usleep_range(5000, 15000); + send_cfg(rm, tas5805m->dsp_cfg_data, tas5805m->dsp_cfg_len); + + tas5805m->is_powered = true; + tas5805m_refresh(tas5805m); + mutex_unlock(&tas5805m->lock); +} - default: - return -EINVAL; +static int tas5805m_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct tas5805m_priv *tas5805m = + snd_soc_component_get_drvdata(component); + struct regmap *rm = tas5805m->regmap; + + if (event & SND_SOC_DAPM_PRE_PMD) { + unsigned int chan, global1, global2; + + dev_dbg(component->dev, "DSP shutdown\n"); + cancel_work_sync(&tas5805m->work); + + mutex_lock(&tas5805m->lock); + if (tas5805m->is_powered) { + tas5805m->is_powered = false; + + regmap_write(rm, REG_PAGE, 0x00); + regmap_write(rm, REG_BOOK, 0x00); + + regmap_read(rm, REG_CHAN_FAULT, &chan); + regmap_read(rm, REG_GLOBAL_FAULT1, &global1); + regmap_read(rm, REG_GLOBAL_FAULT2, &global2); + + dev_dbg(component->dev, "fault regs: CHAN=%02x, " + "GLOBAL1=%02x, GLOBAL2=%02x\n", + chan, global1, global2); + + regmap_write(rm, REG_DEVICE_CTRL_2, DCTRL2_MODE_HIZ); + } + mutex_unlock(&tas5805m->lock); } return 0; @@ -354,7 +394,8 @@ static const struct snd_soc_dapm_route tas5805m_audio_map[] = { static const struct snd_soc_dapm_widget tas5805m_dapm_widgets[] = { SND_SOC_DAPM_AIF_IN("DAC IN", "Playback", 0, SND_SOC_NOPM, 0, 0), - SND_SOC_DAPM_DAC("DAC", NULL, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_DAC_E("DAC", NULL, SND_SOC_NOPM, 0, 0, + tas5805m_dac_event, SND_SOC_DAPM_PRE_PMD), SND_SOC_DAPM_OUTPUT("OUT") }; @@ -375,11 +416,14 @@ static int tas5805m_mute(struct snd_soc_dai *dai, int mute, int direction) struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); + mutex_lock(&tas5805m->lock); dev_dbg(component->dev, "set mute=%d (is_powered=%d)\n", mute, tas5805m->is_powered); + tas5805m->is_muted = mute; if (tas5805m->is_powered) - tas5805m_refresh(component); + tas5805m_refresh(tas5805m); + mutex_unlock(&tas5805m->lock); return 0; } @@ -434,6 +478,7 @@ static int tas5805m_i2c_probe(struct i2c_client *i2c) if (!tas5805m) return -ENOMEM; + tas5805m->i2c = i2c; tas5805m->pvdd = devm_regulator_get(dev, "pvdd"); if (IS_ERR(tas5805m->pvdd)) { dev_err(dev, "failed to get pvdd supply: %ld\n", @@ -507,6 +552,9 @@ static int tas5805m_i2c_probe(struct i2c_client *i2c) gpiod_set_value(tas5805m->gpio_pdn_n, 1); usleep_range(10000, 15000); + INIT_WORK(&tas5805m->work, do_work); + mutex_init(&tas5805m->lock); + /* Don't register through devm. We need to be able to unregister * the component prior to deasserting PDN# */ @@ -527,6 +575,7 @@ static void tas5805m_i2c_remove(struct i2c_client *i2c) struct device *dev = &i2c->dev; struct tas5805m_priv *tas5805m = dev_get_drvdata(dev); + cancel_work_sync(&tas5805m->work); snd_soc_unregister_component(dev); gpiod_set_value(tas5805m->gpio_pdn_n, 0); usleep_range(10000, 15000); diff --git a/sound/soc/codecs/wsa883x.c b/sound/soc/codecs/wsa883x.c index 966ba4909204..58fdb4e9fd97 100644 --- a/sound/soc/codecs/wsa883x.c +++ b/sound/soc/codecs/wsa883x.c @@ -1359,8 +1359,8 @@ static struct snd_soc_dai_driver wsa883x_dais[] = { .stream_name = "SPKR Playback", .rates = WSA883X_RATES | WSA883X_FRAC_RATES, .formats = WSA883X_FORMATS, - .rate_max = 8000, - .rate_min = 352800, + .rate_min = 8000, + .rate_max = 352800, .channels_min = 1, .channels_max = 1, }, diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index 1c9be8a5dcb1..35a52c3a020d 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -1141,6 +1141,7 @@ static int fsl_sai_check_version(struct device *dev) sai->verid.version = val & (FSL_SAI_VERID_MAJOR_MASK | FSL_SAI_VERID_MINOR_MASK); + sai->verid.version >>= FSL_SAI_VERID_MINOR_SHIFT; sai->verid.feature = val & FSL_SAI_VERID_FEATURE_MASK; ret = regmap_read(sai->regmap, FSL_SAI_PARAM, &val); diff --git a/sound/soc/intel/avs/core.c b/sound/soc/intel/avs/core.c index 2ca24273c491..637501850728 100644 --- a/sound/soc/intel/avs/core.c +++ b/sound/soc/intel/avs/core.c @@ -481,6 +481,29 @@ err_remap_bar0: return ret; } +static void avs_pci_shutdown(struct pci_dev *pci) +{ + struct hdac_bus *bus = pci_get_drvdata(pci); + struct avs_dev *adev = hdac_to_avs(bus); + + cancel_work_sync(&adev->probe_work); + avs_ipc_block(adev->ipc); + + snd_hdac_stop_streams(bus); + avs_dsp_op(adev, int_control, false); + snd_hdac_ext_bus_ppcap_int_enable(bus, false); + snd_hdac_ext_bus_link_power_down_all(bus); + + snd_hdac_bus_stop_chip(bus); + snd_hdac_display_power(bus, HDA_CODEC_IDX_CONTROLLER, false); + + if (avs_platattr_test(adev, CLDMA)) + pci_free_irq(pci, 0, &code_loader); + pci_free_irq(pci, 0, adev); + pci_free_irq(pci, 0, bus); + pci_free_irq_vectors(pci); +} + static void avs_pci_remove(struct pci_dev *pci) { struct hdac_device *hdev, *save; @@ -739,6 +762,7 @@ static struct pci_driver avs_pci_driver = { .id_table = avs_ids, .probe = avs_pci_probe, .remove = avs_pci_remove, + .shutdown = avs_pci_shutdown, .driver = { .pm = &avs_dev_pm, }, diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index 09d1f0f6d686..df157b01df8b 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -497,21 +497,28 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) if (adev) { snprintf(codec_name, sizeof(codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); byt_cht_es8316_dais[dai_index].codecs->name = codec_name; } else { dev_err(dev, "Error cannot find '%s' dev\n", mach->id); return -ENXIO; } + codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); + if (!codec_dev) + return -EPROBE_DEFER; + priv->codec_dev = get_device(codec_dev); + /* override platform name, if required */ byt_cht_es8316_card.dev = dev; platform_name = mach->mach_params.platform; ret = snd_soc_fixup_dai_links_platform_name(&byt_cht_es8316_card, platform_name); - if (ret) + if (ret) { + put_device(codec_dev); return ret; + } /* Check for BYTCR or other platform and setup quirks */ dmi_id = dmi_first_match(byt_cht_es8316_quirk_table); @@ -539,13 +546,10 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) /* get the clock */ priv->mclk = devm_clk_get(dev, "pmc_plt_clk_3"); - if (IS_ERR(priv->mclk)) + if (IS_ERR(priv->mclk)) { + put_device(codec_dev); return dev_err_probe(dev, PTR_ERR(priv->mclk), "clk_get pmc_plt_clk_3 failed\n"); - - codec_dev = acpi_get_first_physical_node(adev); - if (!codec_dev) - return -EPROBE_DEFER; - priv->codec_dev = get_device(codec_dev); + } if (quirk & BYT_CHT_ES8316_JD_INVERTED) props[cnt++] = PROPERTY_ENTRY_BOOL("everest,jack-detect-inverted"); diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index 4699ca79f3ea..79e0039c79a3 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -1636,13 +1636,18 @@ static int snd_byt_rt5640_mc_probe(struct platform_device *pdev) if (adev) { snprintf(byt_rt5640_codec_name, sizeof(byt_rt5640_codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); byt_rt5640_dais[dai_index].codecs->name = byt_rt5640_codec_name; } else { dev_err(dev, "Error cannot find '%s' dev\n", mach->id); return -ENXIO; } + codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); + if (!codec_dev) + return -EPROBE_DEFER; + priv->codec_dev = get_device(codec_dev); + /* * swap SSP0 if bytcr is detected * (will be overridden if DMI quirk is detected) @@ -1717,11 +1722,6 @@ static int snd_byt_rt5640_mc_probe(struct platform_device *pdev) byt_rt5640_quirk = quirk_override; } - codec_dev = acpi_get_first_physical_node(adev); - if (!codec_dev) - return -EPROBE_DEFER; - priv->codec_dev = get_device(codec_dev); - if (byt_rt5640_quirk & BYT_RT5640_JD_HP_ELITEP_1000G2) { acpi_dev_add_driver_gpios(ACPI_COMPANION(priv->codec_dev), byt_rt5640_hp_elitepad_1000g2_gpios); diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index 81ac6eeda2e6..8fca9b82d4d0 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -922,7 +922,6 @@ static int snd_byt_rt5651_mc_probe(struct platform_device *pdev) if (adev) { snprintf(byt_rt5651_codec_name, sizeof(byt_rt5651_codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); byt_rt5651_dais[dai_index].codecs->name = byt_rt5651_codec_name; } else { dev_err(dev, "Error cannot find '%s' dev\n", mach->id); @@ -930,6 +929,7 @@ static int snd_byt_rt5651_mc_probe(struct platform_device *pdev) } codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); if (!codec_dev) return -EPROBE_DEFER; priv->codec_dev = get_device(codec_dev); diff --git a/sound/soc/intel/boards/bytcr_wm5102.c b/sound/soc/intel/boards/bytcr_wm5102.c index 1669eb3bd80f..c0706537f673 100644 --- a/sound/soc/intel/boards/bytcr_wm5102.c +++ b/sound/soc/intel/boards/bytcr_wm5102.c @@ -411,9 +411,9 @@ static int snd_byt_wm5102_mc_probe(struct platform_device *pdev) return -ENOENT; } snprintf(codec_name, sizeof(codec_name), "spi-%s", acpi_dev_name(adev)); - put_device(&adev->dev); codec_dev = bus_find_device_by_name(&spi_bus_type, NULL, codec_name); + acpi_dev_put(adev); if (!codec_dev) return -EPROBE_DEFER; diff --git a/sound/soc/intel/boards/sof_cs42l42.c b/sound/soc/intel/boards/sof_cs42l42.c index e38bd2831e6a..e9d190cb13b0 100644 --- a/sound/soc/intel/boards/sof_cs42l42.c +++ b/sound/soc/intel/boards/sof_cs42l42.c @@ -336,6 +336,9 @@ static int create_spk_amp_dai_links(struct device *dev, links[*id].platforms = platform_component; links[*id].num_platforms = ARRAY_SIZE(platform_component); links[*id].dpcm_playback = 1; + /* firmware-generated echo reference */ + links[*id].dpcm_capture = 1; + links[*id].no_pcm = 1; links[*id].cpus = &cpus[*id]; links[*id].num_cpus = 1; diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index 773e5d1d87d4..894b6610b9e2 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -681,7 +681,6 @@ static int sof_es8336_probe(struct platform_device *pdev) if (adev) { snprintf(codec_name, sizeof(codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); dai_links[0].codecs->name = codec_name; /* also fixup codec dai name if relevant */ @@ -692,16 +691,19 @@ static int sof_es8336_probe(struct platform_device *pdev) return -ENXIO; } - ret = snd_soc_fixup_dai_links_platform_name(&sof_es8336_card, - mach->mach_params.platform); - if (ret) - return ret; - codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); if (!codec_dev) return -EPROBE_DEFER; priv->codec_dev = get_device(codec_dev); + ret = snd_soc_fixup_dai_links_platform_name(&sof_es8336_card, + mach->mach_params.platform); + if (ret) { + put_device(codec_dev); + return ret; + } + if (quirk & SOF_ES8336_JD_INVERTED) props[cnt++] = PROPERTY_ENTRY_BOOL("everest,jack-detect-inverted"); diff --git a/sound/soc/intel/boards/sof_nau8825.c b/sound/soc/intel/boards/sof_nau8825.c index a800854c2831..6794a0249a9a 100644 --- a/sound/soc/intel/boards/sof_nau8825.c +++ b/sound/soc/intel/boards/sof_nau8825.c @@ -487,8 +487,6 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].num_codecs = ARRAY_SIZE(max_98373_components); links[id].init = max_98373_spk_codec_init; links[id].ops = &max_98373_ops; - /* feedback stream */ - links[id].dpcm_capture = 1; } else if (sof_nau8825_quirk & SOF_MAX98360A_SPEAKER_AMP_PRESENT) { max_98360a_dai_link(&links[id]); @@ -506,6 +504,9 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].platforms = platform_component; links[id].num_platforms = ARRAY_SIZE(platform_component); links[id].dpcm_playback = 1; + /* feedback stream or firmware-generated echo reference */ + links[id].dpcm_capture = 1; + links[id].no_pcm = 1; links[id].cpus = &cpus[id]; links[id].num_cpus = 1; diff --git a/sound/soc/intel/boards/sof_rt5682.c b/sound/soc/intel/boards/sof_rt5682.c index 2eabc4b0fafa..71a11d747622 100644 --- a/sound/soc/intel/boards/sof_rt5682.c +++ b/sound/soc/intel/boards/sof_rt5682.c @@ -761,8 +761,6 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].num_codecs = ARRAY_SIZE(max_98373_components); links[id].init = max_98373_spk_codec_init; links[id].ops = &max_98373_ops; - /* feedback stream */ - links[id].dpcm_capture = 1; } else if (sof_rt5682_quirk & SOF_MAX98360A_SPEAKER_AMP_PRESENT) { max_98360a_dai_link(&links[id]); @@ -789,6 +787,9 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].platforms = platform_component; links[id].num_platforms = ARRAY_SIZE(platform_component); links[id].dpcm_playback = 1; + /* feedback stream or firmware-generated echo reference */ + links[id].dpcm_capture = 1; + links[id].no_pcm = 1; links[id].cpus = &cpus[id]; links[id].num_cpus = 1; diff --git a/sound/soc/intel/boards/sof_ssp_amp.c b/sound/soc/intel/boards/sof_ssp_amp.c index 94d25aeb6e7c..7b74f122e340 100644 --- a/sound/soc/intel/boards/sof_ssp_amp.c +++ b/sound/soc/intel/boards/sof_ssp_amp.c @@ -258,13 +258,12 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, sof_rt1308_dai_link(&links[id]); } else if (sof_ssp_amp_quirk & SOF_CS35L41_SPEAKER_AMP_PRESENT) { cs35l41_set_dai_link(&links[id]); - - /* feedback from amplifier */ - links[id].dpcm_capture = 1; } links[id].platforms = platform_component; links[id].num_platforms = ARRAY_SIZE(platform_component); links[id].dpcm_playback = 1; + /* feedback from amplifier or firmware-generated echo reference */ + links[id].dpcm_capture = 1; links[id].no_pcm = 1; links[id].cpus = &cpus[id]; links[id].num_cpus = 1; diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index c3be24b2fac5..a79a2fb260b8 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1401,13 +1401,17 @@ static int soc_tplg_dapm_widget_create(struct soc_tplg *tplg, template.num_kcontrols = le32_to_cpu(w->num_kcontrols); kc = devm_kcalloc(tplg->dev, le32_to_cpu(w->num_kcontrols), sizeof(*kc), GFP_KERNEL); - if (!kc) + if (!kc) { + ret = -ENOMEM; goto hdr_err; + } kcontrol_type = devm_kcalloc(tplg->dev, le32_to_cpu(w->num_kcontrols), sizeof(unsigned int), GFP_KERNEL); - if (!kcontrol_type) + if (!kcontrol_type) { + ret = -ENOMEM; goto hdr_err; + } for (i = 0; i < le32_to_cpu(w->num_kcontrols); i++) { control_hdr = (struct snd_soc_tplg_ctl_hdr *)tplg->pos; diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 6bd2888fbb66..d5ccd4d09278 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -318,7 +318,6 @@ static irqreturn_t acp_irq_thread(int irq, void *context) { struct snd_sof_dev *sdev = context; const struct sof_amd_acp_desc *desc = get_chip_info(sdev->pdata); - unsigned int base = desc->dsp_intr_base; unsigned int val, count = ACP_HW_SEM_RETRY_COUNT; val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, desc->ext_intr_stat); @@ -328,28 +327,20 @@ static irqreturn_t acp_irq_thread(int irq, void *context) return IRQ_HANDLED; } - val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET); - if (val & ACP_DSP_TO_HOST_IRQ) { - while (snd_sof_dsp_read(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset)) { - /* Wait until acquired HW Semaphore lock or timeout */ - count--; - if (!count) { - dev_err(sdev->dev, "%s: Failed to acquire HW lock\n", __func__); - return IRQ_NONE; - } + while (snd_sof_dsp_read(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset)) { + /* Wait until acquired HW Semaphore lock or timeout */ + count--; + if (!count) { + dev_err(sdev->dev, "%s: Failed to acquire HW lock\n", __func__); + return IRQ_NONE; } - - sof_ops(sdev)->irq_thread(irq, sdev); - val |= ACP_DSP_TO_HOST_IRQ; - snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET, val); - - /* Unlock or Release HW Semaphore */ - snd_sof_dsp_write(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset, 0x0); - - return IRQ_HANDLED; } - return IRQ_NONE; + sof_ops(sdev)->irq_thread(irq, sdev); + /* Unlock or Release HW Semaphore */ + snd_sof_dsp_write(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset, 0x0); + + return IRQ_HANDLED; }; static irqreturn_t acp_irq_handler(int irq, void *dev_id) @@ -360,8 +351,11 @@ static irqreturn_t acp_irq_handler(int irq, void *dev_id) unsigned int val; val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET); - if (val) + if (val) { + val |= ACP_DSP_TO_HOST_IRQ; + snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET, val); return IRQ_WAKE_THREAD; + } return IRQ_NONE; } diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index 1c3d4887aa30..a642c3067ec5 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -216,6 +216,10 @@ static int hda_link_dma_hw_params(struct snd_pcm_substream *substream, sdev = snd_soc_component_get_drvdata(cpu_dai->component); bus = sof_to_bus(sdev); + hlink = snd_hdac_ext_bus_get_hlink_by_name(bus, codec_dai->component->name); + if (!hlink) + return -EINVAL; + hext_stream = snd_soc_dai_get_dma_data(cpu_dai, substream); if (!hext_stream) { hext_stream = hda_link_stream_assign(bus, substream); @@ -225,10 +229,6 @@ static int hda_link_dma_hw_params(struct snd_pcm_substream *substream, snd_soc_dai_set_dma_data(cpu_dai, substream, (void *)hext_stream); } - hlink = snd_hdac_ext_bus_get_hlink_by_name(bus, codec_dai->component->name); - if (!hlink) - return -EINVAL; - /* set the hdac_stream in the codec dai */ snd_soc_dai_set_stream(codec_dai, hdac_stream(hext_stream), substream->stream); diff --git a/sound/soc/sof/ipc4-mtrace.c b/sound/soc/sof/ipc4-mtrace.c index 70dea8ae706e..0ec6ef681012 100644 --- a/sound/soc/sof/ipc4-mtrace.c +++ b/sound/soc/sof/ipc4-mtrace.c @@ -344,9 +344,10 @@ static ssize_t sof_ipc4_priority_mask_dfs_write(struct file *file, size_t count, loff_t *ppos) { struct sof_mtrace_priv *priv = file->private_data; - int id, ret; + unsigned int id; char *buf; u32 mask; + int ret; /* * To update Nth mask entry, write: @@ -357,9 +358,9 @@ static ssize_t sof_ipc4_priority_mask_dfs_write(struct file *file, if (IS_ERR(buf)) return PTR_ERR(buf); - ret = sscanf(buf, "%d,0x%x", &id, &mask); + ret = sscanf(buf, "%u,0x%x", &id, &mask); if (ret != 2) { - ret = sscanf(buf, "%d,%x", &id, &mask); + ret = sscanf(buf, "%u,%x", &id, &mask); if (ret != 2) { ret = -EINVAL; goto out; diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index c52752250565..3b3f3cf7af38 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -357,7 +357,7 @@ static inline u64 snd_sof_dsp_read64(struct snd_sof_dev *sdev, u32 bar, } static inline void snd_sof_dsp_update8(struct snd_sof_dev *sdev, u32 bar, - u32 offset, u8 value, u8 mask) + u32 offset, u8 mask, u8 value) { u8 reg; diff --git a/sound/soc/sof/sof-audio.c b/sound/soc/sof/sof-audio.c index 7306a2649857..865c367eb2f2 100644 --- a/sound/soc/sof/sof-audio.c +++ b/sound/soc/sof/sof-audio.c @@ -271,9 +271,9 @@ sof_unprepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widg struct snd_sof_widget *swidget = widget->dobj.private; struct snd_soc_dapm_path *p; - /* return if the widget is in use or if it is already unprepared */ - if (!swidget->prepared || swidget->use_count > 1) - return; + /* skip if the widget is in use or if it is already unprepared */ + if (!swidget || !swidget->prepared || swidget->use_count > 0) + goto sink_unprepare; if (widget_ops[widget->id].ipc_unprepare) /* unprepare the source widget */ @@ -281,6 +281,7 @@ sof_unprepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widg swidget->prepared = false; +sink_unprepare: /* unprepare all widgets in the sink paths */ snd_soc_dapm_widget_for_each_sink_path(widget, p) { if (!p->walking && p->sink->dobj.private) { @@ -303,7 +304,7 @@ sof_prepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widget struct snd_soc_dapm_path *p; int ret; - if (!widget_ops[widget->id].ipc_prepare || swidget->prepared) + if (!swidget || !widget_ops[widget->id].ipc_prepare || swidget->prepared) goto sink_prepare; /* prepare the source widget */ @@ -326,7 +327,8 @@ sink_prepare: p->walking = false; if (ret < 0) { /* unprepare the source widget */ - if (widget_ops[widget->id].ipc_unprepare && swidget->prepared) { + if (widget_ops[widget->id].ipc_unprepare && + swidget && swidget->prepared) { widget_ops[widget->id].ipc_unprepare(swidget); swidget->prepared = false; } @@ -429,11 +431,11 @@ sof_walk_widgets_in_order(struct snd_sof_dev *sdev, struct snd_soc_dapm_widget_l for_each_dapm_widgets(list, i, widget) { /* starting widget for playback is AIF type */ - if (dir == SNDRV_PCM_STREAM_PLAYBACK && !WIDGET_IS_AIF(widget->id)) + if (dir == SNDRV_PCM_STREAM_PLAYBACK && widget->id != snd_soc_dapm_aif_in) continue; /* starting widget for capture is DAI type */ - if (dir == SNDRV_PCM_STREAM_CAPTURE && !WIDGET_IS_DAI(widget->id)) + if (dir == SNDRV_PCM_STREAM_CAPTURE && widget->id != snd_soc_dapm_dai_out) continue; switch (op) { diff --git a/sound/synth/emux/emux_nrpn.c b/sound/synth/emux/emux_nrpn.c index 8056422ed7c5..0d6b82ae2955 100644 --- a/sound/synth/emux/emux_nrpn.c +++ b/sound/synth/emux/emux_nrpn.c @@ -349,6 +349,9 @@ int snd_emux_xg_control(struct snd_emux_port *port, struct snd_midi_channel *chan, int param) { + if (param >= ARRAY_SIZE(chan->control)) + return -EINVAL; + return send_converted_effect(xg_effects, ARRAY_SIZE(xg_effects), port, chan, param, chan->control[param], diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 3d13fdf7590c..3ecd1ba7fd4b 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2152,6 +2152,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x0525, 0xa4ad, /* Hamedal C20 usb camero */ QUIRK_FLAG_IFACE_SKIP_CLOSE), + DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */ + QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */ QUIRK_FLAG_FIXED_RATE), diff --git a/tools/gpio/gpio-event-mon.c b/tools/gpio/gpio-event-mon.c index 6c122952c589..5dee2b98ab60 100644 --- a/tools/gpio/gpio-event-mon.c +++ b/tools/gpio/gpio-event-mon.c @@ -86,6 +86,7 @@ int monitor_device(const char *device_name, gpiotools_test_bit(values.bits, i)); } + i = 0; while (1) { struct gpio_v2_line_event event; diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index 85973e55489e..fdb7f5db7308 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -15,10 +15,6 @@ bool mirrored_kernelcore = false; struct page {}; -void __free_pages_core(struct page *page, unsigned int order) -{ -} - void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { diff --git a/tools/testing/selftests/amd-pstate/Makefile b/tools/testing/selftests/amd-pstate/Makefile index 5f195ee756d6..5fd1424db37d 100644 --- a/tools/testing/selftests/amd-pstate/Makefile +++ b/tools/testing/selftests/amd-pstate/Makefile @@ -7,11 +7,6 @@ all: uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -ifeq (x86,$(ARCH)) -TEST_GEN_FILES += ../../../power/x86/amd_pstate_tracer/amd_pstate_trace.py -TEST_GEN_FILES += ../../../power/x86/intel_pstate_tracer/intel_pstate_tracer.py -endif - TEST_PROGS := run.sh TEST_FILES := basic.sh tbench.sh gitsource.sh diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 2cf0c7a3fe23..567e07c19ecc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -30,6 +30,8 @@ #define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 +#define __always_unused __attribute__((__unused__)) + #define _FAIL(errnum, fmt...) \ ({ \ error_at_line(0, (errnum), __func__, __LINE__, fmt); \ @@ -321,7 +323,8 @@ static int socket_loopback(int family, int sotype) return socket_loopback_reuseport(family, sotype, -1); } -static void test_insert_invalid(int family, int sotype, int mapfd) +static void test_insert_invalid(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key = 0; u64 value; @@ -338,7 +341,8 @@ static void test_insert_invalid(int family, int sotype, int mapfd) FAIL_ERRNO("map_update: expected EBADF"); } -static void test_insert_opened(int family, int sotype, int mapfd) +static void test_insert_opened(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key = 0; u64 value; @@ -359,7 +363,8 @@ static void test_insert_opened(int family, int sotype, int mapfd) xclose(s); } -static void test_insert_bound(int family, int sotype, int mapfd) +static void test_insert_bound(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -386,7 +391,8 @@ close: xclose(s); } -static void test_insert(int family, int sotype, int mapfd) +static void test_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 value; u32 key; @@ -402,7 +408,8 @@ static void test_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_delete_after_insert(int family, int sotype, int mapfd) +static void test_delete_after_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 value; u32 key; @@ -419,7 +426,8 @@ static void test_delete_after_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_delete_after_close(int family, int sotype, int mapfd) +static void test_delete_after_close(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int err, s; u64 value; @@ -442,7 +450,8 @@ static void test_delete_after_close(int family, int sotype, int mapfd) FAIL_ERRNO("map_delete: expected EINVAL/EINVAL"); } -static void test_lookup_after_insert(int family, int sotype, int mapfd) +static void test_lookup_after_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 cookie, value; socklen_t len; @@ -470,7 +479,8 @@ static void test_lookup_after_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_lookup_after_delete(int family, int sotype, int mapfd) +static void test_lookup_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int err, s; u64 value; @@ -493,7 +503,8 @@ static void test_lookup_after_delete(int family, int sotype, int mapfd) xclose(s); } -static void test_lookup_32_bit_value(int family, int sotype, int mapfd) +static void test_lookup_32_bit_value(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key, value32; int err, s; @@ -523,7 +534,8 @@ close: xclose(s); } -static void test_update_existing(int family, int sotype, int mapfd) +static void test_update_existing(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int s1, s2; u64 value; @@ -551,7 +563,7 @@ close_s1: /* Exercise the code path where we destroy child sockets that never * got accept()'ed, aka orphans, when parent socket gets closed. */ -static void test_destroy_orphan_child(int family, int sotype, int mapfd) +static void do_destroy_orphan_child(int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -582,10 +594,38 @@ close_srv: xclose(s); } +static void test_destroy_orphan_child(struct test_sockmap_listen *skel, + int family, int sotype, int mapfd) +{ + int msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int skb_verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + const struct test { + int progfd; + enum bpf_attach_type atype; + } tests[] = { + { -1, -1 }, + { msg_verdict, BPF_SK_MSG_VERDICT }, + { skb_verdict, BPF_SK_SKB_VERDICT }, + }; + const struct test *t; + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + if (t->progfd != -1 && + xbpf_prog_attach(t->progfd, mapfd, t->atype, 0) != 0) + return; + + do_destroy_orphan_child(family, sotype, mapfd); + + if (t->progfd != -1) + xbpf_prog_detach2(t->progfd, mapfd, t->atype); + } +} + /* Perform a passive open after removing listening socket from SOCKMAP * to ensure that callbacks get restored properly. */ -static void test_clone_after_delete(int family, int sotype, int mapfd) +static void test_clone_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -621,7 +661,8 @@ close_srv: * SOCKMAP, but got accept()'ed only after the parent has been removed * from SOCKMAP, gets cloned without parent psock state or callbacks. */ -static void test_accept_after_delete(int family, int sotype, int mapfd) +static void test_accept_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; const u32 zero = 0; @@ -675,7 +716,8 @@ close_srv: /* Check that child socket that got created and accepted while parent * was in a SOCKMAP is cloned without parent psock state or callbacks. */ -static void test_accept_before_delete(int family, int sotype, int mapfd) +static void test_accept_before_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; const u32 zero = 0, one = 1; @@ -784,7 +826,8 @@ done: return NULL; } -static void test_syn_recv_insert_delete(int family, int sotype, int mapfd) +static void test_syn_recv_insert_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct connect_accept_ctx ctx = { 0 }; struct sockaddr_storage addr; @@ -847,7 +890,8 @@ static void *listen_thread(void *arg) return NULL; } -static void test_race_insert_listen(int family, int socktype, int mapfd) +static void test_race_insert_listen(struct test_sockmap_listen *skel __always_unused, + int family, int socktype, int mapfd) { struct connect_accept_ctx ctx = { 0 }; const u32 zero = 0; @@ -1473,7 +1517,8 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, int family, int sotype) { const struct op_test { - void (*fn)(int family, int sotype, int mapfd); + void (*fn)(struct test_sockmap_listen *skel, + int family, int sotype, int mapfd); const char *name; int sotype; } tests[] = { @@ -1520,7 +1565,7 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, if (!test__start_subtest(s)) continue; - t->fn(family, sotype, map_fd); + t->fn(skel, family, sotype, map_fd); test_ops_cleanup(map); } } diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 92331053dba3..7bd76b9e0f98 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -826,7 +826,7 @@ out: SEC("kprobe/vfs_link") int BPF_KPROBE(kprobe__vfs_link, - struct dentry* old_dentry, struct user_namespace *mnt_userns, + struct dentry* old_dentry, struct mnt_idmap *idmap, struct inode* dir, struct dentry* new_dentry, struct inode** delegated_inode) { diff --git a/tools/testing/selftests/bpf/verifier/search_pruning.c b/tools/testing/selftests/bpf/verifier/search_pruning.c index 68b14fdfebdb..d63fd8991b03 100644 --- a/tools/testing/selftests/bpf/verifier/search_pruning.c +++ b/tools/testing/selftests/bpf/verifier/search_pruning.c @@ -225,3 +225,39 @@ .result_unpriv = ACCEPT, .insn_processed = 15, }, +/* The test performs a conditional 64-bit write to a stack location + * fp[-8], this is followed by an unconditional 8-bit write to fp[-8], + * then data is read from fp[-8]. This sequence is unsafe. + * + * The test would be mistakenly marked as safe w/o dst register parent + * preservation in verifier.c:copy_register_state() function. + * + * Note the usage of BPF_F_TEST_STATE_FREQ to force creation of the + * checkpoint state after conditional 64-bit assignment. + */ +{ + "write tracking and register parent chain bug", + .insns = { + /* r6 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* r0 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + /* if r0 > r6 goto +1 */ + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_6, 1), + /* *(u64 *)(r10 - 8) = 0xdeadbeef */ + BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0xdeadbeef), + /* r1 = 42 */ + BPF_MOV64_IMM(BPF_REG_1, 42), + /* *(u8 *)(r10 - 8) = r1 */ + BPF_STX_MEM(BPF_B, BPF_REG_FP, BPF_REG_1, -8), + /* r2 = *(u64 *)(r10 - 8) */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_FP, -8), + /* exit(0) */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .flags = BPF_F_TEST_STATE_FREQ, + .errstr = "invalid read from stack off -8+1 size 8", + .result = REJECT, +}, diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 186e1c26867e..75c100de90ff 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -268,6 +268,7 @@ TEST_MATRIX=( # Taking away all CPUs from parent or itself if there are tasks # will make the partition invalid. " S+ C2-3:P1:S+ C3:P1 . . T C2-3 . . 0 A1:2-3,A2:2-3 A1:P1,A2:P-1" + " S+ C3:P1:S+ C3 . . T P1 . . 0 A1:3,A2:3 A1:P1,A2:P-1" " S+ $SETUP_A123_PARTITIONS . T:C2-3 . . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" " S+ $SETUP_A123_PARTITIONS . T:C2-3:C1-3 . . . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" diff --git a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh index 9c79bbcce5a8..aff0a59f92d9 100755 --- a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh +++ b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh @@ -246,7 +246,7 @@ test_vlan_ingress_modify() bridge vlan add dev $swp2 vid 300 tc filter add dev $swp1 ingress chain $(IS1 2) pref 3 \ - protocol 802.1Q flower skip_sw vlan_id 200 \ + protocol 802.1Q flower skip_sw vlan_id 200 src_mac $h1_mac \ action vlan modify id 300 \ action goto chain $(IS2 0 0) diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh index 7f35dc3d15df..7f35dc3d15df 100644..100755 --- a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh +++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index beb944fa6fd4..54680dc5887f 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -237,6 +237,11 @@ static void guest_check_s1ptw_wr_in_dirty_log(void) GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG); } +static void guest_check_no_s1ptw_wr_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG); +} + static void guest_exec(void) { int (*code)(void) = (int (*)(void))TEST_EXEC_GVA; @@ -304,7 +309,7 @@ static struct uffd_args { /* Returns true to continue the test, and false if it should be skipped. */ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, - struct uffd_args *args, bool expect_write) + struct uffd_args *args) { uint64_t addr = msg->arg.pagefault.address; uint64_t flags = msg->arg.pagefault.flags; @@ -313,7 +318,6 @@ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING, "The only expected UFFD mode is MISSING"); - ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write); ASSERT_EQ(addr, (uint64_t)args->hva); pr_debug("uffd fault: addr=%p write=%d\n", @@ -337,19 +341,14 @@ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, return 0; } -static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg) -{ - return uffd_generic_handler(mode, uffd, msg, &pt_args, true); -} - -static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg) +static int uffd_pt_handler(int mode, int uffd, struct uffd_msg *msg) { - return uffd_generic_handler(mode, uffd, msg, &data_args, true); + return uffd_generic_handler(mode, uffd, msg, &pt_args); } -static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg) +static int uffd_data_handler(int mode, int uffd, struct uffd_msg *msg) { - return uffd_generic_handler(mode, uffd, msg, &data_args, false); + return uffd_generic_handler(mode, uffd, msg, &data_args); } static void setup_uffd_args(struct userspace_mem_region *region, @@ -471,9 +470,12 @@ static bool handle_cmd(struct kvm_vm *vm, int cmd) { struct userspace_mem_region *data_region, *pt_region; bool continue_test = true; + uint64_t pte_gpa, pte_pg; data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); pt_region = vm_get_mem_region(vm, MEM_REGION_PT); + pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA)); + pte_pg = (pte_gpa - pt_region->region.guest_phys_addr) / getpagesize(); if (cmd == CMD_SKIP_TEST) continue_test = false; @@ -486,13 +488,13 @@ static bool handle_cmd(struct kvm_vm *vm, int cmd) TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0), "Missing write in dirty log"); if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG) - TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0), + TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, pte_pg), "Missing s1ptw write in dirty log"); if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG) TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0), "Unexpected write in dirty log"); if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG) - TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0), + TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, pte_pg), "Unexpected s1ptw write in dirty log"); return continue_test; @@ -797,7 +799,7 @@ static void help(char *name) .expected_events = { .uffd_faults = _uffd_faults, }, \ } -#define TEST_DIRTY_LOG(_access, _with_af, _test_check) \ +#define TEST_DIRTY_LOG(_access, _with_af, _test_check, _pt_check) \ { \ .name = SCAT3(dirty_log, _access, _with_af), \ .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ @@ -805,13 +807,12 @@ static void help(char *name) .guest_prepare = { _PREPARE(_with_af), \ _PREPARE(_access) }, \ .guest_test = _access, \ - .guest_test_check = { _CHECK(_with_af), _test_check, \ - guest_check_s1ptw_wr_in_dirty_log}, \ + .guest_test_check = { _CHECK(_with_af), _test_check, _pt_check }, \ .expected_events = { 0 }, \ } #define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler, \ - _uffd_faults, _test_check) \ + _uffd_faults, _test_check, _pt_check) \ { \ .name = SCAT3(uffd_and_dirty_log, _access, _with_af), \ .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ @@ -820,16 +821,17 @@ static void help(char *name) _PREPARE(_access) }, \ .guest_test = _access, \ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ - .guest_test_check = { _CHECK(_with_af), _test_check }, \ + .guest_test_check = { _CHECK(_with_af), _test_check, _pt_check }, \ .uffd_data_handler = _uffd_data_handler, \ - .uffd_pt_handler = uffd_pt_write_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ .expected_events = { .uffd_faults = _uffd_faults, }, \ } #define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \ { \ - .name = SCAT3(ro_memslot, _access, _with_af), \ + .name = SCAT2(ro_memslot, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .mmio_handler = _mmio_handler, \ @@ -840,6 +842,7 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_no_syndrome, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .guest_test = _access, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ .expected_events = { .fail_vcpu_runs = 1 }, \ @@ -848,9 +851,9 @@ static void help(char *name) #define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits, \ _test_check) \ { \ - .name = SCAT3(ro_memslot, _access, _with_af), \ + .name = SCAT2(ro_memslot, _access), \ .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ - .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .guest_test_check = { _test_check }, \ @@ -862,7 +865,7 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \ .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ - .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ .guest_test = _access, \ .guest_test_check = { _test_check }, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ @@ -874,11 +877,12 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_uffd, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .uffd_data_handler = _uffd_data_handler, \ - .uffd_pt_handler = uffd_pt_write_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ .mmio_handler = _mmio_handler, \ .expected_events = { .mmio_exits = _mmio_exits, \ .uffd_faults = _uffd_faults }, \ @@ -889,10 +893,11 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_no_syndrome, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ .guest_test = _access, \ .uffd_data_handler = _uffd_data_handler, \ - .uffd_pt_handler = uffd_pt_write_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ .expected_events = { .fail_vcpu_runs = 1, \ .uffd_faults = _uffd_faults }, \ @@ -933,44 +938,51 @@ static struct test_desc tests[] = { * (S1PTW). */ TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), - /* no_af should also lead to a PT write. */ + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), - /* Note how that cas invokes the read handler. */ + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), /* * Can't test guest_at with_af as it's IMPDEF whether the AF is set. * The S1PTW fault should still be marked as a write. */ TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 1), + uffd_no_handler, uffd_pt_handler, 1), TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_write_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_write_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_write_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), /* * Try accesses when the data and PT memory regions are both * tracked for dirty logging. */ - TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log), - /* no_af should also lead to a PT write. */ - TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log), - TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log), - TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log), - TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_ld_preidx, with_af, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), /* * Access when the data and PT memory regions are both marked for @@ -980,29 +992,43 @@ static struct test_desc tests[] = { * fault, and nothing in the dirty log. Any S1PTW should result in * a write in the dirty log and a userfaultfd write. */ - TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2, - guest_check_no_write_in_dirty_log), - /* no_af should also lead to a PT write. */ - TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2, - guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler, - 2, guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1, - guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2, - guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler, - 2, guest_check_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2, - guest_check_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler, - 2, guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, + uffd_data_handler, + 2, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, uffd_no_handler, 1, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, + uffd_data_handler, + 2, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, + uffd_data_handler, 2, + guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, + uffd_data_handler, + 2, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af, - uffd_data_write_handler, 2, - guest_check_write_in_dirty_log), - + uffd_data_handler, 2, + guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), /* - * Try accesses when the data memory region is marked read-only + * Access when both the PT and data regions are marked read-only * (with KVM_MEM_READONLY). Writes with a syndrome result in an * MMIO exit, writes with no syndrome (e.g., CAS) result in a * failed vcpu run, and reads/execs with and without syndroms do @@ -1018,7 +1044,7 @@ static struct test_desc tests[] = { TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx), /* - * Access when both the data region is both read-only and marked + * The PT and data regions are both read-only and marked * for dirty logging at the same time. The expected result is that * for writes there should be no write in the dirty log. The * readonly handling is the same as if the memslot was not marked @@ -1043,7 +1069,7 @@ static struct test_desc tests[] = { guest_check_no_write_in_dirty_log), /* - * Access when the data region is both read-only and punched with + * The PT and data regions are both read-only and punched with * holes tracked with userfaultfd. The expected result is the * union of both userfaultfd and read-only behaviors. For example, * write accesses result in a userfaultfd write fault and an MMIO @@ -1051,22 +1077,15 @@ static struct test_desc tests[] = { * no userfaultfd write fault. Reads result in userfaultfd getting * triggered. */ - TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, - uffd_data_read_handler, 2), - TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, - uffd_data_read_handler, 2), - TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, - uffd_no_handler, 1), - TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, - uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, uffd_data_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, uffd_data_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, uffd_no_handler, 1), + TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, uffd_data_handler, 2), TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1, - uffd_data_write_handler, 2), - TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, - uffd_data_read_handler, 2), - TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, - uffd_no_handler, 1), - TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, - uffd_no_handler, 1), + uffd_data_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, uffd_data_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, uffd_no_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, uffd_no_handler, 1), { 0 } }; diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index ea0978f22db8..251794f83719 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -241,7 +241,7 @@ int main(int argc, char **argv) while ((opt = getopt(argc, argv, "hp:t:r")) != -1) { switch (opt) { case 'p': - reclaim_period_ms = atoi_non_negative("Reclaim period", optarg); + reclaim_period_ms = atoi_positive("Reclaim period", optarg); break; case 't': token = atoi_paranoid(optarg); diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index dae510c263b4..13c75dc18c10 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -434,6 +434,7 @@ static void *juggle_shinfo_state(void *arg) int main(int argc, char *argv[]) { struct timespec min_ts, max_ts, vm_ts; + struct kvm_xen_hvm_attr evt_reset; struct kvm_vm *vm; pthread_t thread; bool verbose; @@ -962,10 +963,8 @@ int main(int argc, char *argv[]) } done: - struct kvm_xen_hvm_attr evt_reset = { - .type = KVM_XEN_ATTR_TYPE_EVTCHN, - .u.evtchn.flags = KVM_XEN_EVTCHN_RESET, - }; + evt_reset.type = KVM_XEN_ATTR_TYPE_EVTCHN; + evt_reset.u.evtchn.flags = KVM_XEN_EVTCHN_RESET; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset); alarm(0); diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ipv6.sh index 2d89cb0ad288..330d0b1ceced 100755 --- a/tools/testing/selftests/net/cmsg_ipv6.sh +++ b/tools/testing/selftests/net/cmsg_ipv6.sh @@ -6,7 +6,7 @@ ksft_skip=4 NS=ns IP6=2001:db8:1::1/64 TGT6=2001:db8:1::2 -TMPF=`mktemp` +TMPF=$(mktemp --suffix ".pcap") cleanup() { diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index c245476fa29d..63c3eaec8d30 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -10,8 +10,10 @@ ret=0 PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} IP="ip -netns testns" +IP_PEER="ip -netns peerns" RTABLE=100 +RTABLE_PEER=101 GW_IP4=192.51.100.2 SRC_IP=192.51.100.3 GW_IP6=2001:db8:1::2 @@ -20,7 +22,9 @@ SRC_IP6=2001:db8:1::3 DEV_ADDR=192.51.100.1 DEV_ADDR6=2001:db8:1::1 DEV=dummy0 -TESTS="fib_rule6 fib_rule4" +TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect" + +SELFTEST_PATH="" log_test() { @@ -52,6 +56,31 @@ log_section() echo "######################################################################" } +check_nettest() +{ + if which nettest > /dev/null 2>&1; then + return 0 + fi + + # Add the selftest directory to PATH if not already done + if [ "${SELFTEST_PATH}" = "" ]; then + SELFTEST_PATH="$(dirname $0)" + PATH="${PATH}:${SELFTEST_PATH}" + + # Now retry with the new path + if which nettest > /dev/null 2>&1; then + return 0 + fi + + if [ "${ret}" -eq 0 ]; then + ret="${ksft_skip}" + fi + echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')" + fi + + return 1 +} + setup() { set -e @@ -72,6 +101,39 @@ cleanup() ip netns del testns } +setup_peer() +{ + set -e + + ip netns add peerns + $IP_PEER link set dev lo up + + ip link add name veth0 netns testns type veth \ + peer name veth1 netns peerns + $IP link set dev veth0 up + $IP_PEER link set dev veth1 up + + $IP address add 192.0.2.10 peer 192.0.2.11/32 dev veth0 + $IP_PEER address add 192.0.2.11 peer 192.0.2.10/32 dev veth1 + + $IP address add 2001:db8::10 peer 2001:db8::11/128 dev veth0 nodad + $IP_PEER address add 2001:db8::11 peer 2001:db8::10/128 dev veth1 nodad + + $IP_PEER address add 198.51.100.11/32 dev lo + $IP route add table $RTABLE_PEER 198.51.100.11/32 via 192.0.2.11 + + $IP_PEER address add 2001:db8::1:11/128 dev lo + $IP route add table $RTABLE_PEER 2001:db8::1:11/128 via 2001:db8::11 + + set +e +} + +cleanup_peer() +{ + $IP link del dev veth0 + ip netns del peerns +} + fib_check_iproute_support() { ip rule help 2>&1 | grep -q $1 @@ -190,6 +252,37 @@ fib_rule6_test() fi } +# Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly +# taken into account when connecting the socket and when sending packets. +fib_rule6_connect_test() +{ + local dsfield + + if ! check_nettest; then + echo "SKIP: Could not run test without nettest tool" + return + fi + + setup_peer + $IP -6 rule add dsfield 0x04 table $RTABLE_PEER + + # Combine the base DS Field value (0x04) with all possible ECN values + # (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3). + # The ECN bits shouldn't influence the result of the test. + for dsfield in 0x04 0x05 0x06 0x07; do + nettest -q -6 -B -t 5 -N testns -O peerns -U -D \ + -Q "${dsfield}" -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dsfield udp connect (dsfield ${dsfield})" + + nettest -q -6 -B -t 5 -N testns -O peerns -Q "${dsfield}" \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})" + done + + $IP -6 rule del dsfield 0x04 table $RTABLE_PEER + cleanup_peer +} + fib_rule4_del() { $IP rule del $1 @@ -296,6 +389,37 @@ fib_rule4_test() fi } +# Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken +# into account when connecting the socket and when sending packets. +fib_rule4_connect_test() +{ + local dsfield + + if ! check_nettest; then + echo "SKIP: Could not run test without nettest tool" + return + fi + + setup_peer + $IP -4 rule add dsfield 0x04 table $RTABLE_PEER + + # Combine the base DS Field value (0x04) with all possible ECN values + # (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3). + # The ECN bits shouldn't influence the result of the test. + for dsfield in 0x04 0x05 0x06 0x07; do + nettest -q -B -t 5 -N testns -O peerns -D -U -Q "${dsfield}" \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dsfield udp connect (dsfield ${dsfield})" + + nettest -q -B -t 5 -N testns -O peerns -Q "${dsfield}" \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})" + done + + $IP -4 rule del dsfield 0x04 table $RTABLE_PEER + cleanup_peer +} + run_fibrule_tests() { log_section "IPv4 fib rule" @@ -345,6 +469,8 @@ do case $t in fib_rule6_test|fib_rule6) fib_rule6_test;; fib_rule4_test|fib_rule4) fib_rule4_test;; + fib_rule6_connect_test|fib_rule6_connect) fib_rule6_connect_test;; + fib_rule4_connect_test|fib_rule4_connect) fib_rule4_connect_test;; help) echo "Test names: $TESTS"; exit 0;; diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 1c4f866de7d7..3d8e4ebda1b6 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -914,14 +914,14 @@ sysctl_set() local value=$1; shift SYSCTL_ORIG[$key]=$(sysctl -n $key) - sysctl -qw $key=$value + sysctl -qw $key="$value" } sysctl_restore() { local key=$1; shift - sysctl -qw $key=${SYSCTL_ORIG["$key"]} + sysctl -qw $key="${SYSCTL_ORIG[$key]}" } forwarding_enable() diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index d11d3d566608..079f8f46849d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -498,6 +498,12 @@ kill_events_pids() kill_wait $evts_ns2_pid } +kill_tests_wait() +{ + kill -SIGUSR1 $(ip netns pids $ns2) $(ip netns pids $ns1) + wait +} + pm_nl_set_limits() { local ns=$1 @@ -1694,6 +1700,7 @@ chk_subflow_nr() local subflow_nr=$3 local cnt1 local cnt2 + local dump_stats if [ -n "${need_title}" ]; then printf "%03u %-36s %s" "${TEST_COUNT}" "${TEST_NAME}" "${msg}" @@ -1711,7 +1718,12 @@ chk_subflow_nr() echo "[ ok ]" fi - [ "${dump_stats}" = 1 ] && ( ss -N $ns1 -tOni ; ss -N $ns1 -tOni | grep token; ip -n $ns1 mptcp endpoint ) + if [ "${dump_stats}" = 1 ]; then + ss -N $ns1 -tOni + ss -N $ns1 -tOni | grep token + ip -n $ns1 mptcp endpoint + dump_stats + fi } chk_link_usage() @@ -3049,7 +3061,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow & + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow 2>/dev/null & wait_mpj $ns1 pm_nl_check_endpoint 1 "creation" \ @@ -3062,14 +3074,14 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags signal pm_nl_check_endpoint 0 "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal - wait + kill_tests_wait fi if reset "delete and re-add"; then pm_nl_set_limits $ns1 1 1 pm_nl_set_limits $ns2 1 1 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - run_tests $ns1 $ns2 10.0.1.1 4 0 0 slow & + run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 2>/dev/null & wait_mpj $ns2 pm_nl_del_endpoint $ns2 2 10.0.2.2 @@ -3079,7 +3091,7 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow wait_mpj $ns2 chk_subflow_nr "" "after re-add" 2 - wait + kill_tests_wait fi } diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c index 7900fa98eccb..ee9a72982705 100644 --- a/tools/testing/selftests/net/nettest.c +++ b/tools/testing/selftests/net/nettest.c @@ -87,6 +87,7 @@ struct sock_args { int use_setsockopt; int use_freebind; int use_cmsg; + uint8_t dsfield; const char *dev; const char *server_dev; int ifindex; @@ -580,6 +581,36 @@ static int set_reuseaddr(int sd) return rc; } +static int set_dsfield(int sd, int version, int dsfield) +{ + if (!dsfield) + return 0; + + switch (version) { + case AF_INET: + if (setsockopt(sd, SOL_IP, IP_TOS, &dsfield, + sizeof(dsfield)) < 0) { + log_err_errno("setsockopt(IP_TOS)"); + return -1; + } + break; + + case AF_INET6: + if (setsockopt(sd, SOL_IPV6, IPV6_TCLASS, &dsfield, + sizeof(dsfield)) < 0) { + log_err_errno("setsockopt(IPV6_TCLASS)"); + return -1; + } + break; + + default: + log_error("Invalid address family\n"); + return -1; + } + + return 0; +} + static int str_to_uint(const char *str, int min, int max, unsigned int *value) { int number; @@ -1317,6 +1348,9 @@ static int msock_init(struct sock_args *args, int server) (char *)&one, sizeof(one)) < 0) log_err_errno("Setting SO_BROADCAST error"); + if (set_dsfield(sd, AF_INET, args->dsfield) != 0) + goto out_err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto out_err; else if (args->use_setsockopt && @@ -1445,6 +1479,9 @@ static int lsock_init(struct sock_args *args) if (set_reuseport(sd) != 0) goto err; + if (set_dsfield(sd, args->version, args->dsfield) != 0) + goto err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto err; else if (args->use_setsockopt && @@ -1658,6 +1695,9 @@ static int connectsock(void *addr, socklen_t alen, struct sock_args *args) if (set_reuseport(sd) != 0) goto err; + if (set_dsfield(sd, args->version, args->dsfield) != 0) + goto err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto err; else if (args->use_setsockopt && @@ -1862,7 +1902,7 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args) return client_status; } -#define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf" +#define GETOPT_STR "sr:l:c:Q:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf" #define OPT_FORCE_BIND_KEY_IFINDEX 1001 #define OPT_NO_BIND_KEY_IFINDEX 1002 @@ -1893,6 +1933,8 @@ static void print_usage(char *prog) " -D|R datagram (D) / raw (R) socket (default stream)\n" " -l addr local address to bind to in server mode\n" " -c addr local address to bind to in client mode\n" + " -Q dsfield DS Field value of the socket (the IP_TOS or\n" + " IPV6_TCLASS socket option)\n" " -x configure XFRM policy on socket\n" "\n" " -d dev bind socket to given device name\n" @@ -1971,6 +2013,13 @@ int main(int argc, char *argv[]) args.has_local_ip = 1; args.client_local_addr_str = optarg; break; + case 'Q': + if (str_to_uint(optarg, 0, 255, &tmp) != 0) { + fprintf(stderr, "Invalid DS Field\n"); + return 1; + } + args.dsfield = tmp; + break; case 'p': if (str_to_uint(optarg, 1, 65535, &tmp) != 0) { fprintf(stderr, "Invalid port\n"); diff --git a/tools/testing/selftests/net/test_vxlan_vnifiltering.sh b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh index 704997ffc244..8c3ac0a72545 100755 --- a/tools/testing/selftests/net/test_vxlan_vnifiltering.sh +++ b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh @@ -293,19 +293,11 @@ setup-vm() { elif [[ -n $vtype && $vtype == "vnifilterg" ]]; then # Add per vni group config with 'bridge vni' api if [ -n "$group" ]; then - if [ "$family" == "v4" ]; then - if [ $mcast -eq 1 ]; then - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group - else - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group - fi - else - if [ $mcast -eq 1 ]; then - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group6 $group - else - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote6 $group - fi - fi + if [ $mcast -eq 1 ]; then + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group + else + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group + fi fi fi done diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh index dc932fd65363..640bc43452fa 100755 --- a/tools/testing/selftests/net/udpgso_bench.sh +++ b/tools/testing/selftests/net/udpgso_bench.sh @@ -7,6 +7,7 @@ readonly GREEN='\033[0;92m' readonly YELLOW='\033[0;33m' readonly RED='\033[0;31m' readonly NC='\033[0m' # No Color +readonly TESTPORT=8000 readonly KSFT_PASS=0 readonly KSFT_FAIL=1 @@ -56,11 +57,26 @@ trap wake_children EXIT run_one() { local -r args=$@ + local nr_socks=0 + local i=0 + local -r timeout=10 + + ./udpgso_bench_rx -p "$TESTPORT" & + ./udpgso_bench_rx -p "$TESTPORT" -t & + + # Wait for the above test program to get ready to receive connections. + while [ "$i" -lt "$timeout" ]; do + nr_socks="$(ss -lnHi | grep -c "\*:${TESTPORT}")" + [ "$nr_socks" -eq 2 ] && break + i=$((i + 1)) + sleep 1 + done + if [ "$nr_socks" -ne 2 ]; then + echo "timed out while waiting for udpgso_bench_rx" + exit 1 + fi - ./udpgso_bench_rx & - ./udpgso_bench_rx -t & - - ./udpgso_bench_tx ${args} + ./udpgso_bench_tx -p "$TESTPORT" ${args} } run_in_netns() { diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index 6a193425c367..4058c7451e70 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -250,7 +250,7 @@ static int recv_msg(int fd, char *buf, int len, int *gso_size) static void do_flush_udp(int fd) { static char rbuf[ETH_MAX_MTU]; - int ret, len, gso_size, budget = 256; + int ret, len, gso_size = 0, budget = 256; len = cfg_read_all ? sizeof(rbuf) : 0; while (budget--) { @@ -336,6 +336,8 @@ static void parse_opts(int argc, char **argv) cfg_verify = true; cfg_read_all = true; break; + default: + exit(1); } } diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c index f1fdaa270291..477392715a9a 100644 --- a/tools/testing/selftests/net/udpgso_bench_tx.c +++ b/tools/testing/selftests/net/udpgso_bench_tx.c @@ -62,6 +62,7 @@ static int cfg_payload_len = (1472 * 42); static int cfg_port = 8000; static int cfg_runtime_ms = -1; static bool cfg_poll; +static int cfg_poll_loop_timeout_ms = 2000; static bool cfg_segment; static bool cfg_sendmmsg; static bool cfg_tcp; @@ -235,16 +236,17 @@ static void flush_errqueue_recv(int fd) } } -static void flush_errqueue(int fd, const bool do_poll) +static void flush_errqueue(int fd, const bool do_poll, + unsigned long poll_timeout, const bool poll_err) { if (do_poll) { struct pollfd fds = {0}; int ret; fds.fd = fd; - ret = poll(&fds, 1, 500); + ret = poll(&fds, 1, poll_timeout); if (ret == 0) { - if (cfg_verbose) + if ((cfg_verbose) && (poll_err)) fprintf(stderr, "poll timeout\n"); } else if (ret < 0) { error(1, errno, "poll"); @@ -254,6 +256,20 @@ static void flush_errqueue(int fd, const bool do_poll) flush_errqueue_recv(fd); } +static void flush_errqueue_retry(int fd, unsigned long num_sends) +{ + unsigned long tnow, tstop; + bool first_try = true; + + tnow = gettimeofday_ms(); + tstop = tnow + cfg_poll_loop_timeout_ms; + do { + flush_errqueue(fd, true, tstop - tnow, first_try); + first_try = false; + tnow = gettimeofday_ms(); + } while ((stat_zcopies != num_sends) && (tnow < tstop)); +} + static int send_tcp(int fd, char *data) { int ret, done = 0, count = 0; @@ -413,7 +429,8 @@ static int send_udp_segment(int fd, char *data) static void usage(const char *filepath) { - error(1, 0, "Usage: %s [-46acmHPtTuvz] [-C cpu] [-D dst ip] [-l secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]", + error(1, 0, "Usage: %s [-46acmHPtTuvz] [-C cpu] [-D dst ip] [-l secs] " + "[-L secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]", filepath); } @@ -423,7 +440,7 @@ static void parse_opts(int argc, char **argv) int max_len, hdrlen; int c; - while ((c = getopt(argc, argv, "46acC:D:Hl:mM:p:s:PS:tTuvz")) != -1) { + while ((c = getopt(argc, argv, "46acC:D:Hl:L:mM:p:s:PS:tTuvz")) != -1) { switch (c) { case '4': if (cfg_family != PF_UNSPEC) @@ -452,6 +469,9 @@ static void parse_opts(int argc, char **argv) case 'l': cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000; break; + case 'L': + cfg_poll_loop_timeout_ms = strtoul(optarg, NULL, 10) * 1000; + break; case 'm': cfg_sendmmsg = true; break; @@ -490,6 +510,8 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_zerocopy = true; break; + default: + exit(1); } } @@ -677,7 +699,7 @@ int main(int argc, char **argv) num_sends += send_udp(fd, buf[i]); num_msgs++; if ((cfg_zerocopy && ((num_msgs & 0xF) == 0)) || cfg_tx_tstamp) - flush_errqueue(fd, cfg_poll); + flush_errqueue(fd, cfg_poll, 500, true); if (cfg_msg_nr && num_msgs >= cfg_msg_nr) break; @@ -696,7 +718,7 @@ int main(int argc, char **argv) } while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop)); if (cfg_zerocopy || cfg_tx_tstamp) - flush_errqueue(fd, true); + flush_errqueue_retry(fd, num_sends); if (close(fd)) error(1, errno, "close"); diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index a634f47d1e56..9a127a8fe176 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -17,7 +17,6 @@ #include <stdio.h> #include <unistd.h> #include <sys/mman.h> -#define __USE_GNU #include <fcntl.h> #define MIN_FREE_PAGES 20 diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h index 813baf13f62a..51a919083d9b 100644 --- a/tools/virtio/linux/bug.h +++ b/tools/virtio/linux/bug.h @@ -1,13 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef BUG_H -#define BUG_H +#ifndef _LINUX_BUG_H +#define _LINUX_BUG_H #include <asm/bug.h> #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) -#define BUILD_BUG_ON(x) - #define BUG() abort() -#endif /* BUG_H */ +#endif /* _LINUX_BUG_H */ diff --git a/tools/virtio/linux/build_bug.h b/tools/virtio/linux/build_bug.h new file mode 100644 index 000000000000..cdbb75e28a60 --- /dev/null +++ b/tools/virtio/linux/build_bug.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BUILD_BUG_H +#define _LINUX_BUILD_BUG_H + +#define BUILD_BUG_ON(x) + +#endif /* _LINUX_BUILD_BUG_H */ diff --git a/tools/virtio/linux/cpumask.h b/tools/virtio/linux/cpumask.h new file mode 100644 index 000000000000..307da69d6b26 --- /dev/null +++ b/tools/virtio/linux/cpumask.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CPUMASK_H +#define _LINUX_CPUMASK_H + +#include <linux/kernel.h> + +#endif /* _LINUX_CPUMASK_H */ diff --git a/tools/virtio/linux/gfp.h b/tools/virtio/linux/gfp.h new file mode 100644 index 000000000000..43d146f236f1 --- /dev/null +++ b/tools/virtio/linux/gfp.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GFP_H +#define __LINUX_GFP_H + +#include <linux/topology.h> + +#endif diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 21593bf97755..8b877167933d 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -10,6 +10,7 @@ #include <stdarg.h> #include <linux/compiler.h> +#include <linux/log2.h> #include <linux/types.h> #include <linux/overflow.h> #include <linux/list.h> diff --git a/tools/virtio/linux/kmsan.h b/tools/virtio/linux/kmsan.h new file mode 100644 index 000000000000..272b5aa285d5 --- /dev/null +++ b/tools/virtio/linux/kmsan.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KMSAN_H +#define _LINUX_KMSAN_H + +#include <linux/gfp.h> + +inline void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir) +{ +} + +#endif /* _LINUX_KMSAN_H */ diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h index 369ee308b668..74d9e1825748 100644 --- a/tools/virtio/linux/scatterlist.h +++ b/tools/virtio/linux/scatterlist.h @@ -2,6 +2,7 @@ #ifndef SCATTERLIST_H #define SCATTERLIST_H #include <linux/kernel.h> +#include <linux/bug.h> struct scatterlist { unsigned long page_link; diff --git a/tools/virtio/linux/topology.h b/tools/virtio/linux/topology.h new file mode 100644 index 000000000000..910794afb993 --- /dev/null +++ b/tools/virtio/linux/topology.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TOPOLOGY_H +#define _LINUX_TOPOLOGY_H + +#include <linux/cpumask.h> + +#endif /* _LINUX_TOPOLOGY_H */ diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 495ceabffe88..9584eb57e0ed 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -336,7 +336,7 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, return -ENXIO; } -static void kvm_vfio_destroy(struct kvm_device *dev) +static void kvm_vfio_release(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_group *kvg, *tmp; @@ -355,7 +355,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev) kvm_vfio_update_coherency(dev); kfree(kv); - kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ + kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */ } static int kvm_vfio_create(struct kvm_device *dev, u32 type); @@ -363,7 +363,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type); static struct kvm_device_ops kvm_vfio_ops = { .name = "kvm-vfio", .create = kvm_vfio_create, - .destroy = kvm_vfio_destroy, + .release = kvm_vfio_release, .set_attr = kvm_vfio_set_attr, .has_attr = kvm_vfio_has_attr, }; |